diff --git a/2017/05/11/Internals.html b/2017/05/11/Internals.html new file mode 100644 index 000000000000..a535db31eec8 --- /dev/null +++ b/2017/05/11/Internals.html @@ -0,0 +1,11 @@ + + + + Redirecting… + + + + +

Redirecting…

+ Click here if you are not redirected. + diff --git a/2017/06/27/Internals2.html b/2017/06/27/Internals2.html new file mode 100644 index 000000000000..d5b7eb345060 --- /dev/null +++ b/2017/06/27/Internals2.html @@ -0,0 +1,11 @@ + + + + Redirecting… + + + + +

Redirecting…

+ Click here if you are not redirected. + diff --git a/2018/01/19/a-year-in.html b/2018/01/19/a-year-in.html new file mode 100644 index 000000000000..4980385057d9 --- /dev/null +++ b/2018/01/19/a-year-in.html @@ -0,0 +1,11 @@ + + + + Redirecting… + + + + +

Redirecting…

+ Click here if you are not redirected. + diff --git a/2018/03/05/tensor-comprehensions.html b/2018/03/05/tensor-comprehensions.html new file mode 100644 index 000000000000..736fa2110f1d --- /dev/null +++ b/2018/03/05/tensor-comprehensions.html @@ -0,0 +1,11 @@ + + + + Redirecting… + + + + +

Redirecting…

+ Click here if you are not redirected. + diff --git a/2018/04/22/0_4_0-migration-guide.html b/2018/04/22/0_4_0-migration-guide.html new file mode 100644 index 000000000000..635d247ff5ee --- /dev/null +++ b/2018/04/22/0_4_0-migration-guide.html @@ -0,0 +1,11 @@ + + + + Redirecting… + + + + +

Redirecting…

+ Click here if you are not redirected. + diff --git a/2018/05/02/road-to-1.0.html b/2018/05/02/road-to-1.0.html new file mode 100644 index 000000000000..78e399bc998a --- /dev/null +++ b/2018/05/02/road-to-1.0.html @@ -0,0 +1,11 @@ + + + + Redirecting… + + + + +

Redirecting…

+ Click here if you are not redirected. + diff --git a/2019/04/29/road-to-1.0.html b/2019/04/29/road-to-1.0.html new file mode 100644 index 000000000000..b19638439684 --- /dev/null +++ b/2019/04/29/road-to-1.0.html @@ -0,0 +1,11 @@ + + + + Redirecting… + + + + +

Redirecting…

+ Click here if you are not redirected. + diff --git a/2019/05/08/model-serving-in-pyorch.html b/2019/05/08/model-serving-in-pyorch.html new file mode 100644 index 000000000000..a578a77ac542 --- /dev/null +++ b/2019/05/08/model-serving-in-pyorch.html @@ -0,0 +1,11 @@ + + + + Redirecting… + + + + +

Redirecting…

+ Click here if you are not redirected. + diff --git a/2019/05/23/torchvision03.html b/2019/05/23/torchvision03.html new file mode 100644 index 000000000000..8547da0cdc64 --- /dev/null +++ b/2019/05/23/torchvision03.html @@ -0,0 +1,11 @@ + + + + Redirecting… + + + + +

Redirecting…

+ Click here if you are not redirected. + diff --git a/2019/06/10/pytorch_hub.html b/2019/06/10/pytorch_hub.html new file mode 100644 index 000000000000..de517a02dd78 --- /dev/null +++ b/2019/06/10/pytorch_hub.html @@ -0,0 +1,11 @@ + + + + Redirecting… + + + + +

Redirecting…

+ Click here if you are not redirected. + diff --git a/2019/07/23/mapillary-research.html b/2019/07/23/mapillary-research.html new file mode 100644 index 000000000000..90b4512c4c08 --- /dev/null +++ b/2019/07/23/mapillary-research.html @@ -0,0 +1,11 @@ + + + + Redirecting… + + + + +

Redirecting…

+ Click here if you are not redirected. + diff --git a/2019/08/06/pytorch_aug2019_releases.html b/2019/08/06/pytorch_aug2019_releases.html new file mode 100644 index 000000000000..29dddfd8c52c --- /dev/null +++ b/2019/08/06/pytorch_aug2019_releases.html @@ -0,0 +1,11 @@ + + + + Redirecting… + + + + +

Redirecting…

+ Click here if you are not redirected. + diff --git a/404.html b/404.html new file mode 100644 index 000000000000..1d1727fd5d38 --- /dev/null +++ b/404.html @@ -0,0 +1,685 @@ + + + + + + + + + + + + + Oops! | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ + + + + + + + +
+ +
+
+ +
+
+
+
+ + +
+ + +

Oops!

+ +

You've reached a dead end.

+ +

+ If you feel like something should be here, you can open an issue on GitHub. +

+ +

+ Click here to go back to the main page. +

+
+
+
+
+
+ +
+
+
+
+

Docs

+

Access comprehensive developer documentation for PyTorch

+ View Docs +
+ +
+

Tutorials

+

Get in-depth tutorials for beginners and advanced developers

+ View Tutorials +
+ +
+

Resources

+

Find development resources and get your questions answered

+ View Resources +
+
+
+
+ + + +
+
+
+
+ + +
+
+
+ + +
+ + + + + + + + + + + + + + + + + + + + + + + diff --git a/CNAME b/CNAME index c101f6da020d..583993f7b85f 100644 --- a/CNAME +++ b/CNAME @@ -1 +1 @@ -pytorch.org \ No newline at end of file +docs.pytorch.org diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 000000000000..b91e23b17c02 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,76 @@ +# Code of Conduct + +## Our Pledge + +In the interest of fostering an open and welcoming environment, we as +contributors and maintainers pledge to make participation in our project and +our community a harassment-free experience for everyone, regardless of age, body +size, disability, ethnicity, sex characteristics, gender identity and expression, +level of experience, education, socio-economic status, nationality, personal +appearance, race, religion, or sexual identity and orientation. + +## Our Standards + +Examples of behavior that contributes to creating a positive environment +include: + +* Using welcoming and inclusive language +* Being respectful of differing viewpoints and experiences +* Gracefully accepting constructive criticism +* Focusing on what is best for the community +* Showing empathy towards other community members + +Examples of unacceptable behavior by participants include: + +* The use of sexualized language or imagery and unwelcome sexual attention or +advances +* Trolling, insulting/derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or electronic +address, without explicit permission +* Other conduct which could reasonably be considered inappropriate in a +professional setting + +## Our Responsibilities + +Project maintainers are responsible for clarifying the standards of acceptable +behavior and are expected to take appropriate and fair corrective action in +response to any instances of unacceptable behavior. + +Project maintainers have the right and responsibility to remove, edit, or +reject comments, commits, code, wiki edits, issues, and other contributions +that are not aligned to this Code of Conduct, or to ban temporarily or +permanently any contributor for other behaviors that they deem inappropriate, +threatening, offensive, or harmful. + +## Scope + +This Code of Conduct applies within all project spaces, and it also applies when +an individual is representing the project or its community in public spaces. +Examples of representing a project or community include using an official +project e-mail address, posting via an official social media account, or acting +as an appointed representative at an online or offline event. Representation of +a project may be further defined and clarified by project maintainers. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported by contacting the project team at . All +complaints will be reviewed and investigated and will result in a response that +is deemed necessary and appropriate to the circumstances. The project team is +obligated to maintain confidentiality with regard to the reporter of an incident. +Further details of specific enforcement policies may be posted separately. + +Project maintainers who do not follow or enforce the Code of Conduct in good +faith may face temporary or permanent repercussions as determined by other +members of the project's leadership. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, +available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html + +[homepage]: https://www.contributor-covenant.org + +For answers to common questions about this code of conduct, see +https://www.contributor-covenant.org/faq diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 000000000000..90e93bd32f19 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,31 @@ +# Contributing to hub +We want to make contributing to this project as easy and transparent as +possible. + +## Pull Requests +We actively welcome your pull requests. + +1. Fork the repo and create your branch from `master`. +2. If you've added code that should be tested, add tests. +3. If you've changed APIs, update the documentation. +4. Ensure the test suite passes. +5. Make sure your code lints. +6. If you haven't already, complete the Contributor License Agreement ("CLA"). + +## Contributor License Agreement ("CLA") +In order to accept your pull request, we need you to submit a CLA. You only need +to do this once to work on any of Facebook's open source projects. + +Complete your CLA here: + +## Issues +We use GitHub issues to track public bugs. Please ensure your description is +clear and has sufficient instructions to be able to reproduce the issue. + +Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe +disclosure of security bugs. In those cases, please go through the process +outlined on that page and do not file a public issue. + +## License +By contributing to hub, you agree that your contributions will be licensed +under the LICENSE file in the root directory of this source tree. \ No newline at end of file diff --git a/Gemfile b/Gemfile deleted file mode 100644 index 076585ad2e5c..000000000000 --- a/Gemfile +++ /dev/null @@ -1,4 +0,0 @@ -source 'https://rubygems.org' - -gem 'github-pages', :group => :jekyll_plugins -gem 'breakpoint' \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 000000000000..673979d26033 --- /dev/null +++ b/LICENSE @@ -0,0 +1,29 @@ +BSD 3-Clause License + +Copyright (c) 2018, Facebook Inc +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/Makefile b/Makefile deleted file mode 100644 index e111cceae1f3..000000000000 --- a/Makefile +++ /dev/null @@ -1,7 +0,0 @@ - -serve: - bundle exec jekyll serve --watch --trace - -setup: - gem install bundler - bundle install \ No newline at end of file diff --git a/README.md b/README.md deleted file mode 100644 index fa3e2c4cc9c3..000000000000 --- a/README.md +++ /dev/null @@ -1,2 +0,0 @@ -# PyTorch Website -http://pytorch.org diff --git a/_config.yml b/_config.yml index fe3854d67820..4b319e0d4e19 100644 --- a/_config.yml +++ b/_config.yml @@ -1,28 +1 @@ -url: http://pytorch.org -name: PyTorch -description: 'Scientific Computing...' -latest_version: 1.0 -baseurl: -relative_permalinks: false -timezone: America/Los_Angeles -sass: - sass_dir: _sass # default - style: compressed -safe: true -highlighter: rouge -markdown: kramdown -future: true -include: - - _static - - _modules - - _sources - - docs/_sources -exclude: - - node_modules - - README.md - - CNAME - - Gemfile - - Gemfile.lock - - package.json - -# google_site_verification: eOAFtDphTbbm4OPKva2d3Z0Z_2bBxWMGdkD0IRQ6VeA +include: [_static, _images, _modules, _sources, _asserts.html, _creation.html, _comparison.html, _lowrank.html, _script.html, _diagnostic.html, _dynamo.html, _serialization.html, _type_utils, _tensor_str.html, _trace.html, _utils.html, _internal, _C, _distributed_autograd.html, _distributed_c10d.html, _distributed_rpc.html, _fft.html, _linalg.html, _monitor.html, _nested.html, _nn.html, _profiler.html, _sparse.html, _special.html, __config__.html, _dynamo, _lobpcg.html, _jit_internal.html, _numeric_suite.html, _numeric_suite_fx.html, _sanitizer.html, _symbolic_trace.html, _async.html, _freeze.html, _fuser.html, _type_utils.html, _utils ] diff --git a/_data/apps.yml b/_data/apps.yml deleted file mode 100644 index 07e2e7d55f15..000000000000 --- a/_data/apps.yml +++ /dev/null @@ -1,60 +0,0 @@ -- - name: "Slack" - desc: "A messaging app for teams" - url: "https://slack.com" - repository: "" - image: "/static/img/apps-test-img.png" - rating: 5 - tags: ["foo", "bar", "baz", "bing"] - downloads: "58,783" - featured: false -- - name: "Some Cool App" - desc: "A messaging app for teams with a longer desciription." - url: "https://slack.com" - repository: "" - image: "/static/img/apps-test-img.png" - rating: 4 - tags: ["foo", "bar", "baz", "bing"] - downloads: "11,783" - featured: false -- - name: "Slack" - desc: "A messaging app for teams" - url: "https://slack.com" - repository: "" - image: "/static/img/apps-test-img.png" - rating: 5 - tags: ["foo", "bar", "baz", "bing"] - downloads: "5,355" - featured: false -- - name: "An even longer cool name" - desc: "But a short description" - url: "https://slack.com" - repository: "" - image: "/static/img/apps-test-img.png" - rating: 1 - tags: ["foo", "bar", "baz", "bing", "foo", "bar", "baz", "bing"] - downloads: "8,783" - featured: false -- - name: "Really Neat App" - desc: "To monitor your children when you are not home and should be." - url: "https://slack.com" - repository: "" - image: "/static/img/apps-test-img.png" - rating: 5 - tags: ["foo", "bar", "baz", "bing"] - downloads: "878" - featured: false -- - name: "Slack" - desc: "A messaging app for teams" - url: "https://slack.com" - repository: "" - image: "/static/img/apps-test-img.png" - rating: 3 - tags: ["foo", "bar", "baz", "bing"] - downloads: "83" - featured: false \ No newline at end of file diff --git a/_data/wizard.yml b/_data/wizard.yml deleted file mode 100644 index 99af39bd9088..000000000000 --- a/_data/wizard.yml +++ /dev/null @@ -1,113 +0,0 @@ -############ conda section ######################### -- - matcher: 'conda,linux,cuda7.5,python2.7' - cmd: 'conda install pytorch torchvision -c soumith' -- - matcher: 'conda,linux,cuda8.0,python2.7' - cmd: 'conda install pytorch torchvision cuda80 -c soumith' -- - matcher: 'conda,linux,cudanone,python2.7' - cmd: 'conda install pytorch torchvision -c soumith' -- - matcher: 'conda,linux,cuda7.5,python3.5' - cmd: 'conda install pytorch torchvision -c soumith' -- - matcher: 'conda,linux,cuda8.0,python3.5' - cmd: 'conda install pytorch torchvision cuda80 -c soumith' -- - matcher: 'conda,linux,cudanone,python3.5' - cmd: 'conda install pytorch torchvision -c soumith' -- - matcher: 'conda,linux,cuda7.5,python3.6' - cmd: 'conda install pytorch torchvision -c soumith' -- - matcher: 'conda,linux,cuda8.0,python3.6' - cmd: 'conda install pytorch torchvision cuda80 -c soumith' -- - matcher: 'conda,linux,cudanone,python3.6' - cmd: 'conda install pytorch torchvision -c soumith' -- - matcher: 'conda,osx,cuda7.5,python2.7' - cmd: 'conda install pytorch torchvision -c soumith
# OSX Binaries dont support CUDA, install from source if CUDA is needed' -- - matcher: 'conda,osx,cuda8.0,python2.7' - cmd: 'conda install pytorch torchvision -c soumith
# OSX Binaries dont support CUDA, install from source if CUDA is needed' -- - matcher: 'conda,osx,cudanone,python2.7' - cmd: 'conda install pytorch torchvision -c soumith
# OSX Binaries dont support CUDA, install from source if CUDA is needed' -- - matcher: 'conda,osx,cuda7.5,python3.5' - cmd: 'conda install pytorch torchvision -c soumith
# OSX Binaries dont support CUDA, install from source if CUDA is needed' -- - matcher: 'conda,osx,cuda8.0,python3.5' - cmd: 'conda install pytorch torchvision -c soumith
# OSX Binaries dont support CUDA, install from source if CUDA is needed' -- - matcher: 'conda,osx,cudanone,python3.5' - cmd: 'conda install pytorch torchvision -c soumith
# OSX Binaries dont support CUDA, install from source if CUDA is needed' -- - matcher: 'conda,osx,cuda7.5,python3.6' - cmd: 'conda install pytorch torchvision -c soumith
# OSX Binaries dont support CUDA, install from source if CUDA is needed' -- - matcher: 'conda,osx,cuda8.0,python3.6' - cmd: 'conda install pytorch torchvision -c soumith
# OSX Binaries dont support CUDA, install from source if CUDA is needed' -- - matcher: 'conda,osx,cudanone,python3.6' - cmd: 'conda install pytorch torchvision -c soumith
# OSX Binaries dont support CUDA, install from source if CUDA is needed' - -############ pip section ######################### -######### OSX ###################### -- - matcher: 'pip,osx,cuda7.5,python2.7' - cmd: 'pip install http://download.pytorch.org/whl/torch-0.1.11.post5-cp27-none-macosx_10_7_x86_64.whl
pip install torchvision
# OSX Binaries dont support CUDA, install from source if CUDA is needed' -- - matcher: 'pip,osx,cuda8.0,python2.7' - cmd: 'pip install http://download.pytorch.org/whl/torch-0.1.11.post5-cp27-none-macosx_10_7_x86_64.whl
pip install torchvision
# OSX Binaries dont support CUDA, install from source if CUDA is needed' -- - matcher: 'pip,osx,cudanone,python2.7' - cmd: 'pip install http://download.pytorch.org/whl/torch-0.1.11.post5-cp27-none-macosx_10_7_x86_64.whl
pip install torchvision
# OSX Binaries dont support CUDA, install from source if CUDA is needed' -- - matcher: 'pip,osx,cuda7.5,python3.5' - cmd: 'pip install http://download.pytorch.org/whl/torch-0.1.11.post5-cp35-cp35m-macosx_10_7_x86_64.whl
pip install torchvision
# OSX Binaries dont support CUDA, install from source if CUDA is needed' -- - matcher: 'pip,osx,cuda8.0,python3.5' - cmd: 'pip install http://download.pytorch.org/whl/torch-0.1.11.post5-cp35-cp35m-macosx_10_7_x86_64.whl
pip install torchvision
# OSX Binaries dont support CUDA, install from source if CUDA is needed' -- - matcher: 'pip,osx,cudanone,python3.5' - cmd: 'pip install http://download.pytorch.org/whl/torch-0.1.11.post5-cp35-cp35m-macosx_10_7_x86_64.whl
pip install torchvision
# OSX Binaries dont support CUDA, install from source if CUDA is needed' -- - matcher: 'pip,osx,cuda7.5,python3.6' - cmd: 'pip install http://download.pytorch.org/whl/torch-0.1.11.post5-cp36-cp36m-macosx_10_7_x86_64.whl
pip install torchvision
# OSX Binaries dont support CUDA, install from source if CUDA is needed' -- - matcher: 'pip,osx,cuda8.0,python3.6' - cmd: 'pip install http://download.pytorch.org/whl/torch-0.1.11.post5-cp36-cp36m-macosx_10_7_x86_64.whl
pip install torchvision
# OSX Binaries dont support CUDA, install from source if CUDA is needed' -- - matcher: 'pip,osx,cudanone,python3.6' - cmd: 'pip install http://download.pytorch.org/whl/torch-0.1.11.post5-cp36-cp36m-macosx_10_7_x86_64.whl
pip install torchvision
# OSX Binaries dont support CUDA, install from source if CUDA is needed' -######### Linux ###################### -- - matcher: 'pip,linux,cuda7.5,python2.7' - cmd: 'pip install http://download.pytorch.org/whl/cu75/torch-0.1.11.post5-cp27-none-linux_x86_64.whl
pip install torchvision' -- - matcher: 'pip,linux,cudanone,python2.7' - cmd: 'pip install http://download.pytorch.org/whl/cu75/torch-0.1.11.post5-cp27-none-linux_x86_64.whl
pip install torchvision' -- - matcher: 'pip,linux,cuda8.0,python2.7' - cmd: 'pip install http://download.pytorch.org/whl/cu80/torch-0.1.11.post5-cp27-none-linux_x86_64.whl
pip install torchvision' -- - matcher: 'pip,linux,cuda7.5,python3.5' - cmd: 'pip install http://download.pytorch.org/whl/cu75/torch-0.1.11.post5-cp35-cp35m-linux_x86_64.whl
pip install torchvision' -- - matcher: 'pip,linux,cudanone,python3.5' - cmd: 'pip install http://download.pytorch.org/whl/cu75/torch-0.1.11.post5-cp35-cp35m-linux_x86_64.whl
pip install torchvision' -- - matcher: 'pip,linux,cuda8.0,python3.5' - cmd: 'pip install http://download.pytorch.org/whl/cu80/torch-0.1.11.post5-cp35-cp35m-linux_x86_64.whl
pip install torchvision' -- - matcher: 'pip,linux,cuda7.5,python3.6' - cmd: 'pip install http://download.pytorch.org/whl/cu75/torch-0.1.11.post5-cp36-cp36m-linux_x86_64.whl
pip install torchvision' -- - matcher: 'pip,linux,cudanone,python3.6' - cmd: 'pip install http://download.pytorch.org/whl/cu75/torch-0.1.11.post5-cp36-cp36m-linux_x86_64.whl
pip install torchvision' -- - matcher: 'pip,linux,cuda8.0,python3.6' - cmd: 'pip install http://download.pytorch.org/whl/cu80/torch-0.1.11.post5-cp36-cp36m-linux_x86_64.whl
pip install torchvision' diff --git a/_includes/footer.html b/_includes/footer.html deleted file mode 100644 index 2b96a18c857e..000000000000 --- a/_includes/footer.html +++ /dev/null @@ -1,21 +0,0 @@ -
- - {% if page.id != 'home' %} -
- {% endif %} - -
- -

- Maintained by the PyTorch core team.
- ©2017 PyTorch -

-
- - {% include primary-nav.html %} - - {% if page.id != 'home' %} -
- {% endif %} - -
diff --git a/_includes/header.html b/_includes/header.html deleted file mode 100644 index 41f96e30a6e6..000000000000 --- a/_includes/header.html +++ /dev/null @@ -1,16 +0,0 @@ -
- - {% if page.id != 'home' and page.id != 'docs' %} -
- {% endif %} - - - Docs - - {% include primary-nav.html %} - - {% if page.id != 'home' and page.id != 'docs' %} -
- {% endif %} - -
\ No newline at end of file diff --git a/_includes/primary-nav.html b/_includes/primary-nav.html deleted file mode 100644 index 233382440bea..000000000000 --- a/_includes/primary-nav.html +++ /dev/null @@ -1,6 +0,0 @@ - diff --git a/_layouts/about.html b/_layouts/about.html deleted file mode 100644 index 187ac113f5f4..000000000000 --- a/_layouts/about.html +++ /dev/null @@ -1,28 +0,0 @@ - - - - {{ page.title }} - - - - - - - - - - {% include header.html %} - -
-
- {{ content }} -
-
- - {% include footer.html %} - - - \ No newline at end of file diff --git a/_layouts/default.html b/_layouts/default.html deleted file mode 100644 index 0d46c4c251a1..000000000000 --- a/_layouts/default.html +++ /dev/null @@ -1,24 +0,0 @@ - - - - {{ page.title }} - - - - - - - - - - {% include header.html %} - - {{ content }} - - {% include footer.html %} - - - \ No newline at end of file diff --git a/_layouts/tutorial.html b/_layouts/tutorial.html deleted file mode 100644 index 187ac113f5f4..000000000000 --- a/_layouts/tutorial.html +++ /dev/null @@ -1,28 +0,0 @@ - - - - {{ page.title }} - - - - - - - - - - {% include header.html %} - -
-
- {{ content }} -
-
- - {% include footer.html %} - - - \ No newline at end of file diff --git a/_sass/_about.scss b/_sass/_about.scss deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/_sass/_apps.scss b/_sass/_apps.scss deleted file mode 100644 index 0646a8f1644a..000000000000 --- a/_sass/_apps.scss +++ /dev/null @@ -1,127 +0,0 @@ - -#apps { - p { - margin-bottom: 60px; - } -} - -.apps { - overflow: hidden; - @include border-radius(3px); - - .app-row { - background-color: $grey-light; - font-size: 14px; - line-height: 18px; - color: #808080; - border-bottom: 1px solid #d9d9d9; - width: 100%; - float: left; - - &.apps-header { - background-color: $grey; - border-bottom: none; - color: #fff; - font-weight: 200; - - .name-cell { - color: #fff; - font-weight: 200; - } - - .downloads-cell { - color: #fff; - font-weight: 200; - font-size: 14px; - } - - .app-cell { - height: auto; - } - } - - .app-cell { - padding: 16px 16px 16px 0; - width: 15%; - float: left; - height: 100%; - height: 90px; - - &:first-child { - padding-left: 16px; - } - - > div { - @extend %vertical-align; - } - } - - .img-cell { - width: 10%; - - > div { - background-repeat: no-repeat; - background-size: cover; - background-position: center; - background-color: $grey; - height: 58px; - width: 58px; - } - } - - .name-cell { - color: $grey; - font-weight: 600; - padding-left: 5px; - } - - .desc-cell { - width: 19%; - font-weight: 200; - } - - .downloads-cell { - font-size: 16px; - font-weight: 600; - color: $grey-medium; - width: 11%; - } - - .rating-cell { - .star { - float: left; - color: $grey-medium; - font-size: 20px; - - &.active { - color: $orange; - } - } - } - - .tags-cell { - font-weight: 200; - } - - .learn-cell {} - - .btn { - font-weight: 600; - font-size: 14px; - height: 38px; - line-height: 36px; - padding: 0 30px; - border: 2px solid $grey-medium; - color: $grey-medium; - @extend %vertical-align; - @include border-radius(19px); - - &:hover { - background-color: $orange; - border-color: $orange; - color: #fff; - } - } - } -} - diff --git a/_sass/_base.scss b/_sass/_base.scss deleted file mode 100644 index be9f6d9b26b3..000000000000 --- a/_sass/_base.scss +++ /dev/null @@ -1,143 +0,0 @@ -@import "normalize"; -@import "mixins"; - - -$grey: #333333; -$grey-medium: #999999; -$grey-light: #f7f7f7; -$red-orange: #f15532; -$text-color: #000000; -$orange: #f4921f; -$header-height: 75px; - - -html, body, ul, li { - margin: 0; - padding: 0; -} - -body { - background-color: $grey; - font-family: "Open Sans", "Helvetica", sans-serif; - font-size: 12px; - color: $text-color; -} - -h1, h2, h3, h4 { - -webkit-margin-before: 0em; - -webkit-margin-after: 0em; - font-weight: 200; -} - -a:hover { - text-decoration: none; -} - -.wrap {} - -.btn { - cursor: pointer; - @include transition(background-color 200ms, color 200ms, linear); -} - -.content { - background-color: #fff; - padding: 98px 0 123px; - - h1 { - font-size: 45px; - margin-bottom: 30px; - } - - h3 { - margin-top: 25px; - margin-bottom: 15px; - } - - p { - font-size: 18px; - color: $text-color; - font-weight: 200; - margin-bottom: 30px; - } - - img { - max-width: 100%; - } - - code { - font-family: monospace !important; - font-weight: 400; - font-size: 0.9em; - background-color: #eee; - padding: 3px 7px; - border-radius: 3px; - color: #666; - word-break: break-all; - -webkit-hyphens: auto; - -moz-hyphens: auto; - -ms-hyphens: auto; - hyphens: auto; - text-shadow: 0px 1px 0px #fff; - } - - ul { - margin-left: 25px; - margin-bottom: 35px; - font-weight: 200; - font-size: 16px; - - li { - color: $text-color; - margin-bottom: 15px; - } - } -} - -table { - margin-bottom: 30px; - overflow: auto; - font-size: 1.1em; - - th, td { - padding: 10px; - text-align: left; - vertical-align: top; - line-height: 1.6; - } - - th { - background-color: $grey; - color: #fff; - border-bottom: none; - font-weight: 600; - - &:first-child { - border-top-left-radius: 6px; - } - - &:last-child { - border-top-right-radius: 6px; - } - } - - td { - color: $text-color; - } - - tr, tr:last-child { - border-bottom: 1px solid #e6e6e6; - } - - tr:nth-child(odd)>td { - background-color: #fff; - } - - tr:nth-child(even)>td { - background-color: #fcfcfc; - } - - tr > td:first-child { - font-weight: 400; - } - } diff --git a/_sass/_chrome.scss b/_sass/_chrome.scss deleted file mode 100644 index 54f93dfd6a44..000000000000 --- a/_sass/_chrome.scss +++ /dev/null @@ -1,124 +0,0 @@ - -.logo { - background-image: url(../img/pytorch-logo-light.svg); - background-repeat: no-repeat; -} - -ul.primary-nav { - text-align: center; - float: right; - - li { - list-style-type: none; - float: left; - - a { - padding: 0 30px; - text-decoration: none; - color: $grey-medium; - height: 100%; - width: 100%; - float: left; - @include transition(color 200ms, linear); - - &.active, - &:hover { - color: #fff; - text-decoration: none; - } - } - } -} - -header { - background-color: #262626; - height: $header-height; - width: 100%; - - .logo { - width: 151px; - height: 31px; - float: left; - position: relative; - top: 18px; - - a { - display: block; - height: 100%; - } - } - - a { - font-size: 14px; - font-weight: 400; - } - - ul.primary-nav { - height: 100%; - line-height: 76px; - margin-right: 22px; - } - - .btn { - display: block; - height: 40px; - line-height: 38px; - padding: 0 21px; - border: 2px solid $grey-medium; - color: $grey-medium; - float: right; - position: relative; - top: 17px; - @include transition(border-color 200ms, color 200ms, linear); - @include border-radius(25px); - - &.active, - &:hover { - border-color: #fff; - color: #fff; - } - } -} - -footer { - background-color: $grey; - height: 150px; - padding: 33px 0 31px; - - .left { - color: #808080; - font-size: 14px; - width: 245px; - line-height: 27px; - font-weight: 200; - float: left; - } - - .logo { - width: 135px; - height: 28px; - margin-bottom: 10px; - - a { - display: block; - height: 100%; - } - } - - a { - font-size: 14px; - font-weight: 400; - } - - ul.primary-nav { - height: 50px; - line-height: 50px; - position: relative; - top: 18px; - - li:last-child a { - padding-right: 0; - } - } -} - diff --git a/_sass/_docs.scss b/_sass/_docs.scss deleted file mode 100644 index 3721d4bac062..000000000000 --- a/_sass/_docs.scss +++ /dev/null @@ -1,20 +0,0 @@ - -#docs { - - header { - padding-right: 2%; - padding-left: 2%; - } - - footer { - display: none; - } - - iframe { - position: fixed; - left: 0; - bottom: 0; - width: 100%; - height: calc(100% - #{$header-height}); - } -} diff --git a/_sass/_home.scss b/_sass/_home.scss deleted file mode 100644 index 8f461cc4de65..000000000000 --- a/_sass/_home.scss +++ /dev/null @@ -1,324 +0,0 @@ - -#home { - header, footer { - padding-right: 8%; - padding-left: 8%; - } - - .container { - max-width: 970px !important; - } -} - -.hero { - background-image: url(../img/hero.png); - background-repeat: no-repeat; - background-size: cover; - background-position: center; - background-color: $grey; - height: 500px; - position: relative; - - .container { - height: 100%; - text-align: center; - } - - .inner { - @extend %vertical-align; - } - - h1 { - color: #fff; - font-size: 40px; - letter-spacing: -0.01em; - line-height: 1.22em; - margin-bottom: 30px; - } - - h2 { - color: #fff; - font-size: 18px; - line-height: 1.55em; - letter-spacing: 0em; - margin-bottom: 25px; - } - - @media screen and (max-width: 400px) { - h1 { - font-size: 30px; - } - - h2 { - font-size: 15px; - } - - .btn { - font-size: 13px; - } - } - - .btn { - font-weight: 600; - font-size: 14px; - height: 48px; - line-height: 46px; - padding: 0 45px; - border: 2px solid #fff; - margin-top: 10px; - color: #fff; - @include border-radius(24px); - - &:hover { - background-color: #fff; - color: $red-orange; - } - } -} - -.install-wizard { - background-color: #fff; - padding: 108px 0; - - h3 { - font-size: 50px; - margin-bottom: 13px; - } - - h4 { - font-size: 16px; - line-height: 22px; - color: $grey-medium; - } - - .row { - margin-bottom: 30px; - } - - .title { - padding-top: 10px; - } - - .options-sets { - text-align: right; - color: $grey-medium; - font-size: 14px; - - .option-row { - float: left; - width: 100%; - margin-bottom: 15px; - - .option-label { - float: left; - width: 25%; - text-align: right; - padding-right: 15px; - padding-top: 10px; - display: table; - vertical-align: middle; - } - - .option-set { - float: left; - width: 75%; - } - - .btn { - font-weight: 600; - font-size: 13px; - height: 38px; - line-height: 36px; - padding: 0; - border: 2px solid #ccc; - color: $grey-medium; - @include border-radius(19px); - - &.selected, - &:hover { - background-color: $grey; - color: #fff; - border-color: $grey; - } - - &:last-child { - margin-right: 0 !important; - } - } - - &.cuda { - .btn { - width: (91% / 3); - margin-right: 2%; - } - } - - &.os { - .btn { - width: (94% / 2); - margin-right: 2%; - } - } - - &.pm { - .btn { - width: (91% / 3); - margin-right: 2%; - } - } - - &.python { - .btn { - width: (91% / 3); - margin-right: 2%; - } - } - - } - } - - .command { - background-color: $grey; - overflow: hidden; - @include border-radius(7px); - - .label { - font-weight: 600; - font-size: 14px; - float: left; - width: 180px; - padding: 18px 10px 12px; - text-align: center; - } - - .text { - background-color: $grey-light; - font-weight: 200; - font-size: 15px; - float: left; - width: calc(100% - 180px); - padding: 15px 25px;; - } - } -} - -.projects-tutorials { - background-color: $grey-light; - padding: 90px 0 20px; - - h3 { - font-size: 40px; - margin-bottom: 20px; - } - - h4 { - font-size: 17px; - margin-bottom: 45px; - color: $grey-medium; - } - - .box { - text-align: center; - padding: 10px 0; - margin-bottom: 70px; - - img { - width: 60px; - height: 60px; - margin-bottom: 35px; - @include opacity(0.5); - } - - &.projects { - img { - width: 50px; - height: 50px; - } - } - - &:first-child { - border-right: 1px solid #d9d9d9; - } - } - - .btn { - font-weight: 600; - font-size: 14px; - height: 50px; - line-height: 48px; - padding: 0 60px; - border: 2px solid $red-orange; - color: $grey; - @include border-radius(25px); - - &:hover { - background-color: $red-orange; - color: #fff; - } - } -} - -.logos { - background-color: #fff; - text-align: center; - padding: 121px 0 100px; - - h3 { - font-size: 40px; - margin-bottom: 85px; - } - - .logos-wrapper { - width: 960px; - height: 312px; - margin: 0 auto; - - img { - float: left; - } - } - - .row { - height: 150px; - - div { - height: 100%; - } - } - - @media screen and (max-width: 768px) { - .row { - height: 100px; - - img { - max-width: 70%; - } - - img.smaller { - max-width: 40%; - } - - img.smallerer { - max-width: 30%; - } - } - } - - img { - @extend %vertical-align; - max-width: 90%; - } - - .smaller { - max-width: 55%; - } - - .smallerer { - max-width: 40%; - } - - .larger { - max-width: 100%; - } -} - diff --git a/_sass/_mixins.scss b/_sass/_mixins.scss deleted file mode 100644 index 0adca7acda51..000000000000 --- a/_sass/_mixins.scss +++ /dev/null @@ -1,57 +0,0 @@ - -%vertical-align { - position: relative; - top: 50%; - -webkit-transform: translateY(-50%); - -ms-transform: translateY(-50%); - transform: translateY(-50%); -} - -%no-select { - -webkit-touch-callout: none; - -webkit-user-select: none; - -khtml-user-select: none; - -moz-user-select: none; - -ms-user-select: none; - user-select: none; -} - -%truncate-text { - white-space: nowrap; - overflow: hidden; - text-overflow: ellipsis; -} - -@mixin border-radius($radius) { - -webkit-border-radius: $radius; - -moz-border-radius: $radius; - -ms-border-radius: $radius; - border-radius: $radius; -} - -@mixin transition($args...) { - -webkit-transition: $args; - -moz-transition: $args; - -ms-transition: $args; - -o-transition: $args; - transition: $args; -} - -@mixin animation($str) { - -webkit-animation: #{$str}; - -moz-animation: #{$str}; - -ms-animation: #{$str}; - -o-animation: #{$str}; - animation: #{$str}; -} - -@mixin box-shadow($horiz:1px, $vert:1px, $blur:3px, $spread:1px, $color:rgba(0,0,0, 0.2)) { - -webkit-box-shadow: $horiz $vert $blur $spread $color; - -moz-box-shadow: $horiz $vert $blur $spread $color; - box-shadow: $horiz $vert $blur $spread $color; -} - -@mixin opacity($opacity) { - opacity: $opacity; - filter: alpha(opacity=($opacity * 100)); //IE8 -} \ No newline at end of file diff --git a/_sass/_normalize.scss b/_sass/_normalize.scss deleted file mode 100644 index 01ee6971c4d1..000000000000 --- a/_sass/_normalize.scss +++ /dev/null @@ -1,461 +0,0 @@ -/*! normalize.css v5.0.0 | MIT License | github.com/necolas/normalize.css */ - -/** - * 1. Change the default font family in all browsers (opinionated). - * 2. Correct the line height in all browsers. - * 3. Prevent adjustments of font size after orientation changes in - * IE on Windows Phone and in iOS. - */ - -/* Document - ========================================================================== */ - -html { - font-family: sans-serif; /* 1 */ - line-height: 1.15; /* 2 */ - -ms-text-size-adjust: 100%; /* 3 */ - -webkit-text-size-adjust: 100%; /* 3 */ -} - -/* Sections - ========================================================================== */ - -/** - * Remove the margin in all browsers (opinionated). - */ - -body { - margin: 0; -} - -/** - * Add the correct display in IE 9-. - */ - -article, -aside, -footer, -header, -nav, -section { - display: block; -} - -/** - * Correct the font size and margin on `h1` elements within `section` and - * `article` contexts in Chrome, Firefox, and Safari. - */ - -h1 { - font-size: 2em; - margin: 0.67em 0; -} - -/* Grouping content - ========================================================================== */ - -/** - * Add the correct display in IE 9-. - * 1. Add the correct display in IE. - */ - -figcaption, -figure, -main { /* 1 */ - display: block; -} - -/** - * Add the correct margin in IE 8. - */ - -figure { - margin: 1em 40px; -} - -/** - * 1. Add the correct box sizing in Firefox. - * 2. Show the overflow in Edge and IE. - */ - -hr { - box-sizing: content-box; /* 1 */ - height: 0; /* 1 */ - overflow: visible; /* 2 */ -} - -/** - * 1. Correct the inheritance and scaling of font size in all browsers. - * 2. Correct the odd `em` font sizing in all browsers. - */ - -pre { - font-family: monospace, monospace; /* 1 */ - font-size: 1em; /* 2 */ -} - -/* Text-level semantics - ========================================================================== */ - -/** - * 1. Remove the gray background on active links in IE 10. - * 2. Remove gaps in links underline in iOS 8+ and Safari 8+. - */ - -a { - background-color: transparent; /* 1 */ - -webkit-text-decoration-skip: objects; /* 2 */ -} - -/** - * Remove the outline on focused links when they are also active or hovered - * in all browsers (opinionated). - */ - -a:active, -a:hover { - outline-width: 0; -} - -/** - * 1. Remove the bottom border in Firefox 39-. - * 2. Add the correct text decoration in Chrome, Edge, IE, Opera, and Safari. - */ - -abbr[title] { - border-bottom: none; /* 1 */ - text-decoration: underline; /* 2 */ - text-decoration: underline dotted; /* 2 */ -} - -/** - * Prevent the duplicate application of `bolder` by the next rule in Safari 6. - */ - -b, -strong { - font-weight: inherit; -} - -/** - * Add the correct font weight in Chrome, Edge, and Safari. - */ - -b, -strong { - font-weight: bolder; -} - -/** - * 1. Correct the inheritance and scaling of font size in all browsers. - * 2. Correct the odd `em` font sizing in all browsers. - */ - -code, -kbd, -samp { - font-family: monospace, monospace; /* 1 */ - font-size: 1em; /* 2 */ -} - -/** - * Add the correct font style in Android 4.3-. - */ - -dfn { - font-style: italic; -} - -/** - * Add the correct background and color in IE 9-. - */ - -mark { - background-color: #ff0; - color: #000; -} - -/** - * Add the correct font size in all browsers. - */ - -small { - font-size: 80%; -} - -/** - * Prevent `sub` and `sup` elements from affecting the line height in - * all browsers. - */ - -sub, -sup { - font-size: 75%; - line-height: 0; - position: relative; - vertical-align: baseline; -} - -sub { - bottom: -0.25em; -} - -sup { - top: -0.5em; -} - -/* Embedded content - ========================================================================== */ - -/** - * Add the correct display in IE 9-. - */ - -audio, -video { - display: inline-block; -} - -/** - * Add the correct display in iOS 4-7. - */ - -audio:not([controls]) { - display: none; - height: 0; -} - -/** - * Remove the border on images inside links in IE 10-. - */ - -img { - border-style: none; -} - -/** - * Hide the overflow in IE. - */ - -svg:not(:root) { - overflow: hidden; -} - -/* Forms - ========================================================================== */ - -/** - * 1. Change the font styles in all browsers (opinionated). - * 2. Remove the margin in Firefox and Safari. - */ - -button, -input, -optgroup, -select, -textarea { - font-family: sans-serif; /* 1 */ - font-size: 100%; /* 1 */ - line-height: 1.15; /* 1 */ - margin: 0; /* 2 */ -} - -/** - * Show the overflow in IE. - * 1. Show the overflow in Edge. - */ - -button, -input { /* 1 */ - overflow: visible; -} - -/** - * Remove the inheritance of text transform in Edge, Firefox, and IE. - * 1. Remove the inheritance of text transform in Firefox. - */ - -button, -select { /* 1 */ - text-transform: none; -} - -/** - * 1. Prevent a WebKit bug where (2) destroys native `audio` and `video` - * controls in Android 4. - * 2. Correct the inability to style clickable types in iOS and Safari. - */ - -button, -html [type="button"], /* 1 */ -[type="reset"], -[type="submit"] { - -webkit-appearance: button; /* 2 */ -} - -/** - * Remove the inner border and padding in Firefox. - */ - -button::-moz-focus-inner, -[type="button"]::-moz-focus-inner, -[type="reset"]::-moz-focus-inner, -[type="submit"]::-moz-focus-inner { - border-style: none; - padding: 0; -} - -/** - * Restore the focus styles unset by the previous rule. - */ - -button:-moz-focusring, -[type="button"]:-moz-focusring, -[type="reset"]:-moz-focusring, -[type="submit"]:-moz-focusring { - outline: 1px dotted ButtonText; -} - -/** - * Change the border, margin, and padding in all browsers (opinionated). - */ - -fieldset { - border: 1px solid #c0c0c0; - margin: 0 2px; - padding: 0.35em 0.625em 0.75em; -} - -/** - * 1. Correct the text wrapping in Edge and IE. - * 2. Correct the color inheritance from `fieldset` elements in IE. - * 3. Remove the padding so developers are not caught out when they zero out - * `fieldset` elements in all browsers. - */ - -legend { - box-sizing: border-box; /* 1 */ - color: inherit; /* 2 */ - display: table; /* 1 */ - max-width: 100%; /* 1 */ - padding: 0; /* 3 */ - white-space: normal; /* 1 */ -} - -/** - * 1. Add the correct display in IE 9-. - * 2. Add the correct vertical alignment in Chrome, Firefox, and Opera. - */ - -progress { - display: inline-block; /* 1 */ - vertical-align: baseline; /* 2 */ -} - -/** - * Remove the default vertical scrollbar in IE. - */ - -textarea { - overflow: auto; -} - -/** - * 1. Add the correct box sizing in IE 10-. - * 2. Remove the padding in IE 10-. - */ - -[type="checkbox"], -[type="radio"] { - box-sizing: border-box; /* 1 */ - padding: 0; /* 2 */ -} - -/** - * Correct the cursor style of increment and decrement buttons in Chrome. - */ - -[type="number"]::-webkit-inner-spin-button, -[type="number"]::-webkit-outer-spin-button { - height: auto; -} - -/** - * 1. Correct the odd appearance in Chrome and Safari. - * 2. Correct the outline style in Safari. - */ - -[type="search"] { - -webkit-appearance: textfield; /* 1 */ - outline-offset: -2px; /* 2 */ -} - -/** - * Remove the inner padding and cancel buttons in Chrome and Safari on macOS. - */ - -[type="search"]::-webkit-search-cancel-button, -[type="search"]::-webkit-search-decoration { - -webkit-appearance: none; -} - -/** - * 1. Correct the inability to style clickable types in iOS and Safari. - * 2. Change font properties to `inherit` in Safari. - */ - -::-webkit-file-upload-button { - -webkit-appearance: button; /* 1 */ - font: inherit; /* 2 */ -} - -/* Interactive - ========================================================================== */ - -/* - * Add the correct display in IE 9-. - * 1. Add the correct display in Edge, IE, and Firefox. - */ - -details, /* 1 */ -menu { - display: block; -} - -/* - * Add the correct display in all browsers. - */ - -summary { - display: list-item; -} - -/* Scripting - ========================================================================== */ - -/** - * Add the correct display in IE 9-. - */ - -canvas { - display: inline-block; -} - -/** - * Add the correct display in IE. - */ - -template { - display: none; -} - -/* Hidden - ========================================================================== */ - -/** - * Add the correct display in IE 10-. - */ - -[hidden] { - display: none; -} \ No newline at end of file diff --git a/_sass/_responsive.scss b/_sass/_responsive.scss deleted file mode 100644 index 4639207eb57b..000000000000 --- a/_sass/_responsive.scss +++ /dev/null @@ -1,35 +0,0 @@ -$tablet-width: 0px; -$phone-width: 0px; - - -@media (max-width: $tablet-width) { - -} - -@media (max-width: $phone-width) { - -} - -@media (max-width: 905px) { - - .primary-nav { - display: none; - } - - .col-md-5.title { - margin-bottom: 4em; - } -} - -@media (max-width: 991px) { - - #home { - .box.projects { - border-right: none; - border-bottom: 1px solid #d9d9d9; - padding-bottom: 50px; - margin-bottom: 50px; - } - } -} - diff --git a/_sass/_support.scss b/_sass/_support.scss deleted file mode 100644 index 48e3d1a2d0a1..000000000000 --- a/_sass/_support.scss +++ /dev/null @@ -1,48 +0,0 @@ - -#support { - .content { - - } - - .support-group { - margin-bottom: 45px; - - h3 { - font-size: 24px; - font-weight: 700; - margin-bottom: 20px; - } - - p { - color: #808080; - font-size: 18px; - margin-bottom: 15px; - - a { - font-weight: 700; - color: $orange; - - &:hover { - color: $red-orange; - } - } - } - - .btn { - font-size: 14px; - height: 46px; - line-height: 44px; - padding: 0 35px; - border: 2px solid #000; - color: #000; - font-weight: 700; - @include border-radius(23px); - - &:hover { - background-color: #000; - color: #fff; - } - } - } -} - diff --git a/about.md b/about.md deleted file mode 100644 index c9fb9e31dd7c..000000000000 --- a/about.md +++ /dev/null @@ -1,110 +0,0 @@ ---- -title: PyTorch | About -id: about -permalink: /about/ -layout: about ---- - -PyTorch is a python package that provides two high-level features: - -- Tensor computation (like numpy) with strong GPU acceleration -- Deep Neural Networks built on a tape-based autograd system - -You can reuse your favorite python packages such as numpy, scipy and Cython to extend PyTorch when needed. - -At a granular level, PyTorch is a library that consists of the following components: - -| Package | Description | -| ------------------------ | --- | -| torch | a Tensor library like NumPy, with strong GPU support | -| torch.autograd | a tape based automatic differentiation library that supports all differentiable Tensor operations in torch | -| torch.nn | a neural networks library deeply integrated with autograd designed for maximum flexibility | -| torch.optim | an optimization package to be used with torch.nn with standard optimization methods such as SGD, RMSProp, LBFGS, Adam etc. | -| torch.multiprocessing | python multiprocessing, but with magical memory sharing of torch Tensors across processes. Useful for data loading and hogwild training. | -| torch.utils | DataLoader, Trainer and other utility functions for convenience | -| torch.legacy(.nn/.optim) | legacy code that has been ported over from torch for backward compatibility reasons | - -Usually one uses PyTorch either as: - -- A replacement for numpy to use the power of GPUs. -- a deep learning research platform that provides maximum flexibility and speed - -Elaborating further: - -### A GPU-ready Tensor library - -If you use numpy, then you have used Tensors (a.k.a ndarray). - -![tensor_illustration](/static/img/tensor_illustration.png) - -PyTorch provides Tensors that can live either on the CPU or the GPU, and accelerate -compute by a huge amount. - -We provide a wide variety of tensor routines to accelerate and fit your scientific computation needs -such as slicing, indexing, math operations, linear algebra, reductions. -And they are fast! - -### Dynamic Neural Networks: Tape based Autograd - -PyTorch has a unique way of building neural networks: using and replaying a tape recorder. - -Most frameworks such as `TensorFlow`, `Theano`, `Caffe` and `CNTK` have a static view of the world. -One has to build a neural network, and reuse the same structure again and again. -Changing the way the network behaves means that one has to start from scratch. - -With PyTorch, we use a technique called Reverse-mode auto-differentiation, which allows you to -change the way your network behaves arbitrarily with zero lag or overhead. Our inspiration comes -from several research papers on this topic, as well as current and past work such as -[autograd](https://github.com/twitter/torch-autograd), -[autograd](https://github.com/HIPS/autograd), -[Chainer](http://chainer.org), etc. - -While this technique is not unique to PyTorch, it's one of the fastest implementations of it to date. -You get the best of speed and flexibility for your crazy research. - -![dynamic_graph](/static/img/dynamic_graph.gif) - -### Python first - -PyTorch is not a Python binding into a monolothic C++ framework. -It is built to be deeply integrated into Python. -You can use it naturally like you would use numpy / scipy / scikit-learn etc. -You can write your new neural network layers in Python itself, using your favorite libraries -and use packages such as Cython and Numba. -Our goal is to not reinvent the wheel where appropriate. - -### Imperative experiences - -PyTorch is designed to be intuitive, linear in thought and easy to use. -When you execute a line of code, it gets executed. There isn't an asynchronous view of the world. -When you drop into a debugger, or receive error messages and stack traces, understanding them is straight-forward. -The stack-trace points to exactly where your code was defined. -We hope you never spend hours debugging your code because of bad stack traces or asynchronous and opaque execution engines. - -### Fast and Lean - -PyTorch has minimal framework overhead. We integrate acceleration libraries -such as Intel MKL and NVIDIA (CuDNN, NCCL) to maximize speed. -At the core, it's CPU and GPU Tensor and Neural Network backends -(TH, THC, THNN, THCUNN) are written as independent libraries with a C99 API. -They are mature and have been tested for years. - -Hence, PyTorch is quite fast -- whether you run small or large neural networks. - -The memory usage in PyTorch is extremely efficient compared to Torch or some of the alternatives. -We've written custom memory allocators for the GPU to make sure that -your deep learning models are maximally memory efficient. -This enables you to train bigger deep learning models than before. - -### Extensions without pain - -Writing new neural network modules, or interfacing with PyTorch's Tensor API was designed to be straight-forward -and with minimal abstractions. - -You can write new neural network layers in Python using the torch API -[or your favorite numpy based libraries such as SciPy](https://github.com/pytorch/tutorials/blob/master/Creating%20extensions%20using%20numpy%20and%20scipy.ipynb) - -If you want to write your layers in C/C++, we provide an extension API based on -[cffi](http://cffi.readthedocs.io/en/latest/) that is efficient and with minimal boilerplate. -There is no wrapper code that needs to be written. [You can see an example here](https://github.com/pytorch/extension-ffi). - diff --git a/ai-powered-competitive-programming.html b/ai-powered-competitive-programming.html new file mode 100644 index 000000000000..aae968437f6f --- /dev/null +++ b/ai-powered-competitive-programming.html @@ -0,0 +1,641 @@ + + + + + + + + + + + + + AI-Powered Competitive Programming: My HackerCup 2024 Experience | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
+
+
+
+ + +
+ + + + + + + + +
+ +
+
+ + +
+ +
+
+
+

PyTorch Webinars

+
+
+
+ +
+
+
+
+ AI-Powered Competitive Programming +

AI-Powered Competitive Programming: My HackerCup 2024 Experience

+

+ Date: January 24, 2025, 1PM ET +
+ Speaker: Anton Pidkuiko, Software Engineer, Meta +
+ Location: Online +
+
+ In this talk, Anton shared how he built an AI agent that ranked #1 in the finals of Meta HackerCup 2024 (AI division). Anton developed a workflow that could solve the hardest competitive programming problems quickly and reliably. Anton will walk through how he used state-of-the-art reasoning LLM models, curated RAG, and leveraged cloud infrastructure to safely test and execute solutions at scale. This approach highlights the massive potential of test-time compute scaling and provides insights into AI's future role in programming. +

+ Anton Pidkuiko is a Software Engineer at Meta, Reality Labs in London. He is currently working on applying the power of Large Language Models to Metaverse Avatar product experiences. +

+ Watch the recording now and access Anton's presentation slides here. +

+

+
+
+
+
+ +
+
+
+
+

Docs

+

Access comprehensive developer documentation for PyTorch

+ View Docs +
+ +
+

Tutorials

+

Get in-depth tutorials for beginners and advanced developers

+ View Tutorials +
+ +
+

Resources

+

Find development resources and get your questions answered

+ View Resources +
+
+
+
+ +
+ +
+ +
+
+
+
+ + +
+
+
+ + +
+ + + + + + + + + + + + + + + + + + + + + + + diff --git a/assets/HackerCup-AI-PS.pdf b/assets/HackerCup-AI-PS.pdf new file mode 100644 index 000000000000..2841752ef6c5 Binary files /dev/null and b/assets/HackerCup-AI-PS.pdf differ diff --git a/assets/Happiest_Minds_Technologies_Logo.jpeg b/assets/Happiest_Minds_Technologies_Logo.jpeg new file mode 100644 index 000000000000..ade14b94d646 Binary files /dev/null and b/assets/Happiest_Minds_Technologies_Logo.jpeg differ diff --git "a/assets/Lyft logo \342\200\223 pink \342\200\223 rgb.png" "b/assets/Lyft logo \342\200\223 pink \342\200\223 rgb.png" new file mode 100644 index 000000000000..31b0eab4b6d8 Binary files /dev/null and "b/assets/Lyft logo \342\200\223 pink \342\200\223 rgb.png" differ diff --git a/assets/brand-guidelines/PyTorch-Brand-Guidelines.pdf b/assets/brand-guidelines/PyTorch-Brand-Guidelines.pdf new file mode 100644 index 000000000000..b752dd31cd63 Binary files /dev/null and b/assets/brand-guidelines/PyTorch-Brand-Guidelines.pdf differ diff --git a/assets/cookie-banner.js b/assets/cookie-banner.js new file mode 100644 index 000000000000..d9b7acfab1bc --- /dev/null +++ b/assets/cookie-banner.js @@ -0,0 +1,42 @@ +var cookieBanner = { + init: function() { + cookieBanner.bind(); + + var cookieExists = cookieBanner.cookieExists(); + + if (!cookieExists) { + cookieBanner.setCookie(); + cookieBanner.showCookieNotice(); + } + }, + + bind: function() { + $(".close-button").on("click", cookieBanner.hideCookieNotice); + }, + + cookieExists: function() { + var cookie = localStorage.getItem("returningPytorchUser"); + + if (cookie) { + return true; + } else { + return false; + } + }, + + setCookie: function() { + localStorage.setItem("returningPytorchUser", true); + }, + + showCookieNotice: function() { + $(".cookie-banner-wrapper").addClass("is-visible"); + }, + + hideCookieNotice: function() { + $(".cookie-banner-wrapper").removeClass("is-visible"); + } +}; + +$(function() { + cookieBanner.init(); +}); diff --git a/assets/css/style.css b/assets/css/style.css new file mode 100644 index 000000000000..1f9ba713ded3 --- /dev/null +++ b/assets/css/style.css @@ -0,0 +1 @@ +/*! normalize.css v4.1.1 | MIT License | github.com/necolas/normalize.css */html{font-family:sans-serif;-ms-text-size-adjust:100%;-webkit-text-size-adjust:100%}body{margin:0}article,aside,details,figcaption,figure,footer,header,main,menu,nav,section{display:block}summary{display:list-item}audio,canvas,progress,video{display:inline-block}audio:not([controls]){display:none;height:0}progress{vertical-align:baseline}template,[hidden]{display:none !important}a{background-color:transparent}a:active,a:hover{outline-width:0}abbr[title]{border-bottom:none;text-decoration:underline;-webkit-text-decoration:underline dotted;text-decoration:underline dotted}b,strong{font-weight:inherit}b,strong{font-weight:bolder}dfn{font-style:italic}h1{font-size:2em;margin:0.67em 0}mark{background-color:#ff0;color:#000}small{font-size:80%}sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline}sub{bottom:-0.25em}sup{top:-0.5em}img{border-style:none}svg:not(:root){overflow:hidden}code,kbd,pre,samp{font-family:monospace, monospace;font-size:1em}figure{margin:1em 40px}hr{box-sizing:content-box;height:0;overflow:visible}button,input,select,textarea{font:inherit;margin:0}optgroup{font-weight:bold}button,input{overflow:visible}button,select{text-transform:none}button,html [type="button"],[type="reset"],[type="submit"]{-webkit-appearance:button}button::-moz-focus-inner,[type="button"]::-moz-focus-inner,[type="reset"]::-moz-focus-inner,[type="submit"]::-moz-focus-inner{border-style:none;padding:0}button:-moz-focusring,[type="button"]:-moz-focusring,[type="reset"]:-moz-focusring,[type="submit"]:-moz-focusring{outline:1px dotted ButtonText}fieldset{border:1px solid #c0c0c0;margin:0 2px;padding:0.35em 0.625em 0.75em}legend{box-sizing:border-box;color:inherit;display:table;max-width:100%;padding:0;white-space:normal}textarea{overflow:auto}[type="checkbox"],[type="radio"]{box-sizing:border-box;padding:0}[type="number"]::-webkit-inner-spin-button,[type="number"]::-webkit-outer-spin-button{height:auto}[type="search"]{-webkit-appearance:textfield;outline-offset:-2px}[type="search"]::-webkit-search-cancel-button,[type="search"]::-webkit-search-decoration{-webkit-appearance:none}::-webkit-input-placeholder{color:inherit;opacity:0.54}::-webkit-file-upload-button{-webkit-appearance:button;font:inherit}*{box-sizing:border-box}input,select,textarea,button{font-family:inherit;font-size:inherit;line-height:inherit}body{font-family:-apple-system,BlinkMacSystemFont,"Segoe UI",Helvetica,Arial,sans-serif,"Apple Color Emoji","Segoe UI Emoji","Segoe UI Symbol";font-size:14px;line-height:1.5;color:#24292e;background-color:#fff}a{color:#0366d6;text-decoration:none}a:hover{text-decoration:underline}b,strong{font-weight:600}hr,.rule{height:0;margin:15px 0;overflow:hidden;background:transparent;border:0;border-bottom:1px solid #dfe2e5}hr::before,.rule::before{display:table;content:""}hr::after,.rule::after{display:table;clear:both;content:""}table{border-spacing:0;border-collapse:collapse}td,th{padding:0}button{cursor:pointer;border-radius:0}[hidden][hidden]{display:none !important}details summary{cursor:pointer}details:not([open])>*:not(summary){display:none !important}h1,h2,h3,h4,h5,h6{margin-top:0;margin-bottom:0}h1{font-size:32px;font-weight:600}h2{font-size:24px;font-weight:600}h3{font-size:20px;font-weight:600}h4{font-size:16px;font-weight:600}h5{font-size:14px;font-weight:600}h6{font-size:12px;font-weight:600}p{margin-top:0;margin-bottom:10px}small{font-size:90%}blockquote{margin:0}ul,ol{padding-left:0;margin-top:0;margin-bottom:0}ol ol,ul ol{list-style-type:lower-roman}ul ul ol,ul ol ol,ol ul ol,ol ol ol{list-style-type:lower-alpha}dd{margin-left:0}tt,code{font-family:"SFMono-Regular",Consolas,"Liberation Mono",Menlo,Courier,monospace;font-size:12px}pre{margin-top:0;margin-bottom:0;font-family:"SFMono-Regular",Consolas,"Liberation Mono",Menlo,Courier,monospace;font-size:12px}.octicon{vertical-align:text-bottom}.anim-fade-in{-webkit-animation-name:fade-in;animation-name:fade-in;-webkit-animation-duration:1s;animation-duration:1s;-webkit-animation-timing-function:ease-in-out;animation-timing-function:ease-in-out}.anim-fade-in.fast{-webkit-animation-duration:300ms;animation-duration:300ms}@-webkit-keyframes fade-in{0%{opacity:0}100%{opacity:1}}@keyframes fade-in{0%{opacity:0}100%{opacity:1}}.anim-fade-out{-webkit-animation-name:fade-out;animation-name:fade-out;-webkit-animation-duration:1s;animation-duration:1s;-webkit-animation-timing-function:ease-out;animation-timing-function:ease-out}.anim-fade-out.fast{-webkit-animation-duration:0.3s;animation-duration:0.3s}@-webkit-keyframes fade-out{0%{opacity:1}100%{opacity:0}}@keyframes fade-out{0%{opacity:1}100%{opacity:0}}.anim-fade-up{opacity:0;-webkit-animation-name:fade-up;animation-name:fade-up;-webkit-animation-duration:0.3s;animation-duration:0.3s;-webkit-animation-fill-mode:forwards;animation-fill-mode:forwards;-webkit-animation-timing-function:ease-out;animation-timing-function:ease-out;-webkit-animation-delay:1s;animation-delay:1s}@-webkit-keyframes fade-up{0%{opacity:0.8;transform:translateY(100%)}100%{opacity:1;transform:translateY(0)}}@keyframes fade-up{0%{opacity:0.8;transform:translateY(100%)}100%{opacity:1;transform:translateY(0)}}.anim-fade-down{-webkit-animation-name:fade-down;animation-name:fade-down;-webkit-animation-duration:0.3s;animation-duration:0.3s;-webkit-animation-fill-mode:forwards;animation-fill-mode:forwards;-webkit-animation-timing-function:ease-in;animation-timing-function:ease-in}@-webkit-keyframes fade-down{0%{opacity:1;transform:translateY(0)}100%{opacity:0.5;transform:translateY(100%)}}@keyframes fade-down{0%{opacity:1;transform:translateY(0)}100%{opacity:0.5;transform:translateY(100%)}}.anim-grow-x{width:0%;-webkit-animation-name:grow-x;animation-name:grow-x;-webkit-animation-duration:0.3s;animation-duration:0.3s;-webkit-animation-fill-mode:forwards;animation-fill-mode:forwards;-webkit-animation-timing-function:ease;animation-timing-function:ease;-webkit-animation-delay:0.5s;animation-delay:0.5s}@-webkit-keyframes grow-x{to{width:100%}}@keyframes grow-x{to{width:100%}}.anim-shrink-x{-webkit-animation-name:shrink-x;animation-name:shrink-x;-webkit-animation-duration:0.3s;animation-duration:0.3s;-webkit-animation-fill-mode:forwards;animation-fill-mode:forwards;-webkit-animation-timing-function:ease-in-out;animation-timing-function:ease-in-out;-webkit-animation-delay:0.5s;animation-delay:0.5s}@-webkit-keyframes shrink-x{to{width:0%}}@keyframes shrink-x{to{width:0%}}.anim-scale-in{-webkit-animation-name:scale-in;animation-name:scale-in;-webkit-animation-duration:0.15s;animation-duration:0.15s;-webkit-animation-timing-function:cubic-bezier(0.2, 0, 0.13, 1.5);animation-timing-function:cubic-bezier(0.2, 0, 0.13, 1.5)}@-webkit-keyframes scale-in{0%{opacity:0;transform:scale(0.5)}100%{opacity:1;transform:scale(1)}}@keyframes scale-in{0%{opacity:0;transform:scale(0.5)}100%{opacity:1;transform:scale(1)}}.anim-pulse{-webkit-animation-name:pulse;animation-name:pulse;-webkit-animation-duration:2s;animation-duration:2s;-webkit-animation-timing-function:linear;animation-timing-function:linear;-webkit-animation-iteration-count:infinite;animation-iteration-count:infinite}@-webkit-keyframes pulse{0%{opacity:0.3}10%{opacity:1}100%{opacity:0.3}}@keyframes pulse{0%{opacity:0.3}10%{opacity:1}100%{opacity:0.3}}.anim-pulse-in{-webkit-animation-name:pulse-in;animation-name:pulse-in;-webkit-animation-duration:0.5s;animation-duration:0.5s}@-webkit-keyframes pulse-in{0%{transform:scale3d(1, 1, 1)}50%{transform:scale3d(1.1, 1.1, 1.1)}100%{transform:scale3d(1, 1, 1)}}@keyframes pulse-in{0%{transform:scale3d(1, 1, 1)}50%{transform:scale3d(1.1, 1.1, 1.1)}100%{transform:scale3d(1, 1, 1)}}.hover-grow{transition:transform 0.3s;-webkit-backface-visibility:hidden;backface-visibility:hidden}.hover-grow:hover{transform:scale(1.025)}.border{border:1px #e1e4e8 solid !important}.border-y{border-top:1px #e1e4e8 solid !important;border-bottom:1px #e1e4e8 solid !important}.border-0{border:0 !important}.border-dashed{border-style:dashed !important}.border-blue{border-color:#0366d6 !important}.border-blue-light{border-color:#c8e1ff !important}.border-green{border-color:#34d058 !important}.border-green-light{border-color:#a2cbac !important}.border-red{border-color:#d73a49 !important}.border-red-light{border-color:#cea0a5 !important}.border-purple{border-color:#6f42c1 !important}.border-yellow{border-color:#d9d0a5 !important}.border-gray-light{border-color:#eaecef !important}.border-gray-dark{border-color:#d1d5da !important}.border-black-fade{border-color:rgba(27,31,35,0.15) !important}.border-top{border-top:1px #e1e4e8 solid !important}.border-right{border-right:1px #e1e4e8 solid !important}.border-bottom{border-bottom:1px #e1e4e8 solid !important}.border-left{border-left:1px #e1e4e8 solid !important}.border-top-0{border-top:0 !important}.border-right-0{border-right:0 !important}.border-bottom-0{border-bottom:0 !important}.border-left-0{border-left:0 !important}.rounded-0{border-radius:0 !important}.rounded-1{border-radius:3px !important}.rounded-2{border-radius:6px !important}.rounded-top-0{border-top-left-radius:0 !important;border-top-right-radius:0 !important}.rounded-top-1{border-top-left-radius:3px !important;border-top-right-radius:3px !important}.rounded-top-2{border-top-left-radius:6px !important;border-top-right-radius:6px !important}.rounded-right-0{border-top-right-radius:0 !important;border-bottom-right-radius:0 !important}.rounded-right-1{border-top-right-radius:3px !important;border-bottom-right-radius:3px !important}.rounded-right-2{border-top-right-radius:6px !important;border-bottom-right-radius:6px !important}.rounded-bottom-0{border-bottom-right-radius:0 !important;border-bottom-left-radius:0 !important}.rounded-bottom-1{border-bottom-right-radius:3px !important;border-bottom-left-radius:3px !important}.rounded-bottom-2{border-bottom-right-radius:6px !important;border-bottom-left-radius:6px !important}.rounded-left-0{border-bottom-left-radius:0 !important;border-top-left-radius:0 !important}.rounded-left-1{border-bottom-left-radius:3px !important;border-top-left-radius:3px !important}.rounded-left-2{border-bottom-left-radius:6px !important;border-top-left-radius:6px !important}@media (min-width: 544px){.border-sm-top{border-top:1px #e1e4e8 solid !important}.border-sm-right{border-right:1px #e1e4e8 solid !important}.border-sm-bottom{border-bottom:1px #e1e4e8 solid !important}.border-sm-left{border-left:1px #e1e4e8 solid !important}.border-sm-top-0{border-top:0 !important}.border-sm-right-0{border-right:0 !important}.border-sm-bottom-0{border-bottom:0 !important}.border-sm-left-0{border-left:0 !important}.rounded-sm-0{border-radius:0 !important}.rounded-sm-1{border-radius:3px !important}.rounded-sm-2{border-radius:6px !important}.rounded-sm-top-0{border-top-left-radius:0 !important;border-top-right-radius:0 !important}.rounded-sm-top-1{border-top-left-radius:3px !important;border-top-right-radius:3px !important}.rounded-sm-top-2{border-top-left-radius:6px !important;border-top-right-radius:6px !important}.rounded-sm-right-0{border-top-right-radius:0 !important;border-bottom-right-radius:0 !important}.rounded-sm-right-1{border-top-right-radius:3px !important;border-bottom-right-radius:3px !important}.rounded-sm-right-2{border-top-right-radius:6px !important;border-bottom-right-radius:6px !important}.rounded-sm-bottom-0{border-bottom-right-radius:0 !important;border-bottom-left-radius:0 !important}.rounded-sm-bottom-1{border-bottom-right-radius:3px !important;border-bottom-left-radius:3px !important}.rounded-sm-bottom-2{border-bottom-right-radius:6px !important;border-bottom-left-radius:6px !important}.rounded-sm-left-0{border-bottom-left-radius:0 !important;border-top-left-radius:0 !important}.rounded-sm-left-1{border-bottom-left-radius:3px !important;border-top-left-radius:3px !important}.rounded-sm-left-2{border-bottom-left-radius:6px !important;border-top-left-radius:6px !important}}@media (min-width: 768px){.border-md-top{border-top:1px #e1e4e8 solid !important}.border-md-right{border-right:1px #e1e4e8 solid !important}.border-md-bottom{border-bottom:1px #e1e4e8 solid !important}.border-md-left{border-left:1px #e1e4e8 solid !important}.border-md-top-0{border-top:0 !important}.border-md-right-0{border-right:0 !important}.border-md-bottom-0{border-bottom:0 !important}.border-md-left-0{border-left:0 !important}.rounded-md-0{border-radius:0 !important}.rounded-md-1{border-radius:3px !important}.rounded-md-2{border-radius:6px !important}.rounded-md-top-0{border-top-left-radius:0 !important;border-top-right-radius:0 !important}.rounded-md-top-1{border-top-left-radius:3px !important;border-top-right-radius:3px !important}.rounded-md-top-2{border-top-left-radius:6px !important;border-top-right-radius:6px !important}.rounded-md-right-0{border-top-right-radius:0 !important;border-bottom-right-radius:0 !important}.rounded-md-right-1{border-top-right-radius:3px !important;border-bottom-right-radius:3px !important}.rounded-md-right-2{border-top-right-radius:6px !important;border-bottom-right-radius:6px !important}.rounded-md-bottom-0{border-bottom-right-radius:0 !important;border-bottom-left-radius:0 !important}.rounded-md-bottom-1{border-bottom-right-radius:3px !important;border-bottom-left-radius:3px !important}.rounded-md-bottom-2{border-bottom-right-radius:6px !important;border-bottom-left-radius:6px !important}.rounded-md-left-0{border-bottom-left-radius:0 !important;border-top-left-radius:0 !important}.rounded-md-left-1{border-bottom-left-radius:3px !important;border-top-left-radius:3px !important}.rounded-md-left-2{border-bottom-left-radius:6px !important;border-top-left-radius:6px !important}}@media (min-width: 1012px){.border-lg-top{border-top:1px #e1e4e8 solid !important}.border-lg-right{border-right:1px #e1e4e8 solid !important}.border-lg-bottom{border-bottom:1px #e1e4e8 solid !important}.border-lg-left{border-left:1px #e1e4e8 solid !important}.border-lg-top-0{border-top:0 !important}.border-lg-right-0{border-right:0 !important}.border-lg-bottom-0{border-bottom:0 !important}.border-lg-left-0{border-left:0 !important}.rounded-lg-0{border-radius:0 !important}.rounded-lg-1{border-radius:3px !important}.rounded-lg-2{border-radius:6px !important}.rounded-lg-top-0{border-top-left-radius:0 !important;border-top-right-radius:0 !important}.rounded-lg-top-1{border-top-left-radius:3px !important;border-top-right-radius:3px !important}.rounded-lg-top-2{border-top-left-radius:6px !important;border-top-right-radius:6px !important}.rounded-lg-right-0{border-top-right-radius:0 !important;border-bottom-right-radius:0 !important}.rounded-lg-right-1{border-top-right-radius:3px !important;border-bottom-right-radius:3px !important}.rounded-lg-right-2{border-top-right-radius:6px !important;border-bottom-right-radius:6px !important}.rounded-lg-bottom-0{border-bottom-right-radius:0 !important;border-bottom-left-radius:0 !important}.rounded-lg-bottom-1{border-bottom-right-radius:3px !important;border-bottom-left-radius:3px !important}.rounded-lg-bottom-2{border-bottom-right-radius:6px !important;border-bottom-left-radius:6px !important}.rounded-lg-left-0{border-bottom-left-radius:0 !important;border-top-left-radius:0 !important}.rounded-lg-left-1{border-bottom-left-radius:3px !important;border-top-left-radius:3px !important}.rounded-lg-left-2{border-bottom-left-radius:6px !important;border-top-left-radius:6px !important}}@media (min-width: 1280px){.border-xl-top{border-top:1px #e1e4e8 solid !important}.border-xl-right{border-right:1px #e1e4e8 solid !important}.border-xl-bottom{border-bottom:1px #e1e4e8 solid !important}.border-xl-left{border-left:1px #e1e4e8 solid !important}.border-xl-top-0{border-top:0 !important}.border-xl-right-0{border-right:0 !important}.border-xl-bottom-0{border-bottom:0 !important}.border-xl-left-0{border-left:0 !important}.rounded-xl-0{border-radius:0 !important}.rounded-xl-1{border-radius:3px !important}.rounded-xl-2{border-radius:6px !important}.rounded-xl-top-0{border-top-left-radius:0 !important;border-top-right-radius:0 !important}.rounded-xl-top-1{border-top-left-radius:3px !important;border-top-right-radius:3px !important}.rounded-xl-top-2{border-top-left-radius:6px !important;border-top-right-radius:6px !important}.rounded-xl-right-0{border-top-right-radius:0 !important;border-bottom-right-radius:0 !important}.rounded-xl-right-1{border-top-right-radius:3px !important;border-bottom-right-radius:3px !important}.rounded-xl-right-2{border-top-right-radius:6px !important;border-bottom-right-radius:6px !important}.rounded-xl-bottom-0{border-bottom-right-radius:0 !important;border-bottom-left-radius:0 !important}.rounded-xl-bottom-1{border-bottom-right-radius:3px !important;border-bottom-left-radius:3px !important}.rounded-xl-bottom-2{border-bottom-right-radius:6px !important;border-bottom-left-radius:6px !important}.rounded-xl-left-0{border-bottom-left-radius:0 !important;border-top-left-radius:0 !important}.rounded-xl-left-1{border-bottom-left-radius:3px !important;border-top-left-radius:3px !important}.rounded-xl-left-2{border-bottom-left-radius:6px !important;border-top-left-radius:6px !important}}.circle{border-radius:50% !important}.box-shadow{box-shadow:0 1px 1px rgba(27,31,35,0.1) !important}.box-shadow-medium{box-shadow:0 1px 5px rgba(27,31,35,0.15) !important}.box-shadow-large{box-shadow:0 1px 15px rgba(27,31,35,0.15) !important}.box-shadow-extra-large{box-shadow:0 10px 50px rgba(27,31,35,0.07) !important}.box-shadow-none{box-shadow:none !important}.bg-white{background-color:#fff !important}.bg-blue{background-color:#0366d6 !important}.bg-blue-light{background-color:#f1f8ff !important}.bg-gray-dark{background-color:#24292e !important}.bg-gray{background-color:#f6f8fa !important}.bg-gray-light{background-color:#fafbfc !important}.bg-green{background-color:#28a745 !important}.bg-green-light{background-color:#dcffe4 !important}.bg-red{background-color:#d73a49 !important}.bg-red-light{background-color:#ffdce0 !important}.bg-yellow{background-color:#ffd33d !important}.bg-yellow-light{background-color:#fff5b1 !important}.bg-purple{background-color:#6f42c1 !important}.bg-purple-light{background-color:#f5f0ff !important}.bg-shade-gradient{background-image:linear-gradient(180deg, rgba(27,31,35,0.065), rgba(27,31,35,0)) !important;background-repeat:no-repeat !important;background-size:100% 200px !important}.text-blue{color:#0366d6 !important}.text-red{color:#cb2431 !important}.text-gray-light{color:#6a737d !important}.text-gray{color:#586069 !important}.text-gray-dark{color:#24292e !important}.text-green{color:#28a745 !important}.text-orange{color:#a04100 !important}.text-orange-light{color:#e36209 !important}.text-purple{color:#6f42c1 !important}.text-white{color:#fff !important}.text-inherit{color:inherit !important}.text-pending{color:#b08800 !important}.bg-pending{color:#dbab09 !important}.link-gray{color:#586069 !important}.link-gray:hover{color:#0366d6 !important}.link-gray-dark{color:#24292e !important}.link-gray-dark:hover{color:#0366d6 !important}.link-hover-blue:hover{color:#0366d6 !important}.muted-link{color:#586069 !important}.muted-link:hover{color:#0366d6 !important;text-decoration:none}.details-overlay[open]>summary::before{position:fixed;top:0;right:0;bottom:0;left:0;z-index:80;display:block;cursor:default;content:" ";background:transparent}.details-overlay-dark[open]>summary::before{z-index:99;background:rgba(27,31,35,0.5)}.flex-row{flex-direction:row !important}.flex-row-reverse{flex-direction:row-reverse !important}.flex-column{flex-direction:column !important}.flex-wrap{flex-wrap:wrap !important}.flex-nowrap{flex-wrap:nowrap !important}.flex-justify-start{justify-content:flex-start !important}.flex-justify-end{justify-content:flex-end !important}.flex-justify-center{justify-content:center !important}.flex-justify-between{justify-content:space-between !important}.flex-justify-around{justify-content:space-around !important}.flex-items-start{align-items:flex-start !important}.flex-items-end{align-items:flex-end !important}.flex-items-center{align-items:center !important}.flex-items-baseline{align-items:baseline !important}.flex-items-stretch{align-items:stretch !important}.flex-content-start{align-content:flex-start !important}.flex-content-end{align-content:flex-end !important}.flex-content-center{align-content:center !important}.flex-content-between{align-content:space-between !important}.flex-content-around{align-content:space-around !important}.flex-content-stretch{align-content:stretch !important}.flex-auto{flex:1 1 auto !important}.flex-shrink-0{flex-shrink:0 !important}.flex-self-auto{align-self:auto !important}.flex-self-start{align-self:flex-start !important}.flex-self-end{align-self:flex-end !important}.flex-self-center{align-self:center !important}.flex-self-baseline{align-self:baseline !important}.flex-self-stretch{align-self:stretch !important}.flex-item-equal{flex-grow:1;flex-basis:0}@media (min-width: 544px){.flex-sm-row{flex-direction:row !important}.flex-sm-row-reverse{flex-direction:row-reverse !important}.flex-sm-column{flex-direction:column !important}.flex-sm-wrap{flex-wrap:wrap !important}.flex-sm-nowrap{flex-wrap:nowrap !important}.flex-sm-justify-start{justify-content:flex-start !important}.flex-sm-justify-end{justify-content:flex-end !important}.flex-sm-justify-center{justify-content:center !important}.flex-sm-justify-between{justify-content:space-between !important}.flex-sm-justify-around{justify-content:space-around !important}.flex-sm-items-start{align-items:flex-start !important}.flex-sm-items-end{align-items:flex-end !important}.flex-sm-items-center{align-items:center !important}.flex-sm-items-baseline{align-items:baseline !important}.flex-sm-items-stretch{align-items:stretch !important}.flex-sm-content-start{align-content:flex-start !important}.flex-sm-content-end{align-content:flex-end !important}.flex-sm-content-center{align-content:center !important}.flex-sm-content-between{align-content:space-between !important}.flex-sm-content-around{align-content:space-around !important}.flex-sm-content-stretch{align-content:stretch !important}.flex-sm-auto{flex:1 1 auto !important}.flex-sm-shrink-0{flex-shrink:0 !important}.flex-sm-self-auto{align-self:auto !important}.flex-sm-self-start{align-self:flex-start !important}.flex-sm-self-end{align-self:flex-end !important}.flex-sm-self-center{align-self:center !important}.flex-sm-self-baseline{align-self:baseline !important}.flex-sm-self-stretch{align-self:stretch !important}.flex-sm-item-equal{flex-grow:1;flex-basis:0}}@media (min-width: 768px){.flex-md-row{flex-direction:row !important}.flex-md-row-reverse{flex-direction:row-reverse !important}.flex-md-column{flex-direction:column !important}.flex-md-wrap{flex-wrap:wrap !important}.flex-md-nowrap{flex-wrap:nowrap !important}.flex-md-justify-start{justify-content:flex-start !important}.flex-md-justify-end{justify-content:flex-end !important}.flex-md-justify-center{justify-content:center !important}.flex-md-justify-between{justify-content:space-between !important}.flex-md-justify-around{justify-content:space-around !important}.flex-md-items-start{align-items:flex-start !important}.flex-md-items-end{align-items:flex-end !important}.flex-md-items-center{align-items:center !important}.flex-md-items-baseline{align-items:baseline !important}.flex-md-items-stretch{align-items:stretch !important}.flex-md-content-start{align-content:flex-start !important}.flex-md-content-end{align-content:flex-end !important}.flex-md-content-center{align-content:center !important}.flex-md-content-between{align-content:space-between !important}.flex-md-content-around{align-content:space-around !important}.flex-md-content-stretch{align-content:stretch !important}.flex-md-auto{flex:1 1 auto !important}.flex-md-shrink-0{flex-shrink:0 !important}.flex-md-self-auto{align-self:auto !important}.flex-md-self-start{align-self:flex-start !important}.flex-md-self-end{align-self:flex-end !important}.flex-md-self-center{align-self:center !important}.flex-md-self-baseline{align-self:baseline !important}.flex-md-self-stretch{align-self:stretch !important}.flex-md-item-equal{flex-grow:1;flex-basis:0}}@media (min-width: 1012px){.flex-lg-row{flex-direction:row !important}.flex-lg-row-reverse{flex-direction:row-reverse !important}.flex-lg-column{flex-direction:column !important}.flex-lg-wrap{flex-wrap:wrap !important}.flex-lg-nowrap{flex-wrap:nowrap !important}.flex-lg-justify-start{justify-content:flex-start !important}.flex-lg-justify-end{justify-content:flex-end !important}.flex-lg-justify-center{justify-content:center !important}.flex-lg-justify-between{justify-content:space-between !important}.flex-lg-justify-around{justify-content:space-around !important}.flex-lg-items-start{align-items:flex-start !important}.flex-lg-items-end{align-items:flex-end !important}.flex-lg-items-center{align-items:center !important}.flex-lg-items-baseline{align-items:baseline !important}.flex-lg-items-stretch{align-items:stretch !important}.flex-lg-content-start{align-content:flex-start !important}.flex-lg-content-end{align-content:flex-end !important}.flex-lg-content-center{align-content:center !important}.flex-lg-content-between{align-content:space-between !important}.flex-lg-content-around{align-content:space-around !important}.flex-lg-content-stretch{align-content:stretch !important}.flex-lg-auto{flex:1 1 auto !important}.flex-lg-shrink-0{flex-shrink:0 !important}.flex-lg-self-auto{align-self:auto !important}.flex-lg-self-start{align-self:flex-start !important}.flex-lg-self-end{align-self:flex-end !important}.flex-lg-self-center{align-self:center !important}.flex-lg-self-baseline{align-self:baseline !important}.flex-lg-self-stretch{align-self:stretch !important}.flex-lg-item-equal{flex-grow:1;flex-basis:0}}@media (min-width: 1280px){.flex-xl-row{flex-direction:row !important}.flex-xl-row-reverse{flex-direction:row-reverse !important}.flex-xl-column{flex-direction:column !important}.flex-xl-wrap{flex-wrap:wrap !important}.flex-xl-nowrap{flex-wrap:nowrap !important}.flex-xl-justify-start{justify-content:flex-start !important}.flex-xl-justify-end{justify-content:flex-end !important}.flex-xl-justify-center{justify-content:center !important}.flex-xl-justify-between{justify-content:space-between !important}.flex-xl-justify-around{justify-content:space-around !important}.flex-xl-items-start{align-items:flex-start !important}.flex-xl-items-end{align-items:flex-end !important}.flex-xl-items-center{align-items:center !important}.flex-xl-items-baseline{align-items:baseline !important}.flex-xl-items-stretch{align-items:stretch !important}.flex-xl-content-start{align-content:flex-start !important}.flex-xl-content-end{align-content:flex-end !important}.flex-xl-content-center{align-content:center !important}.flex-xl-content-between{align-content:space-between !important}.flex-xl-content-around{align-content:space-around !important}.flex-xl-content-stretch{align-content:stretch !important}.flex-xl-auto{flex:1 1 auto !important}.flex-xl-shrink-0{flex-shrink:0 !important}.flex-xl-self-auto{align-self:auto !important}.flex-xl-self-start{align-self:flex-start !important}.flex-xl-self-end{align-self:flex-end !important}.flex-xl-self-center{align-self:center !important}.flex-xl-self-baseline{align-self:baseline !important}.flex-xl-self-stretch{align-self:stretch !important}.flex-xl-item-equal{flex-grow:1;flex-basis:0}}.position-static{position:static !important}.position-relative{position:relative !important}.position-absolute{position:absolute !important}.position-fixed{position:fixed !important}.top-0{top:0 !important}.right-0{right:0 !important}.bottom-0{bottom:0 !important}.left-0{left:0 !important}.v-align-middle{vertical-align:middle !important}.v-align-top{vertical-align:top !important}.v-align-bottom{vertical-align:bottom !important}.v-align-text-top{vertical-align:text-top !important}.v-align-text-bottom{vertical-align:text-bottom !important}.v-align-baseline{vertical-align:baseline !important}.overflow-hidden{overflow:hidden !important}.overflow-scroll{overflow:scroll !important}.overflow-auto{overflow:auto !important}.clearfix::before{display:table;content:""}.clearfix::after{display:table;clear:both;content:""}.float-left{float:left !important}.float-right{float:right !important}.float-none{float:none !important}@media (min-width: 544px){.float-sm-left{float:left !important}.float-sm-right{float:right !important}.float-sm-none{float:none !important}}@media (min-width: 768px){.float-md-left{float:left !important}.float-md-right{float:right !important}.float-md-none{float:none !important}}@media (min-width: 1012px){.float-lg-left{float:left !important}.float-lg-right{float:right !important}.float-lg-none{float:none !important}}@media (min-width: 1280px){.float-xl-left{float:left !important}.float-xl-right{float:right !important}.float-xl-none{float:none !important}}.width-fit{max-width:100% !important}.width-full{width:100% !important}.height-fit{max-height:100% !important}.height-full{height:100% !important}.min-width-0{min-width:0 !important}.direction-rtl{direction:rtl !important}.direction-ltr{direction:ltr !important}@media (min-width: 544px){.direction-sm-rtl{direction:rtl !important}.direction-sm-ltr{direction:ltr !important}}@media (min-width: 768px){.direction-md-rtl{direction:rtl !important}.direction-md-ltr{direction:ltr !important}}@media (min-width: 1012px){.direction-lg-rtl{direction:rtl !important}.direction-lg-ltr{direction:ltr !important}}@media (min-width: 1280px){.direction-xl-rtl{direction:rtl !important}.direction-xl-ltr{direction:ltr !important}}.m-0{margin:0 !important}.mt-0{margin-top:0 !important}.mr-0{margin-right:0 !important}.mb-0{margin-bottom:0 !important}.ml-0{margin-left:0 !important}.mx-0{margin-right:0 !important;margin-left:0 !important}.my-0{margin-top:0 !important;margin-bottom:0 !important}.m-1{margin:4px !important}.mt-1{margin-top:4px !important}.mr-1{margin-right:4px !important}.mb-1{margin-bottom:4px !important}.ml-1{margin-left:4px !important}.mt-n1{margin-top:-4px !important}.mr-n1{margin-right:-4px !important}.mb-n1{margin-bottom:-4px !important}.ml-n1{margin-left:-4px !important}.mx-1{margin-right:4px !important;margin-left:4px !important}.my-1{margin-top:4px !important;margin-bottom:4px !important}.m-2{margin:8px !important}.mt-2{margin-top:8px !important}.mr-2{margin-right:8px !important}.mb-2{margin-bottom:8px !important}.ml-2{margin-left:8px !important}.mt-n2{margin-top:-8px !important}.mr-n2{margin-right:-8px !important}.mb-n2{margin-bottom:-8px !important}.ml-n2{margin-left:-8px !important}.mx-2{margin-right:8px !important;margin-left:8px !important}.my-2{margin-top:8px !important;margin-bottom:8px !important}.m-3{margin:16px !important}.mt-3{margin-top:16px !important}.mr-3{margin-right:16px !important}.mb-3{margin-bottom:16px !important}.ml-3{margin-left:16px !important}.mt-n3{margin-top:-16px !important}.mr-n3{margin-right:-16px !important}.mb-n3{margin-bottom:-16px !important}.ml-n3{margin-left:-16px !important}.mx-3{margin-right:16px !important;margin-left:16px !important}.my-3{margin-top:16px !important;margin-bottom:16px !important}.m-4{margin:24px !important}.mt-4{margin-top:24px !important}.mr-4{margin-right:24px !important}.mb-4{margin-bottom:24px !important}.ml-4{margin-left:24px !important}.mt-n4{margin-top:-24px !important}.mr-n4{margin-right:-24px !important}.mb-n4{margin-bottom:-24px !important}.ml-n4{margin-left:-24px !important}.mx-4{margin-right:24px !important;margin-left:24px !important}.my-4{margin-top:24px !important;margin-bottom:24px !important}.m-5{margin:32px !important}.mt-5{margin-top:32px !important}.mr-5{margin-right:32px !important}.mb-5{margin-bottom:32px !important}.ml-5{margin-left:32px !important}.mt-n5{margin-top:-32px !important}.mr-n5{margin-right:-32px !important}.mb-n5{margin-bottom:-32px !important}.ml-n5{margin-left:-32px !important}.mx-5{margin-right:32px !important;margin-left:32px !important}.my-5{margin-top:32px !important;margin-bottom:32px !important}.m-6{margin:40px !important}.mt-6{margin-top:40px !important}.mr-6{margin-right:40px !important}.mb-6{margin-bottom:40px !important}.ml-6{margin-left:40px !important}.mt-n6{margin-top:-40px !important}.mr-n6{margin-right:-40px !important}.mb-n6{margin-bottom:-40px !important}.ml-n6{margin-left:-40px !important}.mx-6{margin-right:40px !important;margin-left:40px !important}.my-6{margin-top:40px !important;margin-bottom:40px !important}.mx-auto{margin-right:auto !important;margin-left:auto !important}@media (min-width: 544px){.m-sm-0{margin:0 !important}.mt-sm-0{margin-top:0 !important}.mr-sm-0{margin-right:0 !important}.mb-sm-0{margin-bottom:0 !important}.ml-sm-0{margin-left:0 !important}.mx-sm-0{margin-right:0 !important;margin-left:0 !important}.my-sm-0{margin-top:0 !important;margin-bottom:0 !important}.m-sm-1{margin:4px !important}.mt-sm-1{margin-top:4px !important}.mr-sm-1{margin-right:4px !important}.mb-sm-1{margin-bottom:4px !important}.ml-sm-1{margin-left:4px !important}.mt-sm-n1{margin-top:-4px !important}.mr-sm-n1{margin-right:-4px !important}.mb-sm-n1{margin-bottom:-4px !important}.ml-sm-n1{margin-left:-4px !important}.mx-sm-1{margin-right:4px !important;margin-left:4px !important}.my-sm-1{margin-top:4px !important;margin-bottom:4px !important}.m-sm-2{margin:8px !important}.mt-sm-2{margin-top:8px !important}.mr-sm-2{margin-right:8px !important}.mb-sm-2{margin-bottom:8px !important}.ml-sm-2{margin-left:8px !important}.mt-sm-n2{margin-top:-8px !important}.mr-sm-n2{margin-right:-8px !important}.mb-sm-n2{margin-bottom:-8px !important}.ml-sm-n2{margin-left:-8px !important}.mx-sm-2{margin-right:8px !important;margin-left:8px !important}.my-sm-2{margin-top:8px !important;margin-bottom:8px !important}.m-sm-3{margin:16px !important}.mt-sm-3{margin-top:16px !important}.mr-sm-3{margin-right:16px !important}.mb-sm-3{margin-bottom:16px !important}.ml-sm-3{margin-left:16px !important}.mt-sm-n3{margin-top:-16px !important}.mr-sm-n3{margin-right:-16px !important}.mb-sm-n3{margin-bottom:-16px !important}.ml-sm-n3{margin-left:-16px !important}.mx-sm-3{margin-right:16px !important;margin-left:16px !important}.my-sm-3{margin-top:16px !important;margin-bottom:16px !important}.m-sm-4{margin:24px !important}.mt-sm-4{margin-top:24px !important}.mr-sm-4{margin-right:24px !important}.mb-sm-4{margin-bottom:24px !important}.ml-sm-4{margin-left:24px !important}.mt-sm-n4{margin-top:-24px !important}.mr-sm-n4{margin-right:-24px !important}.mb-sm-n4{margin-bottom:-24px !important}.ml-sm-n4{margin-left:-24px !important}.mx-sm-4{margin-right:24px !important;margin-left:24px !important}.my-sm-4{margin-top:24px !important;margin-bottom:24px !important}.m-sm-5{margin:32px !important}.mt-sm-5{margin-top:32px !important}.mr-sm-5{margin-right:32px !important}.mb-sm-5{margin-bottom:32px !important}.ml-sm-5{margin-left:32px !important}.mt-sm-n5{margin-top:-32px !important}.mr-sm-n5{margin-right:-32px !important}.mb-sm-n5{margin-bottom:-32px !important}.ml-sm-n5{margin-left:-32px !important}.mx-sm-5{margin-right:32px !important;margin-left:32px !important}.my-sm-5{margin-top:32px !important;margin-bottom:32px !important}.m-sm-6{margin:40px !important}.mt-sm-6{margin-top:40px !important}.mr-sm-6{margin-right:40px !important}.mb-sm-6{margin-bottom:40px !important}.ml-sm-6{margin-left:40px !important}.mt-sm-n6{margin-top:-40px !important}.mr-sm-n6{margin-right:-40px !important}.mb-sm-n6{margin-bottom:-40px !important}.ml-sm-n6{margin-left:-40px !important}.mx-sm-6{margin-right:40px !important;margin-left:40px !important}.my-sm-6{margin-top:40px !important;margin-bottom:40px !important}.mx-sm-auto{margin-right:auto !important;margin-left:auto !important}}@media (min-width: 768px){.m-md-0{margin:0 !important}.mt-md-0{margin-top:0 !important}.mr-md-0{margin-right:0 !important}.mb-md-0{margin-bottom:0 !important}.ml-md-0{margin-left:0 !important}.mx-md-0{margin-right:0 !important;margin-left:0 !important}.my-md-0{margin-top:0 !important;margin-bottom:0 !important}.m-md-1{margin:4px !important}.mt-md-1{margin-top:4px !important}.mr-md-1{margin-right:4px !important}.mb-md-1{margin-bottom:4px !important}.ml-md-1{margin-left:4px !important}.mt-md-n1{margin-top:-4px !important}.mr-md-n1{margin-right:-4px !important}.mb-md-n1{margin-bottom:-4px !important}.ml-md-n1{margin-left:-4px !important}.mx-md-1{margin-right:4px !important;margin-left:4px !important}.my-md-1{margin-top:4px !important;margin-bottom:4px !important}.m-md-2{margin:8px !important}.mt-md-2{margin-top:8px !important}.mr-md-2{margin-right:8px !important}.mb-md-2{margin-bottom:8px !important}.ml-md-2{margin-left:8px !important}.mt-md-n2{margin-top:-8px !important}.mr-md-n2{margin-right:-8px !important}.mb-md-n2{margin-bottom:-8px !important}.ml-md-n2{margin-left:-8px !important}.mx-md-2{margin-right:8px !important;margin-left:8px !important}.my-md-2{margin-top:8px !important;margin-bottom:8px !important}.m-md-3{margin:16px !important}.mt-md-3{margin-top:16px !important}.mr-md-3{margin-right:16px !important}.mb-md-3{margin-bottom:16px !important}.ml-md-3{margin-left:16px !important}.mt-md-n3{margin-top:-16px !important}.mr-md-n3{margin-right:-16px !important}.mb-md-n3{margin-bottom:-16px !important}.ml-md-n3{margin-left:-16px !important}.mx-md-3{margin-right:16px !important;margin-left:16px !important}.my-md-3{margin-top:16px !important;margin-bottom:16px !important}.m-md-4{margin:24px !important}.mt-md-4{margin-top:24px !important}.mr-md-4{margin-right:24px !important}.mb-md-4{margin-bottom:24px !important}.ml-md-4{margin-left:24px !important}.mt-md-n4{margin-top:-24px !important}.mr-md-n4{margin-right:-24px !important}.mb-md-n4{margin-bottom:-24px !important}.ml-md-n4{margin-left:-24px !important}.mx-md-4{margin-right:24px !important;margin-left:24px !important}.my-md-4{margin-top:24px !important;margin-bottom:24px !important}.m-md-5{margin:32px !important}.mt-md-5{margin-top:32px !important}.mr-md-5{margin-right:32px !important}.mb-md-5{margin-bottom:32px !important}.ml-md-5{margin-left:32px !important}.mt-md-n5{margin-top:-32px !important}.mr-md-n5{margin-right:-32px !important}.mb-md-n5{margin-bottom:-32px !important}.ml-md-n5{margin-left:-32px !important}.mx-md-5{margin-right:32px !important;margin-left:32px !important}.my-md-5{margin-top:32px !important;margin-bottom:32px !important}.m-md-6{margin:40px !important}.mt-md-6{margin-top:40px !important}.mr-md-6{margin-right:40px !important}.mb-md-6{margin-bottom:40px !important}.ml-md-6{margin-left:40px !important}.mt-md-n6{margin-top:-40px !important}.mr-md-n6{margin-right:-40px !important}.mb-md-n6{margin-bottom:-40px !important}.ml-md-n6{margin-left:-40px !important}.mx-md-6{margin-right:40px !important;margin-left:40px !important}.my-md-6{margin-top:40px !important;margin-bottom:40px !important}.mx-md-auto{margin-right:auto !important;margin-left:auto !important}}@media (min-width: 1012px){.m-lg-0{margin:0 !important}.mt-lg-0{margin-top:0 !important}.mr-lg-0{margin-right:0 !important}.mb-lg-0{margin-bottom:0 !important}.ml-lg-0{margin-left:0 !important}.mx-lg-0{margin-right:0 !important;margin-left:0 !important}.my-lg-0{margin-top:0 !important;margin-bottom:0 !important}.m-lg-1{margin:4px !important}.mt-lg-1{margin-top:4px !important}.mr-lg-1{margin-right:4px !important}.mb-lg-1{margin-bottom:4px !important}.ml-lg-1{margin-left:4px !important}.mt-lg-n1{margin-top:-4px !important}.mr-lg-n1{margin-right:-4px !important}.mb-lg-n1{margin-bottom:-4px !important}.ml-lg-n1{margin-left:-4px !important}.mx-lg-1{margin-right:4px !important;margin-left:4px !important}.my-lg-1{margin-top:4px !important;margin-bottom:4px !important}.m-lg-2{margin:8px !important}.mt-lg-2{margin-top:8px !important}.mr-lg-2{margin-right:8px !important}.mb-lg-2{margin-bottom:8px !important}.ml-lg-2{margin-left:8px !important}.mt-lg-n2{margin-top:-8px !important}.mr-lg-n2{margin-right:-8px !important}.mb-lg-n2{margin-bottom:-8px !important}.ml-lg-n2{margin-left:-8px !important}.mx-lg-2{margin-right:8px !important;margin-left:8px !important}.my-lg-2{margin-top:8px !important;margin-bottom:8px !important}.m-lg-3{margin:16px !important}.mt-lg-3{margin-top:16px !important}.mr-lg-3{margin-right:16px !important}.mb-lg-3{margin-bottom:16px !important}.ml-lg-3{margin-left:16px !important}.mt-lg-n3{margin-top:-16px !important}.mr-lg-n3{margin-right:-16px !important}.mb-lg-n3{margin-bottom:-16px !important}.ml-lg-n3{margin-left:-16px !important}.mx-lg-3{margin-right:16px !important;margin-left:16px !important}.my-lg-3{margin-top:16px !important;margin-bottom:16px !important}.m-lg-4{margin:24px !important}.mt-lg-4{margin-top:24px !important}.mr-lg-4{margin-right:24px !important}.mb-lg-4{margin-bottom:24px !important}.ml-lg-4{margin-left:24px !important}.mt-lg-n4{margin-top:-24px !important}.mr-lg-n4{margin-right:-24px !important}.mb-lg-n4{margin-bottom:-24px !important}.ml-lg-n4{margin-left:-24px !important}.mx-lg-4{margin-right:24px !important;margin-left:24px !important}.my-lg-4{margin-top:24px !important;margin-bottom:24px !important}.m-lg-5{margin:32px !important}.mt-lg-5{margin-top:32px !important}.mr-lg-5{margin-right:32px !important}.mb-lg-5{margin-bottom:32px !important}.ml-lg-5{margin-left:32px !important}.mt-lg-n5{margin-top:-32px !important}.mr-lg-n5{margin-right:-32px !important}.mb-lg-n5{margin-bottom:-32px !important}.ml-lg-n5{margin-left:-32px !important}.mx-lg-5{margin-right:32px !important;margin-left:32px !important}.my-lg-5{margin-top:32px !important;margin-bottom:32px !important}.m-lg-6{margin:40px !important}.mt-lg-6{margin-top:40px !important}.mr-lg-6{margin-right:40px !important}.mb-lg-6{margin-bottom:40px !important}.ml-lg-6{margin-left:40px !important}.mt-lg-n6{margin-top:-40px !important}.mr-lg-n6{margin-right:-40px !important}.mb-lg-n6{margin-bottom:-40px !important}.ml-lg-n6{margin-left:-40px !important}.mx-lg-6{margin-right:40px !important;margin-left:40px !important}.my-lg-6{margin-top:40px !important;margin-bottom:40px !important}.mx-lg-auto{margin-right:auto !important;margin-left:auto !important}}@media (min-width: 1280px){.m-xl-0{margin:0 !important}.mt-xl-0{margin-top:0 !important}.mr-xl-0{margin-right:0 !important}.mb-xl-0{margin-bottom:0 !important}.ml-xl-0{margin-left:0 !important}.mx-xl-0{margin-right:0 !important;margin-left:0 !important}.my-xl-0{margin-top:0 !important;margin-bottom:0 !important}.m-xl-1{margin:4px !important}.mt-xl-1{margin-top:4px !important}.mr-xl-1{margin-right:4px !important}.mb-xl-1{margin-bottom:4px !important}.ml-xl-1{margin-left:4px !important}.mt-xl-n1{margin-top:-4px !important}.mr-xl-n1{margin-right:-4px !important}.mb-xl-n1{margin-bottom:-4px !important}.ml-xl-n1{margin-left:-4px !important}.mx-xl-1{margin-right:4px !important;margin-left:4px !important}.my-xl-1{margin-top:4px !important;margin-bottom:4px !important}.m-xl-2{margin:8px !important}.mt-xl-2{margin-top:8px !important}.mr-xl-2{margin-right:8px !important}.mb-xl-2{margin-bottom:8px !important}.ml-xl-2{margin-left:8px !important}.mt-xl-n2{margin-top:-8px !important}.mr-xl-n2{margin-right:-8px !important}.mb-xl-n2{margin-bottom:-8px !important}.ml-xl-n2{margin-left:-8px !important}.mx-xl-2{margin-right:8px !important;margin-left:8px !important}.my-xl-2{margin-top:8px !important;margin-bottom:8px !important}.m-xl-3{margin:16px !important}.mt-xl-3{margin-top:16px !important}.mr-xl-3{margin-right:16px !important}.mb-xl-3{margin-bottom:16px !important}.ml-xl-3{margin-left:16px !important}.mt-xl-n3{margin-top:-16px !important}.mr-xl-n3{margin-right:-16px !important}.mb-xl-n3{margin-bottom:-16px !important}.ml-xl-n3{margin-left:-16px !important}.mx-xl-3{margin-right:16px !important;margin-left:16px !important}.my-xl-3{margin-top:16px !important;margin-bottom:16px !important}.m-xl-4{margin:24px !important}.mt-xl-4{margin-top:24px !important}.mr-xl-4{margin-right:24px !important}.mb-xl-4{margin-bottom:24px !important}.ml-xl-4{margin-left:24px !important}.mt-xl-n4{margin-top:-24px !important}.mr-xl-n4{margin-right:-24px !important}.mb-xl-n4{margin-bottom:-24px !important}.ml-xl-n4{margin-left:-24px !important}.mx-xl-4{margin-right:24px !important;margin-left:24px !important}.my-xl-4{margin-top:24px !important;margin-bottom:24px !important}.m-xl-5{margin:32px !important}.mt-xl-5{margin-top:32px !important}.mr-xl-5{margin-right:32px !important}.mb-xl-5{margin-bottom:32px !important}.ml-xl-5{margin-left:32px !important}.mt-xl-n5{margin-top:-32px !important}.mr-xl-n5{margin-right:-32px !important}.mb-xl-n5{margin-bottom:-32px !important}.ml-xl-n5{margin-left:-32px !important}.mx-xl-5{margin-right:32px !important;margin-left:32px !important}.my-xl-5{margin-top:32px !important;margin-bottom:32px !important}.m-xl-6{margin:40px !important}.mt-xl-6{margin-top:40px !important}.mr-xl-6{margin-right:40px !important}.mb-xl-6{margin-bottom:40px !important}.ml-xl-6{margin-left:40px !important}.mt-xl-n6{margin-top:-40px !important}.mr-xl-n6{margin-right:-40px !important}.mb-xl-n6{margin-bottom:-40px !important}.ml-xl-n6{margin-left:-40px !important}.mx-xl-6{margin-right:40px !important;margin-left:40px !important}.my-xl-6{margin-top:40px !important;margin-bottom:40px !important}.mx-xl-auto{margin-right:auto !important;margin-left:auto !important}}.p-0{padding:0 !important}.pt-0{padding-top:0 !important}.pr-0{padding-right:0 !important}.pb-0{padding-bottom:0 !important}.pl-0{padding-left:0 !important}.px-0{padding-right:0 !important;padding-left:0 !important}.py-0{padding-top:0 !important;padding-bottom:0 !important}.p-1{padding:4px !important}.pt-1{padding-top:4px !important}.pr-1{padding-right:4px !important}.pb-1{padding-bottom:4px !important}.pl-1{padding-left:4px !important}.px-1{padding-right:4px !important;padding-left:4px !important}.py-1{padding-top:4px !important;padding-bottom:4px !important}.p-2{padding:8px !important}.pt-2{padding-top:8px !important}.pr-2{padding-right:8px !important}.pb-2{padding-bottom:8px !important}.pl-2{padding-left:8px !important}.px-2{padding-right:8px !important;padding-left:8px !important}.py-2{padding-top:8px !important;padding-bottom:8px !important}.p-3{padding:16px !important}.pt-3{padding-top:16px !important}.pr-3{padding-right:16px !important}.pb-3{padding-bottom:16px !important}.pl-3{padding-left:16px !important}.px-3{padding-right:16px !important;padding-left:16px !important}.py-3{padding-top:16px !important;padding-bottom:16px !important}.p-4{padding:24px !important}.pt-4{padding-top:24px !important}.pr-4{padding-right:24px !important}.pb-4{padding-bottom:24px !important}.pl-4{padding-left:24px !important}.px-4{padding-right:24px !important;padding-left:24px !important}.py-4{padding-top:24px !important;padding-bottom:24px !important}.p-5{padding:32px !important}.pt-5{padding-top:32px !important}.pr-5{padding-right:32px !important}.pb-5{padding-bottom:32px !important}.pl-5{padding-left:32px !important}.px-5{padding-right:32px !important;padding-left:32px !important}.py-5{padding-top:32px !important;padding-bottom:32px !important}.p-6{padding:40px !important}.pt-6{padding-top:40px !important}.pr-6{padding-right:40px !important}.pb-6{padding-bottom:40px !important}.pl-6{padding-left:40px !important}.px-6{padding-right:40px !important;padding-left:40px !important}.py-6{padding-top:40px !important;padding-bottom:40px !important}@media (min-width: 544px){.p-sm-0{padding:0 !important}.pt-sm-0{padding-top:0 !important}.pr-sm-0{padding-right:0 !important}.pb-sm-0{padding-bottom:0 !important}.pl-sm-0{padding-left:0 !important}.px-sm-0{padding-right:0 !important;padding-left:0 !important}.py-sm-0{padding-top:0 !important;padding-bottom:0 !important}.p-sm-1{padding:4px !important}.pt-sm-1{padding-top:4px !important}.pr-sm-1{padding-right:4px !important}.pb-sm-1{padding-bottom:4px !important}.pl-sm-1{padding-left:4px !important}.px-sm-1{padding-right:4px !important;padding-left:4px !important}.py-sm-1{padding-top:4px !important;padding-bottom:4px !important}.p-sm-2{padding:8px !important}.pt-sm-2{padding-top:8px !important}.pr-sm-2{padding-right:8px !important}.pb-sm-2{padding-bottom:8px !important}.pl-sm-2{padding-left:8px !important}.px-sm-2{padding-right:8px !important;padding-left:8px !important}.py-sm-2{padding-top:8px !important;padding-bottom:8px !important}.p-sm-3{padding:16px !important}.pt-sm-3{padding-top:16px !important}.pr-sm-3{padding-right:16px !important}.pb-sm-3{padding-bottom:16px !important}.pl-sm-3{padding-left:16px !important}.px-sm-3{padding-right:16px !important;padding-left:16px !important}.py-sm-3{padding-top:16px !important;padding-bottom:16px !important}.p-sm-4{padding:24px !important}.pt-sm-4{padding-top:24px !important}.pr-sm-4{padding-right:24px !important}.pb-sm-4{padding-bottom:24px !important}.pl-sm-4{padding-left:24px !important}.px-sm-4{padding-right:24px !important;padding-left:24px !important}.py-sm-4{padding-top:24px !important;padding-bottom:24px !important}.p-sm-5{padding:32px !important}.pt-sm-5{padding-top:32px !important}.pr-sm-5{padding-right:32px !important}.pb-sm-5{padding-bottom:32px !important}.pl-sm-5{padding-left:32px !important}.px-sm-5{padding-right:32px !important;padding-left:32px !important}.py-sm-5{padding-top:32px !important;padding-bottom:32px !important}.p-sm-6{padding:40px !important}.pt-sm-6{padding-top:40px !important}.pr-sm-6{padding-right:40px !important}.pb-sm-6{padding-bottom:40px !important}.pl-sm-6{padding-left:40px !important}.px-sm-6{padding-right:40px !important;padding-left:40px !important}.py-sm-6{padding-top:40px !important;padding-bottom:40px !important}}@media (min-width: 768px){.p-md-0{padding:0 !important}.pt-md-0{padding-top:0 !important}.pr-md-0{padding-right:0 !important}.pb-md-0{padding-bottom:0 !important}.pl-md-0{padding-left:0 !important}.px-md-0{padding-right:0 !important;padding-left:0 !important}.py-md-0{padding-top:0 !important;padding-bottom:0 !important}.p-md-1{padding:4px !important}.pt-md-1{padding-top:4px !important}.pr-md-1{padding-right:4px !important}.pb-md-1{padding-bottom:4px !important}.pl-md-1{padding-left:4px !important}.px-md-1{padding-right:4px !important;padding-left:4px !important}.py-md-1{padding-top:4px !important;padding-bottom:4px !important}.p-md-2{padding:8px !important}.pt-md-2{padding-top:8px !important}.pr-md-2{padding-right:8px !important}.pb-md-2{padding-bottom:8px !important}.pl-md-2{padding-left:8px !important}.px-md-2{padding-right:8px !important;padding-left:8px !important}.py-md-2{padding-top:8px !important;padding-bottom:8px !important}.p-md-3{padding:16px !important}.pt-md-3{padding-top:16px !important}.pr-md-3{padding-right:16px !important}.pb-md-3{padding-bottom:16px !important}.pl-md-3{padding-left:16px !important}.px-md-3{padding-right:16px !important;padding-left:16px !important}.py-md-3{padding-top:16px !important;padding-bottom:16px !important}.p-md-4{padding:24px !important}.pt-md-4{padding-top:24px !important}.pr-md-4{padding-right:24px !important}.pb-md-4{padding-bottom:24px !important}.pl-md-4{padding-left:24px !important}.px-md-4{padding-right:24px !important;padding-left:24px !important}.py-md-4{padding-top:24px !important;padding-bottom:24px !important}.p-md-5{padding:32px !important}.pt-md-5{padding-top:32px !important}.pr-md-5{padding-right:32px !important}.pb-md-5{padding-bottom:32px !important}.pl-md-5{padding-left:32px !important}.px-md-5{padding-right:32px !important;padding-left:32px !important}.py-md-5{padding-top:32px !important;padding-bottom:32px !important}.p-md-6{padding:40px !important}.pt-md-6{padding-top:40px !important}.pr-md-6{padding-right:40px !important}.pb-md-6{padding-bottom:40px !important}.pl-md-6{padding-left:40px !important}.px-md-6{padding-right:40px !important;padding-left:40px !important}.py-md-6{padding-top:40px !important;padding-bottom:40px !important}}@media (min-width: 1012px){.p-lg-0{padding:0 !important}.pt-lg-0{padding-top:0 !important}.pr-lg-0{padding-right:0 !important}.pb-lg-0{padding-bottom:0 !important}.pl-lg-0{padding-left:0 !important}.px-lg-0{padding-right:0 !important;padding-left:0 !important}.py-lg-0{padding-top:0 !important;padding-bottom:0 !important}.p-lg-1{padding:4px !important}.pt-lg-1{padding-top:4px !important}.pr-lg-1{padding-right:4px !important}.pb-lg-1{padding-bottom:4px !important}.pl-lg-1{padding-left:4px !important}.px-lg-1{padding-right:4px !important;padding-left:4px !important}.py-lg-1{padding-top:4px !important;padding-bottom:4px !important}.p-lg-2{padding:8px !important}.pt-lg-2{padding-top:8px !important}.pr-lg-2{padding-right:8px !important}.pb-lg-2{padding-bottom:8px !important}.pl-lg-2{padding-left:8px !important}.px-lg-2{padding-right:8px !important;padding-left:8px !important}.py-lg-2{padding-top:8px !important;padding-bottom:8px !important}.p-lg-3{padding:16px !important}.pt-lg-3{padding-top:16px !important}.pr-lg-3{padding-right:16px !important}.pb-lg-3{padding-bottom:16px !important}.pl-lg-3{padding-left:16px !important}.px-lg-3{padding-right:16px !important;padding-left:16px !important}.py-lg-3{padding-top:16px !important;padding-bottom:16px !important}.p-lg-4{padding:24px !important}.pt-lg-4{padding-top:24px !important}.pr-lg-4{padding-right:24px !important}.pb-lg-4{padding-bottom:24px !important}.pl-lg-4{padding-left:24px !important}.px-lg-4{padding-right:24px !important;padding-left:24px !important}.py-lg-4{padding-top:24px !important;padding-bottom:24px !important}.p-lg-5{padding:32px !important}.pt-lg-5{padding-top:32px !important}.pr-lg-5{padding-right:32px !important}.pb-lg-5{padding-bottom:32px !important}.pl-lg-5{padding-left:32px !important}.px-lg-5{padding-right:32px !important;padding-left:32px !important}.py-lg-5{padding-top:32px !important;padding-bottom:32px !important}.p-lg-6{padding:40px !important}.pt-lg-6{padding-top:40px !important}.pr-lg-6{padding-right:40px !important}.pb-lg-6{padding-bottom:40px !important}.pl-lg-6{padding-left:40px !important}.px-lg-6{padding-right:40px !important;padding-left:40px !important}.py-lg-6{padding-top:40px !important;padding-bottom:40px !important}}@media (min-width: 1280px){.p-xl-0{padding:0 !important}.pt-xl-0{padding-top:0 !important}.pr-xl-0{padding-right:0 !important}.pb-xl-0{padding-bottom:0 !important}.pl-xl-0{padding-left:0 !important}.px-xl-0{padding-right:0 !important;padding-left:0 !important}.py-xl-0{padding-top:0 !important;padding-bottom:0 !important}.p-xl-1{padding:4px !important}.pt-xl-1{padding-top:4px !important}.pr-xl-1{padding-right:4px !important}.pb-xl-1{padding-bottom:4px !important}.pl-xl-1{padding-left:4px !important}.px-xl-1{padding-right:4px !important;padding-left:4px !important}.py-xl-1{padding-top:4px !important;padding-bottom:4px !important}.p-xl-2{padding:8px !important}.pt-xl-2{padding-top:8px !important}.pr-xl-2{padding-right:8px !important}.pb-xl-2{padding-bottom:8px !important}.pl-xl-2{padding-left:8px !important}.px-xl-2{padding-right:8px !important;padding-left:8px !important}.py-xl-2{padding-top:8px !important;padding-bottom:8px !important}.p-xl-3{padding:16px !important}.pt-xl-3{padding-top:16px !important}.pr-xl-3{padding-right:16px !important}.pb-xl-3{padding-bottom:16px !important}.pl-xl-3{padding-left:16px !important}.px-xl-3{padding-right:16px !important;padding-left:16px !important}.py-xl-3{padding-top:16px !important;padding-bottom:16px !important}.p-xl-4{padding:24px !important}.pt-xl-4{padding-top:24px !important}.pr-xl-4{padding-right:24px !important}.pb-xl-4{padding-bottom:24px !important}.pl-xl-4{padding-left:24px !important}.px-xl-4{padding-right:24px !important;padding-left:24px !important}.py-xl-4{padding-top:24px !important;padding-bottom:24px !important}.p-xl-5{padding:32px !important}.pt-xl-5{padding-top:32px !important}.pr-xl-5{padding-right:32px !important}.pb-xl-5{padding-bottom:32px !important}.pl-xl-5{padding-left:32px !important}.px-xl-5{padding-right:32px !important;padding-left:32px !important}.py-xl-5{padding-top:32px !important;padding-bottom:32px !important}.p-xl-6{padding:40px !important}.pt-xl-6{padding-top:40px !important}.pr-xl-6{padding-right:40px !important}.pb-xl-6{padding-bottom:40px !important}.pl-xl-6{padding-left:40px !important}.px-xl-6{padding-right:40px !important;padding-left:40px !important}.py-xl-6{padding-top:40px !important;padding-bottom:40px !important}}.p-responsive{padding-right:16px !important;padding-left:16px !important}@media (min-width: 544px){.p-responsive{padding-right:40px !important;padding-left:40px !important}}@media (min-width: 1012px){.p-responsive{padding-right:16px !important;padding-left:16px !important}}.h1{font-size:26px !important}@media (min-width: 768px){.h1{font-size:32px !important}}.h2{font-size:22px !important}@media (min-width: 768px){.h2{font-size:24px !important}}.h3{font-size:18px !important}@media (min-width: 768px){.h3{font-size:20px !important}}.h4{font-size:16px !important}.h5{font-size:14px !important}.h6{font-size:12px !important}.h1,.h2,.h3,.h4,.h5,.h6{font-weight:600 !important}.f1{font-size:26px !important}@media (min-width: 768px){.f1{font-size:32px !important}}.f2{font-size:22px !important}@media (min-width: 768px){.f2{font-size:24px !important}}.f3{font-size:18px !important}@media (min-width: 768px){.f3{font-size:20px !important}}.f4{font-size:16px !important}@media (min-width: 768px){.f4{font-size:16px !important}}.f5{font-size:14px !important}.f6{font-size:12px !important}.f00-light{font-size:40px !important;font-weight:300 !important}@media (min-width: 768px){.f00-light{font-size:48px !important}}.f0-light{font-size:32px !important;font-weight:300 !important}@media (min-width: 768px){.f0-light{font-size:40px !important}}.f1-light{font-size:26px !important;font-weight:300 !important}@media (min-width: 768px){.f1-light{font-size:32px !important}}.f2-light{font-size:22px !important;font-weight:300 !important}@media (min-width: 768px){.f2-light{font-size:24px !important}}.f3-light{font-size:18px !important;font-weight:300 !important}@media (min-width: 768px){.f3-light{font-size:20px !important}}.text-small{font-size:12px !important}.lead{margin-bottom:30px;font-size:20px;font-weight:300;color:#586069}.lh-condensed-ultra{line-height:1 !important}.lh-condensed{line-height:1.25 !important}.lh-default{line-height:1.5 !important}.lh-0{line-height:0 !important}.text-right{text-align:right !important}.text-left{text-align:left !important}.text-center{text-align:center !important}@media (min-width: 544px){.text-sm-right{text-align:right !important}.text-sm-left{text-align:left !important}.text-sm-center{text-align:center !important}}@media (min-width: 768px){.text-md-right{text-align:right !important}.text-md-left{text-align:left !important}.text-md-center{text-align:center !important}}@media (min-width: 1012px){.text-lg-right{text-align:right !important}.text-lg-left{text-align:left !important}.text-lg-center{text-align:center !important}}@media (min-width: 1280px){.text-xl-right{text-align:right !important}.text-xl-left{text-align:left !important}.text-xl-center{text-align:center !important}}.text-normal{font-weight:400 !important}.text-bold{font-weight:600 !important}.text-italic{font-style:italic !important}.text-uppercase{text-transform:uppercase !important}.text-underline{text-decoration:underline !important}.no-underline{text-decoration:none !important}.no-wrap{white-space:nowrap !important}.ws-normal{white-space:normal !important}.wb-break-all{word-break:break-all !important}.text-emphasized{font-weight:600;color:#24292e}.list-style-none{list-style:none !important}.text-shadow-dark{text-shadow:0 1px 1px rgba(27,31,35,0.25),0 1px 25px rgba(27,31,35,0.75)}.text-shadow-light{text-shadow:0 1px 0 rgba(255,255,255,0.5)}.text-mono{font-family:"SFMono-Regular",Consolas,"Liberation Mono",Menlo,Courier,monospace}.user-select-none{-webkit-user-select:none !important;-moz-user-select:none !important;-ms-user-select:none !important;user-select:none !important}.d-block{display:block !important}.d-flex{display:flex !important}.d-inline{display:inline !important}.d-inline-block{display:inline-block !important}.d-inline-flex{display:inline-flex !important}.d-none{display:none !important}.d-table{display:table !important}.d-table-cell{display:table-cell !important}@media (min-width: 544px){.d-sm-block{display:block !important}.d-sm-flex{display:flex !important}.d-sm-inline{display:inline !important}.d-sm-inline-block{display:inline-block !important}.d-sm-inline-flex{display:inline-flex !important}.d-sm-none{display:none !important}.d-sm-table{display:table !important}.d-sm-table-cell{display:table-cell !important}}@media (min-width: 768px){.d-md-block{display:block !important}.d-md-flex{display:flex !important}.d-md-inline{display:inline !important}.d-md-inline-block{display:inline-block !important}.d-md-inline-flex{display:inline-flex !important}.d-md-none{display:none !important}.d-md-table{display:table !important}.d-md-table-cell{display:table-cell !important}}@media (min-width: 1012px){.d-lg-block{display:block !important}.d-lg-flex{display:flex !important}.d-lg-inline{display:inline !important}.d-lg-inline-block{display:inline-block !important}.d-lg-inline-flex{display:inline-flex !important}.d-lg-none{display:none !important}.d-lg-table{display:table !important}.d-lg-table-cell{display:table-cell !important}}@media (min-width: 1280px){.d-xl-block{display:block !important}.d-xl-flex{display:flex !important}.d-xl-inline{display:inline !important}.d-xl-inline-block{display:inline-block !important}.d-xl-inline-flex{display:inline-flex !important}.d-xl-none{display:none !important}.d-xl-table{display:table !important}.d-xl-table-cell{display:table-cell !important}}.v-hidden{visibility:hidden !important}.v-visible{visibility:visible !important}@media (max-width: 544px){.hide-sm{display:none !important}}@media (min-width: 544px) and (max-width: 768px){.hide-md{display:none !important}}@media (min-width: 768px) and (max-width: 1012px){.hide-lg{display:none !important}}@media (min-width: 1012px){.hide-xl{display:none !important}}.table-fixed{table-layout:fixed !important}.sr-only{position:absolute;width:1px;height:1px;padding:0;overflow:hidden;clip:rect(0, 0, 0, 0);word-wrap:normal;border:0}.show-on-focus{position:absolute;width:1px;height:1px;margin:0;overflow:hidden;clip:rect(1px, 1px, 1px, 1px)}.show-on-focus:focus{z-index:20;width:auto;height:auto;clip:auto}.container{width:980px;margin-right:auto;margin-left:auto}.container::before{display:table;content:""}.container::after{display:table;clear:both;content:""}.container-md{max-width:768px;margin-right:auto;margin-left:auto}.container-lg{max-width:1012px;margin-right:auto;margin-left:auto}.container-xl{max-width:1280px;margin-right:auto;margin-left:auto}.columns{margin-right:-10px;margin-left:-10px}.columns::before{display:table;content:""}.columns::after{display:table;clear:both;content:""}.column{float:left;padding-right:10px;padding-left:10px}.one-third{width:33.333333%}.two-thirds{width:66.666667%}.one-fourth{width:25%}.one-half{width:50%}.three-fourths{width:75%}.one-fifth{width:20%}.four-fifths{width:80%}.centered{display:block;float:none;margin-right:auto;margin-left:auto}.col-1{width:8.3333333333%}.col-2{width:16.6666666667%}.col-3{width:25%}.col-4{width:33.3333333333%}.col-5{width:41.6666666667%}.col-6{width:50%}.col-7{width:58.3333333333%}.col-8{width:66.6666666667%}.col-9{width:75%}.col-10{width:83.3333333333%}.col-11{width:91.6666666667%}.col-12{width:100%}@media (min-width: 544px){.col-sm-1{width:8.3333333333%}.col-sm-2{width:16.6666666667%}.col-sm-3{width:25%}.col-sm-4{width:33.3333333333%}.col-sm-5{width:41.6666666667%}.col-sm-6{width:50%}.col-sm-7{width:58.3333333333%}.col-sm-8{width:66.6666666667%}.col-sm-9{width:75%}.col-sm-10{width:83.3333333333%}.col-sm-11{width:91.6666666667%}.col-sm-12{width:100%}}@media (min-width: 768px){.col-md-1{width:8.3333333333%}.col-md-2{width:16.6666666667%}.col-md-3{width:25%}.col-md-4{width:33.3333333333%}.col-md-5{width:41.6666666667%}.col-md-6{width:50%}.col-md-7{width:58.3333333333%}.col-md-8{width:66.6666666667%}.col-md-9{width:75%}.col-md-10{width:83.3333333333%}.col-md-11{width:91.6666666667%}.col-md-12{width:100%}}@media (min-width: 1012px){.col-lg-1{width:8.3333333333%}.col-lg-2{width:16.6666666667%}.col-lg-3{width:25%}.col-lg-4{width:33.3333333333%}.col-lg-5{width:41.6666666667%}.col-lg-6{width:50%}.col-lg-7{width:58.3333333333%}.col-lg-8{width:66.6666666667%}.col-lg-9{width:75%}.col-lg-10{width:83.3333333333%}.col-lg-11{width:91.6666666667%}.col-lg-12{width:100%}}@media (min-width: 1280px){.col-xl-1{width:8.3333333333%}.col-xl-2{width:16.6666666667%}.col-xl-3{width:25%}.col-xl-4{width:33.3333333333%}.col-xl-5{width:41.6666666667%}.col-xl-6{width:50%}.col-xl-7{width:58.3333333333%}.col-xl-8{width:66.6666666667%}.col-xl-9{width:75%}.col-xl-10{width:83.3333333333%}.col-xl-11{width:91.6666666667%}.col-xl-12{width:100%}}.gutter{margin-right:-16px;margin-left:-16px}.gutter>[class*="col-"]{padding-right:16px !important;padding-left:16px !important}.gutter-condensed{margin-right:-8px;margin-left:-8px}.gutter-condensed>[class*="col-"]{padding-right:8px !important;padding-left:8px !important}.gutter-spacious{margin-right:-24px;margin-left:-24px}.gutter-spacious>[class*="col-"]{padding-right:24px !important;padding-left:24px !important}@media (min-width: 544px){.gutter-sm{margin-right:-16px;margin-left:-16px}.gutter-sm>[class*="col-"]{padding-right:16px !important;padding-left:16px !important}.gutter-sm-condensed{margin-right:-8px;margin-left:-8px}.gutter-sm-condensed>[class*="col-"]{padding-right:8px !important;padding-left:8px !important}.gutter-sm-spacious{margin-right:-24px;margin-left:-24px}.gutter-sm-spacious>[class*="col-"]{padding-right:24px !important;padding-left:24px !important}}@media (min-width: 768px){.gutter-md{margin-right:-16px;margin-left:-16px}.gutter-md>[class*="col-"]{padding-right:16px !important;padding-left:16px !important}.gutter-md-condensed{margin-right:-8px;margin-left:-8px}.gutter-md-condensed>[class*="col-"]{padding-right:8px !important;padding-left:8px !important}.gutter-md-spacious{margin-right:-24px;margin-left:-24px}.gutter-md-spacious>[class*="col-"]{padding-right:24px !important;padding-left:24px !important}}@media (min-width: 1012px){.gutter-lg{margin-right:-16px;margin-left:-16px}.gutter-lg>[class*="col-"]{padding-right:16px !important;padding-left:16px !important}.gutter-lg-condensed{margin-right:-8px;margin-left:-8px}.gutter-lg-condensed>[class*="col-"]{padding-right:8px !important;padding-left:8px !important}.gutter-lg-spacious{margin-right:-24px;margin-left:-24px}.gutter-lg-spacious>[class*="col-"]{padding-right:24px !important;padding-left:24px !important}}@media (min-width: 1280px){.gutter-xl{margin-right:-16px;margin-left:-16px}.gutter-xl>[class*="col-"]{padding-right:16px !important;padding-left:16px !important}.gutter-xl-condensed{margin-right:-8px;margin-left:-8px}.gutter-xl-condensed>[class*="col-"]{padding-right:8px !important;padding-left:8px !important}.gutter-xl-spacious{margin-right:-24px;margin-left:-24px}.gutter-xl-spacious>[class*="col-"]{padding-right:24px !important;padding-left:24px !important}}.offset-1{margin-left:8.3333333333% !important}.offset-2{margin-left:16.6666666667% !important}.offset-3{margin-left:25% !important}.offset-4{margin-left:33.3333333333% !important}.offset-5{margin-left:41.6666666667% !important}.offset-6{margin-left:50% !important}.offset-7{margin-left:58.3333333333% !important}.offset-8{margin-left:66.6666666667% !important}.offset-9{margin-left:75% !important}.offset-10{margin-left:83.3333333333% !important}.offset-11{margin-left:91.6666666667% !important}@media (min-width: 544px){.offset-sm-1{margin-left:8.3333333333% !important}.offset-sm-2{margin-left:16.6666666667% !important}.offset-sm-3{margin-left:25% !important}.offset-sm-4{margin-left:33.3333333333% !important}.offset-sm-5{margin-left:41.6666666667% !important}.offset-sm-6{margin-left:50% !important}.offset-sm-7{margin-left:58.3333333333% !important}.offset-sm-8{margin-left:66.6666666667% !important}.offset-sm-9{margin-left:75% !important}.offset-sm-10{margin-left:83.3333333333% !important}.offset-sm-11{margin-left:91.6666666667% !important}}@media (min-width: 768px){.offset-md-1{margin-left:8.3333333333% !important}.offset-md-2{margin-left:16.6666666667% !important}.offset-md-3{margin-left:25% !important}.offset-md-4{margin-left:33.3333333333% !important}.offset-md-5{margin-left:41.6666666667% !important}.offset-md-6{margin-left:50% !important}.offset-md-7{margin-left:58.3333333333% !important}.offset-md-8{margin-left:66.6666666667% !important}.offset-md-9{margin-left:75% !important}.offset-md-10{margin-left:83.3333333333% !important}.offset-md-11{margin-left:91.6666666667% !important}}@media (min-width: 1012px){.offset-lg-1{margin-left:8.3333333333% !important}.offset-lg-2{margin-left:16.6666666667% !important}.offset-lg-3{margin-left:25% !important}.offset-lg-4{margin-left:33.3333333333% !important}.offset-lg-5{margin-left:41.6666666667% !important}.offset-lg-6{margin-left:50% !important}.offset-lg-7{margin-left:58.3333333333% !important}.offset-lg-8{margin-left:66.6666666667% !important}.offset-lg-9{margin-left:75% !important}.offset-lg-10{margin-left:83.3333333333% !important}.offset-lg-11{margin-left:91.6666666667% !important}}@media (min-width: 1280px){.offset-xl-1{margin-left:8.3333333333% !important}.offset-xl-2{margin-left:16.6666666667% !important}.offset-xl-3{margin-left:25% !important}.offset-xl-4{margin-left:33.3333333333% !important}.offset-xl-5{margin-left:41.6666666667% !important}.offset-xl-6{margin-left:50% !important}.offset-xl-7{margin-left:58.3333333333% !important}.offset-xl-8{margin-left:66.6666666667% !important}.offset-xl-9{margin-left:75% !important}.offset-xl-10{margin-left:83.3333333333% !important}.offset-xl-11{margin-left:91.6666666667% !important}}.markdown-body{font-family:-apple-system,BlinkMacSystemFont,"Segoe UI",Helvetica,Arial,sans-serif,"Apple Color Emoji","Segoe UI Emoji","Segoe UI Symbol";font-size:16px;line-height:1.5;word-wrap:break-word}.markdown-body::before{display:table;content:""}.markdown-body::after{display:table;clear:both;content:""}.markdown-body>*:first-child{margin-top:0 !important}.markdown-body>*:last-child{margin-bottom:0 !important}.markdown-body a:not([href]){color:inherit;text-decoration:none}.markdown-body .absent{color:#cb2431}.markdown-body .anchor{float:left;padding-right:4px;margin-left:-20px;line-height:1}.markdown-body .anchor:focus{outline:none}.markdown-body p,.markdown-body blockquote,.markdown-body ul,.markdown-body ol,.markdown-body dl,.markdown-body table,.markdown-body pre{margin-top:0;margin-bottom:16px}.markdown-body hr{height:.25em;padding:0;margin:24px 0;background-color:#e1e4e8;border:0}.markdown-body blockquote{padding:0 1em;color:#6a737d;border-left:0.25em solid #dfe2e5}.markdown-body blockquote>:first-child{margin-top:0}.markdown-body blockquote>:last-child{margin-bottom:0}.markdown-body kbd{display:inline-block;padding:3px 5px;font-size:11px;line-height:10px;color:#444d56;vertical-align:middle;background-color:#fafbfc;border:solid 1px #c6cbd1;border-bottom-color:#959da5;border-radius:3px;box-shadow:inset 0 -1px 0 #959da5}.markdown-body h1,.markdown-body h2,.markdown-body h3,.markdown-body h4,.markdown-body h5,.markdown-body h6{margin-top:24px;margin-bottom:16px;font-weight:600;line-height:1.25}.markdown-body h1 .octicon-link,.markdown-body h2 .octicon-link,.markdown-body h3 .octicon-link,.markdown-body h4 .octicon-link,.markdown-body h5 .octicon-link,.markdown-body h6 .octicon-link{color:#1b1f23;vertical-align:middle;visibility:hidden}.markdown-body h1:hover .anchor,.markdown-body h2:hover .anchor,.markdown-body h3:hover .anchor,.markdown-body h4:hover .anchor,.markdown-body h5:hover .anchor,.markdown-body h6:hover .anchor{text-decoration:none}.markdown-body h1:hover .anchor .octicon-link,.markdown-body h2:hover .anchor .octicon-link,.markdown-body h3:hover .anchor .octicon-link,.markdown-body h4:hover .anchor .octicon-link,.markdown-body h5:hover .anchor .octicon-link,.markdown-body h6:hover .anchor .octicon-link{visibility:visible}.markdown-body h1 tt,.markdown-body h1 code,.markdown-body h2 tt,.markdown-body h2 code,.markdown-body h3 tt,.markdown-body h3 code,.markdown-body h4 tt,.markdown-body h4 code,.markdown-body h5 tt,.markdown-body h5 code,.markdown-body h6 tt,.markdown-body h6 code{font-size:inherit}.markdown-body h1{padding-bottom:0.3em;font-size:2em;border-bottom:1px solid #eaecef}.markdown-body h2{padding-bottom:0.3em;font-size:1.5em;border-bottom:1px solid #eaecef}.markdown-body h3{font-size:1.25em}.markdown-body h4{font-size:1em}.markdown-body h5{font-size:0.875em}.markdown-body h6{font-size:0.85em;color:#6a737d}.markdown-body ul,.markdown-body ol{padding-left:2em}.markdown-body ul.no-list,.markdown-body ol.no-list{padding:0;list-style-type:none}.markdown-body ul ul,.markdown-body ul ol,.markdown-body ol ol,.markdown-body ol ul{margin-top:0;margin-bottom:0}.markdown-body li{word-wrap:break-all}.markdown-body li>p{margin-top:16px}.markdown-body li+li{margin-top:.25em}.markdown-body dl{padding:0}.markdown-body dl dt{padding:0;margin-top:16px;font-size:1em;font-style:italic;font-weight:600}.markdown-body dl dd{padding:0 16px;margin-bottom:16px}.markdown-body table{display:block;width:100%;overflow:auto}.markdown-body table th{font-weight:600}.markdown-body table th,.markdown-body table td{padding:6px 13px;border:1px solid #dfe2e5}.markdown-body table tr{background-color:#fff;border-top:1px solid #c6cbd1}.markdown-body table tr:nth-child(2n){background-color:#f6f8fa}.markdown-body table img{background-color:transparent}.markdown-body img{max-width:100%;box-sizing:content-box;background-color:#fff}.markdown-body img[align=right]{padding-left:20px}.markdown-body img[align=left]{padding-right:20px}.markdown-body .emoji{max-width:none;vertical-align:text-top;background-color:transparent}.markdown-body span.frame{display:block;overflow:hidden}.markdown-body span.frame>span{display:block;float:left;width:auto;padding:7px;margin:13px 0 0;overflow:hidden;border:1px solid #dfe2e5}.markdown-body span.frame span img{display:block;float:left}.markdown-body span.frame span span{display:block;padding:5px 0 0;clear:both;color:#24292e}.markdown-body span.align-center{display:block;overflow:hidden;clear:both}.markdown-body span.align-center>span{display:block;margin:13px auto 0;overflow:hidden;text-align:center}.markdown-body span.align-center span img{margin:0 auto;text-align:center}.markdown-body span.align-right{display:block;overflow:hidden;clear:both}.markdown-body span.align-right>span{display:block;margin:13px 0 0;overflow:hidden;text-align:right}.markdown-body span.align-right span img{margin:0;text-align:right}.markdown-body span.float-left{display:block;float:left;margin-right:13px;overflow:hidden}.markdown-body span.float-left span{margin:13px 0 0}.markdown-body span.float-right{display:block;float:right;margin-left:13px;overflow:hidden}.markdown-body span.float-right>span{display:block;margin:13px auto 0;overflow:hidden;text-align:right}.markdown-body code,.markdown-body tt{padding:0.2em 0.4em;margin:0;font-size:85%;background-color:rgba(27,31,35,0.05);border-radius:3px}.markdown-body code br,.markdown-body tt br{display:none}.markdown-body del code{text-decoration:inherit}.markdown-body pre{word-wrap:normal}.markdown-body pre>code{padding:0;margin:0;font-size:100%;word-break:normal;white-space:pre;background:transparent;border:0}.markdown-body .highlight{margin-bottom:16px}.markdown-body .highlight pre{margin-bottom:0;word-break:normal}.markdown-body .highlight pre,.markdown-body pre{padding:16px;overflow:auto;font-size:85%;line-height:1.45;background-color:#f6f8fa;border-radius:3px}.markdown-body pre code,.markdown-body pre tt{display:inline;max-width:auto;padding:0;margin:0;overflow:visible;line-height:inherit;word-wrap:normal;background-color:transparent;border:0}.markdown-body .csv-data td,.markdown-body .csv-data th{padding:5px;overflow:hidden;font-size:12px;line-height:1;text-align:left;white-space:nowrap}.markdown-body .csv-data .blob-num{padding:10px 8px 9px;text-align:right;background:#fff;border:0}.markdown-body .csv-data tr{border-top:0}.markdown-body .csv-data th{font-weight:600;background:#f6f8fa;border-top:0}.highlight table td{padding:5px}.highlight table pre{margin:0}.highlight .cm{color:#999988;font-style:italic}.highlight .cp{color:#999999;font-weight:bold}.highlight .c1{color:#999988;font-style:italic}.highlight .cs{color:#999999;font-weight:bold;font-style:italic}.highlight .c,.highlight .cd{color:#999988;font-style:italic}.highlight .err{color:#a61717;background-color:#e3d2d2}.highlight .gd{color:#000000;background-color:#ffdddd}.highlight .ge{color:#000000;font-style:italic}.highlight .gr{color:#aa0000}.highlight .gh{color:#999999}.highlight .gi{color:#000000;background-color:#ddffdd}.highlight .go{color:#888888}.highlight .gp{color:#555555}.highlight .gs{font-weight:bold}.highlight .gu{color:#aaaaaa}.highlight .gt{color:#aa0000}.highlight .kc{color:#000000;font-weight:bold}.highlight .kd{color:#000000;font-weight:bold}.highlight .kn{color:#000000;font-weight:bold}.highlight .kp{color:#000000;font-weight:bold}.highlight .kr{color:#000000;font-weight:bold}.highlight .kt{color:#445588;font-weight:bold}.highlight .k,.highlight .kv{color:#000000;font-weight:bold}.highlight .mf{color:#009999}.highlight .mh{color:#009999}.highlight .il{color:#009999}.highlight .mi{color:#009999}.highlight .mo{color:#009999}.highlight .m,.highlight .mb,.highlight .mx{color:#009999}.highlight .sb{color:#d14}.highlight .sc{color:#d14}.highlight .sd{color:#d14}.highlight .s2{color:#d14}.highlight .se{color:#d14}.highlight .sh{color:#d14}.highlight .si{color:#d14}.highlight .sx{color:#d14}.highlight .sr{color:#009926}.highlight .s1{color:#d14}.highlight .ss{color:#990073}.highlight .s{color:#d14}.highlight .na{color:#008080}.highlight .bp{color:#999999}.highlight .nb{color:#0086B3}.highlight .nc{color:#445588;font-weight:bold}.highlight .no{color:#008080}.highlight .nd{color:#3c5d5d;font-weight:bold}.highlight .ni{color:#800080}.highlight .ne{color:#990000;font-weight:bold}.highlight .nf{color:#990000;font-weight:bold}.highlight .nl{color:#990000;font-weight:bold}.highlight .nn{color:#555555}.highlight .nt{color:#000080}.highlight .vc{color:#008080}.highlight .vg{color:#008080}.highlight .vi{color:#008080}.highlight .nv{color:#008080}.highlight .ow{color:#000000;font-weight:bold}.highlight .o{color:#000000;font-weight:bold}.highlight .w{color:#bbbbbb}.highlight{background-color:#f8f8f8} diff --git a/assets/ecosystem-posters/ Bornstein-Pytorch Community Days Lightning Draft-v2.png b/assets/ecosystem-posters/ Bornstein-Pytorch Community Days Lightning Draft-v2.png new file mode 100644 index 000000000000..2946a8dd1651 Binary files /dev/null and b/assets/ecosystem-posters/ Bornstein-Pytorch Community Days Lightning Draft-v2.png differ diff --git a/assets/ecosystem-posters/f b/assets/ecosystem-posters/f new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/assets/ecosystem-posters/f @@ -0,0 +1 @@ + diff --git a/assets/external-links-new-tab.js b/assets/external-links-new-tab.js new file mode 100644 index 000000000000..1629c88394fe --- /dev/null +++ b/assets/external-links-new-tab.js @@ -0,0 +1,7 @@ +var links = document.links; + +for (var i = 0; i < links.length; i++) { + if (links[i].hostname != window.location.hostname) { + links[i].target = '_blank'; + } +} \ No newline at end of file diff --git a/assets/filter-hub-tags.js b/assets/filter-hub-tags.js new file mode 100644 index 000000000000..65e59f0339d0 --- /dev/null +++ b/assets/filter-hub-tags.js @@ -0,0 +1,103 @@ +var filterScript = $("script[src*=filter-hub-tags]"); +var listId = filterScript.attr("list-id"); +var displayCount = Number(filterScript.attr("display-count")); +var pagination = filterScript.attr("pagination"); + +var options = { + valueNames: ["github-stars-count-whole-number", { data: ["tags", "date-added", "title"] }], + page: displayCount +}; + +$(".next-news-item").on("click" , function(){ + $(".pagination").find(".active").next().trigger( "click" ); +}); + +$(".previous-news-item").on("click" , function(){ + $(".pagination").find(".active").prev().trigger( "click" ); +}); + +// Only the hub index page should have pagination + +if (pagination == "true") { + options.pagination = true; +} + +var hubList = new List(listId, options); + +function filterSelectedTags(cardTags, selectedTags) { + return cardTags.some(function(tag) { + return selectedTags.some(function(selectedTag) { + return selectedTag == tag; + }); + }); +} + +function updateList() { + var selectedTags = []; + + $(".selected").each(function() { + selectedTags.push($(this).data("tag")); + }); + + hubList.filter(function(item) { + var cardTags = item.values().tags.split(","); + + if (selectedTags.length == 0) { + return true; + } else { + return filterSelectedTags(cardTags, selectedTags); + } + }); +} + +$(".filter-btn").on("click", function() { + if ($(this).data("tag") == "all") { + $(this).addClass("all-tag-selected"); + $(".filter").removeClass("selected"); + } else { + $(this).toggleClass("selected"); + $("[data-tag='all']").removeClass("all-tag-selected"); + } + + // If no tags are selected then highlight the 'All' tag + + if (!$(".selected")[0]) { + $("[data-tag='all']").addClass("all-tag-selected"); + } + + updateList(); +}); + +//Scroll back to top of hub cards on click of next/previous page button + +$(document).on("click", ".page", function(e) { + e.preventDefault(); + $('html, body').animate( + {scrollTop: $("#pagination-scroll").position().top}, + 'slow' + ); +}); + +$("#sortLowLeft").on("click", function() { + hubList.sort("github-stars-count-whole-number", { order: "asc" }); +}); + +$("#sortHighLeft").on("click", function() { + hubList.sort("github-stars-count-whole-number", { order: "desc" }); +}); + +$("#sortDateNew").on("click", function() { + hubList.sort("date-added", { order: "desc" }); +}); + +$("#sortDateOld").on("click", function() { + hubList.sort("date-added", { order: "asc" }); +}); + +$("#sortTitleLow").on("click", function() { + hubList.sort("title", { order: "desc" }); +}); + +$("#sortTitleHigh").on("click", function() { + hubList.sort("title", { order: "asc" }); +}); diff --git a/assets/fonts/FreightSans/freight-sans-bold-italic.woff b/assets/fonts/FreightSans/freight-sans-bold-italic.woff new file mode 100755 index 000000000000..e317248423c7 Binary files /dev/null and b/assets/fonts/FreightSans/freight-sans-bold-italic.woff differ diff --git a/assets/fonts/FreightSans/freight-sans-bold-italic.woff2 b/assets/fonts/FreightSans/freight-sans-bold-italic.woff2 new file mode 100755 index 000000000000..cec2dc94fbb5 Binary files /dev/null and b/assets/fonts/FreightSans/freight-sans-bold-italic.woff2 differ diff --git a/assets/fonts/FreightSans/freight-sans-bold.woff b/assets/fonts/FreightSans/freight-sans-bold.woff new file mode 100755 index 000000000000..de46625edfc8 Binary files /dev/null and b/assets/fonts/FreightSans/freight-sans-bold.woff differ diff --git a/assets/fonts/FreightSans/freight-sans-bold.woff2 b/assets/fonts/FreightSans/freight-sans-bold.woff2 new file mode 100755 index 000000000000..dc05cd82bc4d Binary files /dev/null and b/assets/fonts/FreightSans/freight-sans-bold.woff2 differ diff --git a/assets/fonts/FreightSans/freight-sans-book-italic.woff b/assets/fonts/FreightSans/freight-sans-book-italic.woff new file mode 100755 index 000000000000..a50e5038a405 Binary files /dev/null and b/assets/fonts/FreightSans/freight-sans-book-italic.woff differ diff --git a/assets/fonts/FreightSans/freight-sans-book-italic.woff2 b/assets/fonts/FreightSans/freight-sans-book-italic.woff2 new file mode 100755 index 000000000000..fe284db6614a Binary files /dev/null and b/assets/fonts/FreightSans/freight-sans-book-italic.woff2 differ diff --git a/assets/fonts/FreightSans/freight-sans-book.woff b/assets/fonts/FreightSans/freight-sans-book.woff new file mode 100755 index 000000000000..6ab8775f00b1 Binary files /dev/null and b/assets/fonts/FreightSans/freight-sans-book.woff differ diff --git a/assets/fonts/FreightSans/freight-sans-book.woff2 b/assets/fonts/FreightSans/freight-sans-book.woff2 new file mode 100755 index 000000000000..2688739f1f0b Binary files /dev/null and b/assets/fonts/FreightSans/freight-sans-book.woff2 differ diff --git a/assets/fonts/FreightSans/freight-sans-light-italic.woff b/assets/fonts/FreightSans/freight-sans-light-italic.woff new file mode 100755 index 000000000000..beda58d4e218 Binary files /dev/null and b/assets/fonts/FreightSans/freight-sans-light-italic.woff differ diff --git a/assets/fonts/FreightSans/freight-sans-light-italic.woff2 b/assets/fonts/FreightSans/freight-sans-light-italic.woff2 new file mode 100755 index 000000000000..e2fa0134b1a5 Binary files /dev/null and b/assets/fonts/FreightSans/freight-sans-light-italic.woff2 differ diff --git a/assets/fonts/FreightSans/freight-sans-light.woff b/assets/fonts/FreightSans/freight-sans-light.woff new file mode 100755 index 000000000000..226a0bf83583 Binary files /dev/null and b/assets/fonts/FreightSans/freight-sans-light.woff differ diff --git a/assets/fonts/FreightSans/freight-sans-light.woff2 b/assets/fonts/FreightSans/freight-sans-light.woff2 new file mode 100755 index 000000000000..6d8ff2c045b0 Binary files /dev/null and b/assets/fonts/FreightSans/freight-sans-light.woff2 differ diff --git a/assets/fonts/FreightSans/freight-sans-medium-italic.woff b/assets/fonts/FreightSans/freight-sans-medium-italic.woff new file mode 100644 index 000000000000..a42115d63b39 Binary files /dev/null and b/assets/fonts/FreightSans/freight-sans-medium-italic.woff differ diff --git a/assets/fonts/FreightSans/freight-sans-medium-italic.woff2 b/assets/fonts/FreightSans/freight-sans-medium-italic.woff2 new file mode 100644 index 000000000000..16a7713a451a Binary files /dev/null and b/assets/fonts/FreightSans/freight-sans-medium-italic.woff2 differ diff --git a/assets/fonts/FreightSans/freight-sans-medium.woff b/assets/fonts/FreightSans/freight-sans-medium.woff new file mode 100755 index 000000000000..5ea34539c6f5 Binary files /dev/null and b/assets/fonts/FreightSans/freight-sans-medium.woff differ diff --git a/assets/fonts/FreightSans/freight-sans-medium.woff2 b/assets/fonts/FreightSans/freight-sans-medium.woff2 new file mode 100755 index 000000000000..c58b6a528bb6 Binary files /dev/null and b/assets/fonts/FreightSans/freight-sans-medium.woff2 differ diff --git a/assets/fonts/IBMPlexMono/IBMPlexMono-Light.woff b/assets/fonts/IBMPlexMono/IBMPlexMono-Light.woff new file mode 100644 index 000000000000..cf37a5c50bdb Binary files /dev/null and b/assets/fonts/IBMPlexMono/IBMPlexMono-Light.woff differ diff --git a/assets/fonts/IBMPlexMono/IBMPlexMono-Light.woff2 b/assets/fonts/IBMPlexMono/IBMPlexMono-Light.woff2 new file mode 100644 index 000000000000..955a6eab5bb8 Binary files /dev/null and b/assets/fonts/IBMPlexMono/IBMPlexMono-Light.woff2 differ diff --git a/assets/fonts/IBMPlexMono/IBMPlexMono-Medium.woff b/assets/fonts/IBMPlexMono/IBMPlexMono-Medium.woff new file mode 100644 index 000000000000..fc65a679c226 Binary files /dev/null and b/assets/fonts/IBMPlexMono/IBMPlexMono-Medium.woff differ diff --git a/assets/fonts/IBMPlexMono/IBMPlexMono-Medium.woff2 b/assets/fonts/IBMPlexMono/IBMPlexMono-Medium.woff2 new file mode 100644 index 000000000000..c352e40e34a3 Binary files /dev/null and b/assets/fonts/IBMPlexMono/IBMPlexMono-Medium.woff2 differ diff --git a/assets/fonts/IBMPlexMono/IBMPlexMono-Regular.woff b/assets/fonts/IBMPlexMono/IBMPlexMono-Regular.woff new file mode 100644 index 000000000000..7d63d89f24bc Binary files /dev/null and b/assets/fonts/IBMPlexMono/IBMPlexMono-Regular.woff differ diff --git a/assets/fonts/IBMPlexMono/IBMPlexMono-Regular.woff2 b/assets/fonts/IBMPlexMono/IBMPlexMono-Regular.woff2 new file mode 100644 index 000000000000..d0d7ded90791 Binary files /dev/null and b/assets/fonts/IBMPlexMono/IBMPlexMono-Regular.woff2 differ diff --git a/assets/fonts/IBMPlexMono/IBMPlexMono-SemiBold.woff b/assets/fonts/IBMPlexMono/IBMPlexMono-SemiBold.woff new file mode 100644 index 000000000000..1da7753cf283 Binary files /dev/null and b/assets/fonts/IBMPlexMono/IBMPlexMono-SemiBold.woff differ diff --git a/assets/fonts/IBMPlexMono/IBMPlexMono-SemiBold.woff2 b/assets/fonts/IBMPlexMono/IBMPlexMono-SemiBold.woff2 new file mode 100644 index 000000000000..79dffdb85f74 Binary files /dev/null and b/assets/fonts/IBMPlexMono/IBMPlexMono-SemiBold.woff2 differ diff --git a/assets/get-started-sidebar.js b/assets/get-started-sidebar.js new file mode 100644 index 000000000000..0e48cda801aa --- /dev/null +++ b/assets/get-started-sidebar.js @@ -0,0 +1,91 @@ +// Create the sidebar menus for each OS and Cloud Partner + +$([".macos", ".linux", ".windows"]).each(function(index, osClass) { + buildSidebarMenu(osClass, "#get-started-locally-sidebar-list"); +}); + +$([".alibaba", ".aws", ".microsoft-azure", ".google-cloud", ".lightning-studios"]).each(function(index, cloudPartner) { + buildSidebarMenu(cloudPartner, "#get-started-cloud-sidebar-list"); +}); + +$(["macos", "linux", "windows"]).each(function(index, osClass) { + $("#" + osClass).on("click", function() { + showSidebar(osClass, ".get-started-locally-sidebar li"); + }); +}); + +// Show cloud partner side nav on click or hide side nav if already open +$(["alibaba", "aws", "microsoft-azure", "google-cloud", "lightning-studios"]).each(function(index, sidebarClass) { + $("#" + sidebarClass).click(function() { + showSidebar(sidebarClass, ".get-started-cloud-sidebar li"); + // alibaba filter for centering cloud module + if (sidebarClass == "alibaba") { + $(".article-wrapper").parent().removeClass("col-md-8 offset-md-1").addClass("col-md-12"); + $(".cloud-nav").hide(); + } else { + $(".article-wrapper").parent().removeClass("col-md-12").addClass("col-md-8 offset-md-1"); + $(".cloud-nav").show(); + } + if ($("#" + sidebarClass).parent().hasClass("open")) { + $(".get-started-cloud-sidebar li").hide(); + $(".cloud-nav").hide(); + $(".article-wrapper").parent().removeClass("col-md-8 offset-md-1").addClass("col-md-12"); + } + }) +}); + +function buildSidebarMenu(menuClass, menuItem) { + $(menuClass + " > h2," + menuClass + " > h3").each(function(index, element) { + menuClass = menuClass.replace(".", ""); + + // If the menu item is an H3 tag then it should be indented + var indentMenuItem = $(element).get(0).tagName == "H3" ? "subitem" : ""; + + // Combine the menu item classes + var menuItemClasses = [menuClass, indentMenuItem].join(" "); + + $(menuItem).append( + "" + ); + }); +} + +function showSidebar(selectedClass, menuItem) { + // Hide all of the menu items at first + // Then filter for the selected OS/cloud partner + $(menuItem) + .hide() + .filter(function() { + return $(this) + .attr("class") + .includes(selectedClass); + }) + .show(); +} + +$(".get-started-locally-sidebar li").on("click", function() { + removeActiveClass(); + addActiveClass(this); +}); + +function removeActiveClass() { + $(".get-started-locally-sidebar li a").each(function() { + $(this).removeClass("active"); + }); +} + +function addActiveClass(element) { + $(element) + .find("a") + .addClass("active"); +} + +if ($("#get-started-locally-sidebar-list").text() == "") { + $("#get-started-shortcuts-menu").hide(); +} diff --git a/assets/github-stars.js b/assets/github-stars.js new file mode 100644 index 000000000000..8f2d994927f0 --- /dev/null +++ b/assets/github-stars.js @@ -0,0 +1,79 @@ +var githubStarsScript = $("script[src*=github-stars]"); +var starCountCallDate = githubStarsScript.attr("star-count-call-date"); +var starCountData = githubStarsScript.attr("star-count-data"); +var ecosystemStars = githubStarsScript.attr("ecosystem"); +var cloudfrontUrl = ""; + +if (ecosystemStars == "true") { + cloudfrontUrl = "https://d2ze5o8gurgoho.cloudfront.net/star-count"; +} +else { + cloudfrontUrl = "https://du4l4liqvfo92.cloudfront.net/star-count"; +} + +var today = new Date(); +var starCountCallDateParsed = new Date( + parseInt(localStorage.getItem(starCountCallDate), 10) +); + +if ( + Date.parse(today) > + starCountCallDateParsed.setDate(starCountCallDateParsed.getDate() + 7) || + localStorage.getItem(starCountCallDate) == null +) { + updateStarCount(); +} else { + useLocalStorageStarCount(); +} + +function updateStarCount() { + console.log("Updated star count fetched"); + $.getJSON(cloudfrontUrl, function (data) { + localStorage.setItem(starCountCallDate, Date.parse(today)); + localStorage.setItem(starCountData, JSON.stringify(data)); + + updateStarsOnPage(data); + }); +} + +function useLocalStorageStarCount() { + var data = JSON.parse(localStorage.getItem(starCountData)); + + updateStarsOnPage(data); +} + +// Loop through each card and add the star count +// Once each card has its star count then the pagination script is added + +function updateStarsOnPage(data) { + return new Promise(function (resolve, reject) { + for (var i = 0; i < data.length; i++) { + var starCount = data[i].stars; + if (starCount > 999) { + starCount = numeral(starCount).format("0.0a"); + } else if (starCount > 9999) { + starCount = numeral(starCount).format("0.00a"); + } + $("[data-id='" + data[i].id + "'] .github-stars-count-whole-number").html(data[i].stars); + $("[data-id='" + data[i].id + "'] .github-stars-count").html(starCount); + } + resolve( + $("#filter-script").html(addFilterScript()) + ); + }); +} + +function addFilterScript() { + var data = $("#filter-script").data(); + + var script = + ""; + + return script; +} diff --git a/assets/hub-buttons.js b/assets/hub-buttons.js new file mode 100644 index 000000000000..ce7de59432ee --- /dev/null +++ b/assets/hub-buttons.js @@ -0,0 +1,40 @@ +var numberOfCardsToShow = 3; + +$(".cards-left > .col-md-12, .cards-right > .col-md-12") + .filter(function() { + return $(this).attr("data-item-count") > numberOfCardsToShow; + }) + .hide(); + +$("#development-models").on("click", function() { + showCards(this, "#development-models-hide", ".cards-right > .col-md-12"); +}); + +$("#development-models-hide").on("click", function() { + hideCards(this, "#development-models", ".cards-right > .col-md-12"); +}); + +$("#research-models").on("click", function() { + showCards(this, "#research-models-hide", ".cards-left > .col-md-12"); +}); + +$("#research-models-hide").on("click", function() { + hideCards(this, "#research-models", ".cards-left > .col-md-12"); +}); + +function showCards(buttonToHide, buttonToShow, cardsWrapper) { + $(buttonToHide).hide(); + $(buttonToShow) + .add(cardsWrapper) + .show(); +} + +function hideCards(buttonToHide, buttonToShow, cardsWrapper) { + $(buttonToHide).hide(); + $(buttonToShow).show(); + $(cardsWrapper) + .filter(function() { + return $(this).attr("data-item-count") > numberOfCardsToShow; + }) + .hide(); +} diff --git a/assets/hub-detail.js b/assets/hub-detail.js new file mode 100644 index 000000000000..89496b5e1cb3 --- /dev/null +++ b/assets/hub-detail.js @@ -0,0 +1,7 @@ +// Hide broken images that appear on the hub detail page. + +$(".featured-image").each(function() { + if ($(this).data("image-name") == "no-image") { + $(this).hide(); + } +}); diff --git a/assets/hub-search-bar.js b/assets/hub-search-bar.js new file mode 100644 index 000000000000..5e9433eccd50 --- /dev/null +++ b/assets/hub-search-bar.js @@ -0,0 +1,49 @@ +docsearch({ + apiKey: "e3b73ac141dff0b0fd27bdae9055bc73", + indexName: "pytorch", + inputSelector: "#hub-search-input", + algoliaOptions: { facetFilters: ["tags:hub"] }, + debug: false // Set debug to true if you want to inspect the dropdown +}); + +$("#hub-search-icon").on("click", function() { + $(this).hide(); + $("#hub-icons").hide(); + $("#hub-close-search").fadeIn("slow"); + $(".hub-divider").addClass("active-hub-divider"); + $("#hub-search-input") + .show() + .css("background-color", "#CCCDD1") + .focus(); + $(".hub-search-wrapper, .hub-tags-container").addClass("active"); + $("#dropdown-filter-tags").hide(); +}); + +function hideHubSearch(searchIcon) { + $(searchIcon).hide(); + + $("#hub-search-icon, #dropdown-filter-tags").fadeIn("slow"); + $("#hub-icons").fadeIn("slow"); + $("#hub-search-input") + .fadeOut("slow") + .css("background-color", "#f3f4f7"); + $(".hub-divider").removeClass("active-hub-divider"); + $("#hub-search-input") + .removeClass("active-search-icon") + .val(""); + $(".hub-search-wrapper, .hub-tags-container").removeClass("active"); +} + +$("#hub-close-search").on("click", function() { + hideHubSearch(this); +}); + +$(document).click(function(event) { + $target = $(event.target); + if ( + !$target.closest(".hub-search-wrapper").length && + $(".hub-search-wrapper").is(":visible") + ) { + hideHubSearch("#hub-close-search"); + } +}); diff --git a/assets/hub-sort.js b/assets/hub-sort.js new file mode 100644 index 000000000000..e7e117a5a09c --- /dev/null +++ b/assets/hub-sort.js @@ -0,0 +1,31 @@ +var $wrapper = $(".cards-right"); +var $leftWrapper = $(".cards-left"); + +$("#sortLow").on("click", function() { + sorter("low", $wrapper); +}); + +$("#sortHigh").on("click", function() { + sorter("high", $wrapper); +}); + +$("#sortLowLeft").on("click", function() { + sorter("low", $leftWrapper); +}); + +$("#sortHighLeft").on("click", function() { + sorter("high", $leftWrapper); +}); + +function sorter(type, wrapper) { + wrapper + .find(".col-md-12") + .sort(function(a, b) { + if (type == "high") { + return b.dataset.count - a.dataset.count; + } else { + return a.dataset.count - b.dataset.count; + } + }) + .appendTo(wrapper); +} diff --git a/assets/hub/CODE_OF_CONDUCT.ipynb b/assets/hub/CODE_OF_CONDUCT.ipynb new file mode 100644 index 000000000000..363fcab7ed6e --- /dev/null +++ b/assets/hub/CODE_OF_CONDUCT.ipynb @@ -0,0 +1,6 @@ +{ + "cells": [], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/CONTRIBUTING.ipynb b/assets/hub/CONTRIBUTING.ipynb new file mode 100644 index 000000000000..363fcab7ed6e --- /dev/null +++ b/assets/hub/CONTRIBUTING.ipynb @@ -0,0 +1,6 @@ +{ + "cells": [], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/datvuthanh_hybridnets.ipynb b/assets/hub/datvuthanh_hybridnets.ipynb new file mode 100644 index 000000000000..8afb27b0135d --- /dev/null +++ b/assets/hub/datvuthanh_hybridnets.ipynb @@ -0,0 +1,148 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "bab3a91d", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# HybridNets\n", + "\n", + "*Author: Dat Vu Thanh*\n", + "\n", + "**HybridNets - End2End Perception Network**\n", + "\n", + "## Before You Start\n", + "\n", + "Start from a **Python>=3.7** environment with **PyTorch>=1.10** installed. To install PyTorch see [https://pytorch.org/get-started/locally/](https://pytorch.org/get-started/locally/). To install HybridNets dependencies:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9be78eea", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "pip install -qr https://raw.githubusercontent.com/datvuthanh/HybridNets/main/requirements.txt # install dependencies" + ] + }, + { + "cell_type": "markdown", + "id": "e218345e", + "metadata": {}, + "source": [ + "## Model Description\n", + " \n", + " \n", + "\n", + "HybridNets is an end2end perception network for multi-tasks. Our work focused on traffic object detection, drivable area segmentation and lane detection. HybridNets can run real-time on embedded systems, and obtains SOTA Object Detection, Lane Detection on BDD100K Dataset.\n", + "\n", + "### Results\n", + "\n", + "### Traffic Object Detection\n", + "\n", + "| Model | Recall (%) | mAP@0.5 (%) |\n", + "|:------------------:|:------------:|:---------------:|\n", + "| `MultiNet` | 81.3 | 60.2 |\n", + "| `DLT-Net` | 89.4 | 68.4 |\n", + "| `Faster R-CNN` | 77.2 | 55.6 |\n", + "| `YOLOv5s` | 86.8 | 77.2 |\n", + "| `YOLOP` | 89.2 | 76.5 |\n", + "| **`HybridNets`** | **92.8** | **77.3** |\n", + "\n", + "\n", + " \n", + "### Drivable Area Segmentation\n", + "\n", + "| Model | Drivable mIoU (%) |\n", + "|:----------------:|:-----------------:|\n", + "| `MultiNet` | 71.6 |\n", + "| `DLT-Net` | 71.3 |\n", + "| `PSPNet` | 89.6 |\n", + "| `YOLOP` | 91.5 |\n", + "| **`HybridNets`** | **90.5** |\n", + "\n", + "\n", + " \n", + "### Lane Line Detection\n", + "\n", + "| Model | Accuracy (%) | Lane Line IoU (%) |\n", + "|:----------------:|:------------:|:-----------------:|\n", + "| `Enet` | 34.12 | 14.64 |\n", + "| `SCNN` | 35.79 | 15.84 |\n", + "| `Enet-SAD` | 36.56 | 16.02 |\n", + "| `YOLOP` | 70.5 | 26.2 |\n", + "| **`HybridNets`** | **85.4** | **31.6** |\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "### Load From PyTorch Hub\n", + "\n", + "This example loads the pretrained **HybridNets** model and passes an image for inference." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4b94d3f8", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "\n", + "# load model\n", + "model = torch.hub.load('datvuthanh/hybridnets', 'hybridnets', pretrained=True)\n", + "\n", + "#inference\n", + "img = torch.randn(1,3,640,384)\n", + "features, regression, classification, anchors, segmentation = model(img)" + ] + }, + { + "cell_type": "markdown", + "id": "1fdb0bae", + "metadata": {}, + "source": [ + "### Citation\n", + "\n", + "If you find our [paper](https://arxiv.org/abs/2203.09035) and [code](https://github.com/datvuthanh/HybridNets) useful for your research, please consider giving a star and citation:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "76552e35", + "metadata": { + "attributes": { + "classes": [ + "BibTeX" + ], + "id": "" + } + }, + "outputs": [], + "source": [ + "@misc{vu2022hybridnets,\n", + " title={HybridNets: End-to-End Perception Network}, \n", + " author={Dat Vu and Bao Ngo and Hung Phan},\n", + " year={2022},\n", + " eprint={2203.09035},\n", + " archivePrefix={arXiv},\n", + " primaryClass={cs.CV}\n", + "}" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/facebookresearch_WSL-Images_resnext.ipynb b/assets/hub/facebookresearch_WSL-Images_resnext.ipynb new file mode 100644 index 000000000000..874d8fe2b6c3 --- /dev/null +++ b/assets/hub/facebookresearch_WSL-Images_resnext.ipynb @@ -0,0 +1,128 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "8ef1d490", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# ResNext WSL\n", + "\n", + "*Author: Facebook AI*\n", + "\n", + "**ResNext models trained with billion scale weakly-supervised data.**\n", + "\n", + "\"alt\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4cd8dad2", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "model = torch.hub.load('facebookresearch/WSL-Images', 'resnext101_32x8d_wsl')\n", + "# or\n", + "# model = torch.hub.load('facebookresearch/WSL-Images', 'resnext101_32x16d_wsl')\n", + "# or\n", + "# model = torch.hub.load('facebookresearch/WSL-Images', 'resnext101_32x32d_wsl')\n", + "# or\n", + "#model = torch.hub.load('facebookresearch/WSL-Images', 'resnext101_32x48d_wsl')\n", + "model.eval()" + ] + }, + { + "cell_type": "markdown", + "id": "5a74a046", + "metadata": {}, + "source": [ + "All pre-trained models expect input images normalized in the same way,\n", + "i.e. mini-batches of 3-channel RGB images of shape `(3 x H x W)`, where `H` and `W` are expected to be at least `224`.\n", + "The images have to be loaded in to a range of `[0, 1]` and then normalized using `mean = [0.485, 0.456, 0.406]`\n", + "and `std = [0.229, 0.224, 0.225]`.\n", + "\n", + "Here's a sample execution." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "96400b56", + "metadata": {}, + "outputs": [], + "source": [ + "# Download an example image from the pytorch website\n", + "import urllib\n", + "url, filename = (\"https://github.com/pytorch/hub/raw/master/images/dog.jpg\", \"dog.jpg\")\n", + "try: urllib.URLopener().retrieve(url, filename)\n", + "except: urllib.request.urlretrieve(url, filename)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6a5a0f9c", + "metadata": {}, + "outputs": [], + "source": [ + "# sample execution (requires torchvision)\n", + "from PIL import Image\n", + "from torchvision import transforms\n", + "input_image = Image.open(filename)\n", + "preprocess = transforms.Compose([\n", + " transforms.Resize(256),\n", + " transforms.CenterCrop(224),\n", + " transforms.ToTensor(),\n", + " transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n", + "])\n", + "input_tensor = preprocess(input_image)\n", + "input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model\n", + "\n", + "# move the input and model to GPU for speed if available\n", + "if torch.cuda.is_available():\n", + " input_batch = input_batch.to('cuda')\n", + " model.to('cuda')\n", + "\n", + "with torch.no_grad():\n", + " output = model(input_batch)\n", + "# Tensor of shape 1000, with confidence scores over ImageNet's 1000 classes\n", + "print(output[0])\n", + "# The output has unnormalized scores. To get probabilities, you can run a softmax on it.\n", + "print(torch.nn.functional.softmax(output[0], dim=0))\n" + ] + }, + { + "cell_type": "markdown", + "id": "1ab881a6", + "metadata": {}, + "source": [ + "### Model Description\n", + "The provided ResNeXt models are pre-trained in weakly-supervised fashion on **940 million** public images with 1.5K hashtags matching with 1000 ImageNet1K synsets, followed by fine-tuning on ImageNet1K dataset. Please refer to \"Exploring the Limits of Weakly Supervised Pretraining\" (https://arxiv.org/abs/1805.00932) presented at ECCV 2018 for the details of model training.\n", + "\n", + "We are providing 4 models with different capacities.\n", + "\n", + "| Model | #Parameters | FLOPS | Top-1 Acc. | Top-5 Acc. |\n", + "| ------------------ | :---------: | :---: | :--------: | :--------: |\n", + "| ResNeXt-101 32x8d | 88M | 16B | 82.2 | 96.4 |\n", + "| ResNeXt-101 32x16d | 193M | 36B | 84.2 | 97.2 |\n", + "| ResNeXt-101 32x32d | 466M | 87B | 85.1 | 97.5 |\n", + "| ResNeXt-101 32x48d | 829M | 153B | 85.4 | 97.6 |\n", + "\n", + "Our models significantly improve the training accuracy on ImageNet compared to training from scratch. **We achieve state-of-the-art accuracy of 85.4% on ImageNet with our ResNext-101 32x48d model.**\n", + "\n", + "### References\n", + "\n", + " - [Exploring the Limits of Weakly Supervised Pretraining](https://arxiv.org/abs/1805.00932)" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/facebookresearch_pytorch-gan-zoo_dcgan.ipynb b/assets/hub/facebookresearch_pytorch-gan-zoo_dcgan.ipynb new file mode 100644 index 000000000000..85ac76ff49f2 --- /dev/null +++ b/assets/hub/facebookresearch_pytorch-gan-zoo_dcgan.ipynb @@ -0,0 +1,94 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "1b258978", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# DCGAN on FashionGen\n", + "\n", + "*Author: FAIR HDGAN*\n", + "\n", + "**A simple generative image model for 64x64 images**\n", + "\n", + "\"alt\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c32ce03d", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "use_gpu = True if torch.cuda.is_available() else False\n", + "\n", + "model = torch.hub.load('facebookresearch/pytorch_GAN_zoo:hub', 'DCGAN', pretrained=True, useGPU=use_gpu)" + ] + }, + { + "cell_type": "markdown", + "id": "262b4ccb", + "metadata": {}, + "source": [ + "The input to the model is a noise vector of shape `(N, 120)` where `N` is the number of images to be generated.\n", + "It can be constructed using the function `.buildNoiseData`.\n", + "The model has a `.test` function that takes in the noise vector and generates images." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c6c00225", + "metadata": {}, + "outputs": [], + "source": [ + "num_images = 64\n", + "noise, _ = model.buildNoiseData(num_images)\n", + "with torch.no_grad():\n", + " generated_images = model.test(noise)\n", + "\n", + "# let's plot these images using torchvision and matplotlib\n", + "import matplotlib.pyplot as plt\n", + "import torchvision\n", + "plt.imshow(torchvision.utils.make_grid(generated_images).permute(1, 2, 0).cpu().numpy())\n", + "# plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "12cb57e2", + "metadata": {}, + "source": [ + "You should see an image similar to the one on the left.\n", + "\n", + "If you want to train your own DCGAN and other GANs from scratch, have a look at [PyTorch GAN Zoo](https://github.com/facebookresearch/pytorch_GAN_zoo).\n", + "\n", + "### Model Description\n", + "\n", + "In computer vision, generative models are networks trained to create images from a given input. In our case, we consider a specific kind of generative networks: GANs (Generative Adversarial Networks) which learn to map a random vector with a realistic image generation.\n", + "\n", + "DCGAN is a model designed in 2015 by Radford et. al. in the paper [Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks](https://arxiv.org/abs/1511.06434). It is a GAN architecture both very simple and efficient for low resolution image generation (up to 64x64).\n", + "\n", + "\n", + "\n", + "### Requirements\n", + "\n", + "- Currently only supports Python 3\n", + "\n", + "### References\n", + "\n", + "- [Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks](https://arxiv.org/abs/1511.06434)" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/facebookresearch_pytorch-gan-zoo_pgan.ipynb b/assets/hub/facebookresearch_pytorch-gan-zoo_pgan.ipynb new file mode 100644 index 000000000000..26c0c359e205 --- /dev/null +++ b/assets/hub/facebookresearch_pytorch-gan-zoo_pgan.ipynb @@ -0,0 +1,103 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "101b74b4", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# Progressive Growing of GANs (PGAN)\n", + "\n", + "*Author: FAIR HDGAN*\n", + "\n", + "**High-quality image generation of fashion, celebrity faces**\n", + "\n", + "_ | _\n", + "- | -\n", + "![alt](https://pytorch.org/assets/images/pgan_mix.jpg) | ![alt](https://pytorch.org/assets/images/pgan_celebaHQ.jpg)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e69f3757", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "use_gpu = True if torch.cuda.is_available() else False\n", + "\n", + "# trained on high-quality celebrity faces \"celebA\" dataset\n", + "# this model outputs 512 x 512 pixel images\n", + "model = torch.hub.load('facebookresearch/pytorch_GAN_zoo:hub',\n", + " 'PGAN', model_name='celebAHQ-512',\n", + " pretrained=True, useGPU=use_gpu)\n", + "# this model outputs 256 x 256 pixel images\n", + "# model = torch.hub.load('facebookresearch/pytorch_GAN_zoo:hub',\n", + "# 'PGAN', model_name='celebAHQ-256',\n", + "# pretrained=True, useGPU=use_gpu)" + ] + }, + { + "cell_type": "markdown", + "id": "5d21bcb3", + "metadata": {}, + "source": [ + "The input to the model is a noise vector of shape `(N, 512)` where `N` is the number of images to be generated.\n", + "It can be constructed using the function `.buildNoiseData`.\n", + "The model has a `.test` function that takes in the noise vector and generates images." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cd6247e2", + "metadata": {}, + "outputs": [], + "source": [ + "num_images = 4\n", + "noise, _ = model.buildNoiseData(num_images)\n", + "with torch.no_grad():\n", + " generated_images = model.test(noise)\n", + "\n", + "# let's plot these images using torchvision and matplotlib\n", + "import matplotlib.pyplot as plt\n", + "import torchvision\n", + "grid = torchvision.utils.make_grid(generated_images.clamp(min=-1, max=1), scale_each=True, normalize=True)\n", + "plt.imshow(grid.permute(1, 2, 0).cpu().numpy())\n", + "# plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "d38bb5f5", + "metadata": {}, + "source": [ + "You should see an image similar to the one on the left.\n", + "\n", + "If you want to train your own Progressive GAN and other GANs from scratch, have a look at [PyTorch GAN Zoo](https://github.com/facebookresearch/pytorch_GAN_zoo).\n", + "\n", + "### Model Description\n", + "\n", + "In computer vision, generative models are networks trained to create images from a given input. In our case, we consider a specific kind of generative networks: GANs (Generative Adversarial Networks) which learn to map a random vector with a realistic image generation.\n", + "\n", + "Progressive Growing of GANs is a method developed by Karras et. al. [1] in 2017 allowing generation of high resolution images. To do so, the generative network is trained slice by slice. At first the model is trained to build very low resolution images, once it converges, new layers are added and the output resolution doubles. The process continues until the desired resolution is reached.\n", + "\n", + "### Requirements\n", + "\n", + "- Currently only supports Python 3\n", + "\n", + "### References\n", + "\n", + "[1] Tero Karras et al, \"Progressive Growing of GANs for Improved Quality, Stability, and Variation\" https://arxiv.org/abs/1710.10196" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/facebookresearch_pytorchvideo_resnet.ipynb b/assets/hub/facebookresearch_pytorchvideo_resnet.ipynb new file mode 100644 index 000000000000..4b4641722270 --- /dev/null +++ b/assets/hub/facebookresearch_pytorchvideo_resnet.ipynb @@ -0,0 +1,283 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e4f99e2c", + "metadata": {}, + "source": [ + "# 3D ResNet\n", + "\n", + "*Author: FAIR PyTorchVideo*\n", + "\n", + "**Resnet Style Video classification networks pretrained on the Kinetics 400 dataset**\n", + "\n", + "\n", + "### Example Usage\n", + "\n", + "#### Imports\n", + "\n", + "Load the model:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "96affaa2", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "# Choose the `slow_r50` model \n", + "model = torch.hub.load('facebookresearch/pytorchvideo', 'slow_r50', pretrained=True)" + ] + }, + { + "cell_type": "markdown", + "id": "1d0359d9", + "metadata": {}, + "source": [ + "Import remaining functions:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ab84506f", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import urllib\n", + "from pytorchvideo.data.encoded_video import EncodedVideo\n", + "\n", + "from torchvision.transforms import Compose, Lambda\n", + "from torchvision.transforms._transforms_video import (\n", + " CenterCropVideo,\n", + " NormalizeVideo,\n", + ")\n", + "from pytorchvideo.transforms import (\n", + " ApplyTransformToKey,\n", + " ShortSideScale,\n", + " UniformTemporalSubsample\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "06976792", + "metadata": {}, + "source": [ + "#### Setup\n", + "\n", + "Set the model to eval mode and move to desired device." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "680df0e7", + "metadata": { + "attributes": { + "classes": [ + "python " + ], + "id": "" + } + }, + "outputs": [], + "source": [ + "# Set to GPU or CPU\n", + "device = \"cpu\"\n", + "model = model.eval()\n", + "model = model.to(device)" + ] + }, + { + "cell_type": "markdown", + "id": "68096afb", + "metadata": {}, + "source": [ + "Download the id to label mapping for the Kinetics 400 dataset on which the torch hub models were trained. This will be used to get the category label names from the predicted class ids." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9c1eaa3c", + "metadata": {}, + "outputs": [], + "source": [ + "json_url = \"https://dl.fbaipublicfiles.com/pyslowfast/dataset/class_names/kinetics_classnames.json\"\n", + "json_filename = \"kinetics_classnames.json\"\n", + "try: urllib.URLopener().retrieve(json_url, json_filename)\n", + "except: urllib.request.urlretrieve(json_url, json_filename)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "134f9719", + "metadata": {}, + "outputs": [], + "source": [ + "with open(json_filename, \"r\") as f:\n", + " kinetics_classnames = json.load(f)\n", + "\n", + "# Create an id to label name mapping\n", + "kinetics_id_to_classname = {}\n", + "for k, v in kinetics_classnames.items():\n", + " kinetics_id_to_classname[v] = str(k).replace('\"', \"\")" + ] + }, + { + "cell_type": "markdown", + "id": "b53cb1e8", + "metadata": {}, + "source": [ + "#### Define input transform" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7f317a15", + "metadata": {}, + "outputs": [], + "source": [ + "side_size = 256\n", + "mean = [0.45, 0.45, 0.45]\n", + "std = [0.225, 0.225, 0.225]\n", + "crop_size = 256\n", + "num_frames = 8\n", + "sampling_rate = 8\n", + "frames_per_second = 30\n", + "\n", + "# Note that this transform is specific to the slow_R50 model.\n", + "transform = ApplyTransformToKey(\n", + " key=\"video\",\n", + " transform=Compose(\n", + " [\n", + " UniformTemporalSubsample(num_frames),\n", + " Lambda(lambda x: x/255.0),\n", + " NormalizeVideo(mean, std),\n", + " ShortSideScale(\n", + " size=side_size\n", + " ),\n", + " CenterCropVideo(crop_size=(crop_size, crop_size))\n", + " ]\n", + " ),\n", + ")\n", + "\n", + "# The duration of the input clip is also specific to the model.\n", + "clip_duration = (num_frames * sampling_rate)/frames_per_second" + ] + }, + { + "cell_type": "markdown", + "id": "2126afcc", + "metadata": {}, + "source": [ + "#### Run Inference\n", + "\n", + "Download an example video." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d22db1ed", + "metadata": {}, + "outputs": [], + "source": [ + "url_link = \"https://dl.fbaipublicfiles.com/pytorchvideo/projects/archery.mp4\"\n", + "video_path = 'archery.mp4'\n", + "try: urllib.URLopener().retrieve(url_link, video_path)\n", + "except: urllib.request.urlretrieve(url_link, video_path)" + ] + }, + { + "cell_type": "markdown", + "id": "a51f110a", + "metadata": {}, + "source": [ + "Load the video and transform it to the input format required by the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29a3ea72", + "metadata": {}, + "outputs": [], + "source": [ + "# Select the duration of the clip to load by specifying the start and end duration\n", + "# The start_sec should correspond to where the action occurs in the video\n", + "start_sec = 0\n", + "end_sec = start_sec + clip_duration\n", + "\n", + "# Initialize an EncodedVideo helper class and load the video\n", + "video = EncodedVideo.from_path(video_path)\n", + "\n", + "# Load the desired clip\n", + "video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)\n", + "\n", + "# Apply a transform to normalize the video input\n", + "video_data = transform(video_data)\n", + "\n", + "# Move the inputs to the desired device\n", + "inputs = video_data[\"video\"]\n", + "inputs = inputs.to(device)" + ] + }, + { + "cell_type": "markdown", + "id": "5d6b6e2c", + "metadata": {}, + "source": [ + "#### Get Predictions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f4190298", + "metadata": {}, + "outputs": [], + "source": [ + "# Pass the input clip through the model\n", + "preds = model(inputs[None, ...])\n", + "\n", + "# Get the predicted classes\n", + "post_act = torch.nn.Softmax(dim=1)\n", + "preds = post_act(preds)\n", + "pred_classes = preds.topk(k=5).indices[0]\n", + "\n", + "# Map the predicted classes to the label names\n", + "pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_classes]\n", + "print(\"Top 5 predicted labels: %s\" % \", \".join(pred_class_names))" + ] + }, + { + "cell_type": "markdown", + "id": "15cd8cf7", + "metadata": {}, + "source": [ + "### Model Description\n", + "The model architecture is based on [1] with pretrained weights using the 8x8 setting\n", + "on the Kinetics dataset. \n", + "\n", + "| arch | depth | frame length x sample rate | top 1 | top 5 | Flops (G) | Params (M) |\n", + "| --------------- | ----------- | ----------- | ----------- | ----------- | ----------- | ----------- |\n", + "| Slow | R50 | 8x8 | 74.58 | 91.63 | 54.52 | 32.45 |\n", + "\n", + "\n", + "### References\n", + "[1] Christoph Feichtenhofer et al, \"SlowFast Networks for Video Recognition\"\n", + "https://arxiv.org/pdf/1812.03982.pdf" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/facebookresearch_pytorchvideo_slowfast.ipynb b/assets/hub/facebookresearch_pytorchvideo_slowfast.ipynb new file mode 100644 index 000000000000..a95866528fae --- /dev/null +++ b/assets/hub/facebookresearch_pytorchvideo_slowfast.ipynb @@ -0,0 +1,308 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "43b62276", + "metadata": {}, + "source": [ + "# SlowFast\n", + "\n", + "*Author: FAIR PyTorchVideo*\n", + "\n", + "**SlowFast networks pretrained on the Kinetics 400 dataset**\n", + "\n", + "\n", + "### Example Usage\n", + "\n", + "#### Imports\n", + "\n", + "Load the model:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cad7ce41", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "# Choose the `slowfast_r50` model \n", + "model = torch.hub.load('facebookresearch/pytorchvideo', 'slowfast_r50', pretrained=True)" + ] + }, + { + "cell_type": "markdown", + "id": "0105e28f", + "metadata": {}, + "source": [ + "Import remaining functions:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "21ec21fc", + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Dict\n", + "import json\n", + "import urllib\n", + "from torchvision.transforms import Compose, Lambda\n", + "from torchvision.transforms._transforms_video import (\n", + " CenterCropVideo,\n", + " NormalizeVideo,\n", + ")\n", + "from pytorchvideo.data.encoded_video import EncodedVideo\n", + "from pytorchvideo.transforms import (\n", + " ApplyTransformToKey,\n", + " ShortSideScale,\n", + " UniformTemporalSubsample,\n", + " UniformCropVideo\n", + ") " + ] + }, + { + "cell_type": "markdown", + "id": "88fe7c95", + "metadata": {}, + "source": [ + "#### Setup\n", + "\n", + "Set the model to eval mode and move to desired device." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8e439a0f", + "metadata": { + "attributes": { + "classes": [ + "python " + ], + "id": "" + } + }, + "outputs": [], + "source": [ + "# Set to GPU or CPU\n", + "device = \"cpu\"\n", + "model = model.eval()\n", + "model = model.to(device)" + ] + }, + { + "cell_type": "markdown", + "id": "c9126ed1", + "metadata": {}, + "source": [ + "Download the id to label mapping for the Kinetics 400 dataset on which the torch hub models were trained. This will be used to get the category label names from the predicted class ids." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0a9f96e8", + "metadata": {}, + "outputs": [], + "source": [ + "json_url = \"https://dl.fbaipublicfiles.com/pyslowfast/dataset/class_names/kinetics_classnames.json\"\n", + "json_filename = \"kinetics_classnames.json\"\n", + "try: urllib.URLopener().retrieve(json_url, json_filename)\n", + "except: urllib.request.urlretrieve(json_url, json_filename)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "de3e0e3f", + "metadata": {}, + "outputs": [], + "source": [ + "with open(json_filename, \"r\") as f:\n", + " kinetics_classnames = json.load(f)\n", + "\n", + "# Create an id to label name mapping\n", + "kinetics_id_to_classname = {}\n", + "for k, v in kinetics_classnames.items():\n", + " kinetics_id_to_classname[v] = str(k).replace('\"', \"\")" + ] + }, + { + "cell_type": "markdown", + "id": "6866da20", + "metadata": {}, + "source": [ + "#### Define input transform" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8beb0a98", + "metadata": {}, + "outputs": [], + "source": [ + "side_size = 256\n", + "mean = [0.45, 0.45, 0.45]\n", + "std = [0.225, 0.225, 0.225]\n", + "crop_size = 256\n", + "num_frames = 32\n", + "sampling_rate = 2\n", + "frames_per_second = 30\n", + "slowfast_alpha = 4\n", + "num_clips = 10\n", + "num_crops = 3\n", + "\n", + "class PackPathway(torch.nn.Module):\n", + " \"\"\"\n", + " Transform for converting video frames as a list of tensors. \n", + " \"\"\"\n", + " def __init__(self):\n", + " super().__init__()\n", + " \n", + " def forward(self, frames: torch.Tensor):\n", + " fast_pathway = frames\n", + " # Perform temporal sampling from the fast pathway.\n", + " slow_pathway = torch.index_select(\n", + " frames,\n", + " 1,\n", + " torch.linspace(\n", + " 0, frames.shape[1] - 1, frames.shape[1] // slowfast_alpha\n", + " ).long(),\n", + " )\n", + " frame_list = [slow_pathway, fast_pathway]\n", + " return frame_list\n", + "\n", + "transform = ApplyTransformToKey(\n", + " key=\"video\",\n", + " transform=Compose(\n", + " [\n", + " UniformTemporalSubsample(num_frames),\n", + " Lambda(lambda x: x/255.0),\n", + " NormalizeVideo(mean, std),\n", + " ShortSideScale(\n", + " size=side_size\n", + " ),\n", + " CenterCropVideo(crop_size),\n", + " PackPathway()\n", + " ]\n", + " ),\n", + ")\n", + "\n", + "# The duration of the input clip is also specific to the model.\n", + "clip_duration = (num_frames * sampling_rate)/frames_per_second" + ] + }, + { + "cell_type": "markdown", + "id": "d7db0efb", + "metadata": {}, + "source": [ + "#### Run Inference\n", + "\n", + "Download an example video." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d215227", + "metadata": {}, + "outputs": [], + "source": [ + "url_link = \"https://dl.fbaipublicfiles.com/pytorchvideo/projects/archery.mp4\"\n", + "video_path = 'archery.mp4'\n", + "try: urllib.URLopener().retrieve(url_link, video_path)\n", + "except: urllib.request.urlretrieve(url_link, video_path)" + ] + }, + { + "cell_type": "markdown", + "id": "fafecfaa", + "metadata": {}, + "source": [ + "Load the video and transform it to the input format required by the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a2d91dfe", + "metadata": {}, + "outputs": [], + "source": [ + "# Select the duration of the clip to load by specifying the start and end duration\n", + "# The start_sec should correspond to where the action occurs in the video\n", + "start_sec = 0\n", + "end_sec = start_sec + clip_duration\n", + "\n", + "# Initialize an EncodedVideo helper class and load the video\n", + "video = EncodedVideo.from_path(video_path)\n", + "\n", + "# Load the desired clip\n", + "video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)\n", + "\n", + "# Apply a transform to normalize the video input\n", + "video_data = transform(video_data)\n", + "\n", + "# Move the inputs to the desired device\n", + "inputs = video_data[\"video\"]\n", + "inputs = [i.to(device)[None, ...] for i in inputs]" + ] + }, + { + "cell_type": "markdown", + "id": "f2387d0e", + "metadata": {}, + "source": [ + "#### Get Predictions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "55825ac2", + "metadata": {}, + "outputs": [], + "source": [ + "# Pass the input clip through the model\n", + "preds = model(inputs)\n", + "\n", + "# Get the predicted classes\n", + "post_act = torch.nn.Softmax(dim=1)\n", + "preds = post_act(preds)\n", + "pred_classes = preds.topk(k=5).indices[0]\n", + "\n", + "# Map the predicted classes to the label names\n", + "pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_classes]\n", + "print(\"Top 5 predicted labels: %s\" % \", \".join(pred_class_names))" + ] + }, + { + "cell_type": "markdown", + "id": "5f95a42d", + "metadata": {}, + "source": [ + "### Model Description\n", + "SlowFast model architectures are based on [1] with pretrained weights using the 8x8 setting\n", + "on the Kinetics dataset. \n", + "\n", + "| arch | depth | frame length x sample rate | top 1 | top 5 | Flops (G) | Params (M) |\n", + "| --------------- | ----------- | ----------- | ----------- | ----------- | ----------- | ----------- | ----------- |\n", + "| SlowFast | R50 | 8x8 | 76.94 | 92.69 | 65.71 | 34.57 |\n", + "| SlowFast | R101 | 8x8 | 77.90 | 93.27 | 127.20 | 62.83 |\n", + "\n", + "\n", + "### References\n", + "[1] Christoph Feichtenhofer et al, \"SlowFast Networks for Video Recognition\"\n", + "https://arxiv.org/pdf/1812.03982.pdf" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/facebookresearch_pytorchvideo_x3d.ipynb b/assets/hub/facebookresearch_pytorchvideo_x3d.ipynb new file mode 100644 index 000000000000..6f75fcbc1524 --- /dev/null +++ b/assets/hub/facebookresearch_pytorchvideo_x3d.ipynb @@ -0,0 +1,297 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "89d5af57", + "metadata": {}, + "source": [ + "# X3D\n", + "\n", + "*Author: FAIR PyTorchVideo*\n", + "\n", + "**X3D networks pretrained on the Kinetics 400 dataset**\n", + "\n", + "\n", + "### Example Usage\n", + "\n", + "#### Imports\n", + "\n", + "Load the model:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "daf69981", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "# Choose the `x3d_s` model\n", + "model_name = 'x3d_s'\n", + "model = torch.hub.load('facebookresearch/pytorchvideo', model_name, pretrained=True)" + ] + }, + { + "cell_type": "markdown", + "id": "0f4f316f", + "metadata": {}, + "source": [ + "Import remaining functions:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "42dbe99f", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import urllib\n", + "from pytorchvideo.data.encoded_video import EncodedVideo\n", + "\n", + "from torchvision.transforms import Compose, Lambda\n", + "from torchvision.transforms._transforms_video import (\n", + " CenterCropVideo,\n", + " NormalizeVideo,\n", + ")\n", + "from pytorchvideo.transforms import (\n", + " ApplyTransformToKey,\n", + " ShortSideScale,\n", + " UniformTemporalSubsample\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "ab48f59a", + "metadata": {}, + "source": [ + "#### Setup\n", + "\n", + "Set the model to eval mode and move to desired device." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18d25fa0", + "metadata": {}, + "outputs": [], + "source": [ + "# Set to GPU or CPU\n", + "device = \"cpu\"\n", + "model = model.eval()\n", + "model = model.to(device)" + ] + }, + { + "cell_type": "markdown", + "id": "bfed8b6b", + "metadata": {}, + "source": [ + "Download the id to label mapping for the Kinetics 400 dataset on which the torch hub models were trained. This will be used to get the category label names from the predicted class ids." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dbf220ef", + "metadata": {}, + "outputs": [], + "source": [ + "json_url = \"https://dl.fbaipublicfiles.com/pyslowfast/dataset/class_names/kinetics_classnames.json\"\n", + "json_filename = \"kinetics_classnames.json\"\n", + "try: urllib.URLopener().retrieve(json_url, json_filename)\n", + "except: urllib.request.urlretrieve(json_url, json_filename)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ccab195", + "metadata": {}, + "outputs": [], + "source": [ + "with open(json_filename, \"r\") as f:\n", + " kinetics_classnames = json.load(f)\n", + "\n", + "# Create an id to label name mapping\n", + "kinetics_id_to_classname = {}\n", + "for k, v in kinetics_classnames.items():\n", + " kinetics_id_to_classname[v] = str(k).replace('\"', \"\")" + ] + }, + { + "cell_type": "markdown", + "id": "f2ffd57e", + "metadata": {}, + "source": [ + "#### Define input transform" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b387fdb", + "metadata": {}, + "outputs": [], + "source": [ + "mean = [0.45, 0.45, 0.45]\n", + "std = [0.225, 0.225, 0.225]\n", + "frames_per_second = 30\n", + "model_transform_params = {\n", + " \"x3d_xs\": {\n", + " \"side_size\": 182,\n", + " \"crop_size\": 182,\n", + " \"num_frames\": 4,\n", + " \"sampling_rate\": 12,\n", + " },\n", + " \"x3d_s\": {\n", + " \"side_size\": 182,\n", + " \"crop_size\": 182,\n", + " \"num_frames\": 13,\n", + " \"sampling_rate\": 6,\n", + " },\n", + " \"x3d_m\": {\n", + " \"side_size\": 256,\n", + " \"crop_size\": 256,\n", + " \"num_frames\": 16,\n", + " \"sampling_rate\": 5,\n", + " }\n", + "}\n", + "\n", + "# Get transform parameters based on model\n", + "transform_params = model_transform_params[model_name]\n", + "\n", + "# Note that this transform is specific to the slow_R50 model.\n", + "transform = ApplyTransformToKey(\n", + " key=\"video\",\n", + " transform=Compose(\n", + " [\n", + " UniformTemporalSubsample(transform_params[\"num_frames\"]),\n", + " Lambda(lambda x: x/255.0),\n", + " NormalizeVideo(mean, std),\n", + " ShortSideScale(size=transform_params[\"side_size\"]),\n", + " CenterCropVideo(\n", + " crop_size=(transform_params[\"crop_size\"], transform_params[\"crop_size\"])\n", + " )\n", + " ]\n", + " ),\n", + ")\n", + "\n", + "# The duration of the input clip is also specific to the model.\n", + "clip_duration = (transform_params[\"num_frames\"] * transform_params[\"sampling_rate\"])/frames_per_second" + ] + }, + { + "cell_type": "markdown", + "id": "a5de0111", + "metadata": {}, + "source": [ + "#### Run Inference\n", + "\n", + "Download an example video." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dd125847", + "metadata": {}, + "outputs": [], + "source": [ + "url_link = \"https://dl.fbaipublicfiles.com/pytorchvideo/projects/archery.mp4\"\n", + "video_path = 'archery.mp4'\n", + "try: urllib.URLopener().retrieve(url_link, video_path)\n", + "except: urllib.request.urlretrieve(url_link, video_path)" + ] + }, + { + "cell_type": "markdown", + "id": "ceb379eb", + "metadata": {}, + "source": [ + "Load the video and transform it to the input format required by the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5147a11a", + "metadata": {}, + "outputs": [], + "source": [ + "# Select the duration of the clip to load by specifying the start and end duration\n", + "# The start_sec should correspond to where the action occurs in the video\n", + "start_sec = 0\n", + "end_sec = start_sec + clip_duration\n", + "\n", + "# Initialize an EncodedVideo helper class and load the video\n", + "video = EncodedVideo.from_path(video_path)\n", + "\n", + "# Load the desired clip\n", + "video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)\n", + "\n", + "# Apply a transform to normalize the video input\n", + "video_data = transform(video_data)\n", + "\n", + "# Move the inputs to the desired device\n", + "inputs = video_data[\"video\"]\n", + "inputs = inputs.to(device)" + ] + }, + { + "cell_type": "markdown", + "id": "fb9be637", + "metadata": {}, + "source": [ + "#### Get Predictions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6079fb75", + "metadata": {}, + "outputs": [], + "source": [ + "# Pass the input clip through the model\n", + "preds = model(inputs[None, ...])\n", + "\n", + "# Get the predicted classes\n", + "post_act = torch.nn.Softmax(dim=1)\n", + "preds = post_act(preds)\n", + "pred_classes = preds.topk(k=5).indices[0]\n", + "\n", + "# Map the predicted classes to the label names\n", + "pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_classes]\n", + "print(\"Top 5 predicted labels: %s\" % \", \".join(pred_class_names))" + ] + }, + { + "cell_type": "markdown", + "id": "a6e53a9a", + "metadata": {}, + "source": [ + "### Model Description\n", + "X3D model architectures are based on [1] pretrained on the Kinetics dataset.\n", + "\n", + "| arch | depth | frame length x sample rate | top 1 | top 5 | Flops (G) | Params (M) |\n", + "| --------------- | ----------- | ----------- | ----------- | ----------- | ----------- | ----------- | ----------- |\n", + "| X3D | XS | 4x12 | 69.12 | 88.63 | 0.91 | 3.79 |\n", + "| X3D | S | 13x6 | 73.33 | 91.27 | 2.96 | 3.79 |\n", + "| X3D | M | 16x5 | 75.94 | 92.72 | 6.72 | 3.79 |\n", + "\n", + "\n", + "### References\n", + "[1] Christoph Feichtenhofer, \"X3D: Expanding Architectures for\n", + " Efficient Video Recognition.\" https://arxiv.org/abs/2004.04730" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/facebookresearch_semi-supervised-ImageNet1K-models_resnext.ipynb b/assets/hub/facebookresearch_semi-supervised-ImageNet1K-models_resnext.ipynb new file mode 100644 index 000000000000..64a285b7b6ff --- /dev/null +++ b/assets/hub/facebookresearch_semi-supervised-ImageNet1K-models_resnext.ipynb @@ -0,0 +1,165 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "6c28f06b", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# Semi-supervised and semi-weakly supervised ImageNet Models\n", + "\n", + "*Author: Facebook AI*\n", + "\n", + "**ResNet and ResNext models introduced in the \"Billion scale semi-supervised learning for image classification\" paper**\n", + "\n", + "\"alt\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "73f3e3f0", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "\n", + "# === SEMI-WEAKLY SUPERVISED MODELS PRETRAINED WITH 940 HASHTAGGED PUBLIC CONTENT ===\n", + "model = torch.hub.load('facebookresearch/semi-supervised-ImageNet1K-models', 'resnet18_swsl')\n", + "# model = torch.hub.load('facebookresearch/semi-supervised-ImageNet1K-models', 'resnet50_swsl')\n", + "# model = torch.hub.load('facebookresearch/semi-supervised-ImageNet1K-models', 'resnext50_32x4d_swsl')\n", + "# model = torch.hub.load('facebookresearch/semi-supervised-ImageNet1K-models', 'resnext101_32x4d_swsl')\n", + "# model = torch.hub.load('facebookresearch/semi-supervised-ImageNet1K-models', 'resnext101_32x8d_swsl')\n", + "# model = torch.hub.load('facebookresearch/semi-supervised-ImageNet1K-models', 'resnext101_32x16d_swsl')\n", + "# ================= SEMI-SUPERVISED MODELS PRETRAINED WITH YFCC100M ==================\n", + "# model = torch.hub.load('facebookresearch/semi-supervised-ImageNet1K-models', 'resnet18_ssl')\n", + "# model = torch.hub.load('facebookresearch/semi-supervised-ImageNet1K-models', 'resnet50_ssl')\n", + "# model = torch.hub.load('facebookresearch/semi-supervised-ImageNet1K-models', 'resnext50_32x4d_ssl')\n", + "# model = torch.hub.load('facebookresearch/semi-supervised-ImageNet1K-models', 'resnext101_32x4d_ssl')\n", + "# model = torch.hub.load('facebookresearch/semi-supervised-ImageNet1K-models', 'resnext101_32x8d_ssl')\n", + "# model = torch.hub.load('facebookresearch/semi-supervised-ImageNet1K-models', 'resnext101_32x16d_ssl')\n", + "model.eval()" + ] + }, + { + "cell_type": "markdown", + "id": "a25ad51a", + "metadata": {}, + "source": [ + "All pre-trained models expect input images normalized in the same way,\n", + "i.e. mini-batches of 3-channel RGB images of shape `(3 x H x W)`, where `H` and `W` are expected to be at least `224`.\n", + "The images have to be loaded in to a range of `[0, 1]` and then normalized using `mean = [0.485, 0.456, 0.406]`\n", + "and `std = [0.229, 0.224, 0.225]`.\n", + "\n", + "Here's a sample execution." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3eec8b87", + "metadata": {}, + "outputs": [], + "source": [ + "# Download an example image from the pytorch website\n", + "import urllib\n", + "url, filename = (\"https://github.com/pytorch/hub/raw/master/images/dog.jpg\", \"dog.jpg\")\n", + "try: urllib.URLopener().retrieve(url, filename)\n", + "except: urllib.request.urlretrieve(url, filename)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "08b15593", + "metadata": {}, + "outputs": [], + "source": [ + "# sample execution (requires torchvision)\n", + "from PIL import Image\n", + "from torchvision import transforms\n", + "input_image = Image.open(filename)\n", + "preprocess = transforms.Compose([\n", + " transforms.Resize(256),\n", + " transforms.CenterCrop(224),\n", + " transforms.ToTensor(),\n", + " transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n", + "])\n", + "input_tensor = preprocess(input_image)\n", + "input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model\n", + "\n", + "# move the input and model to GPU for speed if available\n", + "if torch.cuda.is_available():\n", + " input_batch = input_batch.to('cuda')\n", + " model.to('cuda')\n", + "\n", + "with torch.no_grad():\n", + " output = model(input_batch)\n", + "# Tensor of shape 1000, with confidence scores over ImageNet's 1000 classes\n", + "print(output[0])\n", + "# The output has unnormalized scores. To get probabilities, you can run a softmax on it.\n", + "print(torch.nn.functional.softmax(output[0], dim=0))\n" + ] + }, + { + "cell_type": "markdown", + "id": "77e2a4e3", + "metadata": {}, + "source": [ + "### Model Description\n", + "This project includes the semi-supervised and semi-weakly supervised ImageNet models introduced in \"Billion-scale Semi-Supervised Learning for Image Classification\" .\n", + "\n", + "\"Semi-supervised\" (SSL) ImageNet models are pre-trained on a subset of unlabeled YFCC100M public image dataset and fine-tuned with the ImageNet1K training dataset, as described by the semi-supervised training framework in the paper mentioned above. In this case, the high capacity teacher model was trained only with labeled examples.\n", + "\n", + "\"Semi-weakly\" supervised (SWSL) ImageNet models are pre-trained on **940 million** public images with 1.5K hashtags matching with 1000 ImageNet1K synsets, followed by fine-tuning on ImageNet1K dataset. In this case, the associated hashtags are only used for building a better teacher model. During training the student model, those hashtags are ingored and the student model is pretrained with a subset of 64M images selected by the teacher model from the same 940 million public image dataset.\n", + "\n", + "Semi-weakly supervised ResNet and ResNext models provided in the table below significantly improve the top-1 accuracy on the ImageNet validation set compared to training from scratch or other training mechanisms introduced in the literature as of September 2019. For example, **We achieve state-of-the-art accuracy of 81.2% on ImageNet for the widely used/adopted ResNet-50 model architecture**.\n", + "\n", + "\n", + "| Architecture | Supervision | #Parameters | FLOPS | Top-1 Acc. | Top-5 Acc. |\n", + "| ------------------ | :--------------:|:----------: | :---: | :--------: | :--------: |\n", + "| ResNet-18 | semi-supervised |14M | 2B | 72.8 | 91.5 |\n", + "| ResNet-50 | semi-supervised |25M | 4B | 79.3 | 94.9 |\n", + "| ResNeXt-50 32x4d | semi-supervised |25M | 4B | 80.3 | 95.4 |\n", + "| ResNeXt-101 32x4d | semi-supervised |42M | 8B | 81.0 | 95.7 |\n", + "| ResNeXt-101 32x8d | semi-supervised |88M | 16B | 81.7 | 96.1 |\n", + "| ResNeXt-101 32x16d | semi-supervised |193M | 36B | 81.9 | 96.2 |\n", + "| ResNet-18 | semi-weakly supervised |14M | 2B | **73.4** | 91.9 |\n", + "| ResNet-50 | semi-weakly supervised |25M | 4B | **81.2** | 96.0 |\n", + "| ResNeXt-50 32x4d | semi-weakly supervised |25M | 4B | **82.2** | 96.3 |\n", + "| ResNeXt-101 32x4d | semi-weakly supervised |42M | 8B | **83.4** | 96.8 |\n", + "| ResNeXt-101 32x8d | semi-weakly supervised |88M | 16B | **84.3** | 97.2 |\n", + "| ResNeXt-101 32x16d | semi-weakly supervised |193M | 36B | **84.8** | 97.4 |\n", + "\n", + "\n", + "## Citation\n", + "\n", + "If you use the models released in this repository, please cite the following publication (https://arxiv.org/abs/1905.00546)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "20db95b4", + "metadata": {}, + "outputs": [], + "source": [ + "@misc{yalniz2019billionscale,\n", + " title={Billion-scale semi-supervised learning for image classification},\n", + " author={I. Zeki Yalniz and Hervé Jégou and Kan Chen and Manohar Paluri and Dhruv Mahajan},\n", + " year={2019},\n", + " eprint={1905.00546},\n", + " archivePrefix={arXiv},\n", + " primaryClass={cs.CV}\n", + "}" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/huggingface_pytorch-pretrained-bert_bert.ipynb b/assets/hub/huggingface_pytorch-pretrained-bert_bert.ipynb new file mode 100644 index 000000000000..7a50107d9365 --- /dev/null +++ b/assets/hub/huggingface_pytorch-pretrained-bert_bert.ipynb @@ -0,0 +1,139 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# BERT\n", + "\n", + "*Author: HuggingFace Team*\n", + "\n", + "**Bidirectional Encoder Representations from Transformers.**\n", + "\n", + "_ | _\n", + "- | -\n", + "![alt](https://pytorch.org/assets/images/bert1.png) | ![alt](https://pytorch.org/assets/images/bert2.png)\n", + "\n", + "\n", + "### Model Description\n", + "\n", + "BERT was released together with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin et al. The model is based on the Transformer architecture introduced in [Attention Is All You Need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani et al and has led to significant improvements on a wide range of downstream tasks.\n", + "\n", + "Here are 8 models based on BERT with [Google's pre-trained models](https://github.com/google-research/bert) along with the associated Tokenizer.\n", + "It includes:\n", + "- `bertTokenizer`: perform end-to-end tokenization, i.e. basic tokenization followed by WordPiece tokenization\n", + "- `bertModel`: raw BERT Transformer model (fully pre-trained)\n", + "- `bertForMaskedLM`: BERT Transformer with the pre-trained masked language modeling head on top (fully pre-trained)\n", + "- `bertForNextSentencePrediction`: BERT Transformer with the pre-trained next sentence prediction classifier on top (fully pre-trained)\n", + "- `bertForPreTraining`: BERT Transformer with masked language modeling head and next sentence prediction classifier on top (fully pre-trained)\n", + "- `bertForSequenceClassification`: BERT Transformer with a sequence classification head on top (BERT Transformer is pre-trained, the sequence classification head is only initialized and has to be trained)\n", + "- `bertForMultipleChoice`: BERT Transformer with a multiple choice head on top (used for task like Swag) (BERT Transformer is pre-trained, the multiple choice classification head is only initialized and has to be trained)\n", + "- `bertForTokenClassification`: BERT Transformer with a token classification head on top (BERT Transformer is pre-trained, the token classification head is only initialized and has to be trained)\n", + "- `bertForQuestionAnswering`: BERT Transformer with a token classification head on top (BERT Transformer is pre-trained, the token classification head is only initialized and has to be trained)\n", + "\n", + "### Requirements\n", + "\n", + "Unlike most other PyTorch Hub models, BERT requires a few additional Python packages to be installed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "pip install tqdm boto3 requests regex" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Example\n", + "\n", + "Here is an example on how to tokenize the input text with `bertTokenizer`, and then get the hidden states computed by `bertModel` or predict masked tokens using `bertForMaskedLM`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "### First, tokenize the input\n", + "import torch\n", + "tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)\n", + "\n", + "# Tokenized input\n", + "text = \"[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]\"\n", + "tokenized_text = tokenizer.tokenize(text)\n", + "indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "### Get the hidden states computed by `bertModel`\n", + "# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)\n", + "segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]\n", + "\n", + "# Convert inputs to PyTorch tensors\n", + "segments_tensors = torch.tensor([segments_ids])\n", + "tokens_tensor = torch.tensor([indexed_tokens])\n", + "\n", + "model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertModel', 'bert-base-cased')\n", + "model.eval()\n", + "\n", + "with torch.no_grad():\n", + " encoded_layers, _ = model(tokens_tensor, segments_tensors)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "### Predict masked tokens using `bertForMaskedLM`\n", + "# Mask a token that we will try to predict back with `BertForMaskedLM`\n", + "masked_index = 8\n", + "tokenized_text[masked_index] = '[MASK]'\n", + "indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)\n", + "tokens_tensor = torch.tensor([indexed_tokens])\n", + "\n", + "maskedLM_model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForMaskedLM', 'bert-base-cased')\n", + "maskedLM_model.eval()\n", + "\n", + "with torch.no_grad():\n", + " predictions = maskedLM_model(tokens_tensor, segments_tensors)\n", + "\n", + "# Get the predicted token\n", + "predicted_index = torch.argmax(predictions[0, masked_index]).item()\n", + "predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]\n", + "assert predicted_token == 'Jim'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Resources\n", + "\n", + " - Paper: [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805)\n", + " - Initial repository (with detailed examples and documentation): [pytorch-pretrained-BERT](https://github.com/huggingface/pytorch-pretrained-BERT)" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/assets/hub/huggingface_pytorch-pretrained-bert_gpt.ipynb b/assets/hub/huggingface_pytorch-pretrained-bert_gpt.ipynb new file mode 100644 index 000000000000..718d4d8188c7 --- /dev/null +++ b/assets/hub/huggingface_pytorch-pretrained-bert_gpt.ipynb @@ -0,0 +1,111 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# GPT\n", + "\n", + "*Author: HuggingFace Team*\n", + "\n", + "**Generative Pre-Training (GPT) models for language understanding**\n", + "\n", + "\"alt\"\n", + "\n", + "\n", + "### Model Description\n", + "\n", + "GPT was released together with the paper [Improving Language Understanding by Generative Pre-Training](https://s3-us-west-2.amazonaws.com/openai-assets/research-covers/language-unsupervised/language_understanding_paper.pdf) by Alec Radford et al at OpenAI. It's a combination of two ideas: Transformer model and large scale unsupervised pre-training.\n", + "\n", + "Here are three models based on [OpenAI's pre-trained weights](https://github.com/openai/finetune-transformer-lm) along with the associated Tokenizer.\n", + "It includes:\n", + "- `openAIGPTModel`: raw OpenAI GPT Transformer model (fully pre-trained)\n", + "- `openAIGPTLMHeadModel`: OpenAI GPT Transformer with the tied language modeling head on top (fully pre-trained)\n", + "- `openAIGPTDoubleHeadsModel`: OpenAI GPT Transformer with the tied language modeling head and a multiple choice classification head on top (OpenAI GPT Transformer is pre-trained, the multiple choice classification head is only initialized and has to be trained)\n", + "\n", + "### Requirements\n", + "\n", + "Unlike most other PyTorch Hub models, BERT requires a few additional Python packages to be installed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "pip install tqdm boto3 requests regex" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Example\n", + "\n", + "Here is an example on how to tokenize the text with `openAIGPTTokenizer`, and then get the hidden states computed by `openAIGPTModel` or predict the next token using `openAIGPTLMHeadModel`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "### First, tokenize the input\n", + "import torch\n", + "tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTTokenizer', 'openai-gpt')\n", + "\n", + "# Prepare tokenized input\n", + "text = \"Who was Jim Henson ? Jim Henson was a puppeteer\"\n", + "tokenized_text = tokenizer.tokenize(text)\n", + "indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)\n", + "tokens_tensor = torch.tensor([indexed_tokens])\n", + "\n", + "### Get the hidden states computed by `openAIGPTModel`\n", + "model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTModel', 'openai-gpt')\n", + "model.eval()\n", + "\n", + "# Compute hidden states features for each layer\n", + "with torch.no_grad():\n", + "\thidden_states = model(tokens_tensor)\n", + "\n", + "### Predict the next token using `openAIGPTLMHeadModel`\n", + "lm_model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTLMHeadModel', 'openai-gpt')\n", + "lm_model.eval()\n", + "\n", + "# Predict all tokens\n", + "with torch.no_grad():\n", + "\tpredictions = lm_model(tokens_tensor)\n", + "\n", + "# Get the last predicted token\n", + "predicted_index = torch.argmax(predictions[0, -1, :]).item()\n", + "predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]\n", + "assert predicted_token == '.'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Requirement\n", + "The model only support python3.\n", + "\n", + "### Resources\n", + "\n", + " - Paper: [Improving Language Understanding by Generative Pre-Training](https://s3-us-west-2.amazonaws.com/openai-assets/research-covers/language-unsupervised/language_understanding_paper.pdf)\n", + " - [Blogpost from OpenAI](https://openai.com/blog/language-unsupervised/)\n", + " - Initial repository (with detailed examples and documentation): [pytorch-pretrained-BERT](https://github.com/huggingface/pytorch-pretrained-BERT)" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/assets/hub/huggingface_pytorch-transformers.ipynb b/assets/hub/huggingface_pytorch-transformers.ipynb new file mode 100644 index 000000000000..fc6856f7f5ba --- /dev/null +++ b/assets/hub/huggingface_pytorch-transformers.ipynb @@ -0,0 +1,452 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "aebf87f7", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# PyTorch-Transformers\n", + "\n", + "*Author: HuggingFace Team*\n", + "\n", + "**PyTorch implementations of popular NLP Transformers**\n", + "\n", + "\n", + "# Model Description\n", + "\n", + "\n", + "PyTorch-Transformers (formerly known as `pytorch-pretrained-bert`) is a library of state-of-the-art pre-trained models for Natural Language Processing (NLP).\n", + "\n", + "The library currently contains PyTorch implementations, pre-trained model weights, usage scripts and conversion utilities for the following models:\n", + "\n", + "1. **[BERT](https://github.com/google-research/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.\n", + "2. **[GPT](https://github.com/openai/finetune-transformer-lm)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.\n", + "3. **[GPT-2](https://blog.openai.com/better-language-models/)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.\n", + "4. **[Transformer-XL](https://github.com/kimiyoung/transformer-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.\n", + "5. **[XLNet](https://github.com/zihangdai/xlnet/)** (from Google/CMU) released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.\n", + "6. **[XLM](https://github.com/facebookresearch/XLM/)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.\n", + "7. **[RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.\n", + "8. **[DistilBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation)** (from HuggingFace), released together with the blogpost [Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT](https://medium.com/huggingface/distilbert-8cf3380435b5) by Victor Sanh, Lysandre Debut and Thomas Wolf.\n", + "\n", + "The components available here are based on the `AutoModel` and `AutoTokenizer` classes of the `pytorch-transformers` library.\n", + "\n", + "# Requirements\n", + "\n", + "Unlike most other PyTorch Hub models, BERT requires a few additional Python packages to be installed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "569404ad", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "pip install tqdm boto3 requests regex sentencepiece sacremoses" + ] + }, + { + "cell_type": "markdown", + "id": "dfccbc22", + "metadata": {}, + "source": [ + "# Usage\n", + "\n", + "The available methods are the following:\n", + "- `config`: returns a configuration item corresponding to the specified model or pth.\n", + "- `tokenizer`: returns a tokenizer corresponding to the specified model or path\n", + "- `model`: returns a model corresponding to the specified model or path\n", + "- `modelForCausalLM`: returns a model with a language modeling head corresponding to the specified model or path\n", + "- `modelForSequenceClassification`: returns a model with a sequence classifier corresponding to the specified model or path\n", + "- `modelForQuestionAnswering`: returns a model with a question answering head corresponding to the specified model or path\n", + "\n", + "All these methods share the following argument: `pretrained_model_or_path`, which is a string identifying a pre-trained model or path from which an instance will be returned. There are several checkpoints available for each model, which are detailed below:\n", + "\n", + "\n", + "\n", + "\n", + "The available models are listed on the [transformers documentation, models page](https://huggingface.co/models).\n", + "\n", + "# Documentation\n", + "\n", + "Here are a few examples detailing the usage of each available method.\n", + "\n", + "\n", + "## Tokenizer\n", + "\n", + "The tokenizer object allows the conversion from character strings to tokens understood by the different models. Each model has its own tokenizer, and some tokenizing methods are different across tokenizers. The complete documentation can be found [here](https://huggingface.co/docs/transformers/main_classes/tokenizer)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a52f187f", + "metadata": { + "attributes": { + "classes": [ + "py" + ], + "id": "" + } + }, + "outputs": [], + "source": [ + "import torch\n", + "tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-uncased') # Download vocabulary from S3 and cache.\n", + "tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', './test/bert_saved_model/') # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`" + ] + }, + { + "cell_type": "markdown", + "id": "2765418b", + "metadata": {}, + "source": [ + "## Models\n", + "\n", + "The model object is a model instance inheriting from a `nn.Module`. Each model is accompanied by their saving/loading methods, either from a local file or directory, or from a pre-trained configuration (see previously described `config`). Each model works differently, a complete overview of the different models can be found in the [documentation](https://huggingface.co/docs/transformers/main_classes/model)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b170367e", + "metadata": { + "attributes": { + "classes": [ + "py" + ], + "id": "" + } + }, + "outputs": [], + "source": [ + "import torch\n", + "model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased') # Download model and configuration from S3 and cache.\n", + "model = torch.hub.load('huggingface/pytorch-transformers', 'model', './test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`\n", + "model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased', output_attentions=True) # Update configuration during loading\n", + "assert model.config.output_attentions == True\n", + "# Loading from a TF checkpoint file instead of a PyTorch model (slower)\n", + "config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')\n", + "model = torch.hub.load('huggingface/pytorch-transformers', 'model', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)" + ] + }, + { + "cell_type": "markdown", + "id": "4d9e3b45", + "metadata": {}, + "source": [ + "## Models with a language modeling head\n", + "\n", + "Previously mentioned `model` instance with an additional language modeling head." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d2e64b72", + "metadata": { + "attributes": { + "classes": [ + "py" + ], + "id": "" + } + }, + "outputs": [], + "source": [ + "import torch\n", + "model = torch.hub.load('huggingface/transformers', 'modelForCausalLM', 'gpt2') # Download model and configuration from huggingface.co and cache.\n", + "model = torch.hub.load('huggingface/transformers', 'modelForCausalLM', './test/saved_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`\n", + "model = torch.hub.load('huggingface/transformers', 'modelForCausalLM', 'gpt2', output_attentions=True) # Update configuration during loading\n", + "assert model.config.output_attentions == True\n", + "# Loading from a TF checkpoint file instead of a PyTorch model (slower)\n", + "config = AutoConfig.from_pretrained('./tf_model/gpt_tf_model_config.json')\n", + "model = torch.hub.load('huggingface/transformers', 'modelForCausalLM', './tf_model/gpt_tf_checkpoint.ckpt.index', from_tf=True, config=config)" + ] + }, + { + "cell_type": "markdown", + "id": "56838e82", + "metadata": {}, + "source": [ + "## Models with a sequence classification head\n", + "\n", + "Previously mentioned `model` instance with an additional sequence classification head." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0fede52f", + "metadata": { + "attributes": { + "classes": [ + "py" + ], + "id": "" + } + }, + "outputs": [], + "source": [ + "import torch\n", + "model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'bert-base-uncased') # Download model and configuration from S3 and cache.\n", + "model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', './test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`\n", + "model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'bert-base-uncased', output_attention=True) # Update configuration during loading\n", + "assert model.config.output_attention == True\n", + "# Loading from a TF checkpoint file instead of a PyTorch model (slower)\n", + "config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')\n", + "model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)" + ] + }, + { + "cell_type": "markdown", + "id": "a17e2167", + "metadata": {}, + "source": [ + "## Models with a question answering head\n", + "\n", + "Previously mentioned `model` instance with an additional question answering head." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a340191", + "metadata": { + "attributes": { + "classes": [ + "py" + ], + "id": "" + } + }, + "outputs": [], + "source": [ + "import torch\n", + "model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', 'bert-base-uncased') # Download model and configuration from S3 and cache.\n", + "model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', './test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`\n", + "model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', 'bert-base-uncased', output_attention=True) # Update configuration during loading\n", + "assert model.config.output_attention == True\n", + "# Loading from a TF checkpoint file instead of a PyTorch model (slower)\n", + "config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')\n", + "model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)" + ] + }, + { + "cell_type": "markdown", + "id": "a347055f", + "metadata": {}, + "source": [ + "## Configuration\n", + "\n", + "The configuration is optional. The configuration object holds information concerning the model, such as the number of heads/layers, if the model should output attentions or hidden states, or if it should be adapted for TorchScript. Many parameters are available, some specific to each model. The complete documentation can be found [here](https://huggingface.co/docs/transformers/main_classes/configuration)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "83bdbd7d", + "metadata": { + "attributes": { + "classes": [ + "py" + ], + "id": "" + } + }, + "outputs": [], + "source": [ + "import torch\n", + "config = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased') # Download configuration from S3 and cache.\n", + "config = torch.hub.load('huggingface/pytorch-transformers', 'config', './test/bert_saved_model/') # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`\n", + "config = torch.hub.load('huggingface/pytorch-transformers', 'config', './test/bert_saved_model/my_configuration.json')\n", + "config = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False)\n", + "assert config.output_attention == True\n", + "config, unused_kwargs = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False, return_unused_kwargs=True)\n", + "assert config.output_attention == True\n", + "assert unused_kwargs == {'foo': False}\n", + "\n", + "# Using the configuration with a model\n", + "config = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased')\n", + "config.output_attentions = True\n", + "config.output_hidden_states = True\n", + "model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased', config=config)\n", + "# Model will now output attentions and hidden states as well\n" + ] + }, + { + "cell_type": "markdown", + "id": "4afcf83b", + "metadata": {}, + "source": [ + "# Example Usage\n", + "\n", + "Here is an example on how to tokenize the input text to be fed as input to a BERT model, and then get the hidden states computed by such a model or predict masked tokens using language modeling BERT model.\n", + "\n", + "## First, tokenize the input" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "91ab7b53", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-cased')\n", + "\n", + "text_1 = \"Who was Jim Henson ?\"\n", + "text_2 = \"Jim Henson was a puppeteer\"\n", + "\n", + "# Tokenized input with special tokens around it (for BERT: [CLS] at the beginning and [SEP] at the end)\n", + "indexed_tokens = tokenizer.encode(text_1, text_2, add_special_tokens=True)" + ] + }, + { + "cell_type": "markdown", + "id": "c057c229", + "metadata": {}, + "source": [ + "## Using `BertModel` to encode the input sentence in a sequence of last layer hidden-states" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "95ac4662", + "metadata": {}, + "outputs": [], + "source": [ + "# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)\n", + "segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]\n", + "\n", + "# Convert inputs to PyTorch tensors\n", + "segments_tensors = torch.tensor([segments_ids])\n", + "tokens_tensor = torch.tensor([indexed_tokens])\n", + "\n", + "model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-cased')\n", + "\n", + "with torch.no_grad():\n", + " encoded_layers, _ = model(tokens_tensor, token_type_ids=segments_tensors)" + ] + }, + { + "cell_type": "markdown", + "id": "70f4fefd", + "metadata": {}, + "source": [ + "## Using `modelForMaskedLM` to predict a masked token with BERT" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "48bde1ca", + "metadata": {}, + "outputs": [], + "source": [ + "# Mask a token that we will try to predict back with `BertForMaskedLM`\n", + "masked_index = 8\n", + "indexed_tokens[masked_index] = tokenizer.mask_token_id\n", + "tokens_tensor = torch.tensor([indexed_tokens])\n", + "\n", + "masked_lm_model = torch.hub.load('huggingface/pytorch-transformers', 'modelForMaskedLM', 'bert-base-cased')\n", + "\n", + "with torch.no_grad():\n", + " predictions = masked_lm_model(tokens_tensor, token_type_ids=segments_tensors)\n", + "\n", + "# Get the predicted token\n", + "predicted_index = torch.argmax(predictions[0][0], dim=1)[masked_index].item()\n", + "predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]\n", + "assert predicted_token == 'Jim'" + ] + }, + { + "cell_type": "markdown", + "id": "1b4a6bef", + "metadata": {}, + "source": [ + "## Using `modelForQuestionAnswering` to do question answering with BERT" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6f37585", + "metadata": {}, + "outputs": [], + "source": [ + "question_answering_model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', 'bert-large-uncased-whole-word-masking-finetuned-squad')\n", + "question_answering_tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-large-uncased-whole-word-masking-finetuned-squad')\n", + "\n", + "# The format is paragraph first and then question\n", + "text_1 = \"Jim Henson was a puppeteer\"\n", + "text_2 = \"Who was Jim Henson ?\"\n", + "indexed_tokens = question_answering_tokenizer.encode(text_1, text_2, add_special_tokens=True)\n", + "segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]\n", + "segments_tensors = torch.tensor([segments_ids])\n", + "tokens_tensor = torch.tensor([indexed_tokens])\n", + "\n", + "# Predict the start and end positions logits\n", + "with torch.no_grad():\n", + " out = question_answering_model(tokens_tensor, token_type_ids=segments_tensors)\n", + "\n", + "# get the highest prediction\n", + "answer = question_answering_tokenizer.decode(indexed_tokens[torch.argmax(out.start_logits):torch.argmax(out.end_logits)+1])\n", + "assert answer == \"puppeteer\"\n", + "\n", + "# Or get the total loss which is the sum of the CrossEntropy loss for the start and end token positions (set model to train mode before if used for training)\n", + "start_positions, end_positions = torch.tensor([12]), torch.tensor([14])\n", + "multiple_choice_loss = question_answering_model(tokens_tensor, token_type_ids=segments_tensors, start_positions=start_positions, end_positions=end_positions)" + ] + }, + { + "cell_type": "markdown", + "id": "6ee33213", + "metadata": {}, + "source": [ + "## Using `modelForSequenceClassification` to do paraphrase classification with BERT" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9384a8b0", + "metadata": {}, + "outputs": [], + "source": [ + "sequence_classification_model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'bert-base-cased-finetuned-mrpc')\n", + "sequence_classification_tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-cased-finetuned-mrpc')\n", + "\n", + "text_1 = \"Jim Henson was a puppeteer\"\n", + "text_2 = \"Who was Jim Henson ?\"\n", + "indexed_tokens = sequence_classification_tokenizer.encode(text_1, text_2, add_special_tokens=True)\n", + "segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]\n", + "segments_tensors = torch.tensor([segments_ids])\n", + "tokens_tensor = torch.tensor([indexed_tokens])\n", + "\n", + "# Predict the sequence classification logits\n", + "with torch.no_grad():\n", + " seq_classif_logits = sequence_classification_model(tokens_tensor, token_type_ids=segments_tensors)\n", + "\n", + "predicted_labels = torch.argmax(seq_classif_logits[0]).item()\n", + "\n", + "assert predicted_labels == 0 # In MRPC dataset this means the two sentences are not paraphrasing each other\n", + "\n", + "# Or get the sequence classification loss (set model to train mode before if used for training)\n", + "labels = torch.tensor([1])\n", + "seq_classif_loss = sequence_classification_model(tokens_tensor, token_type_ids=segments_tensors, labels=labels)" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/hustvl_yolop.ipynb b/assets/hub/hustvl_yolop.ipynb new file mode 100644 index 000000000000..2c3496f534fc --- /dev/null +++ b/assets/hub/hustvl_yolop.ipynb @@ -0,0 +1,165 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "8ac5a855", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# YOLOP\n", + "\n", + "*Author: Hust Visual Learning Team*\n", + "\n", + "**YOLOP pretrained on the BDD100K dataset**\n", + "\n", + "## Before You Start\n", + "To install YOLOP dependencies:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16ed4d6d", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "pip install -qr https://github.com/hustvl/YOLOP/blob/main/requirements.txt # install dependencies" + ] + }, + { + "cell_type": "markdown", + "id": "484a5e2b", + "metadata": {}, + "source": [ + "## YOLOP: You Only Look Once for Panoptic driving Perception\n", + "\n", + "### Model Description\n", + "\n", + "\"YOLOP\n", + " \n", + "\n", + "- YOLOP is an efficient multi-task network that can jointly handle three crucial tasks in autonomous driving: object detection, drivable area segmentation and lane detection. And it is also the first to reach real-time on embedded devices while maintaining state-of-the-art level performance on the **BDD100K** dataset.\n", + "\n", + "\n", + "### Results\n", + "\n", + "#### Traffic Object Detection Result\n", + "\n", + "| Model | Recall(%) | mAP50(%) | Speed(fps) |\n", + "| -------------- | --------- | -------- | ---------- |\n", + "| `Multinet` | 81.3 | 60.2 | 8.6 |\n", + "| `DLT-Net` | 89.4 | 68.4 | 9.3 |\n", + "| `Faster R-CNN` | 77.2 | 55.6 | 5.3 |\n", + "| `YOLOv5s` | 86.8 | 77.2 | 82 |\n", + "| `YOLOP(ours)` | 89.2 | 76.5 | 41 |\n", + "\n", + "#### Drivable Area Segmentation Result\n", + "\n", + "| Model | mIOU(%) | Speed(fps) |\n", + "| ------------- | ------- | ---------- |\n", + "| `Multinet` | 71.6 | 8.6 |\n", + "| `DLT-Net` | 71.3 | 9.3 |\n", + "| `PSPNet` | 89.6 | 11.1 |\n", + "| `YOLOP(ours)` | 91.5 | 41 |\n", + "\n", + "#### Lane Detection Result\n", + "\n", + "| Model | mIOU(%) | IOU(%) |\n", + "| ------------- | ------- | ------ |\n", + "| `ENet` | 34.12 | 14.64 |\n", + "| `SCNN` | 35.79 | 15.84 |\n", + "| `ENet-SAD` | 36.56 | 16.02 |\n", + "| `YOLOP(ours)` | 70.50 | 26.20 |\n", + "\n", + "#### Ablation Studies 1: End-to-end v.s. Step-by-step\n", + "\n", + "| Training_method | Recall(%) | AP(%) | mIoU(%) | Accuracy(%) | IoU(%) |\n", + "| --------------- | --------- | ----- | ------- | ----------- | ------ |\n", + "| `ES-W` | 87.0 | 75.3 | 90.4 | 66.8 | 26.2 |\n", + "| `ED-W` | 87.3 | 76.0 | 91.6 | 71.2 | 26.1 |\n", + "| `ES-D-W` | 87.0 | 75.1 | 91.7 | 68.6 | 27.0 |\n", + "| `ED-S-W` | 87.5 | 76.1 | 91.6 | 68.0 | 26.8 |\n", + "| `End-to-end` | 89.2 | 76.5 | 91.5 | 70.5 | 26.2 |\n", + "\n", + "#### Ablation Studies 2: Multi-task v.s. Single task\n", + "\n", + "| Training_method | Recall(%) | AP(%) | mIoU(%) | Accuracy(%) | IoU(%) | Speed(ms/frame) |\n", + "| --------------- | --------- | ----- | ------- | ----------- | ------ | --------------- |\n", + "| `Det(only)` | 88.2 | 76.9 | - | - | - | 15.7 |\n", + "| `Da-Seg(only)` | - | - | 92.0 | - | - | 14.8 |\n", + "| `Ll-Seg(only)` | - | - | - | 79.6 | 27.9 | 14.8 |\n", + "| `Multitask` | 89.2 | 76.5 | 91.5 | 70.5 | 26.2 | 24.4 |\n", + "\n", + "**Notes**:\n", + "\n", + "- In table 4, E, D, S and W refer to Encoder, Detect head, two Segment heads and whole network. So the Algorithm (First, we only train Encoder and Detect head. Then we freeze the Encoder and Detect head as well as train two Segmentation heads. Finally, the entire network is trained jointly for all three tasks.) can be marked as ED-S-W, and the same for others.\n", + "\n", + "### Visualization\n", + "\n", + "#### Traffic Object Detection Result\n", + "\n", + "\"Traffic\n", + " \n", + "\n", + "#### Drivable Area Segmentation Result\n", + "\n", + "\"Drivable\n", + " \n", + "\n", + "#### Lane Detection Result\n", + "\n", + "\"Lane\n", + " \n", + "\n", + "**Notes**:\n", + "\n", + "- The visualization of lane detection result has been post processed by quadratic fitting.\n", + "\n", + "### Deployment\n", + "\n", + "Our model can reason in real-time on **Jetson Tx2**, with **Zed Camera** to capture image. We use **TensorRT** tool for speeding up. We provide code for deployment and reasoning of model in [github code](https://github.com/hustvl/YOLOP/tree/main/toolkits/deploy).\n", + "\n", + "\n", + "### Load From PyTorch Hub\n", + "This example loads the pretrained **YOLOP** model and passes an image for inference." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a50d292a", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "\n", + "# load model\n", + "model = torch.hub.load('hustvl/yolop', 'yolop', pretrained=True)\n", + "\n", + "#inference\n", + "img = torch.randn(1,3,640,640)\n", + "det_out, da_seg_out,ll_seg_out = model(img)" + ] + }, + { + "cell_type": "markdown", + "id": "f07a9063", + "metadata": {}, + "source": [ + "### Citation\n", + "\n", + "See for more detail in [github code](https://github.com/hustvl/YOLOP) and [arxiv paper](https://arxiv.org/abs/2108.11250).\n", + "\n", + "If you find our paper and code useful for your research, please consider giving a star and citation:" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/intelisl_midas_v2.ipynb b/assets/hub/intelisl_midas_v2.ipynb new file mode 100644 index 000000000000..9c7a4480f581 --- /dev/null +++ b/assets/hub/intelisl_midas_v2.ipynb @@ -0,0 +1,270 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0595d980", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# MiDaS\n", + "\n", + "*Author: Intel ISL*\n", + "\n", + "**MiDaS models for computing relative depth from a single image.**\n", + "\n", + "\"alt\"\n", + "\n", + "\n", + "### Model Description\n", + "\n", + "[MiDaS](https://arxiv.org/abs/1907.01341) computes relative inverse depth from a single image. The repository provides multiple models that cover different use cases ranging from a small, high-speed model to a very large model that provide the highest accuracy. The models have been trained on 10 distinct datasets using\n", + "multi-objective optimization to ensure high quality on a wide range of inputs.\n", + "\n", + "### Dependencies\n", + "\n", + "MiDaS depends on [timm](https://github.com/rwightman/pytorch-image-models). Install with" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "db3fd908", + "metadata": { + "attributes": { + "classes": [ + "shell" + ], + "id": "" + } + }, + "outputs": [], + "source": [ + "pip install timm" + ] + }, + { + "cell_type": "markdown", + "id": "8892d100", + "metadata": {}, + "source": [ + "### Example Usage\n", + "\n", + "Download an image from the PyTorch homepage" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "758e089f", + "metadata": {}, + "outputs": [], + "source": [ + "import cv2\n", + "import torch\n", + "import urllib.request\n", + "\n", + "import matplotlib.pyplot as plt\n", + "\n", + "url, filename = (\"https://github.com/pytorch/hub/raw/master/images/dog.jpg\", \"dog.jpg\")\n", + "urllib.request.urlretrieve(url, filename)" + ] + }, + { + "cell_type": "markdown", + "id": "3d5fb41f", + "metadata": {}, + "source": [ + "Load a model (see [https://github.com/intel-isl/MiDaS/#Accuracy](https://github.com/intel-isl/MiDaS/#Accuracy) for an overview)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "49acb469", + "metadata": {}, + "outputs": [], + "source": [ + "model_type = \"DPT_Large\" # MiDaS v3 - Large (highest accuracy, slowest inference speed)\n", + "#model_type = \"DPT_Hybrid\" # MiDaS v3 - Hybrid (medium accuracy, medium inference speed)\n", + "#model_type = \"MiDaS_small\" # MiDaS v2.1 - Small (lowest accuracy, highest inference speed)\n", + "\n", + "midas = torch.hub.load(\"intel-isl/MiDaS\", model_type)" + ] + }, + { + "cell_type": "markdown", + "id": "d785d8c2", + "metadata": {}, + "source": [ + "Move model to GPU if available" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c2aa0b2d", + "metadata": {}, + "outputs": [], + "source": [ + "device = torch.device(\"cuda\") if torch.cuda.is_available() else torch.device(\"cpu\")\n", + "midas.to(device)\n", + "midas.eval()" + ] + }, + { + "cell_type": "markdown", + "id": "1d0a6f5b", + "metadata": {}, + "source": [ + "Load transforms to resize and normalize the image for large or small model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "763b447a", + "metadata": {}, + "outputs": [], + "source": [ + "midas_transforms = torch.hub.load(\"intel-isl/MiDaS\", \"transforms\")\n", + "\n", + "if model_type == \"DPT_Large\" or model_type == \"DPT_Hybrid\":\n", + " transform = midas_transforms.dpt_transform\n", + "else:\n", + " transform = midas_transforms.small_transform" + ] + }, + { + "cell_type": "markdown", + "id": "ce837f1c", + "metadata": {}, + "source": [ + "Load image and apply transforms" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "412901d6", + "metadata": {}, + "outputs": [], + "source": [ + "img = cv2.imread(filename)\n", + "img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)\n", + "\n", + "input_batch = transform(img).to(device)" + ] + }, + { + "cell_type": "markdown", + "id": "9d621088", + "metadata": {}, + "source": [ + "Predict and resize to original resolution" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2d0d2db5", + "metadata": {}, + "outputs": [], + "source": [ + "with torch.no_grad():\n", + " prediction = midas(input_batch)\n", + "\n", + " prediction = torch.nn.functional.interpolate(\n", + " prediction.unsqueeze(1),\n", + " size=img.shape[:2],\n", + " mode=\"bicubic\",\n", + " align_corners=False,\n", + " ).squeeze()\n", + "\n", + "output = prediction.cpu().numpy()" + ] + }, + { + "cell_type": "markdown", + "id": "991ee991", + "metadata": {}, + "source": [ + "Show result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c630ed12", + "metadata": {}, + "outputs": [], + "source": [ + "plt.imshow(output)\n", + "# plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "f4c23d91", + "metadata": {}, + "source": [ + "### References\n", + "[Towards Robust Monocular Depth Estimation: Mixing Datasets for Zero-shot Cross-dataset Transfer](https://arxiv.org/abs/1907.01341)\n", + "\n", + "[Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413)\n", + "\n", + "Please cite our papers if you use our models:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8248831c", + "metadata": { + "attributes": { + "classes": [ + "bibtex" + ], + "id": "" + } + }, + "outputs": [], + "source": [ + "@article{Ranftl2020,\n", + "\tauthor = {Ren\\'{e} Ranftl and Katrin Lasinger and David Hafner and Konrad Schindler and Vladlen Koltun},\n", + "\ttitle = {Towards Robust Monocular Depth Estimation: Mixing Datasets for Zero-shot Cross-dataset Transfer},\n", + "\tjournal = {IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI)},\n", + "\tyear = {2020},\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e2a1bd81", + "metadata": { + "attributes": { + "classes": [ + "bibtex" + ], + "id": "" + } + }, + "outputs": [], + "source": [ + "@article{Ranftl2021,\n", + "\tauthor = {Ren\\'{e} Ranftl and Alexey Bochkovskiy and Vladlen Koltun},\n", + "\ttitle = {Vision Transformers for Dense Prediction},\n", + "\tjournal = {ArXiv preprint},\n", + "\tyear = {2021},\n", + "}" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/mateuszbuda_brain-segmentation-pytorch_unet.ipynb b/assets/hub/mateuszbuda_brain-segmentation-pytorch_unet.ipynb new file mode 100644 index 000000000000..f91614481594 --- /dev/null +++ b/assets/hub/mateuszbuda_brain-segmentation-pytorch_unet.ipynb @@ -0,0 +1,117 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e3af3710", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# U-Net for brain MRI\n", + "\n", + "*Author: mateuszbuda*\n", + "\n", + "**U-Net with batch normalization for biomedical image segmentation with pretrained weights for abnormality segmentation in brain MRI**\n", + "\n", + "\"alt\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "76536d46", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "model = torch.hub.load('mateuszbuda/brain-segmentation-pytorch', 'unet',\n", + " in_channels=3, out_channels=1, init_features=32, pretrained=True)\n" + ] + }, + { + "cell_type": "markdown", + "id": "a28792eb", + "metadata": {}, + "source": [ + "Loads a U-Net model pre-trained for abnormality segmentation on a dataset of brain MRI volumes [kaggle.com/mateuszbuda/lgg-mri-segmentation](https://www.kaggle.com/mateuszbuda/lgg-mri-segmentation)\n", + "The pre-trained model requires 3 input channels, 1 output channel, and 32 features in the first layer.\n", + "\n", + "### Model Description\n", + "\n", + "This U-Net model comprises four levels of blocks containing two convolutional layers with batch normalization and ReLU activation function, and one max pooling layer in the encoding part and up-convolutional layers instead in the decoding part.\n", + "The number of convolutional filters in each block is 32, 64, 128, and 256.\n", + "The bottleneck layer has 512 convolutional filters.\n", + "From the encoding layers, skip connections are used to the corresponding layers in the decoding part.\n", + "Input image is a 3-channel brain MRI slice from pre-contrast, FLAIR, and post-contrast sequences, respectively.\n", + "Output is a one-channel probability map of abnormality regions with the same size as the input image.\n", + "It can be transformed to a binary segmentation mask by thresholding as shown in the example below.\n", + "\n", + "### Example\n", + "\n", + "Input images for pre-trained model should have 3 channels and be resized to 256x256 pixels and z-score normalized per volume." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "edae5a92", + "metadata": {}, + "outputs": [], + "source": [ + "# Download an example image\n", + "import urllib\n", + "url, filename = (\"https://github.com/mateuszbuda/brain-segmentation-pytorch/raw/master/assets/TCGA_CS_4944.png\", \"TCGA_CS_4944.png\")\n", + "try: urllib.URLopener().retrieve(url, filename)\n", + "except: urllib.request.urlretrieve(url, filename)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b2900236", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "from PIL import Image\n", + "from torchvision import transforms\n", + "\n", + "input_image = Image.open(filename)\n", + "m, s = np.mean(input_image, axis=(0, 1)), np.std(input_image, axis=(0, 1))\n", + "preprocess = transforms.Compose([\n", + " transforms.ToTensor(),\n", + " transforms.Normalize(mean=m, std=s),\n", + "])\n", + "input_tensor = preprocess(input_image)\n", + "input_batch = input_tensor.unsqueeze(0)\n", + "\n", + "if torch.cuda.is_available():\n", + " input_batch = input_batch.to('cuda')\n", + " model = model.to('cuda')\n", + "\n", + "with torch.no_grad():\n", + " output = model(input_batch)\n", + "\n", + "print(torch.round(output[0]))" + ] + }, + { + "cell_type": "markdown", + "id": "b5cdbd4e", + "metadata": {}, + "source": [ + "### References\n", + "\n", + "- [Association of genomic subtypes of lower-grade gliomas with shape features automatically extracted by a deep learning algorithm](http://arxiv.org/abs/1906.03720)\n", + "- [U-Net: Convolutional Networks for Biomedical Image Segmentation](https://arxiv.org/abs/1505.04597)\n", + "- [Brain MRI segmentation dataset](https://www.kaggle.com/mateuszbuda/lgg-mri-segmentation)" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/nicolalandro_ntsnet-cub200_ntsnet.ipynb b/assets/hub/nicolalandro_ntsnet-cub200_ntsnet.ipynb new file mode 100644 index 000000000000..53fc1a9826b6 --- /dev/null +++ b/assets/hub/nicolalandro_ntsnet-cub200_ntsnet.ipynb @@ -0,0 +1,118 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "20f62891", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# ntsnet\n", + "\n", + "*Author: Moreno Caraffini and Nicola Landro*\n", + "\n", + "**classify birds using this fine-grained image classifier**\n", + "\n", + "\"alt\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c6d78a29", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "model = torch.hub.load('nicolalandro/ntsnet-cub200', 'ntsnet', pretrained=True,\n", + " **{'topN': 6, 'device':'cpu', 'num_classes': 200})" + ] + }, + { + "cell_type": "markdown", + "id": "cfa847dd", + "metadata": {}, + "source": [ + "### Example Usage" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bdde7974", + "metadata": {}, + "outputs": [], + "source": [ + "from torchvision import transforms\n", + "import torch\n", + "import urllib\n", + "from PIL import Image\n", + "\n", + "transform_test = transforms.Compose([\n", + " transforms.Resize((600, 600), Image.BILINEAR),\n", + " transforms.CenterCrop((448, 448)),\n", + " # transforms.RandomHorizontalFlip(), # only if train\n", + " transforms.ToTensor(),\n", + " transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),\n", + "])\n", + "\n", + "\n", + "model = torch.hub.load('nicolalandro/ntsnet-cub200', 'ntsnet', pretrained=True, **{'topN': 6, 'device':'cpu', 'num_classes': 200})\n", + "model.eval()\n", + "\n", + "url = 'https://raw.githubusercontent.com/nicolalandro/ntsnet-cub200/master/images/nts-net.png'\n", + "img = Image.open(urllib.request.urlopen(url))\n", + "scaled_img = transform_test(img)\n", + "torch_images = scaled_img.unsqueeze(0)\n", + "\n", + "with torch.no_grad():\n", + " top_n_coordinates, concat_out, raw_logits, concat_logits, part_logits, top_n_index, top_n_prob = model(torch_images)\n", + "\n", + " _, predict = torch.max(concat_logits, 1)\n", + " pred_id = predict.item()\n", + " print('bird class:', model.bird_classes[pred_id])" + ] + }, + { + "cell_type": "markdown", + "id": "20fe5d0c", + "metadata": {}, + "source": [ + "### Model Description\n", + "This is an nts-net pretrained with CUB200 2011 dataset, which is a fine grained dataset of birds species.\n", + "\n", + "### References\n", + "You can read the full paper at this [link](http://artelab.dista.uninsubria.it/res/research/papers/2019/2019-IVCNZ-Nawaz-Birds.pdf)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "74ed0a07", + "metadata": { + "attributes": { + "classes": [ + "bibtex" + ], + "id": "" + } + }, + "outputs": [], + "source": [ + "@INPROCEEDINGS{Gallo:2019:IVCNZ,\n", + " author={Nawaz, Shah and Calefati, Alessandro and Caraffini, Moreno and Landro, Nicola and Gallo, Ignazio},\n", + " booktitle={2019 International Conference on Image and Vision Computing New Zealand (IVCNZ 2019)},\n", + " title={Are These Birds Similar: Learning Branched Networks for Fine-grained Representations},\n", + " year={2019},\n", + " month={Dec},\n", + "}" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/nvidia_deeplearningexamples_efficientnet.ipynb b/assets/hub/nvidia_deeplearningexamples_efficientnet.ipynb new file mode 100644 index 000000000000..04ec17f4104f --- /dev/null +++ b/assets/hub/nvidia_deeplearningexamples_efficientnet.ipynb @@ -0,0 +1,204 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "b913a656", + "metadata": {}, + "source": [ + "### This notebook requires a GPU runtime to run.\n", + "### Please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# EfficientNet\n", + "\n", + "*Author: NVIDIA*\n", + "\n", + "**EfficientNets are a family of image classification models, which achieve state-of-the-art accuracy, being an order-of-magnitude smaller and faster. Trained with mixed precision using Tensor Cores.**\n", + "\n", + "\"alt\"\n", + "\n", + "\n", + "\n", + "### Model Description\n", + "\n", + "EfficientNet is an image classification model family. It was first described in [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946). This notebook allows you to load and test the EfficientNet-B0, EfficientNet-B4, EfficientNet-WideSE-B0 and, EfficientNet-WideSE-B4 models.\n", + "\n", + "EfficientNet-WideSE models use Squeeze-and-Excitation layers wider than original EfficientNet models, the width of SE module is proportional to the width of Depthwise Separable Convolutions instead of block width.\n", + "\n", + "WideSE models are slightly more accurate than original models.\n", + "\n", + "This model is trained with mixed precision using Tensor Cores on Volta and the NVIDIA Ampere GPU architectures. Therefore, researchers can get results over 2x faster than training without Tensor Cores, while experiencing the benefits of mixed precision training. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.\n", + "\n", + "We use [NHWC data layout](https://pytorch.org/tutorials/intermediate/memory_format_tutorial.html) when training using Mixed Precision.\n", + "\n", + "### Example\n", + "\n", + "In the example below we will use the pretrained ***EfficientNet*** model to perform inference on image and present the result.\n", + "\n", + "To run the example you need some extra python packages installed. These are needed for preprocessing images and visualization." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "49342854", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install validators matplotlib" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "17a365de", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from PIL import Image\n", + "import torchvision.transforms as transforms\n", + "import numpy as np\n", + "import json\n", + "import requests\n", + "import matplotlib.pyplot as plt\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "%matplotlib inline\n", + "\n", + "device = torch.device(\"cuda\") if torch.cuda.is_available() else torch.device(\"cpu\")\n", + "print(f'Using {device} for inference')" + ] + }, + { + "cell_type": "markdown", + "id": "cc63c523", + "metadata": {}, + "source": [ + "Load the model pretrained on ImageNet dataset.\n", + "\n", + "You can choose among the following models:\n", + "\n", + "| TorchHub entrypoint | Description |\n", + "| :----- | :----- |\n", + "| `nvidia_efficientnet_b0` | baseline EfficientNet |\n", + "| `nvidia_efficientnet_b4` | scaled EfficientNet|\n", + "| `nvidia_efficientnet_widese_b0` | model with Squeeze-and-Excitation layers wider than baseline EfficientNet model |\n", + "| `nvidia_efficientnet_widese_b4` | model with Squeeze-and-Excitation layers wider than scaled EfficientNet model |\n", + "\n", + "There are also quantized version of the models, but they require nvidia container. See [quantized models](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Classification/ConvNets/efficientnet#quantization)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9434f5c7", + "metadata": {}, + "outputs": [], + "source": [ + "efficientnet = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_efficientnet_b0', pretrained=True)\n", + "utils = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_convnets_processing_utils')\n", + "\n", + "efficientnet.eval().to(device)\n" + ] + }, + { + "cell_type": "markdown", + "id": "7303edb8", + "metadata": {}, + "source": [ + "Prepare sample input data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "489f6768", + "metadata": {}, + "outputs": [], + "source": [ + "uris = [\n", + " 'http://images.cocodataset.org/test-stuff2017/000000024309.jpg',\n", + " 'http://images.cocodataset.org/test-stuff2017/000000028117.jpg',\n", + " 'http://images.cocodataset.org/test-stuff2017/000000006149.jpg',\n", + " 'http://images.cocodataset.org/test-stuff2017/000000004954.jpg',\n", + "]\n", + "\n", + "batch = torch.cat(\n", + " [utils.prepare_input_from_uri(uri) for uri in uris]\n", + ").to(device)" + ] + }, + { + "cell_type": "markdown", + "id": "21f12d5e", + "metadata": {}, + "source": [ + "Run inference. Use `pick_n_best(predictions=output, n=topN)` helper function to pick N most probable hypotheses according to the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d997b6f5", + "metadata": {}, + "outputs": [], + "source": [ + "with torch.no_grad():\n", + " output = torch.nn.functional.softmax(efficientnet(batch), dim=1)\n", + " \n", + "results = utils.pick_n_best(predictions=output, n=5)" + ] + }, + { + "cell_type": "markdown", + "id": "8b7a0638", + "metadata": {}, + "source": [ + "Display the result." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "401003b6", + "metadata": {}, + "outputs": [], + "source": [ + "for uri, result in zip(uris, results):\n", + " img = Image.open(requests.get(uri, stream=True).raw)\n", + " img.thumbnail((256,256), Image.ANTIALIAS)\n", + " plt.imshow(img)\n", + " plt.show()\n", + " print(result)" + ] + }, + { + "cell_type": "markdown", + "id": "e4780b64", + "metadata": {}, + "source": [ + "### Details\n", + "For detailed information on model input and output, training recipies, inference and performance visit:\n", + "[github](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Classification/ConvNets/efficientnet)\n", + "and/or [NGC](https://ngc.nvidia.com/catalog/resources/nvidia:efficientnet_for_pytorch)\n", + "\n", + "### References\n", + "\n", + " - [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946)\n", + " - [model on NGC](https://ngc.nvidia.com/catalog/resources/nvidia:efficientnet_for_pytorch)\n", + " - [model on github](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Classification/ConvNets/efficientnet)\n", + " - [pretrained model on NGC (efficientnet-b0)](https://ngc.nvidia.com/catalog/models/nvidia:efficientnet_b0_pyt_amp)\n", + " - [pretrained model on NGC (efficientnet-b4)](https://ngc.nvidia.com/catalog/models/nvidia:efficientnet_b4_pyt_amp)\n", + " - [pretrained model on NGC (efficientnet-widese-b0)](https://ngc.nvidia.com/catalog/models/nvidia:efficientnet_widese_b0_pyt_amp)\n", + " - [pretrained model on NGC (efficientnet-widese-b4)](https://ngc.nvidia.com/catalog/models/nvidia:efficientnet_widese_b4_pyt_amp)\n", + " - [pretrained, quantized model on NGC (efficientnet-widese-b0)](https://ngc.nvidia.com/catalog/models/nvidia:efficientnet_widese_b0_pyt_amp)\n", + " - [pretrained, quantized model on NGC (efficientnet-widese-b4)](https://ngc.nvidia.com/catalog/models/nvidia:efficientnet_widese_b4_pyt_amp)" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/nvidia_deeplearningexamples_fastpitch.ipynb b/assets/hub/nvidia_deeplearningexamples_fastpitch.ipynb new file mode 100644 index 000000000000..09ca3a568f57 --- /dev/null +++ b/assets/hub/nvidia_deeplearningexamples_fastpitch.ipynb @@ -0,0 +1,324 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "b844056c", + "metadata": {}, + "source": [ + "### This notebook requires a GPU runtime to run.\n", + "### Please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# FastPitch 2\n", + "\n", + "*Author: NVIDIA*\n", + "\n", + "**The FastPitch model for generating mel spectrograms from text**\n", + "\n", + "\"alt\"\n", + "\n", + "\n", + "\n", + "### Model Description\n", + "\n", + "This notebook demonstrates a PyTorch implementation of the FastPitch model described in the [FastPitch](https://arxiv.org/abs/2006.06873) paper.\n", + "The FastPitch model generates mel-spectrograms and predicts a pitch contour from raw input text. In version 1.1, it does not need any pre-trained aligning model to bootstrap from. To get the audio waveform we need a second model that will produce it from the generated mel-spectrogram. In this notebook we use HiFi-GAN model for that second step.\n", + "\n", + "The FastPitch model is based on the [FastSpeech](https://arxiv.org/abs/1905.09263) model. The main differences between FastPitch vs FastSpeech are as follows:\n", + "* no dependence on external aligner (Transformer TTS, Tacotron 2); in version 1.1, FastPitch aligns audio to transcriptions by itself as in [One TTS Alignment To Rule Them All](https://arxiv.org/abs/2108.10447),\n", + "* FastPitch explicitly learns to predict the pitch contour,\n", + "* pitch conditioning removes harsh sounding artifacts and provides faster convergence,\n", + "* no need for distilling mel-spectrograms with a teacher model,\n", + "* capabilities to train a multi-speaker model.\n", + "\n", + "\n", + "#### Model architecture\n", + "\n", + "![FastPitch Architecture](https://raw.githubusercontent.com/NVIDIA/DeepLearningExamples/master/PyTorch/SpeechSynthesis/FastPitch/img/fastpitch_model.png)\n", + "\n", + "### Example\n", + "In the example below:\n", + "\n", + "- pretrained FastPitch and HiFiGAN models are loaded from torch.hub\n", + "- given tensor representation of an input text (\"Say this smoothly to prove you are not a robot.\"), FastPitch generates mel spectrogram\n", + "- HiFiGAN generates sound given the mel spectrogram\n", + "- the output sound is saved in an 'audio.wav' file\n", + "\n", + "To run the example you need some extra python packages installed. These are needed for preprocessing of text and audio, as well as for display and input/output handling. Finally, for better performance of FastPitch model, we download the CMU pronounciation dictionary." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "05ac615a", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "apt-get update\n", + "apt-get install -y libsndfile1 wget\n", + "pip install numpy scipy librosa unidecode inflect librosa matplotlib==3.6.3\n", + "wget https://raw.githubusercontent.com/NVIDIA/NeMo/263a30be71e859cee330e5925332009da3e5efbc/scripts/tts_dataset_files/heteronyms-052722 -qO heteronyms\n", + "wget https://raw.githubusercontent.com/NVIDIA/NeMo/263a30be71e859cee330e5925332009da3e5efbc/scripts/tts_dataset_files/cmudict-0.7b_nv22.08 -qO cmudict-0.7b" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "848828d1", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import matplotlib.pyplot as plt\n", + "from IPython.display import Audio\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "\n", + "device = torch.device(\"cuda\") if torch.cuda.is_available() else torch.device(\"cpu\")\n", + "print(f'Using {device} for inference')" + ] + }, + { + "cell_type": "markdown", + "id": "a8a93fec", + "metadata": {}, + "source": [ + "Download and setup FastPitch generator model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0de224eb", + "metadata": {}, + "outputs": [], + "source": [ + "fastpitch, generator_train_setup = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_fastpitch')" + ] + }, + { + "cell_type": "markdown", + "id": "4f160e82", + "metadata": {}, + "source": [ + "Download and setup vocoder and denoiser models." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b0655df7", + "metadata": {}, + "outputs": [], + "source": [ + "hifigan, vocoder_train_setup, denoiser = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_hifigan')" + ] + }, + { + "cell_type": "markdown", + "id": "9c8575d3", + "metadata": {}, + "source": [ + "Verify that generator and vocoder models agree on input parameters." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d2140a54", + "metadata": {}, + "outputs": [], + "source": [ + "CHECKPOINT_SPECIFIC_ARGS = [\n", + " 'sampling_rate', 'hop_length', 'win_length', 'p_arpabet', 'text_cleaners',\n", + " 'symbol_set', 'max_wav_value', 'prepend_space_to_text',\n", + " 'append_space_to_text']\n", + "\n", + "for k in CHECKPOINT_SPECIFIC_ARGS:\n", + "\n", + " v1 = generator_train_setup.get(k, None)\n", + " v2 = vocoder_train_setup.get(k, None)\n", + "\n", + " assert v1 is None or v2 is None or v1 == v2, \\\n", + " f'{k} mismatch in spectrogram generator and vocoder'" + ] + }, + { + "cell_type": "markdown", + "id": "e24e3c5d", + "metadata": {}, + "source": [ + "Put all models on available device." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c7383ab5", + "metadata": {}, + "outputs": [], + "source": [ + "fastpitch.to(device)\n", + "hifigan.to(device)\n", + "denoiser.to(device)" + ] + }, + { + "cell_type": "markdown", + "id": "dd803d24", + "metadata": {}, + "source": [ + "Load text processor." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3f512618", + "metadata": {}, + "outputs": [], + "source": [ + "tp = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_textprocessing_utils', cmudict_path=\"cmudict-0.7b\", heteronyms_path=\"heteronyms\")" + ] + }, + { + "cell_type": "markdown", + "id": "c3ee8163", + "metadata": {}, + "source": [ + "Set the text to be synthetized, prepare input and set additional generation parameters." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fad7df55", + "metadata": {}, + "outputs": [], + "source": [ + "text = \"Say this smoothly, to prove you are not a robot.\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3bca9235", + "metadata": {}, + "outputs": [], + "source": [ + "batches = tp.prepare_input_sequence([text], batch_size=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3993c431", + "metadata": {}, + "outputs": [], + "source": [ + "gen_kw = {'pace': 1.0,\n", + " 'speaker': 0,\n", + " 'pitch_tgt': None,\n", + " 'pitch_transform': None}\n", + "denoising_strength = 0.005" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8d4b3ecd", + "metadata": {}, + "outputs": [], + "source": [ + "for batch in batches:\n", + " with torch.no_grad():\n", + " mel, mel_lens, *_ = fastpitch(batch['text'].to(device), **gen_kw)\n", + " audios = hifigan(mel).float()\n", + " audios = denoiser(audios.squeeze(1), denoising_strength)\n", + " audios = audios.squeeze(1) * vocoder_train_setup['max_wav_value']\n" + ] + }, + { + "cell_type": "markdown", + "id": "c48a0f58", + "metadata": {}, + "source": [ + "Plot the intermediate spectorgram." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "006163af", + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure(figsize=(10,12))\n", + "res_mel = mel[0].detach().cpu().numpy()\n", + "plt.imshow(res_mel, origin='lower')\n", + "plt.xlabel('time')\n", + "plt.ylabel('frequency')\n", + "_=plt.title('Spectrogram')" + ] + }, + { + "cell_type": "markdown", + "id": "6629975b", + "metadata": {}, + "source": [ + "Syntesize audio." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "251ea5b9", + "metadata": {}, + "outputs": [], + "source": [ + "audio_numpy = audios[0].cpu().numpy()\n", + "Audio(audio_numpy, rate=22050)" + ] + }, + { + "cell_type": "markdown", + "id": "98a6104e", + "metadata": {}, + "source": [ + "Write audio to wav file." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c246eca4", + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.io.wavfile import write\n", + "write(\"audio.wav\", vocoder_train_setup['sampling_rate'], audio_numpy)" + ] + }, + { + "cell_type": "markdown", + "id": "6a978c6c", + "metadata": {}, + "source": [ + "### Details\n", + "For detailed information on model input and output, training recipies, inference and performance visit: [github](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/SpeechSynthesis/HiFiGAN) and/or [NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/dle/resources/fastpitch_pyt)\n", + "\n", + "### References\n", + "\n", + " - [FastPitch paper](https://arxiv.org/abs/2006.06873)\n", + " - [FastPitch on NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/dle/resources/fastpitch_pyt)\n", + " - [HiFi-GAN on NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/dle/resources/hifigan_pyt)\n", + " - [FastPitch and HiFi-GAN on github](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/SpeechSynthesis/HiFiGAN)" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/nvidia_deeplearningexamples_gpunet.ipynb b/assets/hub/nvidia_deeplearningexamples_gpunet.ipynb new file mode 100644 index 000000000000..248a21648804 --- /dev/null +++ b/assets/hub/nvidia_deeplearningexamples_gpunet.ipynb @@ -0,0 +1,218 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "f003828f", + "metadata": {}, + "source": [ + "### This notebook requires a GPU runtime to run.\n", + "### Please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# GPUNet\n", + "\n", + "*Author: NVIDIA*\n", + "\n", + "**GPUNet is a new family of Convolutional Neural Networks designed to max out the performance of NVIDIA GPU and TensorRT.**\n", + "\n", + "\"alt\"\n", + "\n", + "\n", + "\n", + "### Model Description\n", + "GPUNets are a new family of deployment and production ready Convolutional Neural Networks from NVIDIA auto-designed to max out the performance of NVIDIA GPU and TensorRT. \n", + "\n", + "Crafted by NVIDIA AI using novel Neural Architecture Search(NAS) methods, GPUNet demonstrates state-of-the-art inference performance up to 2x faster than EfficientNet-X and FBNet-V3. This notebook allows you to load and test all the the GPUNet model implementation listed in our [CVPR-2022 paper](https://arxiv.org/pdf/2205.00841.pdf). You can use this notebook to quickly load each one of listed models to perform inference runs.\n", + "\n", + "### Example\n", + "In the example below the pretrained ***GPUNet-0*** model is loaded by default to perform inference on image and present the result. You can switch the default pre-trained model loading from GPUNet-0 to one of these: GPUNet-1, GPUNet-2, GPUNet-P0, GPUNet-P1, GPUNet-D1 or GPUNet-D2.\n", + "### Install pre-requisites\n", + "To run the example you need some extra python packages installed. These are needed for preprocessing images and visualization." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ebb1a369", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install validators matplotlib\n", + "!pip install timm==0.5.4" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5747f0f9", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from PIL import Image\n", + "import torchvision.transforms as transforms\n", + "import numpy as np\n", + "import json\n", + "import requests\n", + "import matplotlib.pyplot as plt\n", + "import warnings\n", + "\n", + "warnings.filterwarnings('ignore')\n", + "%matplotlib inline\n", + "\n", + "\n", + "if torch.cuda.is_available():\n", + " device = torch.device(\"cuda\") \n", + " !nvidia-smi\n", + "else:\n", + " device = torch.device(\"cpu\")\n", + "\n", + "print(f'Using {device} for inference')" + ] + }, + { + "cell_type": "markdown", + "id": "2f6e3438", + "metadata": {}, + "source": [ + "### Load Pretrained model\n", + "Loads NVIDIA GPUNet-0 model by default pre-trained on ImageNet dataset. You can switch the default pre-trained model loading from GPUNet-0 to one of the following models listed below. \n", + "\n", + "The model architecture is visible as output of the loaded model. For details architecture and latency info please refer to [architecture section](https://github.com/NVIDIA/DeepLearningExamples/tree/torchhub/PyTorch/Classification/GPUNet#model-architecture) in the original repo and Table#[3](https://arxiv.org/pdf/2205.00841.pdf) in the CVPR-2022 paper, respectively. \n", + "\n", + "Please pick and choose one of the following pre-trained models:\n", + "\n", + "| TorchHub model | Description |\n", + "| :----- | :----- |\n", + "| `GPUNet-0` | GPUNet-0 has the fastest measured latency on GV100 |\n", + "| `GPUNet-1` | GPUNet-1 has improved accuracy with one additional layer on GPUNet-0|\n", + "| `GPUNet-2` | GPUNet-2 has higher accuracy with two additional layers on GPUNet-0 |\n", + "| `GPUNet-P0` | GPUNet-P0 is the distilled model with higher accuracy than GPUNet-0 but similar latency|\n", + "| `GPUNet-P1` | GPUNet-P1 is distilled model with even higher accuracy than GPUNet-1 but similar latency |\n", + "| `GPUNet-D1` | GPUNet-D1 has the second highest accuracy amongst all GPUNets|\n", + "| `GPUNet-D2` | GPUNet-D2 has the highest accuracy amongst all GPUNets |" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1a59a679", + "metadata": {}, + "outputs": [], + "source": [ + "model_type = \"GPUNet-0\" # select one from above\n", + "precision = \"fp32\" # select either fp32 of fp16 (for better performance on GPU)\n", + "\n", + "gpunet = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_gpunet', pretrained=True, model_type=model_type, model_math=precision)\n", + "utils = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_convnets_processing_utils')\n", + "\n", + "gpunet.to(device)\n", + "gpunet.eval()" + ] + }, + { + "cell_type": "markdown", + "id": "be086399", + "metadata": {}, + "source": [ + "### Prepare inference data\n", + "Prepare sample input data for inference." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "34097ec0", + "metadata": {}, + "outputs": [], + "source": [ + "uris = [\n", + " 'http://images.cocodataset.org/test-stuff2017/000000024309.jpg',\n", + " 'http://images.cocodataset.org/test-stuff2017/000000028117.jpg',\n", + " 'http://images.cocodataset.org/test-stuff2017/000000006149.jpg',\n", + " 'http://images.cocodataset.org/test-stuff2017/000000004954.jpg',\n", + "]\n", + "\n", + "batch = torch.cat(\n", + " [utils.prepare_input_from_uri(uri) for uri in uris]\n", + ").to(device)\n", + "\n", + "if precision == \"fp16\":\n", + " batch = batch.half()\n", + " \n", + "print(\"Ready to run inference...\")" + ] + }, + { + "cell_type": "markdown", + "id": "2e1d3345", + "metadata": {}, + "source": [ + "### Run inference\n", + "Use `pick_n_best(predictions=output, n=topN)` helper function to pick N most probable hypotheses according to the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "763f63b6", + "metadata": {}, + "outputs": [], + "source": [ + "with torch.no_grad():\n", + " output = torch.nn.functional.softmax(gpunet(batch), dim=1)\n", + " \n", + "results = utils.pick_n_best(predictions=output, n=5)" + ] + }, + { + "cell_type": "markdown", + "id": "af473b19", + "metadata": {}, + "source": [ + "### Display result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2f9ecc93", + "metadata": {}, + "outputs": [], + "source": [ + "for uri, result in zip(uris, results):\n", + " img = Image.open(requests.get(uri, stream=True).raw)\n", + " img.thumbnail((256,256), Image.ANTIALIAS)\n", + " plt.imshow(img)\n", + " plt.show()\n", + " print(result)" + ] + }, + { + "cell_type": "markdown", + "id": "bc98fef5", + "metadata": {}, + "source": [ + "### Details\n", + "For detailed information on model input and output, training recipies, inference and performance visit:\n", + "[github](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Classification/GPUNet)\n", + "\n", + "### References\n", + "\n", + " - [GPUNets: Searching Deployable Convolution Neural Networks for GPUs](https://arxiv.org/pdf/2205.00841.pdf)\n", + " - [model on github](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Classification/GPUNet)\n", + " - [pretrained model on NGC (GPUNet-0)](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/dle/models/gpunet_0_pyt_ckpt)\n", + " - [pretrained model on NGC (GPUNet-1)](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/dle/models/gpunet_1_pyt_ckpt)\n", + " - [pretrained model on NGC (GPUNet-2)](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/dle/models/gpunet_2_pyt_ckpt)\n", + " - [pretrained distilled model on NGC (GPUNet-P0)](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/dle/models/gpunet_p0_pyt_ckpt)\n", + " - [pretrained, distilled model on NGC (GPUNet-P1)](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/dle/models/gpunet_p1_pyt_ckpt)\n", + " - [pretrained, distilled model on NGC (GPUNet-D1)](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/dle/models/gpunet_d1_pyt_ckpt)\n", + " - [pretrained, distilled model on NGC (GPUNet-D2)](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/dle/models/gpunet_d2_pyt_ckpt)" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/nvidia_deeplearningexamples_hifigan.ipynb b/assets/hub/nvidia_deeplearningexamples_hifigan.ipynb new file mode 100644 index 000000000000..da04aad4410c --- /dev/null +++ b/assets/hub/nvidia_deeplearningexamples_hifigan.ipynb @@ -0,0 +1,318 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ad9f8ba2", + "metadata": {}, + "source": [ + "### This notebook requires a GPU runtime to run.\n", + "### Please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# HiFi GAN\n", + "\n", + "*Author: NVIDIA*\n", + "\n", + "**The HiFi GAN model for generating waveforms from mel spectrograms**\n", + "\n", + "\"alt\"\n", + "\n", + "\n", + "\n", + "### Model Description\n", + "This notebook demonstrates a PyTorch implementation of the HiFi-GAN model described in the paper: [HiFi-GAN: Generative Adversarial Networks for Efficient and High Fidelity Speech Synthesis](https://arxiv.org/abs/2010.05646).\n", + "The HiFi-GAN model implements a spectrogram inversion model that allows to synthesize speech waveforms from mel-spectrograms. It follows the generative adversarial network (GAN) paradigm, and is composed of a generator and a discriminator. After training, the generator is used for synthesis, and the discriminator is discarded.\n", + "\n", + "Our implementation is based on the one [published by the authors of the paper](https://github.com/jik876/hifi-gan). We modify the original hyperparameters and provide an alternative training recipe, which enables training on larger batches and faster convergence. HiFi-GAN is trained on a publicly available [LJ Speech dataset](https://keithito.com/LJ-Speech-Dataset/). The samples demonstrate speech synthesized with our publicly available FastPitch and HiFi-GAN checkpoints.\n", + "\n", + "#### Model architecture\n", + "\n", + "![HiFiGAN Architecture](https://raw.githubusercontent.com/NVIDIA/DeepLearningExamples/master/PyTorch/SpeechSynthesis/HiFiGAN/img/hifigan_model.png)\n", + "\n", + "### Example\n", + "In the example below:\n", + "\n", + "- pretrained FastPitch and HiFiGAN models are loaded from torch.hub\n", + "- given tensor representation of an input text (\"Say this smoothly to prove you are not a robot.\"), FastPitch generates mel spectrogram \n", + "- HiFiGAN generates sound given the mel spectrogram\n", + "- the output sound is saved in an 'audio.wav' file\n", + "\n", + "To run the example you need some extra python packages installed. These are needed for preprocessing of text and audio, as well as for display and input/output handling. Finally, for better performance of FastPitch model, we download the CMU pronounciation dictionary." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c2cf6412", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "pip install numpy scipy librosa unidecode inflect librosa matplotlib==3.6.3\n", + "apt-get update\n", + "apt-get install -y libsndfile1 wget\n", + "wget https://raw.githubusercontent.com/NVIDIA/NeMo/263a30be71e859cee330e5925332009da3e5efbc/scripts/tts_dataset_files/heteronyms-052722 -qO heteronyms\n", + "wget https://raw.githubusercontent.com/NVIDIA/NeMo/263a30be71e859cee330e5925332009da3e5efbc/scripts/tts_dataset_files/cmudict-0.7b_nv22.08 -qO cmudict-0.7b" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "97c0c357", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import matplotlib.pyplot as plt\n", + "from IPython.display import Audio\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "\n", + "device = torch.device(\"cuda\") if torch.cuda.is_available() else torch.device(\"cpu\")\n", + "print(f'Using {device} for inference')" + ] + }, + { + "cell_type": "markdown", + "id": "c6b05df7", + "metadata": {}, + "source": [ + "Download and setup FastPitch generator model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ac394a05", + "metadata": {}, + "outputs": [], + "source": [ + "fastpitch, generator_train_setup = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_fastpitch')" + ] + }, + { + "cell_type": "markdown", + "id": "930dfcb6", + "metadata": {}, + "source": [ + "Download and setup vocoder and denoiser models." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f2157457", + "metadata": {}, + "outputs": [], + "source": [ + "hifigan, vocoder_train_setup, denoiser = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_hifigan')" + ] + }, + { + "cell_type": "markdown", + "id": "334e163f", + "metadata": {}, + "source": [ + "Verify that generator and vocoder models agree on input parameters." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b07030e5", + "metadata": {}, + "outputs": [], + "source": [ + "CHECKPOINT_SPECIFIC_ARGS = [\n", + " 'sampling_rate', 'hop_length', 'win_length', 'p_arpabet', 'text_cleaners',\n", + " 'symbol_set', 'max_wav_value', 'prepend_space_to_text',\n", + " 'append_space_to_text']\n", + "\n", + "for k in CHECKPOINT_SPECIFIC_ARGS:\n", + "\n", + " v1 = generator_train_setup.get(k, None)\n", + " v2 = vocoder_train_setup.get(k, None)\n", + "\n", + " assert v1 is None or v2 is None or v1 == v2, \\\n", + " f'{k} mismatch in spectrogram generator and vocoder'" + ] + }, + { + "cell_type": "markdown", + "id": "37d00c33", + "metadata": {}, + "source": [ + "Put all models on available device." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "78dea725", + "metadata": {}, + "outputs": [], + "source": [ + "fastpitch.to(device)\n", + "hifigan.to(device)\n", + "denoiser.to(device)" + ] + }, + { + "cell_type": "markdown", + "id": "ca87ee4b", + "metadata": {}, + "source": [ + "Load text processor." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "75ccfe9f", + "metadata": {}, + "outputs": [], + "source": [ + "tp = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_textprocessing_utils', cmudict_path=\"cmudict-0.7b\", heteronyms_path=\"heteronyms\")" + ] + }, + { + "cell_type": "markdown", + "id": "711e02f7", + "metadata": {}, + "source": [ + "Set the text to be synthetized, prepare input and set additional generation parameters." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d465b2b8", + "metadata": {}, + "outputs": [], + "source": [ + "text = \"Say this smoothly, to prove you are not a robot.\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fe72e111", + "metadata": {}, + "outputs": [], + "source": [ + "batches = tp.prepare_input_sequence([text], batch_size=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "97ab345f", + "metadata": {}, + "outputs": [], + "source": [ + "gen_kw = {'pace': 1.0,\n", + " 'speaker': 0,\n", + " 'pitch_tgt': None,\n", + " 'pitch_transform': None}\n", + "denoising_strength = 0.005" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ad88a994", + "metadata": {}, + "outputs": [], + "source": [ + "for batch in batches:\n", + " with torch.no_grad():\n", + " mel, mel_lens, *_ = fastpitch(batch['text'].to(device), **gen_kw)\n", + " audios = hifigan(mel).float()\n", + " audios = denoiser(audios.squeeze(1), denoising_strength)\n", + " audios = audios.squeeze(1) * vocoder_train_setup['max_wav_value']\n" + ] + }, + { + "cell_type": "markdown", + "id": "215ac622", + "metadata": {}, + "source": [ + "Plot the intermediate spectorgram." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1391d11a", + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure(figsize=(10,12))\n", + "res_mel = mel[0].detach().cpu().numpy()\n", + "plt.imshow(res_mel, origin='lower')\n", + "plt.xlabel('time')\n", + "plt.ylabel('frequency')\n", + "_=plt.title('Spectrogram')" + ] + }, + { + "cell_type": "markdown", + "id": "2bc202bd", + "metadata": {}, + "source": [ + "Syntesize audio." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dff55e0a", + "metadata": {}, + "outputs": [], + "source": [ + "audio_numpy = audios[0].cpu().numpy()\n", + "Audio(audio_numpy, rate=22050)" + ] + }, + { + "cell_type": "markdown", + "id": "911663e6", + "metadata": {}, + "source": [ + "Write audio to wav file." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e6bb116", + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.io.wavfile import write\n", + "write(\"audio.wav\", vocoder_train_setup['sampling_rate'], audio_numpy)" + ] + }, + { + "cell_type": "markdown", + "id": "927c61db", + "metadata": {}, + "source": [ + "### Details\n", + "For detailed information on model input and output, training recipies, inference and performance visit: [github](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/SpeechSynthesis/HiFiGAN) and/or [NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/dle/resources/hifigan_pyt)\n", + "\n", + "### References\n", + "\n", + " - [HiFi-GAN: Generative Adversarial Networks for Efficient and High Fidelity Speech Synthesis](https://arxiv.org/abs/2010.05646)\n", + " - [Original implementation](https://github.com/jik876/hifi-gan)\n", + " - [FastPitch on NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/dle/resources/fastpitch_pyt)\n", + " - [HiFi-GAN on NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/dle/resources/hifigan_pyt)\n", + " - [FastPitch and HiFi-GAN on github](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/SpeechSynthesis/HiFi-GAN)" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/nvidia_deeplearningexamples_resnet50.ipynb b/assets/hub/nvidia_deeplearningexamples_resnet50.ipynb new file mode 100644 index 000000000000..c1bac33dbf51 --- /dev/null +++ b/assets/hub/nvidia_deeplearningexamples_resnet50.ipynb @@ -0,0 +1,192 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "66c2720d", + "metadata": {}, + "source": [ + "### This notebook requires a GPU runtime to run.\n", + "### Please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# ResNet50\n", + "\n", + "*Author: NVIDIA*\n", + "\n", + "**ResNet50 model trained with mixed precision using Tensor Cores.**\n", + "\n", + "\"alt\"\n", + "\n", + "\n", + "### Model Description\n", + "\n", + "The **_ResNet50 v1.5_** model is a modified version of the [original ResNet50 v1 model](https://arxiv.org/abs/1512.03385).\n", + "\n", + "The difference between v1 and v1.5 is that, in the bottleneck blocks which requires\n", + "downsampling, v1 has stride = 2 in the first 1x1 convolution, whereas v1.5 has stride = 2 in the 3x3 convolution.\n", + "\n", + "This difference makes ResNet50 v1.5 slightly more accurate (\\~0.5% top1) than v1, but comes with a small performance drawback (\\~5% imgs/sec).\n", + "\n", + "The model is initialized as described in [Delving deep into rectifiers: Surpassing human-level performance on ImageNet classification](https://arxiv.org/pdf/1502.01852.pdf)\n", + "\n", + "This model is trained with mixed precision using Tensor Cores on Volta, Turing, and the NVIDIA Ampere GPU architectures. Therefore, researchers can get results over 2x faster than training without Tensor Cores, while experiencing the benefits of mixed precision training. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.\n", + "\n", + "Note that the ResNet50 v1.5 model can be deployed for inference on the [NVIDIA Triton Inference Server](https://github.com/triton-inference-server/server) using TorchScript, ONNX Runtime or TensorRT as an execution backend. For details check [NGC](https://ngc.nvidia.com/catalog/resources/nvidia:resnet_for_triton_from_pytorch)\n", + "\n", + "### Example\n", + "\n", + "In the example below we will use the pretrained **_ResNet50 v1.5_** model to perform inference on **_image_** and present the result.\n", + "\n", + "To run the example you need some extra python packages installed. These are needed for preprocessing images and visualization." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "78f246de", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install validators matplotlib" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "519f74f5", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from PIL import Image\n", + "import torchvision.transforms as transforms\n", + "import numpy as np\n", + "import json\n", + "import requests\n", + "import matplotlib.pyplot as plt\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "%matplotlib inline\n", + "\n", + "device = torch.device(\"cuda\") if torch.cuda.is_available() else torch.device(\"cpu\")\n", + "print(f'Using {device} for inference')" + ] + }, + { + "cell_type": "markdown", + "id": "dee3c5cc", + "metadata": {}, + "source": [ + "Load the model pretrained on ImageNet dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6839294f", + "metadata": {}, + "outputs": [], + "source": [ + "resnet50 = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_resnet50', pretrained=True)\n", + "utils = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_convnets_processing_utils')\n", + "\n", + "resnet50.eval().to(device)" + ] + }, + { + "cell_type": "markdown", + "id": "71c9765d", + "metadata": {}, + "source": [ + "Prepare sample input data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2f3a02e3", + "metadata": {}, + "outputs": [], + "source": [ + "uris = [\n", + " 'http://images.cocodataset.org/test-stuff2017/000000024309.jpg',\n", + " 'http://images.cocodataset.org/test-stuff2017/000000028117.jpg',\n", + " 'http://images.cocodataset.org/test-stuff2017/000000006149.jpg',\n", + " 'http://images.cocodataset.org/test-stuff2017/000000004954.jpg',\n", + "]\n", + "\n", + "batch = torch.cat(\n", + " [utils.prepare_input_from_uri(uri) for uri in uris]\n", + ").to(device)" + ] + }, + { + "cell_type": "markdown", + "id": "c9e8c0b1", + "metadata": {}, + "source": [ + "Run inference. Use `pick_n_best(predictions=output, n=topN)` helper function to pick N most probably hypothesis according to the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8f4c40b9", + "metadata": {}, + "outputs": [], + "source": [ + "with torch.no_grad():\n", + " output = torch.nn.functional.softmax(resnet50(batch), dim=1)\n", + "\n", + "results = utils.pick_n_best(predictions=output, n=5)" + ] + }, + { + "cell_type": "markdown", + "id": "5c6115ba", + "metadata": {}, + "source": [ + "Display the result." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f2d32382", + "metadata": {}, + "outputs": [], + "source": [ + "for uri, result in zip(uris, results):\n", + " img = Image.open(requests.get(uri, stream=True).raw)\n", + " img.thumbnail((256,256), Image.LANCZOS)\n", + " plt.imshow(img)\n", + " plt.show()\n", + " print(result)\n" + ] + }, + { + "cell_type": "markdown", + "id": "20e348ab", + "metadata": {}, + "source": [ + "### Details\n", + "\n", + "For detailed information on model input and output, training recipies, inference and performance visit:\n", + "[github](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Classification/ConvNets/resnet50v1.5)\n", + "and/or [NGC](https://ngc.nvidia.com/catalog/resources/nvidia:resnet_50_v1_5_for_pytorch)\n", + "\n", + "### References\n", + "\n", + "- [Original ResNet50 v1 paper](https://arxiv.org/abs/1512.03385)\n", + "- [Delving deep into rectifiers: Surpassing human-level performance on ImageNet classification](https://arxiv.org/pdf/1502.01852.pdf)\n", + "- [model on github](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Classification/ConvNets/resnet50v1.5)\n", + "- [model on NGC](https://ngc.nvidia.com/catalog/resources/nvidia:resnet_50_v1_5_for_pytorch)\n", + "- [pretrained model on NGC](https://ngc.nvidia.com/catalog/models/nvidia:resnet50_pyt_amp)" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/nvidia_deeplearningexamples_resnext.ipynb b/assets/hub/nvidia_deeplearningexamples_resnext.ipynb new file mode 100644 index 000000000000..cc968b11d524 --- /dev/null +++ b/assets/hub/nvidia_deeplearningexamples_resnext.ipynb @@ -0,0 +1,201 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "f6bde560", + "metadata": {}, + "source": [ + "### This notebook requires a GPU runtime to run.\n", + "### Please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# ResNeXt101\n", + "\n", + "*Author: NVIDIA*\n", + "\n", + "**ResNet with bottleneck 3x3 Convolutions substituted by 3x3 Grouped Convolutions, trained with mixed precision using Tensor Cores.**\n", + "\n", + "_ | _\n", + "- | -\n", + "![alt](https://pytorch.org/assets/images/ResNeXtArch.png) | ![alt](https://pytorch.org/assets/images/classification.jpg)\n", + "\n", + "\n", + "\n", + "### Model Description\n", + "\n", + "The ***ResNeXt101-32x4d*** is a model introduced in the [Aggregated Residual Transformations for Deep Neural Networks](https://arxiv.org/pdf/1611.05431.pdf) paper.\n", + "\n", + "It is based on regular ResNet model, substituting 3x3 convolutions inside the bottleneck block for 3x3 grouped convolutions.\n", + "\n", + "This model is trained with mixed precision using Tensor Cores on Volta, Turing, and the NVIDIA Ampere GPU architectures. Therefore, researchers can get results 3x faster than training without Tensor Cores, while experiencing the benefits of mixed precision training. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.\n", + "\n", + "We use [NHWC data layout](https://pytorch.org/tutorials/intermediate/memory_format_tutorial.html) when training using Mixed Precision.\n", + "\n", + "Note that the ResNeXt101-32x4d model can be deployed for inference on the [NVIDIA Triton Inference Server](https://github.com/triton-inference-server/server) using TorchScript, ONNX Runtime or TensorRT as an execution backend. For details check [NGC](https://ngc.nvidia.com/catalog/resources/nvidia:resnext_for_triton_from_pytorch)\n", + "\n", + "#### Model architecture\n", + "\n", + "![ResNextArch](https://pytorch.org/assets/images/ResNeXtArch.png)\n", + "\n", + "_Image source: [Aggregated Residual Transformations for Deep Neural Networks](https://arxiv.org/pdf/1611.05431.pdf)_\n", + "\n", + "Image shows difference between ResNet bottleneck block and ResNeXt bottleneck block.\n", + "\n", + "ResNeXt101-32x4d model's cardinality equals to 32 and bottleneck width equals to 4.\n", + "### Example\n", + "\n", + "In the example below we will use the pretrained ***ResNeXt101-32x4d*** model to perform inference on images and present the result.\n", + "\n", + "To run the example you need some extra python packages installed. These are needed for preprocessing images and visualization." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6284d8fa", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install validators matplotlib" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8dcf5bde", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from PIL import Image\n", + "import torchvision.transforms as transforms\n", + "import numpy as np\n", + "import json\n", + "import requests\n", + "import matplotlib.pyplot as plt\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "%matplotlib inline\n", + "\n", + "device = torch.device(\"cuda\") if torch.cuda.is_available() else torch.device(\"cpu\")\n", + "print(f'Using {device} for inference')" + ] + }, + { + "cell_type": "markdown", + "id": "d36102e3", + "metadata": {}, + "source": [ + "Load the model pretrained on ImageNet dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9a2cad8f", + "metadata": {}, + "outputs": [], + "source": [ + "resneXt = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_resneXt')\n", + "utils = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_convnets_processing_utils')\n", + "\n", + "resneXt.eval().to(device)" + ] + }, + { + "cell_type": "markdown", + "id": "46a9ccd2", + "metadata": {}, + "source": [ + "Prepare sample input data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d9a32ac0", + "metadata": {}, + "outputs": [], + "source": [ + "uris = [\n", + " 'http://images.cocodataset.org/test-stuff2017/000000024309.jpg',\n", + " 'http://images.cocodataset.org/test-stuff2017/000000028117.jpg',\n", + " 'http://images.cocodataset.org/test-stuff2017/000000006149.jpg',\n", + " 'http://images.cocodataset.org/test-stuff2017/000000004954.jpg',\n", + "]\n", + "\n", + "\n", + "batch = torch.cat(\n", + " [utils.prepare_input_from_uri(uri) for uri in uris]\n", + ").to(device)" + ] + }, + { + "cell_type": "markdown", + "id": "5eeef251", + "metadata": {}, + "source": [ + "Run inference. Use `pick_n_best(predictions=output, n=topN)` helper function to pick N most probably hypothesis according to the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c2dacfe", + "metadata": {}, + "outputs": [], + "source": [ + "with torch.no_grad():\n", + " output = torch.nn.functional.softmax(resneXt(batch), dim=1)\n", + " \n", + "results = utils.pick_n_best(predictions=output, n=5)" + ] + }, + { + "cell_type": "markdown", + "id": "60cb6124", + "metadata": {}, + "source": [ + "Display the result." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c10f528a", + "metadata": {}, + "outputs": [], + "source": [ + "for uri, result in zip(uris, results):\n", + " img = Image.open(requests.get(uri, stream=True).raw)\n", + " img.thumbnail((256,256), Image.ANTIALIAS)\n", + " plt.imshow(img)\n", + " plt.show()\n", + " print(result)\n" + ] + }, + { + "cell_type": "markdown", + "id": "653c5f00", + "metadata": {}, + "source": [ + "### Details\n", + "For detailed information on model input and output, training recipies, inference and performance visit:\n", + "[github](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Classification/ConvNets/resnext101-32x4d)\n", + "and/or [NGC](https://ngc.nvidia.com/catalog/resources/nvidia:resnext_for_pytorch)\n", + "\n", + "\n", + "### References\n", + "\n", + " - [Aggregated Residual Transformations for Deep Neural Networks](https://arxiv.org/pdf/1611.05431.pdf)\n", + " - [model on github](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Classification/ConvNets/resnext101-32x4d)\n", + " - [model on NGC](https://ngc.nvidia.com/catalog/resources/nvidia:resnext_for_pytorch)\n", + " - [pretrained model on NGC](https://ngc.nvidia.com/catalog/models/nvidia:resnext101_32x4d_pyt_amp)" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/nvidia_deeplearningexamples_se-resnext.ipynb b/assets/hub/nvidia_deeplearningexamples_se-resnext.ipynb new file mode 100644 index 000000000000..10268dd88a4a --- /dev/null +++ b/assets/hub/nvidia_deeplearningexamples_se-resnext.ipynb @@ -0,0 +1,201 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0d837faa", + "metadata": {}, + "source": [ + "### This notebook requires a GPU runtime to run.\n", + "### Please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# SE-ResNeXt101\n", + "\n", + "*Author: NVIDIA*\n", + "\n", + "**ResNeXt with Squeeze-and-Excitation module added, trained with mixed precision using Tensor Cores.**\n", + "\n", + "_ | _\n", + "- | -\n", + "![alt](https://pytorch.org/assets/images/SEArch.png) | ![alt](https://pytorch.org/assets/images/classification.jpg)\n", + "\n", + "\n", + "\n", + "### Model Description\n", + "\n", + "The ***SE-ResNeXt101-32x4d*** is a [ResNeXt101-32x4d](https://arxiv.org/pdf/1611.05431.pdf)\n", + "model with added Squeeze-and-Excitation module introduced\n", + "in the [Squeeze-and-Excitation Networks](https://arxiv.org/pdf/1709.01507.pdf) paper.\n", + "\n", + "This model is trained with mixed precision using Tensor Cores on Volta, Turing, and the NVIDIA Ampere GPU architectures. Therefore, researchers can get results 3x faster than training without Tensor Cores, while experiencing the benefits of mixed precision training. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.\n", + "\n", + "We use [NHWC data layout](https://pytorch.org/tutorials/intermediate/memory_format_tutorial.html) when training using Mixed Precision.\n", + "\n", + "#### Model architecture\n", + "\n", + "![SEArch](https://pytorch.org/assets/images/SEArch.png)\n", + "\n", + "_Image source: [Squeeze-and-Excitation Networks](https://arxiv.org/pdf/1709.01507.pdf)_\n", + "\n", + "Image shows the architecture of SE block and where is it placed in ResNet bottleneck block.\n", + "\n", + "\n", + "Note that the SE-ResNeXt101-32x4d model can be deployed for inference on the [NVIDIA Triton Inference Server](https://github.com/triton-inference-server/server) using TorchScript, ONNX Runtime or TensorRT as an execution backend. For details check [NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/resources/se_resnext_for_triton_from_pytorch).\n", + "\n", + "### Example\n", + "\n", + "In the example below we will use the pretrained ***SE-ResNeXt101-32x4d*** model to perform inference on images and present the result.\n", + "\n", + "To run the example you need some extra python packages installed. These are needed for preprocessing images and visualization." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b8f575d1", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install validators matplotlib" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d1f397f1", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from PIL import Image\n", + "import torchvision.transforms as transforms\n", + "import numpy as np\n", + "import json\n", + "import requests\n", + "import matplotlib.pyplot as plt\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "%matplotlib inline\n", + "\n", + "device = torch.device(\"cuda\") if torch.cuda.is_available() else torch.device(\"cpu\")\n", + "print(f'Using {device} for inference')" + ] + }, + { + "cell_type": "markdown", + "id": "7501370a", + "metadata": {}, + "source": [ + "Load the model pretrained on ImageNet dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "642724b2", + "metadata": {}, + "outputs": [], + "source": [ + "resneXt = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_se_resnext101_32x4d')\n", + "utils = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_convnets_processing_utils')\n", + "\n", + "resneXt.eval().to(device)" + ] + }, + { + "cell_type": "markdown", + "id": "d60343f8", + "metadata": {}, + "source": [ + "Prepare sample input data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "302c281e", + "metadata": {}, + "outputs": [], + "source": [ + "uris = [\n", + " 'http://images.cocodataset.org/test-stuff2017/000000024309.jpg',\n", + " 'http://images.cocodataset.org/test-stuff2017/000000028117.jpg',\n", + " 'http://images.cocodataset.org/test-stuff2017/000000006149.jpg',\n", + " 'http://images.cocodataset.org/test-stuff2017/000000004954.jpg',\n", + "]\n", + "\n", + "\n", + "batch = torch.cat(\n", + " [utils.prepare_input_from_uri(uri) for uri in uris]\n", + ").to(device)" + ] + }, + { + "cell_type": "markdown", + "id": "6dd2775b", + "metadata": {}, + "source": [ + "Run inference. Use `pick_n_best(predictions=output, n=topN)` helper function to pick N most probable hypotheses according to the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dc63ff1b", + "metadata": {}, + "outputs": [], + "source": [ + "with torch.no_grad():\n", + " output = torch.nn.functional.softmax(resneXt(batch), dim=1)\n", + " \n", + "results = utils.pick_n_best(predictions=output, n=5)" + ] + }, + { + "cell_type": "markdown", + "id": "20296003", + "metadata": {}, + "source": [ + "Display the result." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6a8ef184", + "metadata": {}, + "outputs": [], + "source": [ + "for uri, result in zip(uris, results):\n", + " img = Image.open(requests.get(uri, stream=True).raw)\n", + " img.thumbnail((256,256), Image.ANTIALIAS)\n", + " plt.imshow(img)\n", + " plt.show()\n", + " print(result)\n" + ] + }, + { + "cell_type": "markdown", + "id": "0e6c679f", + "metadata": {}, + "source": [ + "### Details\n", + "For detailed information on model input and output, training recipies, inference and performance visit:\n", + "[github](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Classification/ConvNets/se-resnext101-32x4d)\n", + "and/or [NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/resources/se_resnext_for_pytorch).\n", + "\n", + "\n", + "### References\n", + "\n", + " - [Squeeze-and-Excitation Networks](https://arxiv.org/pdf/1709.01507.pdf)\n", + " - [model on github](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Classification/ConvNets/se-resnext101-32x4d)\n", + " - [model on NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/resources/se_resnext_for_pytorch)\n", + " - [pretrained model on NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/models/seresnext101_32x4d_pyt_amp)" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/nvidia_deeplearningexamples_ssd.ipynb b/assets/hub/nvidia_deeplearningexamples_ssd.ipynb new file mode 100644 index 000000000000..c2a8a01a4e6e --- /dev/null +++ b/assets/hub/nvidia_deeplearningexamples_ssd.ipynb @@ -0,0 +1,264 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "add05f5a", + "metadata": {}, + "source": [ + "### This notebook requires a GPU runtime to run.\n", + "### Please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# SSD\n", + "\n", + "*Author: NVIDIA*\n", + "\n", + "**Single Shot MultiBox Detector model for object detection**\n", + "\n", + "_ | _\n", + "- | -\n", + "![alt](https://pytorch.org/assets/images/ssd_diagram.png) | ![alt](https://pytorch.org/assets/images/ssd.png)\n", + "\n", + "\n", + "\n", + "### Model Description\n", + "\n", + "This SSD300 model is based on the\n", + "[SSD: Single Shot MultiBox Detector](https://arxiv.org/abs/1512.02325) paper, which\n", + "describes SSD as “a method for detecting objects in images using a single deep neural network\".\n", + "The input size is fixed to 300x300.\n", + "\n", + "The main difference between this model and the one described in the paper is in the backbone.\n", + "Specifically, the VGG model is obsolete and is replaced by the ResNet-50 model.\n", + "\n", + "From the\n", + "[Speed/accuracy trade-offs for modern convolutional object detectors](https://arxiv.org/abs/1611.10012)\n", + "paper, the following enhancements were made to the backbone:\n", + "* The conv5_x, avgpool, fc and softmax layers were removed from the original classification model.\n", + "* All strides in conv4_x are set to 1x1.\n", + "\n", + "The backbone is followed by 5 additional convolutional layers.\n", + "In addition to the convolutional layers, we attached 6 detection heads:\n", + "* The first detection head is attached to the last conv4_x layer.\n", + "* The other five detection heads are attached to the corresponding 5 additional layers.\n", + "\n", + "Detector heads are similar to the ones referenced in the paper, however,\n", + "they are enhanced by additional BatchNorm layers after each convolution.\n", + "\n", + "### Example\n", + "\n", + "In the example below we will use the pretrained SSD model to detect objects in sample images and visualize the result.\n", + "\n", + "To run the example you need some extra python packages installed. These are needed for preprocessing images and visualization." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a7799905", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "pip install numpy scipy scikit-image matplotlib" + ] + }, + { + "cell_type": "markdown", + "id": "ee048b09", + "metadata": {}, + "source": [ + "Load an SSD model pretrained on COCO dataset, as well as a set of utility methods for convenient and comprehensive formatting of input and output of the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "57eadd0d", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "ssd_model = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_ssd')\n", + "utils = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_ssd_processing_utils')" + ] + }, + { + "cell_type": "markdown", + "id": "f313bb4f", + "metadata": {}, + "source": [ + "Now, prepare the loaded model for inference" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc269b96", + "metadata": {}, + "outputs": [], + "source": [ + "ssd_model.to('cuda')\n", + "ssd_model.eval()" + ] + }, + { + "cell_type": "markdown", + "id": "f44b4baf", + "metadata": {}, + "source": [ + "Prepare input images for object detection.\n", + "(Example links below correspond to first few test images from the COCO dataset, but you can also specify paths to your local images here)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "97de9048", + "metadata": {}, + "outputs": [], + "source": [ + "uris = [\n", + " 'http://images.cocodataset.org/val2017/000000397133.jpg',\n", + " 'http://images.cocodataset.org/val2017/000000037777.jpg',\n", + " 'http://images.cocodataset.org/val2017/000000252219.jpg'\n", + "]" + ] + }, + { + "cell_type": "markdown", + "id": "5c7a8563", + "metadata": {}, + "source": [ + "Format the images to comply with the network input and convert them to tensor." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e6b7bce", + "metadata": {}, + "outputs": [], + "source": [ + "inputs = [utils.prepare_input(uri) for uri in uris]\n", + "tensor = utils.prepare_tensor(inputs)" + ] + }, + { + "cell_type": "markdown", + "id": "ba5ef064", + "metadata": {}, + "source": [ + "Run the SSD network to perform object detection." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7af0e311", + "metadata": {}, + "outputs": [], + "source": [ + "with torch.no_grad():\n", + " detections_batch = ssd_model(tensor)" + ] + }, + { + "cell_type": "markdown", + "id": "2f7b0ce7", + "metadata": {}, + "source": [ + "By default, raw output from SSD network per input image contains\n", + "8732 boxes with localization and class probability distribution.\n", + "Let's filter this output to only get reasonable detections (confidence>40%) in a more comprehensive format." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d5e55a01", + "metadata": {}, + "outputs": [], + "source": [ + "results_per_input = utils.decode_results(detections_batch)\n", + "best_results_per_input = [utils.pick_best(results, 0.40) for results in results_per_input]" + ] + }, + { + "cell_type": "markdown", + "id": "21d6fcd1", + "metadata": {}, + "source": [ + "The model was trained on COCO dataset, which we need to access in order to translate class IDs into object names.\n", + "For the first time, downloading annotations may take a while." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b0c5e835", + "metadata": {}, + "outputs": [], + "source": [ + "classes_to_labels = utils.get_coco_object_dictionary()" + ] + }, + { + "cell_type": "markdown", + "id": "8953e6d2", + "metadata": {}, + "source": [ + "Finally, let's visualize our detections" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cf2d9efe", + "metadata": {}, + "outputs": [], + "source": [ + "from matplotlib import pyplot as plt\n", + "import matplotlib.patches as patches\n", + "\n", + "for image_idx in range(len(best_results_per_input)):\n", + " fig, ax = plt.subplots(1)\n", + " # Show original, denormalized image...\n", + " image = inputs[image_idx] / 2 + 0.5\n", + " ax.imshow(image)\n", + " # ...with detections\n", + " bboxes, classes, confidences = best_results_per_input[image_idx]\n", + " for idx in range(len(bboxes)):\n", + " left, bot, right, top = bboxes[idx]\n", + " x, y, w, h = [val * 300 for val in [left, bot, right - left, top - bot]]\n", + " rect = patches.Rectangle((x, y), w, h, linewidth=1, edgecolor='r', facecolor='none')\n", + " ax.add_patch(rect)\n", + " ax.text(x, y, \"{} {:.0f}%\".format(classes_to_labels[classes[idx] - 1], confidences[idx]*100), bbox=dict(facecolor='white', alpha=0.5))\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "d50074e7", + "metadata": {}, + "source": [ + "### Details\n", + "For detailed information on model input and output,\n", + "training recipies, inference and performance visit:\n", + "[github](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Detection/SSD)\n", + "and/or [NGC](https://ngc.nvidia.com/catalog/resources/nvidia:ssd_for_pytorch)\n", + "\n", + "### References\n", + "\n", + " - [SSD: Single Shot MultiBox Detector](https://arxiv.org/abs/1512.02325) paper\n", + " - [Speed/accuracy trade-offs for modern convolutional object detectors](https://arxiv.org/abs/1611.10012) paper\n", + " - [SSD on NGC](https://ngc.nvidia.com/catalog/resources/nvidia:ssd_for_pytorch)\n", + " - [SSD on github](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Detection/SSD)" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/nvidia_deeplearningexamples_tacotron2.ipynb b/assets/hub/nvidia_deeplearningexamples_tacotron2.ipynb new file mode 100644 index 000000000000..ecd3b7c29c3e --- /dev/null +++ b/assets/hub/nvidia_deeplearningexamples_tacotron2.ipynb @@ -0,0 +1,213 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "d6a36e6b", + "metadata": {}, + "source": [ + "### This notebook requires a GPU runtime to run.\n", + "### Please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# Tacotron 2\n", + "\n", + "*Author: NVIDIA*\n", + "\n", + "**The Tacotron 2 model for generating mel spectrograms from text**\n", + "\n", + "\"alt\"\n", + "\n", + "\n", + "\n", + "### Model Description\n", + "\n", + "The Tacotron 2 and WaveGlow model form a text-to-speech system that enables user to synthesise a natural sounding speech from raw transcripts without any additional prosody information. The Tacotron 2 model produces mel spectrograms from input text using encoder-decoder architecture. WaveGlow (also available via torch.hub) is a flow-based model that consumes the mel spectrograms to generate speech.\n", + "\n", + "This implementation of Tacotron 2 model differs from the model described in the paper. Our implementation uses Dropout instead of Zoneout to regularize the LSTM layers.\n", + "\n", + "### Example\n", + "\n", + "In the example below:\n", + "- pretrained Tacotron2 and Waveglow models are loaded from torch.hub\n", + "- Given a tensor representation of the input text (\"Hello world, I missed you so much\"), Tacotron2 generates a Mel spectrogram as shown on the illustration\n", + "- Waveglow generates sound given the mel spectrogram\n", + "- the output sound is saved in an 'audio.wav' file\n", + "\n", + "To run the example you need some extra python packages installed.\n", + "These are needed for preprocessing the text and audio, as well as for display and input / output." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a384b737", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "pip install numpy scipy librosa unidecode inflect librosa\n", + "apt-get update\n", + "apt-get install -y libsndfile1" + ] + }, + { + "cell_type": "markdown", + "id": "2578bea8", + "metadata": {}, + "source": [ + "Load the Tacotron2 model pre-trained on [LJ Speech dataset](https://keithito.com/LJ-Speech-Dataset/) and prepare it for inference:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6d735c9f", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "tacotron2 = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_tacotron2', model_math='fp16')\n", + "tacotron2 = tacotron2.to('cuda')\n", + "tacotron2.eval()" + ] + }, + { + "cell_type": "markdown", + "id": "96353646", + "metadata": {}, + "source": [ + "Load pretrained WaveGlow model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "726773b0", + "metadata": {}, + "outputs": [], + "source": [ + "waveglow = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_waveglow', model_math='fp16')\n", + "waveglow = waveglow.remove_weightnorm(waveglow)\n", + "waveglow = waveglow.to('cuda')\n", + "waveglow.eval()" + ] + }, + { + "cell_type": "markdown", + "id": "0055a4a6", + "metadata": {}, + "source": [ + "Now, let's make the model say:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c432a2a5", + "metadata": {}, + "outputs": [], + "source": [ + "text = \"Hello world, I missed you so much.\"" + ] + }, + { + "cell_type": "markdown", + "id": "09f3df08", + "metadata": {}, + "source": [ + "Format the input using utility methods" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1186aca4", + "metadata": {}, + "outputs": [], + "source": [ + "utils = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_tts_utils')\n", + "sequences, lengths = utils.prepare_input_sequence([text])" + ] + }, + { + "cell_type": "markdown", + "id": "52b62d50", + "metadata": {}, + "source": [ + "Run the chained models:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fe9a3235", + "metadata": {}, + "outputs": [], + "source": [ + "with torch.no_grad():\n", + " mel, _, _ = tacotron2.infer(sequences, lengths)\n", + " audio = waveglow.infer(mel)\n", + "audio_numpy = audio[0].data.cpu().numpy()\n", + "rate = 22050" + ] + }, + { + "cell_type": "markdown", + "id": "4f981ac4", + "metadata": {}, + "source": [ + "You can write it to a file and listen to it" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4811ba40", + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.io.wavfile import write\n", + "write(\"audio.wav\", rate, audio_numpy)" + ] + }, + { + "cell_type": "markdown", + "id": "a8484e97", + "metadata": {}, + "source": [ + "Alternatively, play it right away in a notebook with IPython widgets" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4aea3333", + "metadata": {}, + "outputs": [], + "source": [ + "from IPython.display import Audio\n", + "Audio(audio_numpy, rate=rate)" + ] + }, + { + "cell_type": "markdown", + "id": "d8d9b03f", + "metadata": {}, + "source": [ + "### Details\n", + "For detailed information on model input and output, training recipies, inference and performance visit: [github](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/SpeechSynthesis/Tacotron2) and/or [NGC](https://ngc.nvidia.com/catalog/resources/nvidia:tacotron_2_and_waveglow_for_pytorch)\n", + "\n", + "### References\n", + "\n", + " - [Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions](https://arxiv.org/abs/1712.05884)\n", + " - [WaveGlow: A Flow-based Generative Network for Speech Synthesis](https://arxiv.org/abs/1811.00002)\n", + " - [Tacotron2 and WaveGlow on NGC](https://ngc.nvidia.com/catalog/resources/nvidia:tacotron_2_and_waveglow_for_pytorch)\n", + " - [Tacotron2 and Waveglow on github](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/SpeechSynthesis/Tacotron2)" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/nvidia_deeplearningexamples_waveglow.ipynb b/assets/hub/nvidia_deeplearningexamples_waveglow.ipynb new file mode 100644 index 000000000000..be4c1c4b8b72 --- /dev/null +++ b/assets/hub/nvidia_deeplearningexamples_waveglow.ipynb @@ -0,0 +1,228 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "1d26fab5", + "metadata": {}, + "source": [ + "### This notebook requires a GPU runtime to run.\n", + "### Please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# WaveGlow\n", + "\n", + "*Author: NVIDIA*\n", + "\n", + "**WaveGlow model for generating speech from mel spectrograms (generated by Tacotron2)**\n", + "\n", + "\"alt\"\n", + "\n", + "\n", + "\n", + "### Model Description\n", + "\n", + "The Tacotron 2 and WaveGlow model form a text-to-speech system that enables user to synthesise a natural sounding speech from raw transcripts without any additional prosody information. The Tacotron 2 model (also available via torch.hub) produces mel spectrograms from input text using encoder-decoder architecture. WaveGlow is a flow-based model that consumes the mel spectrograms to generate speech.\n", + "\n", + "### Example\n", + "\n", + "In the example below:\n", + "- pretrained Tacotron2 and Waveglow models are loaded from torch.hub\n", + "- Given a tensor representation of the input text (\"Hello world, I missed you so much\"), Tacotron2 generates a Mel spectrogram as shown on the illustration\n", + "- Waveglow generates sound given the mel spectrogram\n", + "- the output sound is saved in an 'audio.wav' file\n", + "\n", + "To run the example you need some extra python packages installed.\n", + "These are needed for preprocessing the text and audio, as well as for display and input / output." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "406508db", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "pip install numpy scipy librosa unidecode inflect librosa\n", + "apt-get update\n", + "apt-get install -y libsndfile1" + ] + }, + { + "cell_type": "markdown", + "id": "942e77d1", + "metadata": {}, + "source": [ + "Load the WaveGlow model pre-trained on [LJ Speech dataset](https://keithito.com/LJ-Speech-Dataset/)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "537f1a63", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "waveglow = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_waveglow', model_math='fp32')" + ] + }, + { + "cell_type": "markdown", + "id": "7a47b767", + "metadata": {}, + "source": [ + "Prepare the WaveGlow model for inference" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1fbed70c", + "metadata": {}, + "outputs": [], + "source": [ + "waveglow = waveglow.remove_weightnorm(waveglow)\n", + "waveglow = waveglow.to('cuda')\n", + "waveglow.eval()" + ] + }, + { + "cell_type": "markdown", + "id": "8b0dcbce", + "metadata": {}, + "source": [ + "Load a pretrained Tacotron2 model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e1c62ea", + "metadata": {}, + "outputs": [], + "source": [ + "tacotron2 = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_tacotron2', model_math='fp32')\n", + "tacotron2 = tacotron2.to('cuda')\n", + "tacotron2.eval()" + ] + }, + { + "cell_type": "markdown", + "id": "df4cc284", + "metadata": {}, + "source": [ + "Now, let's make the model say:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa1ca779", + "metadata": {}, + "outputs": [], + "source": [ + "text = \"hello world, I missed you so much\"" + ] + }, + { + "cell_type": "markdown", + "id": "4ad6ebad", + "metadata": {}, + "source": [ + "Format the input using utility methods" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1b6dc4d1", + "metadata": {}, + "outputs": [], + "source": [ + "utils = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_tts_utils')\n", + "sequences, lengths = utils.prepare_input_sequence([text])" + ] + }, + { + "cell_type": "markdown", + "id": "2de62c22", + "metadata": {}, + "source": [ + "Run the chained models" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "881b70b7", + "metadata": {}, + "outputs": [], + "source": [ + "with torch.no_grad():\n", + " mel, _, _ = tacotron2.infer(sequences, lengths)\n", + " audio = waveglow.infer(mel)\n", + "audio_numpy = audio[0].data.cpu().numpy()\n", + "rate = 22050" + ] + }, + { + "cell_type": "markdown", + "id": "9471a982", + "metadata": {}, + "source": [ + "You can write it to a file and listen to it" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "87449085", + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.io.wavfile import write\n", + "write(\"audio.wav\", rate, audio_numpy)" + ] + }, + { + "cell_type": "markdown", + "id": "b8555270", + "metadata": {}, + "source": [ + "Alternatively, play it right away in a notebook with IPython widgets" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1a54e376", + "metadata": {}, + "outputs": [], + "source": [ + "from IPython.display import Audio\n", + "Audio(audio_numpy, rate=rate)" + ] + }, + { + "cell_type": "markdown", + "id": "461a1cf1", + "metadata": {}, + "source": [ + "### Details\n", + "For detailed information on model input and output, training recipies, inference and performance visit: [github](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/SpeechSynthesis/Tacotron2) and/or [NGC](https://ngc.nvidia.com/catalog/resources/nvidia:tacotron_2_and_waveglow_for_pytorch)\n", + "\n", + "### References\n", + "\n", + " - [Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions](https://arxiv.org/abs/1712.05884)\n", + " - [WaveGlow: A Flow-based Generative Network for Speech Synthesis](https://arxiv.org/abs/1811.00002)\n", + " - [Tacotron2 and WaveGlow on NGC](https://ngc.nvidia.com/catalog/resources/nvidia:tacotron_2_and_waveglow_for_pytorch)\n", + " - [Tacotron2 and Waveglow on github](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/SpeechSynthesis/Tacotron2)" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/pytorch_fairseq_roberta.ipynb b/assets/hub/pytorch_fairseq_roberta.ipynb new file mode 100644 index 000000000000..a0b9e24a2743 --- /dev/null +++ b/assets/hub/pytorch_fairseq_roberta.ipynb @@ -0,0 +1,190 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a22ee80f", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# RoBERTa\n", + "\n", + "*Author: Facebook AI (fairseq Team)*\n", + "\n", + "**A Robustly Optimized BERT Pretraining Approach**\n", + "\n", + "\n", + "\n", + "### Model Description\n", + "\n", + "Bidirectional Encoder Representations from Transformers, or [BERT][1], is a\n", + "revolutionary self-supervised pretraining technique that learns to predict\n", + "intentionally hidden (masked) sections of text. Crucially, the representations\n", + "learned by BERT have been shown to generalize well to downstream tasks, and when\n", + "BERT was first released in 2018 it achieved state-of-the-art results on many NLP\n", + "benchmark datasets.\n", + "\n", + "[RoBERTa][2] builds on BERT's language masking strategy and modifies key\n", + "hyperparameters in BERT, including removing BERT's next-sentence pretraining\n", + "objective, and training with much larger mini-batches and learning rates.\n", + "RoBERTa was also trained on an order of magnitude more data than BERT, for a\n", + "longer amount of time. This allows RoBERTa representations to generalize even\n", + "better to downstream tasks compared to BERT.\n", + "\n", + "\n", + "### Requirements\n", + "\n", + "We require a few additional Python dependencies for preprocessing:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "31bf82e3", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "pip install regex requests hydra-core omegaconf" + ] + }, + { + "cell_type": "markdown", + "id": "c661359f", + "metadata": {}, + "source": [ + "### Example\n", + "\n", + "##### Load RoBERTa" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ea6f6c39", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "roberta = torch.hub.load('pytorch/fairseq', 'roberta.large')\n", + "roberta.eval() # disable dropout (or leave in train mode to finetune)" + ] + }, + { + "cell_type": "markdown", + "id": "ec181a50", + "metadata": {}, + "source": [ + "##### Apply Byte-Pair Encoding (BPE) to input text" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fb01609c", + "metadata": {}, + "outputs": [], + "source": [ + "tokens = roberta.encode('Hello world!')\n", + "assert tokens.tolist() == [0, 31414, 232, 328, 2]\n", + "assert roberta.decode(tokens) == 'Hello world!'" + ] + }, + { + "cell_type": "markdown", + "id": "6903db0b", + "metadata": {}, + "source": [ + "##### Extract features from RoBERTa" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "637c35e5", + "metadata": {}, + "outputs": [], + "source": [ + "# Extract the last layer's features\n", + "last_layer_features = roberta.extract_features(tokens)\n", + "assert last_layer_features.size() == torch.Size([1, 5, 1024])\n", + "\n", + "# Extract all layer's features (layer 0 is the embedding layer)\n", + "all_layers = roberta.extract_features(tokens, return_all_hiddens=True)\n", + "assert len(all_layers) == 25\n", + "assert torch.all(all_layers[-1] == last_layer_features)" + ] + }, + { + "cell_type": "markdown", + "id": "db346d27", + "metadata": {}, + "source": [ + "##### Use RoBERTa for sentence-pair classification tasks" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "898b46e2", + "metadata": {}, + "outputs": [], + "source": [ + "# Download RoBERTa already finetuned for MNLI\n", + "roberta = torch.hub.load('pytorch/fairseq', 'roberta.large.mnli')\n", + "roberta.eval() # disable dropout for evaluation\n", + "\n", + "with torch.no_grad():\n", + " # Encode a pair of sentences and make a prediction\n", + " tokens = roberta.encode('Roberta is a heavily optimized version of BERT.', 'Roberta is not very optimized.')\n", + " prediction = roberta.predict('mnli', tokens).argmax().item()\n", + " assert prediction == 0 # contradiction\n", + "\n", + " # Encode another pair of sentences\n", + " tokens = roberta.encode('Roberta is a heavily optimized version of BERT.', 'Roberta is based on BERT.')\n", + " prediction = roberta.predict('mnli', tokens).argmax().item()\n", + " assert prediction == 2 # entailment" + ] + }, + { + "cell_type": "markdown", + "id": "6c234073", + "metadata": {}, + "source": [ + "##### Register a new (randomly initialized) classification head" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1a89094b", + "metadata": {}, + "outputs": [], + "source": [ + "roberta.register_classification_head('new_task', num_classes=3)\n", + "logprobs = roberta.predict('new_task', tokens) # tensor([[-1.1050, -1.0672, -1.1245]], grad_fn=)" + ] + }, + { + "cell_type": "markdown", + "id": "77b22901", + "metadata": {}, + "source": [ + "### References\n", + "\n", + "- [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding][1]\n", + "- [RoBERTa: A Robustly Optimized BERT Pretraining Approach][2]\n", + "\n", + "\n", + "[1]: https://arxiv.org/abs/1810.04805\n", + "[2]: https://arxiv.org/abs/1907.11692" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/pytorch_fairseq_translation.ipynb b/assets/hub/pytorch_fairseq_translation.ipynb new file mode 100644 index 000000000000..1c54148e4b77 --- /dev/null +++ b/assets/hub/pytorch_fairseq_translation.ipynb @@ -0,0 +1,200 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "00c45e90", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# Transformer (NMT)\n", + "\n", + "*Author: Facebook AI (fairseq Team)*\n", + "\n", + "**Transformer models for English-French and English-German translation.**\n", + "\n", + "\n", + "\n", + "### Model Description\n", + "\n", + "The Transformer, introduced in the paper [Attention Is All You Need][1], is a\n", + "powerful sequence-to-sequence modeling architecture capable of producing\n", + "state-of-the-art neural machine translation (NMT) systems.\n", + "\n", + "Recently, the fairseq team has explored large-scale semi-supervised training of\n", + "Transformers using back-translated data, further improving translation quality\n", + "over the original model. More details can be found in [this blog post][2].\n", + "\n", + "\n", + "### Requirements\n", + "\n", + "We require a few additional Python dependencies for preprocessing:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fc44211c", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "pip install bitarray fastBPE hydra-core omegaconf regex requests sacremoses subword_nmt" + ] + }, + { + "cell_type": "markdown", + "id": "688cbbe5", + "metadata": {}, + "source": [ + "### English-to-French Translation\n", + "\n", + "To translate from English to French using the model from the paper [Scaling\n", + "Neural Machine Translation][3]:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "36199fd8", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "\n", + "# Load an En-Fr Transformer model trained on WMT'14 data :\n", + "en2fr = torch.hub.load('pytorch/fairseq', 'transformer.wmt14.en-fr', tokenizer='moses', bpe='subword_nmt')\n", + "\n", + "# Use the GPU (optional):\n", + "en2fr.cuda()\n", + "\n", + "# Translate with beam search:\n", + "fr = en2fr.translate('Hello world!', beam=5)\n", + "assert fr == 'Bonjour à tous !'\n", + "\n", + "# Manually tokenize:\n", + "en_toks = en2fr.tokenize('Hello world!')\n", + "assert en_toks == 'Hello world !'\n", + "\n", + "# Manually apply BPE:\n", + "en_bpe = en2fr.apply_bpe(en_toks)\n", + "assert en_bpe == 'H@@ ello world !'\n", + "\n", + "# Manually binarize:\n", + "en_bin = en2fr.binarize(en_bpe)\n", + "assert en_bin.tolist() == [329, 14044, 682, 812, 2]\n", + "\n", + "# Generate five translations with top-k sampling:\n", + "fr_bin = en2fr.generate(en_bin, beam=5, sampling=True, sampling_topk=20)\n", + "assert len(fr_bin) == 5\n", + "\n", + "# Convert one of the samples to a string and detokenize\n", + "fr_sample = fr_bin[0]['tokens']\n", + "fr_bpe = en2fr.string(fr_sample)\n", + "fr_toks = en2fr.remove_bpe(fr_bpe)\n", + "fr = en2fr.detokenize(fr_toks)\n", + "assert fr == en2fr.decode(fr_sample)" + ] + }, + { + "cell_type": "markdown", + "id": "66b917b7", + "metadata": {}, + "source": [ + "### English-to-German Translation\n", + "\n", + "Semi-supervised training with back-translation is an effective way of improving\n", + "translation systems. In the paper [Understanding Back-Translation at Scale][4],\n", + "we back-translate over 200 million German sentences to use as additional\n", + "training data. An ensemble of five of these models was the winning submission to\n", + "the [WMT'18 English-German news translation competition][5].\n", + "\n", + "We can further improved this approach through [noisy-channel reranking][6]. More\n", + "details can be found in [this blog post][7]. An ensemble of models trained with\n", + "this technique was the winning submission to the [WMT'19 English-German news\n", + "translation competition][8].\n", + "\n", + "To translate from English to German using one of the models from the winning submission:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6ec05f7", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "\n", + "# Load an En-De Transformer model trained on WMT'19 data:\n", + "en2de = torch.hub.load('pytorch/fairseq', 'transformer.wmt19.en-de.single_model', tokenizer='moses', bpe='fastbpe')\n", + "\n", + "# Access the underlying TransformerModel\n", + "assert isinstance(en2de.models[0], torch.nn.Module)\n", + "\n", + "# Translate from En-De\n", + "de = en2de.translate('PyTorch Hub is a pre-trained model repository designed to facilitate research reproducibility.')\n", + "assert de == 'PyTorch Hub ist ein vorgefertigtes Modell-Repository, das die Reproduzierbarkeit der Forschung erleichtern soll.'" + ] + }, + { + "cell_type": "markdown", + "id": "5633bdd6", + "metadata": {}, + "source": [ + "We can also do a round-trip translation to create a paraphrase:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3c9ced10", + "metadata": {}, + "outputs": [], + "source": [ + "# Round-trip translations between English and German:\n", + "en2de = torch.hub.load('pytorch/fairseq', 'transformer.wmt19.en-de.single_model', tokenizer='moses', bpe='fastbpe')\n", + "de2en = torch.hub.load('pytorch/fairseq', 'transformer.wmt19.de-en.single_model', tokenizer='moses', bpe='fastbpe')\n", + "\n", + "paraphrase = de2en.translate(en2de.translate('PyTorch Hub is an awesome interface!'))\n", + "assert paraphrase == 'PyTorch Hub is a fantastic interface!'\n", + "\n", + "# Compare the results with English-Russian round-trip translation:\n", + "en2ru = torch.hub.load('pytorch/fairseq', 'transformer.wmt19.en-ru.single_model', tokenizer='moses', bpe='fastbpe')\n", + "ru2en = torch.hub.load('pytorch/fairseq', 'transformer.wmt19.ru-en.single_model', tokenizer='moses', bpe='fastbpe')\n", + "\n", + "paraphrase = ru2en.translate(en2ru.translate('PyTorch Hub is an awesome interface!'))\n", + "assert paraphrase == 'PyTorch is a great interface!'" + ] + }, + { + "cell_type": "markdown", + "id": "5e28c30c", + "metadata": {}, + "source": [ + "### References\n", + "\n", + "- [Attention Is All You Need][1]\n", + "- [Scaling Neural Machine Translation][3]\n", + "- [Understanding Back-Translation at Scale][4]\n", + "- [Facebook FAIR's WMT19 News Translation Task Submission][6]\n", + "\n", + "\n", + "[1]: https://arxiv.org/abs/1706.03762\n", + "[2]: https://code.fb.com/ai-research/scaling-neural-machine-translation-to-bigger-data-sets-with-faster-training-and-inference/\n", + "[3]: https://arxiv.org/abs/1806.00187\n", + "[4]: https://arxiv.org/abs/1808.09381\n", + "[5]: http://www.statmt.org/wmt18/translation-task.html\n", + "[6]: https://arxiv.org/abs/1907.06616\n", + "[7]: https://ai.facebook.com/blog/facebook-leads-wmt-translation-competition/\n", + "[8]: http://www.statmt.org/wmt19/translation-task.html" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/pytorch_vision_alexnet.ipynb b/assets/hub/pytorch_vision_alexnet.ipynb new file mode 100644 index 000000000000..9d657a3d0f8b --- /dev/null +++ b/assets/hub/pytorch_vision_alexnet.ipynb @@ -0,0 +1,148 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a8c51646", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# AlexNet\n", + "\n", + "*Author: Pytorch Team*\n", + "\n", + "**The 2012 ImageNet winner achieved a top-5 error of 15.3%, more than 10.8 percentage points lower than that of the runner up.**\n", + "\n", + "_ | _\n", + "- | -\n", + "![alt](https://pytorch.org/assets/images/alexnet1.png) | ![alt](https://pytorch.org/assets/images/alexnet2.png)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "de851ed4", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "model = torch.hub.load('pytorch/vision:v0.10.0', 'alexnet', pretrained=True)\n", + "model.eval()" + ] + }, + { + "cell_type": "markdown", + "id": "c6c7ae8b", + "metadata": {}, + "source": [ + "All pre-trained models expect input images normalized in the same way,\n", + "i.e. mini-batches of 3-channel RGB images of shape `(3 x H x W)`, where `H` and `W` are expected to be at least `224`.\n", + "The images have to be loaded in to a range of `[0, 1]` and then normalized using `mean = [0.485, 0.456, 0.406]`\n", + "and `std = [0.229, 0.224, 0.225]`.\n", + "\n", + "Here's a sample execution." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3e4e8088", + "metadata": {}, + "outputs": [], + "source": [ + "# Download an example image from the pytorch website\n", + "import urllib\n", + "url, filename = (\"https://github.com/pytorch/hub/raw/master/images/dog.jpg\", \"dog.jpg\")\n", + "try: urllib.URLopener().retrieve(url, filename)\n", + "except: urllib.request.urlretrieve(url, filename)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f52cf73f", + "metadata": {}, + "outputs": [], + "source": [ + "# sample execution (requires torchvision)\n", + "from PIL import Image\n", + "from torchvision import transforms\n", + "input_image = Image.open(filename)\n", + "preprocess = transforms.Compose([\n", + " transforms.Resize(256),\n", + " transforms.CenterCrop(224),\n", + " transforms.ToTensor(),\n", + " transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n", + "])\n", + "input_tensor = preprocess(input_image)\n", + "input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model\n", + "\n", + "# move the input and model to GPU for speed if available\n", + "if torch.cuda.is_available():\n", + " input_batch = input_batch.to('cuda')\n", + " model.to('cuda')\n", + "\n", + "with torch.no_grad():\n", + " output = model(input_batch)\n", + "# Tensor of shape 1000, with confidence scores over ImageNet's 1000 classes\n", + "print(output[0])\n", + "# The output has unnormalized scores. To get probabilities, you can run a softmax on it.\n", + "probabilities = torch.nn.functional.softmax(output[0], dim=0)\n", + "print(probabilities)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c0d41084", + "metadata": {}, + "outputs": [], + "source": [ + "# Download ImageNet labels\n", + "!wget https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "813b3bab", + "metadata": {}, + "outputs": [], + "source": [ + "# Read the categories\n", + "with open(\"imagenet_classes.txt\", \"r\") as f:\n", + " categories = [s.strip() for s in f.readlines()]\n", + "# Show top categories per image\n", + "top5_prob, top5_catid = torch.topk(probabilities, 5)\n", + "for i in range(top5_prob.size(0)):\n", + " print(categories[top5_catid[i]], top5_prob[i].item())" + ] + }, + { + "cell_type": "markdown", + "id": "2b0e3dac", + "metadata": {}, + "source": [ + "### Model Description\n", + "\n", + "AlexNet competed in the ImageNet Large Scale Visual Recognition Challenge on September 30, 2012. The network achieved a top-5 error of 15.3%, more than 10.8 percentage points lower than that of the runner up. The original paper's primary result was that the depth of the model was essential for its high performance, which was computationally expensive, but made feasible due to the utilization of graphics processing units (GPUs) during training.\n", + "\n", + "The 1-crop error rates on the ImageNet dataset with the pretrained model are listed below.\n", + "\n", + "| Model structure | Top-1 error | Top-5 error |\n", + "| --------------- | ----------- | ----------- |\n", + "| AlexNet | 43.45 | 20.91 |\n", + "\n", + "### References\n", + "\n", + "1. [One weird trick for parallelizing convolutional neural networks](https://arxiv.org/abs/1404.5997)." + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/pytorch_vision_deeplabv3_resnet101.ipynb b/assets/hub/pytorch_vision_deeplabv3_resnet101.ipynb new file mode 100644 index 000000000000..9908963f7726 --- /dev/null +++ b/assets/hub/pytorch_vision_deeplabv3_resnet101.ipynb @@ -0,0 +1,158 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "7e0c977e", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# Deeplabv3\n", + "\n", + "*Author: Pytorch Team*\n", + "\n", + "**DeepLabV3 models with ResNet-50, ResNet-101 and MobileNet-V3 backbones**\n", + "\n", + "_ | _\n", + "- | -\n", + "![alt](https://pytorch.org/assets/images/deeplab1.png) | ![alt](https://pytorch.org/assets/images/deeplab2.png)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aee2b394", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "model = torch.hub.load('pytorch/vision:v0.10.0', 'deeplabv3_resnet50', pretrained=True)\n", + "# or any of these variants\n", + "# model = torch.hub.load('pytorch/vision:v0.10.0', 'deeplabv3_resnet101', pretrained=True)\n", + "# model = torch.hub.load('pytorch/vision:v0.10.0', 'deeplabv3_mobilenet_v3_large', pretrained=True)\n", + "model.eval()" + ] + }, + { + "cell_type": "markdown", + "id": "ffb66f42", + "metadata": {}, + "source": [ + "All pre-trained models expect input images normalized in the same way,\n", + "i.e. mini-batches of 3-channel RGB images of shape `(N, 3, H, W)`, where `N` is the number of images, `H` and `W` are expected to be at least `224` pixels.\n", + "The images have to be loaded in to a range of `[0, 1]` and then normalized using `mean = [0.485, 0.456, 0.406]`\n", + "and `std = [0.229, 0.224, 0.225]`.\n", + "\n", + "The model returns an `OrderedDict` with two Tensors that are of the same height and width as the input Tensor, but with 21 classes.\n", + "`output['out']` contains the semantic masks, and `output['aux']` contains the auxiliary loss values per-pixel. In inference mode, `output['aux']` is not useful.\n", + "So, `output['out']` is of shape `(N, 21, H, W)`. More documentation can be found [here](https://pytorch.org/vision/stable/models.html#semantic-segmentation)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6eb1b292", + "metadata": {}, + "outputs": [], + "source": [ + "# Download an example image from the pytorch website\n", + "import urllib\n", + "url, filename = (\"https://github.com/pytorch/hub/raw/master/images/deeplab1.png\", \"deeplab1.png\")\n", + "try: urllib.URLopener().retrieve(url, filename)\n", + "except: urllib.request.urlretrieve(url, filename)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "88780d40", + "metadata": {}, + "outputs": [], + "source": [ + "# sample execution (requires torchvision)\n", + "from PIL import Image\n", + "from torchvision import transforms\n", + "input_image = Image.open(filename)\n", + "input_image = input_image.convert(\"RGB\")\n", + "preprocess = transforms.Compose([\n", + " transforms.ToTensor(),\n", + " transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n", + "])\n", + "\n", + "input_tensor = preprocess(input_image)\n", + "input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model\n", + "\n", + "# move the input and model to GPU for speed if available\n", + "if torch.cuda.is_available():\n", + " input_batch = input_batch.to('cuda')\n", + " model.to('cuda')\n", + "\n", + "with torch.no_grad():\n", + " output = model(input_batch)['out'][0]\n", + "output_predictions = output.argmax(0)" + ] + }, + { + "cell_type": "markdown", + "id": "162a301a", + "metadata": {}, + "source": [ + "The output here is of shape `(21, H, W)`, and at each location, there are unnormalized probabilities corresponding to the prediction of each class.\n", + "To get the maximum prediction of each class, and then use it for a downstream task, you can do `output_predictions = output.argmax(0)`.\n", + "\n", + "Here's a small snippet that plots the predictions, with each color being assigned to each class (see the visualized image on the left)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "967e4c05", + "metadata": {}, + "outputs": [], + "source": [ + "# create a color pallette, selecting a color for each class\n", + "palette = torch.tensor([2 ** 25 - 1, 2 ** 15 - 1, 2 ** 21 - 1])\n", + "colors = torch.as_tensor([i for i in range(21)])[:, None] * palette\n", + "colors = (colors % 255).numpy().astype(\"uint8\")\n", + "\n", + "# plot the semantic segmentation predictions of 21 classes in each color\n", + "r = Image.fromarray(output_predictions.byte().cpu().numpy()).resize(input_image.size)\n", + "r.putpalette(colors)\n", + "\n", + "import matplotlib.pyplot as plt\n", + "plt.imshow(r)\n", + "# plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "a766996f", + "metadata": {}, + "source": [ + "### Model Description\n", + "\n", + "Deeplabv3-ResNet is constructed by a Deeplabv3 model using a ResNet-50 or ResNet-101 backbone.\n", + "Deeplabv3-MobileNetV3-Large is constructed by a Deeplabv3 model using the MobileNetV3 large backbone.\n", + "The pre-trained model has been trained on a subset of COCO train2017, on the 20 categories that are present in the Pascal VOC dataset.\n", + "\n", + "Their accuracies of the pre-trained models evaluated on COCO val2017 dataset are listed below.\n", + "\n", + "| Model structure | Mean IOU | Global Pixelwise Accuracy |\n", + "| ---------------------------- | ----------- | --------------------------|\n", + "| deeplabv3_resnet50 | 66.4 | 92.4 |\n", + "| deeplabv3_resnet101 | 67.4 | 92.4 |\n", + "| deeplabv3_mobilenet_v3_large | 60.3 | 91.2 |\n", + "\n", + "### Resources\n", + "\n", + " - [Rethinking Atrous Convolution for Semantic Image Segmentation](https://arxiv.org/abs/1706.05587)" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/pytorch_vision_densenet.ipynb b/assets/hub/pytorch_vision_densenet.ipynb new file mode 100644 index 000000000000..fc61d267a09d --- /dev/null +++ b/assets/hub/pytorch_vision_densenet.ipynb @@ -0,0 +1,155 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "7dac9025", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# Densenet\n", + "\n", + "*Author: Pytorch Team*\n", + "\n", + "**Dense Convolutional Network (DenseNet), connects each layer to every other layer in a feed-forward fashion.**\n", + "\n", + "_ | _\n", + "- | -\n", + "![alt](https://pytorch.org/assets/images/densenet1.png) | ![alt](https://pytorch.org/assets/images/densenet2.png)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a9367c2e", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "model = torch.hub.load('pytorch/vision:v0.10.0', 'densenet121', pretrained=True)\n", + "# or any of these variants\n", + "# model = torch.hub.load('pytorch/vision:v0.10.0', 'densenet169', pretrained=True)\n", + "# model = torch.hub.load('pytorch/vision:v0.10.0', 'densenet201', pretrained=True)\n", + "# model = torch.hub.load('pytorch/vision:v0.10.0', 'densenet161', pretrained=True)\n", + "model.eval()" + ] + }, + { + "cell_type": "markdown", + "id": "16747e9d", + "metadata": {}, + "source": [ + "All pre-trained models expect input images normalized in the same way,\n", + "i.e. mini-batches of 3-channel RGB images of shape `(3 x H x W)`, where `H` and `W` are expected to be at least `224`.\n", + "The images have to be loaded in to a range of `[0, 1]` and then normalized using `mean = [0.485, 0.456, 0.406]`\n", + "and `std = [0.229, 0.224, 0.225]`.\n", + "\n", + "Here's a sample execution." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "578d3a1e", + "metadata": {}, + "outputs": [], + "source": [ + "# Download an example image from the pytorch website\n", + "import urllib\n", + "url, filename = (\"https://github.com/pytorch/hub/raw/master/images/dog.jpg\", \"dog.jpg\")\n", + "try: urllib.URLopener().retrieve(url, filename)\n", + "except: urllib.request.urlretrieve(url, filename)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9e53747c", + "metadata": {}, + "outputs": [], + "source": [ + "# sample execution (requires torchvision)\n", + "from PIL import Image\n", + "from torchvision import transforms\n", + "input_image = Image.open(filename)\n", + "preprocess = transforms.Compose([\n", + " transforms.Resize(256),\n", + " transforms.CenterCrop(224),\n", + " transforms.ToTensor(),\n", + " transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n", + "])\n", + "input_tensor = preprocess(input_image)\n", + "input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model\n", + "\n", + "# move the input and model to GPU for speed if available\n", + "if torch.cuda.is_available():\n", + " input_batch = input_batch.to('cuda')\n", + " model.to('cuda')\n", + "\n", + "with torch.no_grad():\n", + " output = model(input_batch)\n", + "# Tensor of shape 1000, with confidence scores over ImageNet's 1000 classes\n", + "print(output[0])\n", + "# The output has unnormalized scores. To get probabilities, you can run a softmax on it.\n", + "probabilities = torch.nn.functional.softmax(output[0], dim=0)\n", + "print(probabilities)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a77ca7e1", + "metadata": {}, + "outputs": [], + "source": [ + "# Download ImageNet labels\n", + "!wget https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c37376b4", + "metadata": {}, + "outputs": [], + "source": [ + "# Read the categories\n", + "with open(\"imagenet_classes.txt\", \"r\") as f:\n", + " categories = [s.strip() for s in f.readlines()]\n", + "# Show top categories per image\n", + "top5_prob, top5_catid = torch.topk(probabilities, 5)\n", + "for i in range(top5_prob.size(0)):\n", + " print(categories[top5_catid[i]], top5_prob[i].item())" + ] + }, + { + "cell_type": "markdown", + "id": "5cdbff63", + "metadata": {}, + "source": [ + "### Model Description\n", + "\n", + "Dense Convolutional Network (DenseNet), connects each layer to every other layer in a feed-forward fashion. Whereas traditional convolutional networks with L layers have L connections - one between each layer and its subsequent layer - our network has L(L+1)/2 direct connections. For each layer, the feature-maps of all preceding layers are used as inputs, and its own feature-maps are used as inputs into all subsequent layers. DenseNets have several compelling advantages: they alleviate the vanishing-gradient problem, strengthen feature propagation, encourage feature reuse, and substantially reduce the number of parameters.\n", + "\n", + "The 1-crop error rates on the ImageNet dataset with the pretrained model are listed below.\n", + "\n", + "| Model structure | Top-1 error | Top-5 error |\n", + "| --------------- | ----------- | ----------- |\n", + "| densenet121 | 25.35 | 7.83 |\n", + "| densenet169 | 24.00 | 7.00 |\n", + "| densenet201 | 22.80 | 6.43 |\n", + "| densenet161 | 22.35 | 6.20 |\n", + "\n", + "### References\n", + "\n", + " - [Densely Connected Convolutional Networks](https://arxiv.org/abs/1608.06993)." + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/pytorch_vision_fcn_resnet101.ipynb b/assets/hub/pytorch_vision_fcn_resnet101.ipynb new file mode 100644 index 000000000000..880c4ad7b2d6 --- /dev/null +++ b/assets/hub/pytorch_vision_fcn_resnet101.ipynb @@ -0,0 +1,155 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ad4eaa3f", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# FCN\n", + "\n", + "*Author: Pytorch Team*\n", + "\n", + "**Fully-Convolutional Network model with ResNet-50 and ResNet-101 backbones**\n", + "\n", + "_ | _\n", + "- | -\n", + "![alt](https://pytorch.org/assets/images/deeplab1.png) | ![alt](https://pytorch.org/assets/images/fcn2.png)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "596ae4bd", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "model = torch.hub.load('pytorch/vision:v0.10.0', 'fcn_resnet50', pretrained=True)\n", + "# or\n", + "# model = torch.hub.load('pytorch/vision:v0.10.0', 'fcn_resnet101', pretrained=True)\n", + "model.eval()" + ] + }, + { + "cell_type": "markdown", + "id": "2e46d2dc", + "metadata": {}, + "source": [ + "All pre-trained models expect input images normalized in the same way,\n", + "i.e. mini-batches of 3-channel RGB images of shape `(N, 3, H, W)`, where `N` is the number of images, `H` and `W` are expected to be at least `224` pixels.\n", + "The images have to be loaded in to a range of `[0, 1]` and then normalized using `mean = [0.485, 0.456, 0.406]`\n", + "and `std = [0.229, 0.224, 0.225]`.\n", + "\n", + "The model returns an `OrderedDict` with two Tensors that are of the same height and width as the input Tensor, but with 21 classes.\n", + "`output['out']` contains the semantic masks, and `output['aux']` contains the auxillary loss values per-pixel. In inference mode, `output['aux']` is not useful.\n", + "So, `output['out']` is of shape `(N, 21, H, W)`. More documentation can be found [here](https://pytorch.org/vision/stable/models.html#object-detection-instance-segmentation-and-person-keypoint-detection)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5b259707", + "metadata": {}, + "outputs": [], + "source": [ + "# Download an example image from the pytorch website\n", + "import urllib\n", + "url, filename = (\"https://github.com/pytorch/hub/raw/master/images/deeplab1.png\", \"deeplab1.png\")\n", + "try: urllib.URLopener().retrieve(url, filename)\n", + "except: urllib.request.urlretrieve(url, filename)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b1f6fa05", + "metadata": {}, + "outputs": [], + "source": [ + "# sample execution (requires torchvision)\n", + "from PIL import Image\n", + "from torchvision import transforms\n", + "input_image = Image.open(filename)\n", + "input_image = input_image.convert(\"RGB\")\n", + "preprocess = transforms.Compose([\n", + " transforms.ToTensor(),\n", + " transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n", + "])\n", + "\n", + "input_tensor = preprocess(input_image)\n", + "input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model\n", + "\n", + "# move the input and model to GPU for speed if available\n", + "if torch.cuda.is_available():\n", + " input_batch = input_batch.to('cuda')\n", + " model.to('cuda')\n", + "\n", + "with torch.no_grad():\n", + " output = model(input_batch)['out'][0]\n", + "output_predictions = output.argmax(0)" + ] + }, + { + "cell_type": "markdown", + "id": "4542090e", + "metadata": {}, + "source": [ + "The output here is of shape `(21, H, W)`, and at each location, there are unnormalized probabilities corresponding to the prediction of each class.\n", + "To get the maximum prediction of each class, and then use it for a downstream task, you can do `output_predictions = output.argmax(0)`.\n", + "\n", + "Here's a small snippet that plots the predictions, with each color being assigned to each class (see the visualized image on the left)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f7277631", + "metadata": {}, + "outputs": [], + "source": [ + "# create a color pallette, selecting a color for each class\n", + "palette = torch.tensor([2 ** 25 - 1, 2 ** 15 - 1, 2 ** 21 - 1])\n", + "colors = torch.as_tensor([i for i in range(21)])[:, None] * palette\n", + "colors = (colors % 255).numpy().astype(\"uint8\")\n", + "\n", + "# plot the semantic segmentation predictions of 21 classes in each color\n", + "r = Image.fromarray(output_predictions.byte().cpu().numpy()).resize(input_image.size)\n", + "r.putpalette(colors)\n", + "\n", + "import matplotlib.pyplot as plt\n", + "plt.imshow(r)\n", + "# plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "3a5a585d", + "metadata": {}, + "source": [ + "### Model Description\n", + "\n", + "FCN-ResNet is constructed by a Fully-Convolutional Network model, using a ResNet-50 or a ResNet-101 backbone.\n", + "The pre-trained models have been trained on a subset of COCO train2017, on the 20 categories that are present in the Pascal VOC dataset.\n", + "\n", + "Their accuracies of the pre-trained models evaluated on COCO val2017 dataset are listed below.\n", + "\n", + "| Model structure | Mean IOU | Global Pixelwise Accuracy |\n", + "| --------------- | ----------- | --------------------------|\n", + "| fcn_resnet50 | 60.5 | 91.4 |\n", + "| fcn_resnet101 | 63.7 | 91.9 |\n", + "\n", + "### Resources\n", + "\n", + " - [Fully Convolutional Networks for Semantic Segmentation](https://arxiv.org/abs/1605.06211)" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/pytorch_vision_ghostnet.ipynb b/assets/hub/pytorch_vision_ghostnet.ipynb new file mode 100644 index 000000000000..9625aa6efbe1 --- /dev/null +++ b/assets/hub/pytorch_vision_ghostnet.ipynb @@ -0,0 +1,154 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "28f1b67a", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# GhostNet\n", + "\n", + "*Author: Huawei Noah's Ark Lab*\n", + "\n", + "**Efficient networks by generating more features from cheap operations**\n", + "\n", + "\"alt\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d3c51de", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "model = torch.hub.load('huawei-noah/ghostnet', 'ghostnet_1x', pretrained=True)\n", + "model.eval()" + ] + }, + { + "cell_type": "markdown", + "id": "f950f2af", + "metadata": {}, + "source": [ + "All pre-trained models expect input images normalized in the same way,\n", + "i.e. mini-batches of 3-channel RGB images of shape `(3 x H x W)`, where `H` and `W` are expected to be at least `224`.\n", + "The images have to be loaded in to a range of `[0, 1]` and then normalized using `mean = [0.485, 0.456, 0.406]`\n", + "and `std = [0.229, 0.224, 0.225]`.\n", + "\n", + "Here's a sample execution." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "58fbf55e", + "metadata": {}, + "outputs": [], + "source": [ + "# Download an example image from the pytorch website\n", + "import urllib\n", + "url, filename = (\"https://github.com/pytorch/hub/raw/master/images/dog.jpg\", \"dog.jpg\")\n", + "try: urllib.URLopener().retrieve(url, filename)\n", + "except: urllib.request.urlretrieve(url, filename)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ab2d59bf", + "metadata": {}, + "outputs": [], + "source": [ + "# sample execution (requires torchvision)\n", + "from PIL import Image\n", + "from torchvision import transforms\n", + "input_image = Image.open(filename)\n", + "preprocess = transforms.Compose([\n", + " transforms.Resize(256),\n", + " transforms.CenterCrop(224),\n", + " transforms.ToTensor(),\n", + " transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n", + "])\n", + "input_tensor = preprocess(input_image)\n", + "input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model\n", + "\n", + "# move the input and model to GPU for speed if available\n", + "if torch.cuda.is_available():\n", + " input_batch = input_batch.to('cuda')\n", + " model.to('cuda')\n", + "\n", + "with torch.no_grad():\n", + " output = model(input_batch)\n", + "# Tensor of shape 1000, with confidence scores over ImageNet's 1000 classes\n", + "print(output[0])\n", + "# The output has unnormalized scores. To get probabilities, you can run a softmax on it.\n", + "probabilities = torch.nn.functional.softmax(output[0], dim=0)\n", + "print(probabilities)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2979ac25", + "metadata": {}, + "outputs": [], + "source": [ + "# Download ImageNet labels\n", + "!wget https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8b59152d", + "metadata": {}, + "outputs": [], + "source": [ + "# Read the categories\n", + "with open(\"imagenet_classes.txt\", \"r\") as f:\n", + " categories = [s.strip() for s in f.readlines()]\n", + "# Show top categories per image\n", + "top5_prob, top5_catid = torch.topk(probabilities, 5)\n", + "for i in range(top5_prob.size(0)):\n", + " print(categories[top5_catid[i]], top5_prob[i].item())" + ] + }, + { + "cell_type": "markdown", + "id": "1d889cf0", + "metadata": {}, + "source": [ + "### Model Description\n", + "\n", + "The GhostNet architecture is based on an Ghost module structure which generate more features from cheap operations. Based on a set of intrinsic feature maps, a series of cheap operations are applied to generate many ghost feature maps that could fully reveal information underlying intrinsic features. Experiments conducted on benchmarks demonstrate that the superiority of GhostNet in terms of speed and accuracy tradeoff.\n", + "\n", + "The corresponding accuracy on ImageNet dataset with pretrained model is listed below.\n", + "\n", + "| Model structure | FLOPs | Top-1 acc | Top-5 acc |\n", + "| --------------- | ----------- | ----------- | ----------- |\n", + "| GhostNet 1.0x | 142M | 73.98 | 91.46 |\n", + "\n", + "\n", + "### References\n", + "\n", + "You can read the full paper at this [link](https://arxiv.org/abs/1911.11907).\n", + "\n", + ">@inproceedings{han2019ghostnet,\n", + "> title={GhostNet: More Features from Cheap Operations},\n", + "> author={Kai Han and Yunhe Wang and Qi Tian and Jianyuan Guo and Chunjing Xu and Chang Xu},\n", + "> booktitle={CVPR},\n", + "> year={2020},\n", + ">}" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/pytorch_vision_googlenet.ipynb b/assets/hub/pytorch_vision_googlenet.ipynb new file mode 100644 index 000000000000..a17e3e2097ae --- /dev/null +++ b/assets/hub/pytorch_vision_googlenet.ipynb @@ -0,0 +1,148 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "3de6cad3", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# GoogLeNet\n", + "\n", + "*Author: Pytorch Team*\n", + "\n", + "**GoogLeNet was based on a deep convolutional neural network architecture codenamed \"Inception\" which won ImageNet 2014.**\n", + "\n", + "_ | _\n", + "- | -\n", + "![alt](https://pytorch.org/assets/images/googlenet1.png) | ![alt](https://pytorch.org/assets/images/googlenet2.png)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9f47a584", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "model = torch.hub.load('pytorch/vision:v0.10.0', 'googlenet', pretrained=True)\n", + "model.eval()" + ] + }, + { + "cell_type": "markdown", + "id": "535bcc73", + "metadata": {}, + "source": [ + "All pre-trained models expect input images normalized in the same way,\n", + "i.e. mini-batches of 3-channel RGB images of shape `(3 x H x W)`, where `H` and `W` are expected to be at least `224`.\n", + "The images have to be loaded in to a range of `[0, 1]` and then normalized using `mean = [0.485, 0.456, 0.406]`\n", + "and `std = [0.229, 0.224, 0.225]`.\n", + "\n", + "Here's a sample execution." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fb150def", + "metadata": {}, + "outputs": [], + "source": [ + "# Download an example image from the pytorch website\n", + "import urllib\n", + "url, filename = (\"https://github.com/pytorch/hub/raw/master/images/dog.jpg\", \"dog.jpg\")\n", + "try: urllib.URLopener().retrieve(url, filename)\n", + "except: urllib.request.urlretrieve(url, filename)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b6cbf35", + "metadata": {}, + "outputs": [], + "source": [ + "# sample execution (requires torchvision)\n", + "from PIL import Image\n", + "from torchvision import transforms\n", + "input_image = Image.open(filename)\n", + "preprocess = transforms.Compose([\n", + " transforms.Resize(256),\n", + " transforms.CenterCrop(224),\n", + " transforms.ToTensor(),\n", + " transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n", + "])\n", + "input_tensor = preprocess(input_image)\n", + "input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model\n", + "\n", + "# move the input and model to GPU for speed if available\n", + "if torch.cuda.is_available():\n", + " input_batch = input_batch.to('cuda')\n", + " model.to('cuda')\n", + "\n", + "with torch.no_grad():\n", + " output = model(input_batch)\n", + "# Tensor of shape 1000, with confidence scores over ImageNet's 1000 classes\n", + "print(output[0])\n", + "# The output has unnormalized scores. To get probabilities, you can run a softmax on it.\n", + "probabilities = torch.nn.functional.softmax(output[0], dim=0)\n", + "print(probabilities)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "59b9161f", + "metadata": {}, + "outputs": [], + "source": [ + "# Download ImageNet labels\n", + "!wget https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19a93651", + "metadata": {}, + "outputs": [], + "source": [ + "# Read the categories\n", + "with open(\"imagenet_classes.txt\", \"r\") as f:\n", + " categories = [s.strip() for s in f.readlines()]\n", + "# Show top categories per image\n", + "top5_prob, top5_catid = torch.topk(probabilities, 5)\n", + "for i in range(top5_prob.size(0)):\n", + " print(categories[top5_catid[i]], top5_prob[i].item())" + ] + }, + { + "cell_type": "markdown", + "id": "84c98908", + "metadata": {}, + "source": [ + "### Model Description\n", + "\n", + "GoogLeNet was based on a deep convolutional neural network architecture codenamed \"Inception\", which was responsible for setting the new state of the art for classification and detection in the ImageNet Large-Scale Visual Recognition Challenge 2014 (ILSVRC 2014). The 1-crop error rates on the ImageNet dataset with a pretrained model are list below.\n", + "\n", + "| Model structure | Top-1 error | Top-5 error |\n", + "| --------------- | ----------- | ----------- |\n", + "| googlenet | 30.22 | 10.47 |\n", + "\n", + "\n", + "\n", + "### References\n", + "\n", + " - [Going Deeper with Convolutions](https://arxiv.org/abs/1409.4842)" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/pytorch_vision_hardnet.ipynb b/assets/hub/pytorch_vision_hardnet.ipynb new file mode 100644 index 000000000000..f362326c17e2 --- /dev/null +++ b/assets/hub/pytorch_vision_hardnet.ipynb @@ -0,0 +1,161 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "7b71e157", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# HarDNet\n", + "\n", + "*Author: PingoLH*\n", + "\n", + "**Harmonic DenseNet pre-trained on ImageNet**\n", + "\n", + "_ | _\n", + "- | -\n", + "![alt](https://pytorch.org/assets/images/hardnet.png) | ![alt](https://pytorch.org/assets/images/hardnet_blk.png)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c3be2b63", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "model = torch.hub.load('PingoLH/Pytorch-HarDNet', 'hardnet68', pretrained=True)\n", + "# or any of these variants\n", + "# model = torch.hub.load('PingoLH/Pytorch-HarDNet', 'hardnet85', pretrained=True)\n", + "# model = torch.hub.load('PingoLH/Pytorch-HarDNet', 'hardnet68ds', pretrained=True)\n", + "# model = torch.hub.load('PingoLH/Pytorch-HarDNet', 'hardnet39ds', pretrained=True)\n", + "model.eval()" + ] + }, + { + "cell_type": "markdown", + "id": "781d2cd7", + "metadata": {}, + "source": [ + "All pre-trained models expect input images normalized in the same way,\n", + "i.e. mini-batches of 3-channel RGB images of shape `(3 x H x W)`, where `H` and `W` are expected to be at least `224`.\n", + "The images have to be loaded in to a range of `[0, 1]` and then normalized using `mean = [0.485, 0.456, 0.406]`\n", + "and `std = [0.229, 0.224, 0.225]`.\n", + "\n", + "Here's a sample execution." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7e95526f", + "metadata": {}, + "outputs": [], + "source": [ + "# Download an example image from the pytorch website\n", + "import urllib\n", + "url, filename = (\"https://github.com/pytorch/hub/raw/master/images/dog.jpg\", \"dog.jpg\")\n", + "try: urllib.URLopener().retrieve(url, filename)\n", + "except: urllib.request.urlretrieve(url, filename)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e7c2c3e", + "metadata": {}, + "outputs": [], + "source": [ + "# sample execution (requires torchvision)\n", + "from PIL import Image\n", + "from torchvision import transforms\n", + "input_image = Image.open(filename)\n", + "preprocess = transforms.Compose([\n", + " transforms.Resize(256),\n", + " transforms.CenterCrop(224),\n", + " transforms.ToTensor(),\n", + " transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n", + "])\n", + "input_tensor = preprocess(input_image)\n", + "input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model\n", + "\n", + "# move the input and model to GPU for speed if available\n", + "if torch.cuda.is_available():\n", + " input_batch = input_batch.to('cuda')\n", + " model.to('cuda')\n", + "\n", + "with torch.no_grad():\n", + " output = model(input_batch)\n", + "# Tensor of shape 1000, with confidence scores over ImageNet's 1000 classes\n", + "print(output[0])\n", + "# The output has unnormalized scores. To get probabilities, you can run a softmax on it.\n", + "probabilities = torch.nn.functional.softmax(output[0], dim=0)\n", + "print(probabilities)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a40b533b", + "metadata": {}, + "outputs": [], + "source": [ + "# Download ImageNet labels\n", + "!wget https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cfeff952", + "metadata": {}, + "outputs": [], + "source": [ + "# Read the categories\n", + "with open(\"imagenet_classes.txt\", \"r\") as f:\n", + " categories = [s.strip() for s in f.readlines()]\n", + "# Show top categories per image\n", + "top5_prob, top5_catid = torch.topk(probabilities, 5)\n", + "for i in range(top5_prob.size(0)):\n", + " print(categories[top5_catid[i]], top5_prob[i].item())" + ] + }, + { + "cell_type": "markdown", + "id": "b85e734e", + "metadata": {}, + "source": [ + "### Model Description\n", + "\n", + "Harmonic DenseNet (HarDNet) is a low memory traffic CNN model, which is fast and efficient.\n", + "The basic concept is to minimize both computational cost and memory access cost at the same\n", + "time, such that the HarDNet models are 35% faster than ResNet running on GPU\n", + "comparing to models with the same accuracy (except the two DS models that\n", + "were designed for comparing with MobileNet).\n", + "\n", + "Here we have the 4 versions of hardnet models, which contains 39, 68, 85 layers\n", + "w/ or w/o Depthwise Separable Conv respectively.\n", + "Their 1-crop error rates on ImageNet dataset with pretrained models are listed below.\n", + "\n", + "| Model structure | Top-1 error | Top-5 error |\n", + "| --------------- | ----------- | ----------- |\n", + "| hardnet39ds | 27.92 | 9.57 |\n", + "| hardnet68ds | 25.71 | 8.13 |\n", + "| hardnet68 | 23.52 | 6.99 |\n", + "| hardnet85 | 21.96 | 6.11 |\n", + "\n", + "### References\n", + "\n", + " - [HarDNet: A Low Memory Traffic Network](https://arxiv.org/abs/1909.00948)" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/pytorch_vision_ibnnet.ipynb b/assets/hub/pytorch_vision_ibnnet.ipynb new file mode 100644 index 000000000000..d5e9bbcc434e --- /dev/null +++ b/assets/hub/pytorch_vision_ibnnet.ipynb @@ -0,0 +1,164 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "981d849a", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# IBN-Net\n", + "\n", + "*Author: Xingang Pan*\n", + "\n", + "**Networks with domain/appearance invariance**\n", + "\n", + "\"alt\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "638d2324", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "model = torch.hub.load('XingangPan/IBN-Net', 'resnet50_ibn_a', pretrained=True)\n", + "model.eval()" + ] + }, + { + "cell_type": "markdown", + "id": "dd36e6ab", + "metadata": {}, + "source": [ + "All pre-trained models expect input images normalized in the same way,\n", + "i.e. mini-batches of 3-channel RGB images of shape `(3 x H x W)`, where `H` and `W` are expected to be at least `224`.\n", + "The images have to be loaded in to a range of `[0, 1]` and then normalized using `mean = [0.485, 0.456, 0.406]`\n", + "and `std = [0.229, 0.224, 0.225]`.\n", + "\n", + "Here's a sample execution." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "50499219", + "metadata": {}, + "outputs": [], + "source": [ + "# Download an example image from the pytorch website\n", + "import urllib\n", + "url, filename = (\"https://github.com/pytorch/hub/raw/master/images/dog.jpg\", \"dog.jpg\")\n", + "try: urllib.URLopener().retrieve(url, filename)\n", + "except: urllib.request.urlretrieve(url, filename)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "81296a41", + "metadata": {}, + "outputs": [], + "source": [ + "# sample execution (requires torchvision)\n", + "from PIL import Image\n", + "from torchvision import transforms\n", + "input_image = Image.open(filename)\n", + "preprocess = transforms.Compose([\n", + " transforms.Resize(256),\n", + " transforms.CenterCrop(224),\n", + " transforms.ToTensor(),\n", + " transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n", + "])\n", + "input_tensor = preprocess(input_image)\n", + "input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model\n", + "\n", + "# move the input and model to GPU for speed if available\n", + "if torch.cuda.is_available():\n", + " input_batch = input_batch.to('cuda')\n", + " model.to('cuda')\n", + "\n", + "with torch.no_grad():\n", + " output = model(input_batch)\n", + "# Tensor of shape 1000, with confidence scores over ImageNet's 1000 classes\n", + "print(output[0])\n", + "# The output has unnormalized scores. To get probabilities, you can run a softmax on it.\n", + "probabilities = torch.nn.functional.softmax(output[0], dim=0)\n", + "print(probabilities)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b8f1a0cf", + "metadata": {}, + "outputs": [], + "source": [ + "# Download ImageNet labels\n", + "!wget https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "805ecb76", + "metadata": {}, + "outputs": [], + "source": [ + "# Read the categories\n", + "with open(\"imagenet_classes.txt\", \"r\") as f:\n", + " categories = [s.strip() for s in f.readlines()]\n", + "# Show top categories per image\n", + "top5_prob, top5_catid = torch.topk(probabilities, 5)\n", + "for i in range(top5_prob.size(0)):\n", + " print(categories[top5_catid[i]], top5_prob[i].item())" + ] + }, + { + "cell_type": "markdown", + "id": "85dcd9b3", + "metadata": {}, + "source": [ + "### Model Description\n", + "\n", + "IBN-Net is a CNN model with domain/appearance invariance.\n", + "Motivated by style transfer works, IBN-Net carefully unifies instance normalization and batch normalization in a single deep network.\n", + "It provides a simple way to increase both modeling and generalization capacities without adding model complexity.\n", + "IBN-Net is especially suitable for cross domain or person/vehicle re-identification tasks.\n", + "\n", + "The corresponding accuracies on ImageNet dataset with pretrained models are listed below.\n", + "\n", + "| Model name | Top-1 acc | Top-5 acc |\n", + "| --------------- | ----------- | ----------- |\n", + "| resnet50_ibn_a | 77.46 | 93.68 |\n", + "| resnet101_ibn_a | 78.61 | 94.41 |\n", + "| resnext101_ibn_a | 79.12 | 94.58 |\n", + "| se_resnet101_ibn_a | 78.75 | 94.49 |\n", + "\n", + "The rank1/mAP on two Re-ID benchmarks Market1501 and DukeMTMC-reID are listed below (from [michuanhaohao/reid-strong-baseline](https://github.com/michuanhaohao/reid-strong-baseline)).\n", + "\n", + "| Backbone | Market1501 | DukeMTMC-reID |\n", + "| --- | -- | -- |\n", + "| ResNet50 | 94.5 (85.9) | 86.4 (76.4) |\n", + "| ResNet101 | 94.5 (87.1) | 87.6 (77.6) |\n", + "| SeResNet50 | 94.4 (86.3) | 86.4 (76.5) |\n", + "| SeResNet101 | 94.6 (87.3) | 87.5 (78.0) |\n", + "| SeResNeXt50 | 94.9 (87.6) | 88.0 (78.3) |\n", + "| SeResNeXt101 | 95.0 (88.0) | 88.4 (79.0) |\n", + "| ResNet50-IBN-a | 95.0 (88.2) | 90.1 (79.1) |\n", + "\n", + "### References\n", + "\n", + " - [Two at Once: Enhancing Learning and Generalization Capacities via IBN-Net](https://arxiv.org/abs/1807.09441)" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/pytorch_vision_inception_v3.ipynb b/assets/hub/pytorch_vision_inception_v3.ipynb new file mode 100644 index 000000000000..087a6201fe8f --- /dev/null +++ b/assets/hub/pytorch_vision_inception_v3.ipynb @@ -0,0 +1,146 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "6a0633dd", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# Inception_v3\n", + "\n", + "*Author: Pytorch Team*\n", + "\n", + "**Also called GoogleNetv3, a famous ConvNet trained on ImageNet from 2015**\n", + "\n", + "\"alt\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b593a71f", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "model = torch.hub.load('pytorch/vision:v0.10.0', 'inception_v3', pretrained=True)\n", + "model.eval()" + ] + }, + { + "cell_type": "markdown", + "id": "24d6e73e", + "metadata": {}, + "source": [ + "All pre-trained models expect input images normalized in the same way,\n", + "i.e. mini-batches of 3-channel RGB images of shape `(3 x H x W)`, where `H` and `W` are expected to be at least `299`.\n", + "The images have to be loaded in to a range of `[0, 1]` and then normalized using `mean = [0.485, 0.456, 0.406]`\n", + "and `std = [0.229, 0.224, 0.225]`.\n", + "\n", + "Here's a sample execution." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f1228762", + "metadata": {}, + "outputs": [], + "source": [ + "# Download an example image from the pytorch website\n", + "import urllib\n", + "url, filename = (\"https://github.com/pytorch/hub/raw/master/images/dog.jpg\", \"dog.jpg\")\n", + "try: urllib.URLopener().retrieve(url, filename)\n", + "except: urllib.request.urlretrieve(url, filename)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c187630", + "metadata": {}, + "outputs": [], + "source": [ + "# sample execution (requires torchvision)\n", + "from PIL import Image\n", + "from torchvision import transforms\n", + "input_image = Image.open(filename)\n", + "preprocess = transforms.Compose([\n", + " transforms.Resize(299),\n", + " transforms.CenterCrop(299),\n", + " transforms.ToTensor(),\n", + " transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n", + "])\n", + "input_tensor = preprocess(input_image)\n", + "input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model\n", + "\n", + "# move the input and model to GPU for speed if available\n", + "if torch.cuda.is_available():\n", + " input_batch = input_batch.to('cuda')\n", + " model.to('cuda')\n", + "\n", + "with torch.no_grad():\n", + " output = model(input_batch)\n", + "# Tensor of shape 1000, with confidence scores over ImageNet's 1000 classes\n", + "print(output[0])\n", + "# The output has unnormalized scores. To get probabilities, you can run a softmax on it.\n", + "probabilities = torch.nn.functional.softmax(output[0], dim=0)\n", + "print(probabilities)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f4d1c366", + "metadata": {}, + "outputs": [], + "source": [ + "# Download ImageNet labels\n", + "!wget https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "adc090b7", + "metadata": {}, + "outputs": [], + "source": [ + "# Read the categories\n", + "with open(\"imagenet_classes.txt\", \"r\") as f:\n", + " categories = [s.strip() for s in f.readlines()]\n", + "# Show top categories per image\n", + "top5_prob, top5_catid = torch.topk(probabilities, 5)\n", + "for i in range(top5_prob.size(0)):\n", + " print(categories[top5_catid[i]], top5_prob[i].item())" + ] + }, + { + "cell_type": "markdown", + "id": "e0199fa7", + "metadata": {}, + "source": [ + "### Model Description\n", + "\n", + "Inception v3: Based on the exploration of ways to scale up networks in ways that aim at utilizing the added computation as efficiently as possible by suitably factorized convolutions and aggressive regularization. We benchmark our methods on the ILSVRC 2012 classification challenge validation set demonstrate substantial gains over the state of the art: 21.2% top-1 and 5.6% top-5 error for single frame evaluation using a network with a computational cost of 5 billion multiply-adds per inference and with using less than 25 million parameters. With an ensemble of 4 models and multi-crop evaluation, we report 3.5% top-5 error on the validation set (3.6% error on the test set) and 17.3% top-1 error on the validation set.\n", + "\n", + "The 1-crop error rates on the ImageNet dataset with the pretrained model are listed below.\n", + "\n", + "| Model structure | Top-1 error | Top-5 error |\n", + "| --------------- | ----------- | ----------- |\n", + "| inception_v3 | 22.55 | 6.44 |\n", + "\n", + "### References\n", + "\n", + " - [Rethinking the Inception Architecture for Computer Vision](https://arxiv.org/abs/1512.00567)." + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/pytorch_vision_meal_v2.ipynb b/assets/hub/pytorch_vision_meal_v2.ipynb new file mode 100644 index 000000000000..bea099f0d0cb --- /dev/null +++ b/assets/hub/pytorch_vision_meal_v2.ipynb @@ -0,0 +1,199 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "87b6d7ba", + "metadata": {}, + "source": [ + "### This notebook requires a GPU runtime to run.\n", + "### Please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# MEAL_V2\n", + "\n", + "*Author: Carnegie Mellon University*\n", + "\n", + "**Boosting Tiny and Efficient Models using Knowledge Distillation.**\n", + "\n", + "_ | _\n", + "- | -\n", + "![alt](https://pytorch.org/assets/images/MEALV2_method.png) | ![alt](https://pytorch.org/assets/images/MEALV2_results.png)\n", + "\n", + "\n", + "We require one additional Python dependency" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "041ba368", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "!pip install timm" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d29f16dc", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "# list of models: 'mealv1_resnest50', 'mealv2_resnest50', 'mealv2_resnest50_cutmix', 'mealv2_resnest50_380x380', 'mealv2_mobilenetv3_small_075', 'mealv2_mobilenetv3_small_100', 'mealv2_mobilenet_v3_large_100', 'mealv2_efficientnet_b0'\n", + "# load pretrained models, using \"mealv2_resnest50_cutmix\" as an example\n", + "model = torch.hub.load('szq0214/MEAL-V2','meal_v2', 'mealv2_resnest50_cutmix', pretrained=True)\n", + "model.eval()" + ] + }, + { + "cell_type": "markdown", + "id": "99c27a3e", + "metadata": {}, + "source": [ + "All pre-trained models expect input images normalized in the same way,\n", + "i.e. mini-batches of 3-channel RGB images of shape `(3 x H x W)`, where `H` and `W` are expected to be at least `224`.\n", + "The images have to be loaded in to a range of `[0, 1]` and then normalized using `mean = [0.485, 0.456, 0.406]`\n", + "and `std = [0.229, 0.224, 0.225]`.\n", + "\n", + "Here's a sample execution." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5aa4fa53", + "metadata": {}, + "outputs": [], + "source": [ + "# Download an example image from the pytorch website\n", + "import urllib\n", + "url, filename = (\"https://github.com/pytorch/hub/raw/master/images/dog.jpg\", \"dog.jpg\")\n", + "try: urllib.URLopener().retrieve(url, filename)\n", + "except: urllib.request.urlretrieve(url, filename)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c9f18274", + "metadata": {}, + "outputs": [], + "source": [ + "# sample execution (requires torchvision)\n", + "from PIL import Image\n", + "from torchvision import transforms\n", + "input_image = Image.open(filename)\n", + "preprocess = transforms.Compose([\n", + " transforms.Resize(256),\n", + " transforms.CenterCrop(224),\n", + " transforms.ToTensor(),\n", + " transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n", + "])\n", + "input_tensor = preprocess(input_image)\n", + "input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model\n", + "\n", + "# move the input and model to GPU for speed if available\n", + "if torch.cuda.is_available():\n", + " input_batch = input_batch.to('cuda')\n", + " model.to('cuda')\n", + "\n", + "with torch.no_grad():\n", + " output = model(input_batch)\n", + "# Tensor of shape 1000, with confidence scores over ImageNet's 1000 classes\n", + "print(output[0])\n", + "# The output has unnormalized scores. To get probabilities, you can run a softmax on it.\n", + "probabilities = torch.nn.functional.softmax(output[0], dim=0)\n", + "print(probabilities)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2ce4b3fd", + "metadata": {}, + "outputs": [], + "source": [ + "# Download ImageNet labels\n", + "!wget https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c392ed05", + "metadata": {}, + "outputs": [], + "source": [ + "# Read the categories\n", + "with open(\"imagenet_classes.txt\", \"r\") as f:\n", + " categories = [s.strip() for s in f.readlines()]\n", + "# Show top categories per image\n", + "top5_prob, top5_catid = torch.topk(probabilities, 5)\n", + "for i in range(top5_prob.size(0)):\n", + " print(categories[top5_catid[i]], top5_prob[i].item())" + ] + }, + { + "cell_type": "markdown", + "id": "2de17ed5", + "metadata": {}, + "source": [ + "### Model Description\n", + "\n", + "MEAL V2 models are from the [MEAL V2: Boosting Vanilla ResNet-50 to 80%+ Top-1 Accuracy on ImageNet without Tricks](https://arxiv.org/pdf/2009.08453.pdf) paper.\n", + "\n", + "In this paper, we introduce a simple yet effective approach that can boost the vanilla ResNet-50 to 80%+ Top-1 accuracy on ImageNet without any tricks. Generally, our method is based on the recently proposed [MEAL](https://arxiv.org/abs/1812.02425), i.e., ensemble knowledge distillation via discriminators. We further simplify it through 1) adopting the similarity loss and discriminator only on the final outputs and 2) using the average of softmax probabilities from all teacher ensembles as the stronger supervision for distillation. One crucial perspective of our method is that the one-hot/hard label should not be used in the distillation process. We show that such a simple framework can achieve state-of-the-art results without involving any commonly-used tricks, such as 1) architecture modification; 2) outside training data beyond ImageNet; 3) autoaug/randaug; 4) cosine learning rate; 5) mixup/cutmix training; 6) label smoothing; etc.\n", + "\n", + "| Models | Resolution| #Parameters | Top-1/Top-5 |\n", + "| :---: | :-: | :-: | :------:| :------: | \n", + "| [MEAL-V1 w/ ResNet50](https://arxiv.org/abs/1812.02425) | 224 | 25.6M |**78.21/94.01** | [GitHub](https://github.com/AaronHeee/MEAL#imagenet-model) |\n", + "| MEAL-V2 w/ ResNet50 | 224 | 25.6M | **80.67/95.09** | \n", + "| MEAL-V2 w/ ResNet50| 380 | 25.6M | **81.72/95.81** | \n", + "| MEAL-V2 + CutMix w/ ResNet50| 224 | 25.6M | **80.98/95.35** | \n", + "| MEAL-V2 w/ MobileNet V3-Small 0.75| 224 | 2.04M | **67.60/87.23** | \n", + "| MEAL-V2 w/ MobileNet V3-Small 1.0| 224 | 2.54M | **69.65/88.71** | \n", + "| MEAL-V2 w/ MobileNet V3-Large 1.0 | 224 | 5.48M | **76.92/93.32** | \n", + "| MEAL-V2 w/ EfficientNet-B0| 224 | 5.29M | **78.29/93.95** | \n", + "\n", + "### References\n", + "\n", + "Please refer to our papers [MEAL V2](https://arxiv.org/pdf/2009.08453.pdf), [MEAL](https://arxiv.org/pdf/1812.02425.pdf) for more details." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1966f9f3", + "metadata": {}, + "outputs": [], + "source": [ + "@article{shen2020mealv2,\n", + " title={MEAL V2: Boosting Vanilla ResNet-50 to 80%+ Top-1 Accuracy on ImageNet without Tricks},\n", + " author={Shen, Zhiqiang and Savvides, Marios},\n", + " journal={arXiv preprint arXiv:2009.08453},\n", + " year={2020}\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "4fed91a2", + "metadata": {}, + "source": [ + "@inproceedings{shen2019MEAL,\n", + "\t\ttitle = {MEAL: Multi-Model Ensemble via Adversarial Learning},\n", + "\t\tauthor = {Shen, Zhiqiang and He, Zhankui and Xue, Xiangyang},\n", + "\t\tbooktitle = {AAAI},\n", + "\t\tyear = {2019}\n", + "\t}" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/pytorch_vision_mobilenet_v2.ipynb b/assets/hub/pytorch_vision_mobilenet_v2.ipynb new file mode 100644 index 000000000000..b33561619ff5 --- /dev/null +++ b/assets/hub/pytorch_vision_mobilenet_v2.ipynb @@ -0,0 +1,147 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "eb5333f8", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# MobileNet v2\n", + "\n", + "*Author: Pytorch Team*\n", + "\n", + "**Efficient networks optimized for speed and memory, with residual blocks**\n", + "\n", + "_ | _\n", + "- | -\n", + "![alt](https://pytorch.org/assets/images/mobilenet_v2_1.png) | ![alt](https://pytorch.org/assets/images/mobilenet_v2_2.png)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a9ec286f", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "model = torch.hub.load('pytorch/vision:v0.10.0', 'mobilenet_v2', pretrained=True)\n", + "model.eval()" + ] + }, + { + "cell_type": "markdown", + "id": "efb840d6", + "metadata": {}, + "source": [ + "All pre-trained models expect input images normalized in the same way,\n", + "i.e. mini-batches of 3-channel RGB images of shape `(3 x H x W)`, where `H` and `W` are expected to be at least `224`.\n", + "The images have to be loaded in to a range of `[0, 1]` and then normalized using `mean = [0.485, 0.456, 0.406]`\n", + "and `std = [0.229, 0.224, 0.225]`.\n", + "\n", + "Here's a sample execution." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "82571048", + "metadata": {}, + "outputs": [], + "source": [ + "# Download an example image from the pytorch website\n", + "import urllib\n", + "url, filename = (\"https://github.com/pytorch/hub/raw/master/images/dog.jpg\", \"dog.jpg\")\n", + "try: urllib.URLopener().retrieve(url, filename)\n", + "except: urllib.request.urlretrieve(url, filename)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "28f02763", + "metadata": {}, + "outputs": [], + "source": [ + "# sample execution (requires torchvision)\n", + "from PIL import Image\n", + "from torchvision import transforms\n", + "input_image = Image.open(filename)\n", + "preprocess = transforms.Compose([\n", + " transforms.Resize(256),\n", + " transforms.CenterCrop(224),\n", + " transforms.ToTensor(),\n", + " transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n", + "])\n", + "input_tensor = preprocess(input_image)\n", + "input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model\n", + "\n", + "# move the input and model to GPU for speed if available\n", + "if torch.cuda.is_available():\n", + " input_batch = input_batch.to('cuda')\n", + " model.to('cuda')\n", + "\n", + "with torch.no_grad():\n", + " output = model(input_batch)\n", + "# Tensor of shape 1000, with confidence scores over ImageNet's 1000 classes\n", + "print(output[0])\n", + "# The output has unnormalized scores. To get probabilities, you can run a softmax on it.\n", + "probabilities = torch.nn.functional.softmax(output[0], dim=0)\n", + "print(probabilities)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6380a714", + "metadata": {}, + "outputs": [], + "source": [ + "# Download ImageNet labels\n", + "!wget https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "92b2e982", + "metadata": {}, + "outputs": [], + "source": [ + "# Read the categories\n", + "with open(\"imagenet_classes.txt\", \"r\") as f:\n", + " categories = [s.strip() for s in f.readlines()]\n", + "# Show top categories per image\n", + "top5_prob, top5_catid = torch.topk(probabilities, 5)\n", + "for i in range(top5_prob.size(0)):\n", + " print(categories[top5_catid[i]], top5_prob[i].item())" + ] + }, + { + "cell_type": "markdown", + "id": "805611f2", + "metadata": {}, + "source": [ + "### Model Description\n", + "\n", + "The MobileNet v2 architecture is based on an inverted residual structure where the input and output of the residual block are thin bottleneck layers opposite to traditional residual models which use expanded representations in the input. MobileNet v2 uses lightweight depthwise convolutions to filter features in the intermediate expansion layer. Additionally, non-linearities in the narrow layers were removed in order to maintain representational power.\n", + "\n", + "| Model structure | Top-1 error | Top-5 error |\n", + "| --------------- | ----------- | ----------- |\n", + "| mobilenet_v2 | 28.12 | 9.71 |\n", + "\n", + "\n", + "### References\n", + "\n", + " - [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381)" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/pytorch_vision_once_for_all.ipynb b/assets/hub/pytorch_vision_once_for_all.ipynb new file mode 100644 index 000000000000..0183543a65a3 --- /dev/null +++ b/assets/hub/pytorch_vision_once_for_all.ipynb @@ -0,0 +1,210 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a1a4b7f5", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# Once-for-All\n", + "\n", + "*Author: MIT Han Lab*\n", + "\n", + "**Once-for-all (OFA) decouples training and search, and achieves efficient inference across various edge devices and resource constraints.**\n", + "\n", + "\"alt\"\n", + "\n", + "\n", + "\n", + "\n", + "### Get supernet\n", + "\n", + "You can quickly load a supernet as following" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "53f8de30", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "super_net_name = \"ofa_supernet_mbv3_w10\" \n", + "# other options: \n", + "# ofa_supernet_resnet50 / \n", + "# ofa_supernet_mbv3_w12 / \n", + "# ofa_supernet_proxyless\n", + "\n", + "super_net = torch.hub.load('mit-han-lab/once-for-all', super_net_name, pretrained=True).eval()" + ] + }, + { + "cell_type": "markdown", + "id": "1fd4088d", + "metadata": {}, + "source": [ + "| OFA Network | Design Space | Resolution | Width Multiplier | Depth | Expand Ratio | kernel Size | \n", + "|----------------------|----------|----------|---------|------------|---------|------------|\n", + "| ofa_resnet50 | ResNet50D | 128 - 224 | 0.65, 0.8, 1.0 | 0, 1, 2 | 0.2, 0.25, 0.35 | 3 |\n", + "| ofa_mbv3_d234_e346_k357_w1.0 | MobileNetV3 | 128 - 224 | 1.0 | 2, 3, 4 | 3, 4, 6 | 3, 5, 7 |\n", + "| ofa_mbv3_d234_e346_k357_w1.2 | MobileNetV3 | 160 - 224 | 1.2 | 2, 3, 4 | 3, 4, 6 | 3, 5, 7 |\n", + "| ofa_proxyless_d234_e346_k357_w1.3 | ProxylessNAS | 128 - 224 | 1.3 | 2, 3, 4 | 3, 4, 6 | 3, 5, 7 |\n", + "\n", + "\n", + "Below are the usage of sampling / selecting a subnet from the supernet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6b33c44d", + "metadata": {}, + "outputs": [], + "source": [ + "# Randomly sample sub-networks from OFA network\n", + "super_net.sample_active_subnet()\n", + "random_subnet = super_net.get_active_subnet(preserve_weight=True)\n", + " \n", + "# Manually set the sub-network\n", + "super_net.set_active_subnet(ks=7, e=6, d=4)\n", + "manual_subnet = super_net.get_active_subnet(preserve_weight=True)" + ] + }, + { + "cell_type": "markdown", + "id": "dd512c03", + "metadata": {}, + "source": [ + "### Get Specialized Architecture" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e1d56c24", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "\n", + "# or load a architecture specialized for certain platform\n", + "net_config = \"resnet50D_MAC_4_1B\"\n", + "\n", + "specialized_net, image_size = torch.hub.load('mit-han-lab/once-for-all', net_config, pretrained=True)\n", + "specialized_net.eval()" + ] + }, + { + "cell_type": "markdown", + "id": "157a77cd", + "metadata": {}, + "source": [ + "More models and configurations can be found in [once-for-all/model-zoo](https://github.com/mit-han-lab/once-for-all#evaluate-1)\n", + "and obtained through the following scripts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9985bdd4", + "metadata": {}, + "outputs": [], + "source": [ + "ofa_specialized_get = torch.hub.load('mit-han-lab/once-for-all', \"ofa_specialized_get\")\n", + "model, image_size = ofa_specialized_get(\"flops@595M_top1@80.0_finetune@75\", pretrained=True)\n", + "model.eval()" + ] + }, + { + "cell_type": "markdown", + "id": "b98bdea4", + "metadata": {}, + "source": [ + "The model's prediction can be evalutaed by" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2d86ac1d", + "metadata": {}, + "outputs": [], + "source": [ + "# Download an example image from pytorch website\n", + "import urllib\n", + "url, filename = (\"https://github.com/pytorch/hub/raw/master/images/dog.jpg\", \"dog.jpg\")\n", + "try: \n", + " urllib.URLopener().retrieve(url, filename)\n", + "except: \n", + " urllib.request.urlretrieve(url, filename)\n", + "\n", + "\n", + "# sample execution (requires torchvision)\n", + "from PIL import Image\n", + "from torchvision import transforms\n", + "input_image = Image.open(filename)\n", + "preprocess = transforms.Compose([\n", + " transforms.Resize(256),\n", + " transforms.CenterCrop(224),\n", + " transforms.ToTensor(),\n", + " transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n", + "])\n", + "input_tensor = preprocess(input_image)\n", + "input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model\n", + "\n", + "# move the input and model to GPU for speed if available\n", + "if torch.cuda.is_available():\n", + " input_batch = input_batch.to('cuda')\n", + " model.to('cuda')\n", + "\n", + "with torch.no_grad():\n", + " output = model(input_batch)\n", + "# Tensor of shape 1000, with confidence scores over ImageNet's 1000 classes\n", + "print(output[0])\n", + "# The output has unnormalized scores. To get probabilities, you can run a softmax on it.\n", + "probabilities = torch.nn.functional.softmax(output[0], dim=0)\n", + "print(probabilities)\n" + ] + }, + { + "cell_type": "markdown", + "id": "db6dd8fb", + "metadata": {}, + "source": [ + "### Model Description\n", + "Once-for-all models are from [Once for All: Train One Network and Specialize it for Efficient Deployment](https://arxiv.org/abs/1908.09791). Conventional approaches either manually design or use neural architecture search (NAS) to find a specialized neural network and train it from scratch for each case, which is computationally prohibitive (causing CO2 emission as much as 5 cars' lifetime) thus unscalable. In this work, we propose to train a once-for-all (OFA) network that supports diverse architectural settings by decoupling training and search. Across diverse edge devices, OFA consistently outperforms state-of-the-art (SOTA) NAS methods (up to 4.0% ImageNet top1 accuracy improvement over MobileNetV3, or same accuracy but 1.5x faster than MobileNetV3, 2.6x faster than EfficientNet w.r.t measured latency) while reducing many orders of magnitude GPU hours and CO2 emission. In particular, OFA achieves a new SOTA 80.0% ImageNet top-1 accuracy under the mobile setting (<600M MACs).\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "### References" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6fa6a8b2", + "metadata": {}, + "outputs": [], + "source": [ + "@inproceedings{\n", + " cai2020once,\n", + " title={Once for All: Train One Network and Specialize it for Efficient Deployment},\n", + " author={Han Cai and Chuang Gan and Tianzhe Wang and Zhekai Zhang and Song Han},\n", + " booktitle={International Conference on Learning Representations},\n", + " year={2020},\n", + " url={https://arxiv.org/pdf/1908.09791.pdf}\n", + "}" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/pytorch_vision_proxylessnas.ipynb b/assets/hub/pytorch_vision_proxylessnas.ipynb new file mode 100644 index 000000000000..0bf04652b2a2 --- /dev/null +++ b/assets/hub/pytorch_vision_proxylessnas.ipynb @@ -0,0 +1,159 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "68fb6d6f", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# ProxylessNAS\n", + "\n", + "*Author: MIT Han Lab*\n", + "\n", + "**Proxylessly specialize CNN architectures for different hardware platforms.**\n", + "\n", + "\"alt\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c2515655", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "target_platform = \"proxyless_cpu\"\n", + "# proxyless_gpu, proxyless_mobile, proxyless_mobile14 are also avaliable.\n", + "model = torch.hub.load('mit-han-lab/ProxylessNAS', target_platform, pretrained=True)\n", + "model.eval()" + ] + }, + { + "cell_type": "markdown", + "id": "57e3d1a0", + "metadata": {}, + "source": [ + "All pre-trained models expect input images normalized in the same way,\n", + "i.e. mini-batches of 3-channel RGB images of shape `(3 x H x W)`, where `H` and `W` are expected to be at least `224`.\n", + "The images have to be loaded in to a range of `[0, 1]` and then normalized using `mean = [0.485, 0.456, 0.406]`\n", + "and `std = [0.229, 0.224, 0.225]`.\n", + "\n", + "Here's a sample execution." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1366edb4", + "metadata": {}, + "outputs": [], + "source": [ + "# Download an example image from the pytorch website\n", + "import urllib\n", + "url, filename = (\"https://github.com/pytorch/hub/raw/master/images/dog.jpg\", \"dog.jpg\")\n", + "try: urllib.URLopener().retrieve(url, filename)\n", + "except: urllib.request.urlretrieve(url, filename)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "55b37d3a", + "metadata": {}, + "outputs": [], + "source": [ + "# sample execution (requires torchvision)\n", + "from PIL import Image\n", + "from torchvision import transforms\n", + "input_image = Image.open(filename)\n", + "preprocess = transforms.Compose([\n", + " transforms.Resize(256),\n", + " transforms.CenterCrop(224),\n", + " transforms.ToTensor(),\n", + " transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n", + "])\n", + "input_tensor = preprocess(input_image)\n", + "input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model\n", + "\n", + "# move the input and model to GPU for speed if available\n", + "if torch.cuda.is_available():\n", + " input_batch = input_batch.to('cuda')\n", + " model.to('cuda')\n", + "\n", + "with torch.no_grad():\n", + " output = model(input_batch)\n", + "# Tensor of shape 1000, with confidence scores over ImageNet's 1000 classes\n", + "print(output[0])\n", + "# The output has unnormalized scores. To get probabilities, you can run a softmax on it.\n", + "probabilities = torch.nn.functional.softmax(output[0], dim=0)\n", + "print(probabilities)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a8503aa7", + "metadata": {}, + "outputs": [], + "source": [ + "# Download ImageNet labels\n", + "!wget https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bec59ca2", + "metadata": {}, + "outputs": [], + "source": [ + "# Read the categories\n", + "with open(\"imagenet_classes.txt\", \"r\") as f:\n", + " categories = [s.strip() for s in f.readlines()]\n", + "# Show top categories per image\n", + "top5_prob, top5_catid = torch.topk(probabilities, 5)\n", + "for i in range(top5_prob.size(0)):\n", + " print(categories[top5_catid[i]], top5_prob[i].item())" + ] + }, + { + "cell_type": "markdown", + "id": "be80f865", + "metadata": {}, + "source": [ + "### Model Description\n", + "\n", + "ProxylessNAS models are from the [ProxylessNAS: Direct Neural Architecture Search on Target Task and Hardware](https://arxiv.org/abs/1812.00332) paper.\n", + "\n", + "Conventionally, people tend to design *one efficient model* for *all hardware platforms*. But different hardware has different properties, for example, CPU has higher frequency and GPU is better at parallization. Therefore, instead of generalizing, we need to **specialize** CNN architectures for different hardware platforms. As shown in below, with similar accuracy, specialization offers free yet significant performance boost on all three platforms.\n", + "\n", + "| Model structure | GPU Latency | CPU Latency | Mobile Latency\n", + "| --------------- | ----------- | ----------- | ----------- |\n", + "| proxylessnas_gpu | **5.1ms** | 204.9ms | 124ms |\n", + "| proxylessnas_cpu | 7.4ms | **138.7ms** | 116ms |\n", + "| proxylessnas_mobile | 7.2ms | 164.1ms | **78ms** |\n", + "\n", + "The corresponding top-1 accuracy with pretrained models are listed below.\n", + "\n", + "| Model structure | Top-1 error |\n", + "| --------------- | ----------- |\n", + "| proxylessnas_cpu | 24.7 |\n", + "| proxylessnas_gpu | 24.9 |\n", + "| proxylessnas_mobile | 25.4 |\n", + "| proxylessnas_mobile_14 | 23.3 |\n", + "\n", + "### References\n", + "\n", + " - [ProxylessNAS: Direct Neural Architecture Search on Target Task and Hardware](https://arxiv.org/abs/1812.00332)." + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/pytorch_vision_resnest.ipynb b/assets/hub/pytorch_vision_resnest.ipynb new file mode 100644 index 000000000000..d7641840d1f9 --- /dev/null +++ b/assets/hub/pytorch_vision_resnest.ipynb @@ -0,0 +1,152 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "8521b666", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# ResNeSt\n", + "\n", + "*Author: Hang Zhang*\n", + "\n", + "**A new ResNet variant.**\n", + "\n", + "\"alt\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "68bf59b7", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "# get list of models\n", + "torch.hub.list('zhanghang1989/ResNeSt', force_reload=True)\n", + "# load pretrained models, using ResNeSt-50 as an example\n", + "model = torch.hub.load('zhanghang1989/ResNeSt', 'resnest50', pretrained=True)\n", + "model.eval()" + ] + }, + { + "cell_type": "markdown", + "id": "d8d356c0", + "metadata": {}, + "source": [ + "All pre-trained models expect input images normalized in the same way,\n", + "i.e. mini-batches of 3-channel RGB images of shape `(3 x H x W)`, where `H` and `W` are expected to be at least `224`.\n", + "The images have to be loaded in to a range of `[0, 1]` and then normalized using `mean = [0.485, 0.456, 0.406]`\n", + "and `std = [0.229, 0.224, 0.225]`.\n", + "\n", + "Here's a sample execution." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "196e3c9a", + "metadata": {}, + "outputs": [], + "source": [ + "# Download an example image from the pytorch website\n", + "import urllib\n", + "url, filename = (\"https://github.com/pytorch/hub/raw/master/images/dog.jpg\", \"dog.jpg\")\n", + "try: urllib.URLopener().retrieve(url, filename)\n", + "except: urllib.request.urlretrieve(url, filename)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "96819a60", + "metadata": {}, + "outputs": [], + "source": [ + "# sample execution (requires torchvision)\n", + "from PIL import Image\n", + "from torchvision import transforms\n", + "input_image = Image.open(filename)\n", + "preprocess = transforms.Compose([\n", + " transforms.Resize(256),\n", + " transforms.CenterCrop(224),\n", + " transforms.ToTensor(),\n", + " transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n", + "])\n", + "input_tensor = preprocess(input_image)\n", + "input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model\n", + "\n", + "# move the input and model to GPU for speed if available\n", + "if torch.cuda.is_available():\n", + " input_batch = input_batch.to('cuda')\n", + " model.to('cuda')\n", + "\n", + "with torch.no_grad():\n", + " output = model(input_batch)\n", + "# Tensor of shape 1000, with confidence scores over ImageNet's 1000 classes\n", + "print(output[0])\n", + "# The output has unnormalized scores. To get probabilities, you can run a softmax on it.\n", + "probabilities = torch.nn.functional.softmax(output[0], dim=0)\n", + "print(probabilities)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "57aa5766", + "metadata": {}, + "outputs": [], + "source": [ + "# Download ImageNet labels\n", + "!wget https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3189a592", + "metadata": {}, + "outputs": [], + "source": [ + "# Read the categories\n", + "with open(\"imagenet_classes.txt\", \"r\") as f:\n", + " categories = [s.strip() for s in f.readlines()]\n", + "# Show top categories per image\n", + "top5_prob, top5_catid = torch.topk(probabilities, 5)\n", + "for i in range(top5_prob.size(0)):\n", + " print(categories[top5_catid[i]], top5_prob[i].item())" + ] + }, + { + "cell_type": "markdown", + "id": "cbaa3b10", + "metadata": {}, + "source": [ + "### Model Description\n", + "\n", + "ResNeSt models are from the [ResNeSt: Split-Attention Networks](https://arxiv.org/pdf/2004.08955.pdf) paper.\n", + "\n", + "While image classification models have recently continued to advance, most downstream applications such as object detection and semantic segmentation still employ ResNet variants as the backbone network due to their simple and modular structure. We present a simple and modular Split-Attention block that enables attention across feature-map groups. By stacking these Split-Attention blocks ResNet-style, we obtain a new ResNet variant which we call ResNeSt. Our network preserves the overall ResNet structure to be used in downstream tasks straightforwardly without introducing additional computational costs. ResNeSt models outperform other networks with similar model complexities, and also help downstream tasks including object detection, instance segmentation and semantic segmentation.\n", + "\n", + "| | crop size | PyTorch |\n", + "|-------------|-----------|---------|\n", + "| ResNeSt-50 | 224 | 81.03 |\n", + "| ResNeSt-101 | 256 | 82.83 |\n", + "| ResNeSt-200 | 320 | 83.84 |\n", + "| ResNeSt-269 | 416 | 84.54 |\n", + "\n", + "### References\n", + "\n", + " - [ResNeSt: Split-Attention Networks](https://arxiv.org/abs/2004.08955)." + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/pytorch_vision_resnet.ipynb b/assets/hub/pytorch_vision_resnet.ipynb new file mode 100644 index 000000000000..2fda8ca3e66e --- /dev/null +++ b/assets/hub/pytorch_vision_resnet.ipynb @@ -0,0 +1,156 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "b74d19a0", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# ResNet\n", + "\n", + "*Author: Pytorch Team*\n", + "\n", + "**Deep residual networks pre-trained on ImageNet**\n", + "\n", + "\"alt\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7de1aa61", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', pretrained=True)\n", + "# or any of these variants\n", + "# model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet34', pretrained=True)\n", + "# model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet50', pretrained=True)\n", + "# model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet101', pretrained=True)\n", + "# model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet152', pretrained=True)\n", + "model.eval()" + ] + }, + { + "cell_type": "markdown", + "id": "3ddf328e", + "metadata": {}, + "source": [ + "All pre-trained models expect input images normalized in the same way,\n", + "i.e. mini-batches of 3-channel RGB images of shape `(3 x H x W)`, where `H` and `W` are expected to be at least `224`.\n", + "The images have to be loaded in to a range of `[0, 1]` and then normalized using `mean = [0.485, 0.456, 0.406]`\n", + "and `std = [0.229, 0.224, 0.225]`.\n", + "\n", + "Here's a sample execution." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "94991788", + "metadata": {}, + "outputs": [], + "source": [ + "# Download an example image from the pytorch website\n", + "import urllib\n", + "url, filename = (\"https://github.com/pytorch/hub/raw/master/images/dog.jpg\", \"dog.jpg\")\n", + "try: urllib.URLopener().retrieve(url, filename)\n", + "except: urllib.request.urlretrieve(url, filename)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "86dcba1c", + "metadata": {}, + "outputs": [], + "source": [ + "# sample execution (requires torchvision)\n", + "from PIL import Image\n", + "from torchvision import transforms\n", + "input_image = Image.open(filename)\n", + "preprocess = transforms.Compose([\n", + " transforms.Resize(256),\n", + " transforms.CenterCrop(224),\n", + " transforms.ToTensor(),\n", + " transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n", + "])\n", + "input_tensor = preprocess(input_image)\n", + "input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model\n", + "\n", + "# move the input and model to GPU for speed if available\n", + "if torch.cuda.is_available():\n", + " input_batch = input_batch.to('cuda')\n", + " model.to('cuda')\n", + "\n", + "with torch.no_grad():\n", + " output = model(input_batch)\n", + "# Tensor of shape 1000, with confidence scores over ImageNet's 1000 classes\n", + "print(output[0])\n", + "# The output has unnormalized scores. To get probabilities, you can run a softmax on it.\n", + "probabilities = torch.nn.functional.softmax(output[0], dim=0)\n", + "print(probabilities)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ace2a087", + "metadata": {}, + "outputs": [], + "source": [ + "# Download ImageNet labels\n", + "!wget https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8448d407", + "metadata": {}, + "outputs": [], + "source": [ + "# Read the categories\n", + "with open(\"imagenet_classes.txt\", \"r\") as f:\n", + " categories = [s.strip() for s in f.readlines()]\n", + "# Show top categories per image\n", + "top5_prob, top5_catid = torch.topk(probabilities, 5)\n", + "for i in range(top5_prob.size(0)):\n", + " print(categories[top5_catid[i]], top5_prob[i].item())" + ] + }, + { + "cell_type": "markdown", + "id": "70b4e1b9", + "metadata": {}, + "source": [ + "### Model Description\n", + "\n", + "Resnet models were proposed in \"Deep Residual Learning for Image Recognition\".\n", + "Here we have the 5 versions of resnet models, which contains 18, 34, 50, 101, 152 layers respectively.\n", + "Detailed model architectures can be found in Table 1.\n", + "Their 1-crop error rates on ImageNet dataset with pretrained models are listed below.\n", + "\n", + "| Model structure | Top-1 error | Top-5 error |\n", + "| --------------- | ----------- | ----------- |\n", + "| resnet18 | 30.24 | 10.92 |\n", + "| resnet34 | 26.70 | 8.58 |\n", + "| resnet50 | 23.85 | 7.13 |\n", + "| resnet101 | 22.63 | 6.44 |\n", + "| resnet152 | 21.69 | 5.94 |\n", + "\n", + "### References\n", + "\n", + " - [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385)" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/pytorch_vision_resnext.ipynb b/assets/hub/pytorch_vision_resnext.ipynb new file mode 100644 index 000000000000..eb0f20bea282 --- /dev/null +++ b/assets/hub/pytorch_vision_resnext.ipynb @@ -0,0 +1,152 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "f2256586", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# ResNext\n", + "\n", + "*Author: Pytorch Team*\n", + "\n", + "**Next generation ResNets, more efficient and accurate**\n", + "\n", + "\"alt\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6ee13ed9", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "model = torch.hub.load('pytorch/vision:v0.10.0', 'resnext50_32x4d', pretrained=True)\n", + "# or\n", + "# model = torch.hub.load('pytorch/vision:v0.10.0', 'resnext101_32x8d', pretrained=True)\n", + "model.eval()" + ] + }, + { + "cell_type": "markdown", + "id": "3d73fa36", + "metadata": {}, + "source": [ + "All pre-trained models expect input images normalized in the same way,\n", + "i.e. mini-batches of 3-channel RGB images of shape `(3 x H x W)`, where `H` and `W` are expected to be at least `224`.\n", + "The images have to be loaded in to a range of `[0, 1]` and then normalized using `mean = [0.485, 0.456, 0.406]`\n", + "and `std = [0.229, 0.224, 0.225]`.\n", + "\n", + "Here's a sample execution." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fd200719", + "metadata": {}, + "outputs": [], + "source": [ + "# Download an example image from the pytorch website\n", + "import urllib\n", + "url, filename = (\"https://github.com/pytorch/hub/raw/master/images/dog.jpg\", \"dog.jpg\")\n", + "try: urllib.URLopener().retrieve(url, filename)\n", + "except: urllib.request.urlretrieve(url, filename)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7753e3c1", + "metadata": {}, + "outputs": [], + "source": [ + "# sample execution (requires torchvision)\n", + "from PIL import Image\n", + "from torchvision import transforms\n", + "input_image = Image.open(filename)\n", + "preprocess = transforms.Compose([\n", + " transforms.Resize(256),\n", + " transforms.CenterCrop(224),\n", + " transforms.ToTensor(),\n", + " transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n", + "])\n", + "input_tensor = preprocess(input_image)\n", + "input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model\n", + "\n", + "# move the input and model to GPU for speed if available\n", + "if torch.cuda.is_available():\n", + " input_batch = input_batch.to('cuda')\n", + " model.to('cuda')\n", + "\n", + "with torch.no_grad():\n", + " output = model(input_batch)\n", + "# Tensor of shape 1000, with confidence scores over ImageNet's 1000 classes\n", + "print(output[0])\n", + "# The output has unnormalized scores. To get probabilities, you can run a softmax on it.\n", + "probabilities = torch.nn.functional.softmax(output[0], dim=0)\n", + "print(probabilities)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0d81ccdf", + "metadata": {}, + "outputs": [], + "source": [ + "# Download ImageNet labels\n", + "!wget https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a71ac5d9", + "metadata": {}, + "outputs": [], + "source": [ + "# Read the categories\n", + "with open(\"imagenet_classes.txt\", \"r\") as f:\n", + " categories = [s.strip() for s in f.readlines()]\n", + "\n", + "# Show top categories per image\n", + "top5_prob, top5_catid = torch.topk(probabilities, 5)\n", + "\n", + "for i in range(top5_prob.size(0)):\n", + " print(categories[top5_catid[i]], top5_prob[i].item())" + ] + }, + { + "cell_type": "markdown", + "id": "f7ddaedf", + "metadata": {}, + "source": [ + "### Model Description\n", + "\n", + "Resnext models were proposed in [Aggregated Residual Transformations for Deep Neural Networks](https://arxiv.org/abs/1611.05431).\n", + "Here we have the 2 versions of resnet models, which contains 50, 101 layers repspectively.\n", + "A comparison in model archetechure between resnet50 and resnext50 can be found in Table 1.\n", + "Their 1-crop error rates on ImageNet dataset with pretrained models are listed below.\n", + "\n", + "| Model structure | Top-1 error | Top-5 error |\n", + "| ----------------- | ----------- | ----------- |\n", + "| resnext50_32x4d | 22.38 | 6.30 |\n", + "| resnext101_32x8d | 20.69 | 5.47 |\n", + "\n", + "### References\n", + "\n", + " - [Aggregated Residual Transformations for Deep Neural Networks](https://arxiv.org/abs/1611.05431)" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/pytorch_vision_shufflenet_v2.ipynb b/assets/hub/pytorch_vision_shufflenet_v2.ipynb new file mode 100644 index 000000000000..6af2a18507be --- /dev/null +++ b/assets/hub/pytorch_vision_shufflenet_v2.ipynb @@ -0,0 +1,147 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ceddae15", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# ShuffleNet v2\n", + "\n", + "*Author: Pytorch Team*\n", + "\n", + "**An efficient ConvNet optimized for speed and memory, pre-trained on ImageNet**\n", + "\n", + "_ | _\n", + "- | -\n", + "![alt](https://pytorch.org/assets/images/shufflenet_v2_1.png) | ![alt](https://pytorch.org/assets/images/shufflenet_v2_2.png)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f5e75733", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "model = torch.hub.load('pytorch/vision:v0.10.0', 'shufflenet_v2_x1_0', pretrained=True)\n", + "model.eval()" + ] + }, + { + "cell_type": "markdown", + "id": "68f3912c", + "metadata": {}, + "source": [ + "All pre-trained models expect input images normalized in the same way,\n", + "i.e. mini-batches of 3-channel RGB images of shape `(3 x H x W)`, where `H` and `W` are expected to be at least `224`.\n", + "The images have to be loaded in to a range of `[0, 1]` and then normalized using `mean = [0.485, 0.456, 0.406]`\n", + "and `std = [0.229, 0.224, 0.225]`.\n", + "\n", + "Here's a sample execution." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b128da6", + "metadata": {}, + "outputs": [], + "source": [ + "# Download an example image from the pytorch website\n", + "import urllib\n", + "url, filename = (\"https://github.com/pytorch/hub/raw/master/images/dog.jpg\", \"dog.jpg\")\n", + "try: urllib.URLopener().retrieve(url, filename)\n", + "except: urllib.request.urlretrieve(url, filename)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "70d5a956", + "metadata": {}, + "outputs": [], + "source": [ + "# sample execution (requires torchvision)\n", + "from PIL import Image\n", + "from torchvision import transforms\n", + "input_image = Image.open(filename)\n", + "preprocess = transforms.Compose([\n", + " transforms.Resize(256),\n", + " transforms.CenterCrop(224),\n", + " transforms.ToTensor(),\n", + " transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n", + "])\n", + "input_tensor = preprocess(input_image)\n", + "input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model\n", + "\n", + "# move the input and model to GPU for speed if available\n", + "if torch.cuda.is_available():\n", + " input_batch = input_batch.to('cuda')\n", + " model.to('cuda')\n", + "\n", + "with torch.no_grad():\n", + " output = model(input_batch)\n", + "# Tensor of shape 1000, with confidence scores over ImageNet's 1000 classes\n", + "print(output[0])\n", + "# The output has unnormalized scores. To get probabilities, you can run a softmax on it.\n", + "probabilities = torch.nn.functional.softmax(output[0], dim=0)\n", + "print(probabilities)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "416164dc", + "metadata": {}, + "outputs": [], + "source": [ + "# Download ImageNet labels\n", + "!wget https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6a9adec0", + "metadata": {}, + "outputs": [], + "source": [ + "# Read the categories\n", + "with open(\"imagenet_classes.txt\", \"r\") as f:\n", + " categories = [s.strip() for s in f.readlines()]\n", + "# Show top categories per image\n", + "top5_prob, top5_catid = torch.topk(probabilities, 5)\n", + "for i in range(top5_prob.size(0)):\n", + " print(categories[top5_catid[i]], top5_prob[i].item())" + ] + }, + { + "cell_type": "markdown", + "id": "d969a239", + "metadata": {}, + "source": [ + "### Model Description\n", + "\n", + "Previously, neural network architecture design was mostly guided by the indirect metric of computation complexity, i.e., FLOPs. However, the direct metric, e.g., speed, also depends on the other factors such as memory access cost and platform characteristics. Based on a series of controlled experiments, this work derives several practical guidelines for efficient network design. Accordingly, a new architecture is presented, called ShuffleNet V2. Comprehensive ablation experiments verify that our model is the state of-the-art in terms of speed and accuracy tradeoff.\n", + "\n", + "| Model structure | Top-1 error | Top-5 error |\n", + "| --------------- | ----------- | ----------- |\n", + "| shufflenet_v2 | 30.64 | 11.68 |\n", + "\n", + "\n", + "### References\n", + "\n", + " - [ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design](https://arxiv.org/abs/1807.11164)" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/pytorch_vision_snnmlp.ipynb b/assets/hub/pytorch_vision_snnmlp.ipynb new file mode 100644 index 000000000000..0b2d3ec2d3bf --- /dev/null +++ b/assets/hub/pytorch_vision_snnmlp.ipynb @@ -0,0 +1,141 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "679d03cf", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# SNNMLP\n", + "\n", + "*Author: Huawei Noah's Ark Lab*\n", + "\n", + "**Brain-inspired Multilayer Perceptron with Spiking Neurons**\n", + "\n", + "\"alt\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e4eb3d83", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "model = torch.hub.load('huawei-noah/Efficient-AI-Backbones', 'snnmlp_t', pretrained=True)\n", + "# or\n", + "# model = torch.hub.load('huawei-noah/Efficient-AI-Backbones', 'snnmlp_s', pretrained=True)\n", + "# or\n", + "# model = torch.hub.load('huawei-noah/Efficient-AI-Backbones', 'snnmlp_b', pretrained=True)\n", + "model.eval()" + ] + }, + { + "cell_type": "markdown", + "id": "d2ac61fc", + "metadata": {}, + "source": [ + "All pre-trained models expect input images normalized in the same way,\n", + "i.e. mini-batches of 3-channel RGB images of shape `(3 x H x W)`, where `H` and `W` are expected to be at least `224`.\n", + "The images have to be loaded in to a range of `[0, 1]` and then normalized using `mean = [0.485, 0.456, 0.406]`\n", + "and `std = [0.229, 0.224, 0.225]`.\n", + "\n", + "Here's a sample execution." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "32db137b", + "metadata": {}, + "outputs": [], + "source": [ + "# Download an example image from the pytorch website\n", + "import urllib\n", + "url, filename = (\"https://github.com/pytorch/hub/raw/master/dog.jpg\", \"dog.jpg\")\n", + "try: urllib.URLopener().retrieve(url, filename)\n", + "except: urllib.request.urlretrieve(url, filename)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15f2f96a", + "metadata": {}, + "outputs": [], + "source": [ + "# sample execution (requires torchvision)\n", + "from PIL import Image\n", + "from torchvision import transforms\n", + "input_image = Image.open(filename)\n", + "preprocess = transforms.Compose([\n", + " transforms.Resize(256),\n", + " transforms.CenterCrop(224),\n", + " transforms.ToTensor(),\n", + " transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n", + "])\n", + "input_tensor = preprocess(input_image)\n", + "input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model\n", + "\n", + "# move the input and model to GPU for speed if available\n", + "if torch.cuda.is_available():\n", + " input_batch = input_batch.to('cuda')\n", + " model.to('cuda')\n", + "\n", + "with torch.no_grad():\n", + " output = model(input_batch)\n", + "# Tensor of shape 1000, with confidence scores over ImageNet's 1000 classes\n", + "print(output[0])\n", + "# The output has unnormalized scores. To get probabilities, you can run a softmax on it.\n", + "print(torch.nn.functional.softmax(output[0], dim=0))\n" + ] + }, + { + "cell_type": "markdown", + "id": "391bc7b8", + "metadata": {}, + "source": [ + "### Model Description\n", + "\n", + "SNNMLP incorporates the mechanism of LIF neurons into the MLP models, to achieve better accuracy without extra FLOPs. We propose a full-precision LIF operation to communicate between patches, including horizontal LIF and vertical LIF in different directions. We also propose to use group LIF to extract better local features. With LIF modules, our SNNMLP model achieves 81.9%, 83.3% and 83.6% top-1 accuracy on ImageNet dataset with only 4.4G, 8.5G and 15.2G FLOPs, respectively.\n", + "\n", + "The corresponding accuracy on ImageNet dataset with pretrained model is listed below.\n", + "\n", + "| Model structure | #Parameters | FLOPs | Top-1 acc |\n", + "| --------------- | ----------- | ----------- | ----------- |\n", + "| SNNMLP Tiny | 28M | 4.4G | 81.88 |\n", + "| SNNMLP Small | 50M | 8.5G | 83.30 |\n", + "| SNNMLP Base | 88M | 15.2G | 85.59 |\n", + "\n", + "\n", + "### References\n", + "\n", + "You can read the full paper [here](https://arxiv.org/abs/2203.14679)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a3bee9bc", + "metadata": {}, + "outputs": [], + "source": [ + "@inproceedings{li2022brain,\n", + " title={Brain-inspired multilayer perceptron with spiking neurons},\n", + " author={Li, Wenshuo and Chen, Hanting and Guo, Jianyuan and Zhang, Ziyang and Wang, Yunhe},\n", + " booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},\n", + " pages={783--793},\n", + " year={2022}\n", + "}" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/pytorch_vision_squeezenet.ipynb b/assets/hub/pytorch_vision_squeezenet.ipynb new file mode 100644 index 000000000000..71af8b401bf9 --- /dev/null +++ b/assets/hub/pytorch_vision_squeezenet.ipynb @@ -0,0 +1,152 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "cd4df47c", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# SqueezeNet\n", + "\n", + "*Author: Pytorch Team*\n", + "\n", + "**Alexnet-level accuracy with 50x fewer parameters.**\n", + "\n", + "\"alt\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8b58effa", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "model = torch.hub.load('pytorch/vision:v0.10.0', 'squeezenet1_0', pretrained=True)\n", + "# or\n", + "# model = torch.hub.load('pytorch/vision:v0.10.0', 'squeezenet1_1', pretrained=True)\n", + "model.eval()" + ] + }, + { + "cell_type": "markdown", + "id": "fc0fbc27", + "metadata": {}, + "source": [ + "All pre-trained models expect input images normalized in the same way,\n", + "i.e. mini-batches of 3-channel RGB images of shape `(3 x H x W)`, where `H` and `W` are expected to be at least `224`.\n", + "The images have to be loaded in to a range of `[0, 1]` and then normalized using `mean = [0.485, 0.456, 0.406]`\n", + "and `std = [0.229, 0.224, 0.225]`.\n", + "\n", + "Here's a sample execution." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b8740dd7", + "metadata": {}, + "outputs": [], + "source": [ + "# Download an example image from the pytorch website\n", + "import urllib\n", + "url, filename = (\"https://github.com/pytorch/hub/raw/master/images/dog.jpg\", \"dog.jpg\")\n", + "try: urllib.URLopener().retrieve(url, filename)\n", + "except: urllib.request.urlretrieve(url, filename)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "978191be", + "metadata": {}, + "outputs": [], + "source": [ + "# sample execution (requires torchvision)\n", + "from PIL import Image\n", + "from torchvision import transforms\n", + "input_image = Image.open(filename)\n", + "preprocess = transforms.Compose([\n", + " transforms.Resize(256),\n", + " transforms.CenterCrop(224),\n", + " transforms.ToTensor(),\n", + " transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n", + "])\n", + "input_tensor = preprocess(input_image)\n", + "input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model\n", + "\n", + "# move the input and model to GPU for speed if available\n", + "if torch.cuda.is_available():\n", + " input_batch = input_batch.to('cuda')\n", + " model.to('cuda')\n", + "\n", + "with torch.no_grad():\n", + " output = model(input_batch)\n", + "# Tensor of shape 1000, with confidence scores over ImageNet's 1000 classes\n", + "print(output[0])\n", + "# The output has unnormalized scores. To get probabilities, you can run a softmax on it.\n", + "probabilities = torch.nn.functional.softmax(output[0], dim=0)\n", + "print(probabilities)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2e475d7e", + "metadata": {}, + "outputs": [], + "source": [ + "# Download ImageNet labels\n", + "!wget https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6f18701", + "metadata": {}, + "outputs": [], + "source": [ + "# Read the categories\n", + "with open(\"imagenet_classes.txt\", \"r\") as f:\n", + " categories = [s.strip() for s in f.readlines()]\n", + "# Show top categories per image\n", + "top5_prob, top5_catid = torch.topk(probabilities, 5)\n", + "for i in range(top5_prob.size(0)):\n", + " print(categories[top5_catid[i]], top5_prob[i].item())" + ] + }, + { + "cell_type": "markdown", + "id": "066555b8", + "metadata": {}, + "source": [ + "### Model Description\n", + "\n", + "Model `squeezenet1_0` is from the [SqueezeNet: AlexNet-level accuracy with 50x fewer parameters and <0.5MB model size](https://arxiv.org/pdf/1602.07360.pdf) paper\n", + "\n", + "Model `squeezenet1_1` is from the [official squeezenet repo](https://github.com/DeepScale/SqueezeNet/tree/master/SqueezeNet_v1.1).\n", + "It has 2.4x less computation and slightly fewer parameters than `squeezenet1_0`, without sacrificing accuracy.\n", + "\n", + "Their 1-crop error rates on ImageNet dataset with pretrained models are listed below.\n", + "\n", + "| Model structure | Top-1 error | Top-5 error |\n", + "| --------------- | ----------- | ----------- |\n", + "| squeezenet1_0 | 41.90 | 19.58 |\n", + "| squeezenet1_1 | 41.81 | 19.38 |\n", + "\n", + "### References\n", + "\n", + " - [Squeezenet: Alexnet-level accuracy with 50x fewer parameters and <0.5MB model size](https://arxiv.org/pdf/1602.07360.pdf)." + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/pytorch_vision_vgg.ipynb b/assets/hub/pytorch_vision_vgg.ipynb new file mode 100644 index 000000000000..689966eb07c5 --- /dev/null +++ b/assets/hub/pytorch_vision_vgg.ipynb @@ -0,0 +1,165 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "48c981c1", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# vgg-nets\n", + "\n", + "*Author: Pytorch Team*\n", + "\n", + "**Award winning ConvNets from 2014 ImageNet ILSVRC challenge**\n", + "\n", + "\"alt\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "353975ab", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "model = torch.hub.load('pytorch/vision:v0.10.0', 'vgg11', pretrained=True)\n", + "# or any of these variants\n", + "# model = torch.hub.load('pytorch/vision:v0.10.0', 'vgg11_bn', pretrained=True)\n", + "# model = torch.hub.load('pytorch/vision:v0.10.0', 'vgg13', pretrained=True)\n", + "# model = torch.hub.load('pytorch/vision:v0.10.0', 'vgg13_bn', pretrained=True)\n", + "# model = torch.hub.load('pytorch/vision:v0.10.0', 'vgg16', pretrained=True)\n", + "# model = torch.hub.load('pytorch/vision:v0.10.0', 'vgg16_bn', pretrained=True)\n", + "# model = torch.hub.load('pytorch/vision:v0.10.0', 'vgg19', pretrained=True)\n", + "# model = torch.hub.load('pytorch/vision:v0.10.0', 'vgg19_bn', pretrained=True)\n", + "model.eval()" + ] + }, + { + "cell_type": "markdown", + "id": "49b59512", + "metadata": {}, + "source": [ + "All pre-trained models expect input images normalized in the same way,\n", + "i.e. mini-batches of 3-channel RGB images of shape `(3 x H x W)`, where `H` and `W` are expected to be at least `224`.\n", + "The images have to be loaded in to a range of `[0, 1]` and then normalized using `mean = [0.485, 0.456, 0.406]`\n", + "and `std = [0.229, 0.224, 0.225]`.\n", + "\n", + "Here's a sample execution." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "30b08430", + "metadata": {}, + "outputs": [], + "source": [ + "# Download an example image from the pytorch website\n", + "import urllib\n", + "url, filename = (\"https://github.com/pytorch/hub/raw/master/images/dog.jpg\", \"dog.jpg\")\n", + "try: urllib.URLopener().retrieve(url, filename)\n", + "except: urllib.request.urlretrieve(url, filename)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f8e70afe", + "metadata": {}, + "outputs": [], + "source": [ + "# sample execution (requires torchvision)\n", + "from PIL import Image\n", + "from torchvision import transforms\n", + "input_image = Image.open(filename)\n", + "preprocess = transforms.Compose([\n", + " transforms.Resize(256),\n", + " transforms.CenterCrop(224),\n", + " transforms.ToTensor(),\n", + " transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n", + "])\n", + "input_tensor = preprocess(input_image)\n", + "input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model\n", + "\n", + "# move the input and model to GPU for speed if available\n", + "if torch.cuda.is_available():\n", + " input_batch = input_batch.to('cuda')\n", + " model.to('cuda')\n", + "\n", + "with torch.no_grad():\n", + " output = model(input_batch)\n", + "# Tensor of shape 1000, with confidence scores over ImageNet's 1000 classes\n", + "print(output[0])\n", + "# The output has unnormalized scores. To get probabilities, you can run a softmax on it.\n", + "probabilities = torch.nn.functional.softmax(output[0], dim=0)\n", + "print(probabilities)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c29c5f9e", + "metadata": {}, + "outputs": [], + "source": [ + "# Download ImageNet labels\n", + "!wget https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f0a2573", + "metadata": {}, + "outputs": [], + "source": [ + "# Read the categories\n", + "with open(\"imagenet_classes.txt\", \"r\") as f:\n", + " categories = [s.strip() for s in f.readlines()]\n", + "# Show top categories per image\n", + "top5_prob, top5_catid = torch.topk(probabilities, 5)\n", + "for i in range(top5_prob.size(0)):\n", + " print(categories[top5_catid[i]], top5_prob[i].item())" + ] + }, + { + "cell_type": "markdown", + "id": "f02f5387", + "metadata": {}, + "source": [ + "### Model Description\n", + "\n", + "Here we have implementations for the models proposed in [Very Deep Convolutional Networks for Large-Scale Image Recognition](https://arxiv.org/abs/1409.1556),\n", + "for each configurations and their with batchnorm version.\n", + "\n", + "For example, configuration `A` presented in the paper is `vgg11`, configuration `B` is `vgg13`, configuration `D` is `vgg16`\n", + "and configuration `E` is `vgg19`. Their batchnorm version are suffixed with `_bn`.\n", + "\n", + "Their Top-1 error rates on ImageNet dataset with pretrained models are listed below.\n", + "\n", + "| Model structure | Top-1 error | Top-5 error |\n", + "| --------------- | ----------- | ----------- |\n", + "| vgg11 | 30.98 | 11.37 |\n", + "| vgg11_bn | 26.70 | 8.58 |\n", + "| vgg13 | 30.07 | 10.75 |\n", + "| vgg13_bn | 28.45 | 9.63 |\n", + "| vgg16 | 28.41 | 9.62 |\n", + "| vgg16_bn | 26.63 | 8.50 |\n", + "| vgg19 | 27.62 | 9.12 |\n", + "| vgg19_bn | 25.76 | 8.15 |\n", + "\n", + "### References\n", + "\n", + "- [Very Deep Convolutional Networks for Large-Scale Image Recognition](https://arxiv.org/abs/1409.1556)." + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/pytorch_vision_wide_resnet.ipynb b/assets/hub/pytorch_vision_wide_resnet.ipynb new file mode 100644 index 000000000000..4d81cad7d879 --- /dev/null +++ b/assets/hub/pytorch_vision_wide_resnet.ipynb @@ -0,0 +1,157 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a42a2c48", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# Wide ResNet\n", + "\n", + "*Author: Sergey Zagoruyko*\n", + "\n", + "**Wide Residual Networks**\n", + "\n", + "\"alt\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6367742", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "# load WRN-50-2:\n", + "model = torch.hub.load('pytorch/vision:v0.10.0', 'wide_resnet50_2', pretrained=True)\n", + "# or WRN-101-2\n", + "model = torch.hub.load('pytorch/vision:v0.10.0', 'wide_resnet101_2', pretrained=True)\n", + "model.eval()" + ] + }, + { + "cell_type": "markdown", + "id": "758f9e23", + "metadata": {}, + "source": [ + "All pre-trained models expect input images normalized in the same way,\n", + "i.e. mini-batches of 3-channel RGB images of shape `(3 x H x W)`, where `H` and `W` are expected to be at least `224`.\n", + "The images have to be loaded in to a range of `[0, 1]` and then normalized using `mean = [0.485, 0.456, 0.406]`\n", + "and `std = [0.229, 0.224, 0.225]`.\n", + "\n", + "Here's a sample execution." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "faf3a0f5", + "metadata": {}, + "outputs": [], + "source": [ + "# Download an example image from the pytorch website\n", + "import urllib\n", + "url, filename = (\"https://github.com/pytorch/hub/raw/master/images/dog.jpg\", \"dog.jpg\")\n", + "try: urllib.URLopener().retrieve(url, filename)\n", + "except: urllib.request.urlretrieve(url, filename)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dc6a9980", + "metadata": {}, + "outputs": [], + "source": [ + "# sample execution (requires torchvision)\n", + "from PIL import Image\n", + "from torchvision import transforms\n", + "input_image = Image.open(filename)\n", + "preprocess = transforms.Compose([\n", + " transforms.Resize(256),\n", + " transforms.CenterCrop(224),\n", + " transforms.ToTensor(),\n", + " transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n", + "])\n", + "input_tensor = preprocess(input_image)\n", + "input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model\n", + "\n", + "# move the input and model to GPU for speed if available\n", + "if torch.cuda.is_available():\n", + " input_batch = input_batch.to('cuda')\n", + " model.to('cuda')\n", + "\n", + "with torch.no_grad():\n", + " output = model(input_batch)\n", + "# Tensor of shape 1000, with confidence scores over ImageNet's 1000 classes\n", + "print(output[0])\n", + "# The output has unnormalized scores. To get probabilities, you can run a softmax on it.\n", + "probabilities = torch.nn.functional.softmax(output[0], dim=0)\n", + "print(probabilities)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "28122f5d", + "metadata": {}, + "outputs": [], + "source": [ + "# Download ImageNet labels\n", + "!wget https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d2f6fcb", + "metadata": {}, + "outputs": [], + "source": [ + "# Read the categories\n", + "with open(\"imagenet_classes.txt\", \"r\") as f:\n", + " categories = [s.strip() for s in f.readlines()]\n", + "# Show top categories per image\n", + "top5_prob, top5_catid = torch.topk(probabilities, 5)\n", + "for i in range(top5_prob.size(0)):\n", + " print(categories[top5_catid[i]], top5_prob[i].item())" + ] + }, + { + "cell_type": "markdown", + "id": "a9b740da", + "metadata": {}, + "source": [ + "### Model Description\n", + "\n", + "Wide Residual networks simply have increased number of channels compared to ResNet.\n", + "Otherwise the architecture is the same. Deeper ImageNet models with bottleneck\n", + "block have increased number of channels in the inner 3x3 convolution.\n", + "\n", + "The `wide_resnet50_2` and `wide_resnet101_2` models were trained in FP16 with\n", + "mixed precision training using SGD with warm restarts. Checkpoints have weights in\n", + "half precision (except batch norm) for smaller size, and can be used in FP32 models too.\n", + "\n", + "| Model structure | Top-1 error | Top-5 error | # parameters |\n", + "| ----------------- | :---------: | :---------: | :----------: |\n", + "| wide_resnet50_2 | 21.49 | 5.91 | 68.9M |\n", + "| wide_resnet101_2 | 21.16 | 5.72 | 126.9M |\n", + "\n", + "### References\n", + "\n", + " - [Wide Residual Networks](https://arxiv.org/abs/1605.07146)\n", + " - [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385)\n", + " - [Mixed Precision Training](https://arxiv.org/abs/1710.03740)\n", + " - [SGDR: Stochastic Gradient Descent with Warm Restarts](https://arxiv.org/abs/1608.03983)" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/sigsep_open-unmix-pytorch_umx.ipynb b/assets/hub/sigsep_open-unmix-pytorch_umx.ipynb new file mode 100644 index 000000000000..de8bc7d3b942 --- /dev/null +++ b/assets/hub/sigsep_open-unmix-pytorch_umx.ipynb @@ -0,0 +1,120 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "99fde666", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# Open-Unmix\n", + "\n", + "*Author: Inria*\n", + "\n", + "**Reference implementation for music source separation**\n", + "\n", + "\"alt\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1b6c35b", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "# assuming you have a PyTorch >=1.6.0 installed\n", + "pip install -q torchaudio" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "869d0784", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "\n", + "# loading umxhq four target separator\n", + "separator = torch.hub.load('sigsep/open-unmix-pytorch', 'umxhq')\n", + "\n", + "# generate random audio\n", + "# ... with shape (nb_samples, nb_channels, nb_timesteps)\n", + "# ... and with the same sample rate as that of the separator\n", + "audio = torch.rand((1, 2, 100000))\n", + "original_sample_rate = separator.sample_rate\n", + "\n", + "# make sure to resample the audio to models' sample rate, separator.sample_rate, if the two are different\n", + "# resampler = torchaudio.transforms.Resample(original_sample_rate, separator.sample_rate)\n", + "# audio = resampler(audio)\n", + "\n", + "estimates = separator(audio)\n", + "# estimates.shape = (1, 4, 2, 100000)" + ] + }, + { + "cell_type": "markdown", + "id": "7e8fcf3c", + "metadata": {}, + "source": [ + "### Model Description\n", + "\n", + "__Open-Unmix__ provides ready-to-use models that allow users to separate pop music into four stems: __vocals__, __drums__, __bass__ and the remaining __other__ instruments. The models were pre-trained on the freely available [MUSDB18](https://sigsep.github.io/datasets/musdb.html) dataset.\n", + "\n", + "Each target model is based on a three-layer bidirectional deep LSTM. The model learns to predict the magnitude spectrogram of a target source, like vocals, from the magnitude spectrogram of a mixture input. Internally, the prediction is obtained by applying a mask on the input. The model is optimized in the magnitude domain using mean squared error.\n", + "\n", + "A `Separator` meta-model (as shown in the code example above) puts together multiple _Open-unmix_ spectrogram models for each desired target, and combines their output through a multichannel generalized Wiener filter, before application of inverse STFTs using `torchaudio`.\n", + "The filtering is differentiable (but parameter-free) version of [norbert](https://github.com/sigsep/norbert).\n", + "\n", + "### Pre-trained `Separator` models\n", + "\n", + "* __`umxhq` (default)__ trained on [MUSDB18-HQ](https://sigsep.github.io/datasets/musdb.html#uncompressed-wav) which comprises the same tracks as in MUSDB18 but un-compressed which yield in a full bandwidth of 22050 Hz.\n", + "\n", + "* __`umx`__ is trained on the regular [MUSDB18](https://sigsep.github.io/datasets/musdb.html#compressed-stems) which is bandwidth limited to 16 kHz due to AAC compression. This model should be used for comparison with other (older) methods for evaluation in [SiSEC18](sisec18.unmix.app).\n", + "\n", + "Furthermore, we provide a model for speech enhancement trained by [Sony Corporation](link)\n", + "\n", + "* __`umxse`__ speech enhancement model is trained on the 28-speaker version of the [Voicebank+DEMAND corpus](https://datashare.is.ed.ac.uk/handle/10283/1942?show=full).\n", + "\n", + "All three models are also available as spectrogram (core) models, which take magnitude spectrogram inputs and ouput separated spectrograms.\n", + "These models can be loaded using `umxhq_spec`, `umx_spec` and `umxse_spec`.\n", + "\n", + "### Details\n", + "\n", + "For additional examples, documentation and usage examples, please visit this [the github repo](https://github.com/sigsep/open-unmix-pytorch).\n", + "\n", + "Furthermore, the models and all utility function to preprocess, read and save audio stems, are available in a python package that can be installed via" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8ad88076", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "pip install openunmix" + ] + }, + { + "cell_type": "markdown", + "id": "2f026e5d", + "metadata": {}, + "source": [ + "### References\n", + "\n", + "- [Open-Unmix - A Reference Implementation for Music Source Separation](https://doi.org/10.21105/joss.01667)\n", + "- [SigSep - Open Ressources for Music Separation](https://sigsep.github.io/)" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/simplenet.ipynb b/assets/hub/simplenet.ipynb new file mode 100644 index 000000000000..b9e57af0ee25 --- /dev/null +++ b/assets/hub/simplenet.ipynb @@ -0,0 +1,169 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "72d50304", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# SimpleNet\n", + "\n", + "*Author: Seyyed Hossein Hasanpour*\n", + "\n", + "**Lets Keep it simple, Using simple architectures to outperform deeper and more complex architectures**\n", + "\n", + "\"alt\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0d02ef84", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "model = torch.hub.load(\"coderx7/simplenet_pytorch:v1.0.0\", \"simplenetv1_5m_m1\", pretrained=True)\n", + "# or any of these variants\n", + "# model = torch.hub.load(\"coderx7/simplenet_pytorch:v1.0.0\", \"simplenetv1_5m_m2\", pretrained=True)\n", + "# model = torch.hub.load(\"coderx7/simplenet_pytorch:v1.0.0\", \"simplenetv1_9m_m1\", pretrained=True)\n", + "# model = torch.hub.load(\"coderx7/simplenet_pytorch:v1.0.0\", \"simplenetv1_9m_m2\", pretrained=True)\n", + "# model = torch.hub.load(\"coderx7/simplenet_pytorch:v1.0.0\", \"simplenetv1_small_m1_05\", pretrained=True)\n", + "# model = torch.hub.load(\"coderx7/simplenet_pytorch:v1.0.0\", \"simplenetv1_small_m2_05\", pretrained=True)\n", + "# model = torch.hub.load(\"coderx7/simplenet_pytorch:v1.0.0\", \"simplenetv1_small_m1_075\", pretrained=True)\n", + "# model = torch.hub.load(\"coderx7/simplenet_pytorch:v1.0.0\", \"simplenetv1_small_m2_075\", pretrained=True)\n", + "model.eval()" + ] + }, + { + "cell_type": "markdown", + "id": "ec4ddcb6", + "metadata": {}, + "source": [ + "All pre-trained models expect input images normalized in the same way,\n", + "i.e. mini-batches of 3-channel RGB images of shape `(3 x H x W)`, where `H` and `W` are expected to be at least `224`.\n", + "The images have to be loaded in to a range of `[0, 1]` and then normalized using `mean = [0.485, 0.456, 0.406]`\n", + "and `std = [0.229, 0.224, 0.225]`.\n", + "\n", + "Here's a sample execution." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c655a2d", + "metadata": {}, + "outputs": [], + "source": [ + "# Download an example image from the pytorch website\n", + "import urllib\n", + "url, filename = (\"https://github.com/pytorch/hub/raw/master/images/dog.jpg\", \"dog.jpg\")\n", + "try: urllib.URLopener().retrieve(url, filename)\n", + "except: urllib.request.urlretrieve(url, filename)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "43bb8ba8", + "metadata": {}, + "outputs": [], + "source": [ + "# sample execution (requires torchvision)\n", + "from PIL import Image\n", + "from torchvision import transforms\n", + "input_image = Image.open(filename)\n", + "preprocess = transforms.Compose([\n", + " transforms.Resize(256),\n", + " transforms.CenterCrop(224),\n", + " transforms.ToTensor(),\n", + " transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n", + "])\n", + "input_tensor = preprocess(input_image)\n", + "input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model\n", + "\n", + "# move the input and model to GPU for speed if available\n", + "if torch.cuda.is_available():\n", + " input_batch = input_batch.to('cuda')\n", + " model.to('cuda')\n", + "\n", + "with torch.no_grad():\n", + " output = model(input_batch)\n", + "# Tensor of shape 1000, with confidence scores over ImageNet's 1000 classes\n", + "print(output[0])\n", + "# The output has unnormalized scores. To get probabilities, you can run a softmax on it.\n", + "probabilities = torch.nn.functional.softmax(output[0], dim=0)\n", + "print(probabilities)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d89b13ff", + "metadata": {}, + "outputs": [], + "source": [ + "# Download ImageNet labels\n", + "!wget https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ff946e58", + "metadata": {}, + "outputs": [], + "source": [ + "# Read the categories\n", + "with open(\"imagenet_classes.txt\", \"r\") as f:\n", + " categories = [s.strip() for s in f.readlines()]\n", + "# Show top categories per image\n", + "top5_prob, top5_catid = torch.topk(probabilities, 5)\n", + "for i in range(top5_prob.size(0)):\n", + " print(categories[top5_catid[i]], top5_prob[i].item())" + ] + }, + { + "cell_type": "markdown", + "id": "dbd43f60", + "metadata": {}, + "source": [ + "### Model Description\n", + "\n", + "SimpleNet models were proposed in \"Lets Keep it simple, Using simple architectures to outperform deeper and more complex architectures\". \n", + "Here we have the 8 versions of simplenet models, which contains 1.5m, 3.2m, 5.7m and 9.5m parameters respectively. \n", + "Detailed model architectures can be found in Table 1 and Table 2. \n", + "Their 1-crop errors on ImageNet dataset with pretrained models are listed below. \n", + "\n", + "The m2 variants \n", + "\n", + "| Model structure | Top-1 errors | Top-5 errors |\n", + "| :------------------------- | :-----------: | :-----------:|\n", + "| simplenetv1_small_m2_05 | 38.33 | 16.512 |\n", + "| simplenetv1_small_m2_075 | 31.494 | 11.85 |\n", + "| simplenetv1_5m_m2 | 27.97 | 9.676 |\n", + "| simplenetv1_9m_m2 | 25.77 | 8.252 |\n", + "\n", + "The m1 variants \n", + "\n", + "| Model structure | Top-1 errors | Top-5 errors |\n", + "| :------------------------- | :-----------: | :-----------:|\n", + "| simplenetv1_small_m1_05 | 38.878 | 17.012 |\n", + "| simplenetv1_small_m1_075 | 32.216 | 12.282 |\n", + "| simplenetv1_5m_m1 | 28.452 | 10.06 |\n", + "| simplenetv1_9m_m1 | 26.208 | 8.514 |\n", + "\n", + "### References\n", + "\n", + " - [Lets Keep it simple, Using simple architectures to outperform deeper and more complex architectures](https://arxiv.org/abs/1608.06037)" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/snakers4_silero-models_stt.ipynb b/assets/hub/snakers4_silero-models_stt.ipynb new file mode 100644 index 000000000000..df2cd221f606 --- /dev/null +++ b/assets/hub/snakers4_silero-models_stt.ipynb @@ -0,0 +1,106 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "cadeb1df", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# Silero Speech-To-Text Models\n", + "\n", + "*Author: Silero AI Team*\n", + "\n", + "**A set of compact enterprise-grade pre-trained STT Models for multiple languages.**\n", + "\n", + "_ | _\n", + "- | -\n", + "![alt](https://pytorch.org/assets/images/silero_stt_model.jpg) | ![alt](https://pytorch.org/assets/images/silero_imagenet_moment.png)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "df36f984", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "# this assumes that you have a proper version of PyTorch already installed\n", + "pip install -q torchaudio omegaconf soundfile" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1ab515b9", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import zipfile\n", + "import torchaudio\n", + "from glob import glob\n", + "\n", + "device = torch.device('cpu') # gpu also works, but our models are fast enough for CPU\n", + "\n", + "model, decoder, utils = torch.hub.load(repo_or_dir='snakers4/silero-models',\n", + " model='silero_stt',\n", + " language='en', # also available 'de', 'es'\n", + " device=device)\n", + "(read_batch, split_into_batches,\n", + " read_audio, prepare_model_input) = utils # see function signature for details\n", + "\n", + "# download a single file, any format compatible with TorchAudio (soundfile backend)\n", + "torch.hub.download_url_to_file('https://opus-codec.org/static/examples/samples/speech_orig.wav',\n", + " dst ='speech_orig.wav', progress=True)\n", + "test_files = glob('speech_orig.wav')\n", + "batches = split_into_batches(test_files, batch_size=10)\n", + "input = prepare_model_input(read_batch(batches[0]),\n", + " device=device)\n", + "\n", + "output = model(input)\n", + "for example in output:\n", + " print(decoder(example.cpu()))" + ] + }, + { + "cell_type": "markdown", + "id": "84bebade", + "metadata": {}, + "source": [ + "### Model Description\n", + "\n", + "Silero Speech-To-Text models provide enterprise grade STT in a compact form-factor for several commonly spoken languages. Unlike conventional ASR models our models are robust to a variety of dialects, codecs, domains, noises, lower sampling rates (for simplicity audio should be resampled to 16 kHz). The models consume a normalized audio in the form of samples (i.e. without any pre-processing except for normalization to -1 ... 1) and output frames with token probabilities. We provide a decoder utility for simplicity (we could include it into our model itself, but scripted modules had problems with storing model artifacts i.e. labels during certain export scenarios).\n", + "\n", + "We hope that our efforts with Open-STT and Silero Models will bring the ImageNet moment in speech closer.\n", + "\n", + "### Supported Languages and Formats\n", + "\n", + "As of this page update, the following languages are supported:\n", + "\n", + "- English\n", + "- German\n", + "- Spanish\n", + "\n", + "To see the always up-to-date language list, please visit our [repo](https://github.com/snakers4/silero-models) and see the `yml` [file](https://github.com/snakers4/silero-models/blob/master/models.yml) for all available checkpoints.\n", + "\n", + "### Additional Examples and Benchmarks\n", + "\n", + "For additional examples and other model formats please visit this [link](https://github.com/snakers4/silero-models). For quality and performance benchmarks please see the [wiki](https://github.com/snakers4/silero-models/wiki). These resources will be updated from time to time.\n", + "\n", + "### References\n", + "\n", + "- [Silero Models](https://github.com/snakers4/silero-models)\n", + "- [Alexander Veysov, \"Toward's an ImageNet Moment for Speech-to-Text\", The Gradient, 2020](https://thegradient.pub/towards-an-imagenet-moment-for-speech-to-text/)\n", + "- [Alexander Veysov, \"A Speech-To-Text Practitioner’s Criticisms of Industry and Academia\", The Gradient, 2020](https://thegradient.pub/a-speech-to-text-practitioners-criticisms-of-industry-and-academia/)" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/snakers4_silero-models_tts.ipynb b/assets/hub/snakers4_silero-models_tts.ipynb new file mode 100644 index 000000000000..5a674397cd29 --- /dev/null +++ b/assets/hub/snakers4_silero-models_tts.ipynb @@ -0,0 +1,99 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ff883b45", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# Silero Text-To-Speech Models\n", + "\n", + "*Author: Silero AI Team*\n", + "\n", + "**A set of compact enterprise-grade pre-trained TTS Models for multiple languages**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1ce245de", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "# this assumes that you have a proper version of PyTorch already installed\n", + "pip install -q torchaudio omegaconf" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "39b1ae7f", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "\n", + "language = 'en'\n", + "speaker = 'lj_16khz'\n", + "device = torch.device('cpu')\n", + "model, symbols, sample_rate, example_text, apply_tts = torch.hub.load(repo_or_dir='snakers4/silero-models',\n", + " model='silero_tts',\n", + " language=language,\n", + " speaker=speaker)\n", + "model = model.to(device) # gpu or cpu\n", + "audio = apply_tts(texts=[example_text],\n", + " model=model,\n", + " sample_rate=sample_rate,\n", + " symbols=symbols,\n", + " device=device)" + ] + }, + { + "cell_type": "markdown", + "id": "352c834f", + "metadata": {}, + "source": [ + "### Model Description\n", + "\n", + "Silero Text-To-Speech models provide enterprise grade TTS in a compact form-factor for several commonly spoken languages:\n", + "\n", + "- One-line usage\n", + "- Naturally sounding speech\n", + "- No GPU or training required\n", + "- Minimalism and lack of dependencies\n", + "- A library of voices in many languages\n", + "- Support for `16kHz` and `8kHz` out of the box\n", + "- High throughput on slow hardware. Decent performance on one CPU thread\n", + "\n", + "### Supported Languages and Formats\n", + "\n", + "As of this page update, the speakers of the following languages are supported both in 8 kHz and 16 kHz:\n", + "\n", + "- Russian (6 speakers)\n", + "- English (1 speaker)\n", + "- German (1 speaker)\n", + "- Spanish (1 speaker)\n", + "- French (1 speaker)\n", + "\n", + "To see the always up-to-date language list, please visit our [repo](https://github.com/snakers4/silero-models) and see the `yml` [file](https://github.com/snakers4/silero-models/blob/master/models.yml) for all available checkpoints.\n", + "\n", + "### Additional Examples and Benchmarks\n", + "\n", + "For additional examples and other model formats please visit this [link](https://github.com/snakers4/silero-models). For quality and performance benchmarks please see the [wiki](https://github.com/snakers4/silero-models/wiki). These resources will be updated from time to time.\n", + "\n", + "### References\n", + "\n", + "- [Silero Models](https://github.com/snakers4/silero-models)\n", + "- [High-Quality Speech-to-Text Made Accessible, Simple and Fast](https://habr.com/ru/post/549482/)" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/snakers4_silero-vad_vad.ipynb b/assets/hub/snakers4_silero-vad_vad.ipynb new file mode 100644 index 000000000000..9cdf0c02f217 --- /dev/null +++ b/assets/hub/snakers4_silero-vad_vad.ipynb @@ -0,0 +1,95 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "4e119581", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# Silero Voice Activity Detector\n", + "\n", + "*Author: Silero AI Team*\n", + "\n", + "**Pre-trained Voice Activity Detector**\n", + "\n", + "\"alt\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0a9a1b01", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "# this assumes that you have a proper version of PyTorch already installed\n", + "pip install -q torchaudio" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a638e514", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "torch.set_num_threads(1)\n", + "\n", + "from IPython.display import Audio\n", + "from pprint import pprint\n", + "# download example\n", + "torch.hub.download_url_to_file('https://models.silero.ai/vad_models/en.wav', 'en_example.wav')\n", + "\n", + "model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n", + " model='silero_vad',\n", + " force_reload=True)\n", + "\n", + "(get_speech_timestamps,\n", + " _, read_audio,\n", + " *_) = utils\n", + "\n", + "sampling_rate = 16000 # also accepts 8000\n", + "wav = read_audio('en_example.wav', sampling_rate=sampling_rate)\n", + "# get speech timestamps from full audio file\n", + "speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=sampling_rate)\n", + "pprint(speech_timestamps)" + ] + }, + { + "cell_type": "markdown", + "id": "9c5dc9e9", + "metadata": {}, + "source": [ + "### Model Description\n", + "\n", + "Silero VAD: pre-trained enterprise-grade Voice Activity Detector (VAD). Enterprise-grade Speech Products made refreshingly simple (see our STT models). **Each model is published separately**.\n", + "\n", + "Currently, there are hardly any high quality / modern / free / public voice activity detectors except for WebRTC Voice Activity Detector (link). WebRTC though starts to show its age and it suffers from many false positives.\n", + "\n", + "**(!!!) Important Notice (!!!)** - the models are intended to run on CPU only and were optimized for performance on 1 CPU thread. Note that the model is quantized.\n", + "\n", + "\n", + "### Additional Examples and Benchmarks\n", + "\n", + "For additional examples and other model formats please visit this [link](https://github.com/snakers4/silero-vad) and please refer to the extensive examples in the Colab format (including the streaming examples).\n", + "\n", + "### References\n", + "\n", + "VAD model architectures are based on similar STT architectures.\n", + "\n", + "- [Silero VAD](https://github.com/snakers4/silero-vad)\n", + "- [Alexander Veysov, \"Toward's an ImageNet Moment for Speech-to-Text\", The Gradient, 2020](https://thegradient.pub/towards-an-imagenet-moment-for-speech-to-text/)\n", + "- [Alexander Veysov, \"A Speech-To-Text Practitioner’s Criticisms of Industry and Academia\", The Gradient, 2020](https://thegradient.pub/a-speech-to-text-practitioners-criticisms-of-industry-and-academia/)" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/ultralytics_yolov5.ipynb b/assets/hub/ultralytics_yolov5.ipynb new file mode 100644 index 000000000000..1dacc0f1dc8d --- /dev/null +++ b/assets/hub/ultralytics_yolov5.ipynb @@ -0,0 +1,167 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "5c265ba5", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# YOLOv5\n", + "\n", + "*Author: Ultralytics*\n", + "\n", + "**Ultralytics YOLOv5 🚀 for object detection, instance segmentation and image classification.**\n", + "\n", + "_ | _\n", + "- | -\n", + "![alt](https://pytorch.org/assets/images/ultralytics_yolov5_img1.png) | ![alt](https://pytorch.org/assets/images/ultralytics_yolov5_img2.png)\n", + "\n", + "\n", + "## Before You Start\n", + "\n", + "Start from a **Python>=3.8** environment with **PyTorch>=1.7** installed. To install PyTorch see [https://pytorch.org/get-started/locally/](https://pytorch.org/get-started/locally/). To install YOLOv5 dependencies:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6f3bfa8", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "pip install -U ultralytics" + ] + }, + { + "cell_type": "markdown", + "id": "6f248585", + "metadata": {}, + "source": [ + "## Model Description\n", + "\n", + "\"YOLO\n", + "\n", + "Ultralytics YOLOv5 🚀 is a cutting-edge, state-of-the-art (SOTA) model that builds upon the success of previous YOLO versions and introduces new features and improvements to further boost performance and flexibility. YOLOv5 is designed to be fast, accurate, and easy to use, making it an excellent choice for a wide range of object detection, instance segmentation and image classification tasks.\n", + "\n", + "We hope that the resources here will help you get the most out of YOLOv5. Please browse the YOLOv5 [Docs](https://docs.ultralytics.com/yolov5) for details, raise an issue on [GitHub](https://github.com/ultralytics/yolov5/issues/new/choose) for support, and join our [Discord](https://discord.gg/n6cFeSPZdD) community for questions and discussions!\n", + "\n", + "| Model | size
(pixels) | mAPval
50-95 | mAPval
50 | Speed
CPU b1
(ms) | Speed
V100 b1
(ms) | Speed
V100 b32
(ms) | params
(M) | FLOPs
@640 (B) |\n", + "|-------------------------------------------------------------------------------------------------|-----------------------|----------------------|-------------------|------------------------------|-------------------------------|--------------------------------|--------------------|------------------------|\n", + "| [YOLOv5n](https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5n.pt) | 640 | 28.0 | 45.7 | **45** | **6.3** | **0.6** | **1.9** | **4.5** |\n", + "| [YOLOv5s](https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5s.pt) | 640 | 37.4 | 56.8 | 98 | 6.4 | 0.9 | 7.2 | 16.5 |\n", + "| [YOLOv5m](https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5m.pt) | 640 | 45.4 | 64.1 | 224 | 8.2 | 1.7 | 21.2 | 49.0 |\n", + "| [YOLOv5l](https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5l.pt) | 640 | 49.0 | 67.3 | 430 | 10.1 | 2.7 | 46.5 | 109.1 |\n", + "| [YOLOv5x](https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5x.pt) | 640 | 50.7 | 68.9 | 766 | 12.1 | 4.8 | 86.7 | 205.7 |\n", + "| | | | | | | | | |\n", + "| [YOLOv5n6](https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5n6.pt) | 1280 | 36.0 | 54.4 | 153 | 8.1 | 2.1 | 3.2 | 4.6 |\n", + "| [YOLOv5s6](https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5s6.pt) | 1280 | 44.8 | 63.7 | 385 | 8.2 | 3.6 | 12.6 | 16.8 |\n", + "| [YOLOv5m6](https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5m6.pt) | 1280 | 51.3 | 69.3 | 887 | 11.1 | 6.8 | 35.7 | 50.0 |\n", + "| [YOLOv5l6](https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5l6.pt) | 1280 | 53.7 | 71.3 | 1784 | 15.8 | 10.5 | 76.8 | 111.4 |\n", + "| [YOLOv5x6](https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5x6.pt)
+ [TTA] | 1280
1536 | 55.0
**55.8** | 72.7
**72.7** | 3136
- | 26.2
- | 19.4
- | 140.7
- | 209.8
- |\n", + "\n", + "
\n", + " Table Notes\n", + "\n", + "- All checkpoints are trained to 300 epochs with default settings. Nano and Small models use [hyp.scratch-low.yaml](https://github.com/ultralytics/yolov5/blob/master/data/hyps/hyp.scratch-low.yaml) hyps, all others use [hyp.scratch-high.yaml](https://github.com/ultralytics/yolov5/blob/master/data/hyps/hyp.scratch-high.yaml).\n", + "- **mAPval** values are for single-model single-scale on [COCO val2017](http://cocodataset.org) dataset.
Reproduce by `python val.py --data coco.yaml --img 640 --conf 0.001 --iou 0.65`\n", + "- **Speed** averaged over COCO val images using a [AWS p3.2xlarge](https://aws.amazon.com/ec2/instance-types/p3/) instance. NMS times (~1 ms/img) not included.
Reproduce by `python val.py --data coco.yaml --img 640 --task speed --batch 1`\n", + "- **TTA** [Test Time Augmentation](https://docs.ultralytics.com/yolov5/tutorials/test_time_augmentation) includes reflection and scale augmentations.
Reproduce by `python val.py --data coco.yaml --img 1536 --iou 0.7 --augment`\n", + "\n", + "
\n", + "\n", + "## Load From PyTorch Hub\n", + "\n", + "This example loads a pretrained **YOLOv5s** model and passes an image for inference. YOLOv5 accepts **URL**, **Filename**, **PIL**, **OpenCV**, **Numpy** and **PyTorch** inputs, and returns detections in **torch**, **pandas**, and **JSON** output formats. See the [YOLOv5 PyTorch Hub Tutorial](https://docs.ultralytics.com/yolov5/tutorials/pytorch_hub_model_loading/) for details." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5a729163", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "\n", + "# Model\n", + "model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True)\n", + "\n", + "# Images\n", + "imgs = ['https://ultralytics.com/images/zidane.jpg'] # batch of images\n", + "\n", + "# Inference\n", + "results = model(imgs)\n", + "\n", + "# Results\n", + "results.print()\n", + "results.save() # or .show()\n", + "\n", + "results.xyxy[0] # img1 predictions (tensor)\n", + "results.pandas().xyxy[0] # img1 predictions (pandas)\n", + "# xmin ymin xmax ymax confidence class name\n", + "# 0 749.50 43.50 1148.0 704.5 0.874023 0 person\n", + "# 1 433.50 433.50 517.5 714.5 0.687988 27 tie\n", + "# 2 114.75 195.75 1095.0 708.0 0.624512 0 person\n", + "# 3 986.00 304.00 1028.0 420.0 0.286865 27 tie" + ] + }, + { + "cell_type": "markdown", + "id": "6d4d3437", + "metadata": {}, + "source": [ + "## Citation\n", + "\n", + "If you use YOLOv5 or YOLOv5u in your research, please cite the Ultralytics YOLOv5 repository as follows:\n", + "\n", + "[![DOI](https://zenodo.org/badge/264818686.svg)](https://zenodo.org/badge/latestdoi/264818686)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b8e175a9", + "metadata": { + "attributes": { + "classes": [ + "bibtex" + ], + "id": "" + } + }, + "outputs": [], + "source": [ + "@software{yolov5,\n", + " title = {YOLOv5 by Ultralytics},\n", + " author = {Glenn Jocher},\n", + " year = {2020},\n", + " version = {7.0},\n", + " license = {AGPL-3.0},\n", + " url = {https://github.com/ultralytics/yolov5},\n", + " doi = {10.5281/zenodo.3908559},\n", + " orcid = {0000-0001-5950-6979}\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "5aca7c41", + "metadata": {}, + "source": [ + "## Contact\n", + "\n", + "For YOLOv5 bug reports and feature requests please visit [GitHub Issues](https://github.com/ultralytics/yolov5/issues), and join our [Discord](https://discord.gg/n6cFeSPZdD) community for questions and discussions!\n", + "\n", + " " + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/images/1738166706211.jpg b/assets/images/1738166706211.jpg new file mode 100644 index 000000000000..80766945d098 Binary files /dev/null and b/assets/images/1738166706211.jpg differ diff --git a/assets/images/175b_throught.png b/assets/images/175b_throught.png new file mode 100644 index 000000000000..7352b57fc912 Binary files /dev/null and b/assets/images/175b_throught.png differ diff --git a/assets/images/1t_thought.png b/assets/images/1t_thought.png new file mode 100644 index 000000000000..e85feb6874ab Binary files /dev/null and b/assets/images/1t_thought.png differ diff --git a/assets/images/2022-7-12-a-better-transformer-for-fast-transformer-encoder-inference-1.png b/assets/images/2022-7-12-a-better-transformer-for-fast-transformer-encoder-inference-1.png new file mode 100644 index 000000000000..c6d7260793dc Binary files /dev/null and b/assets/images/2022-7-12-a-better-transformer-for-fast-transformer-encoder-inference-1.png differ diff --git a/assets/images/2022-7-12-a-better-transformer-for-fast-transformer-encoder-inference-2.png b/assets/images/2022-7-12-a-better-transformer-for-fast-transformer-encoder-inference-2.png new file mode 100644 index 000000000000..88d47cc97700 Binary files /dev/null and b/assets/images/2022-7-12-a-better-transformer-for-fast-transformer-encoder-inference-2.png differ diff --git a/assets/images/2022-7-12-a-better-transformer-for-fast-transformer-encoder-inference-3.png b/assets/images/2022-7-12-a-better-transformer-for-fast-transformer-encoder-inference-3.png new file mode 100644 index 000000000000..5ecd74c9f576 Binary files /dev/null and b/assets/images/2022-7-12-a-better-transformer-for-fast-transformer-encoder-inference-3.png differ diff --git a/assets/images/2022-7-15-PathAI-Uses-PyTorch-to-Improve-Patient-Outcomes-with-AI-powered-Pathology-1.png b/assets/images/2022-7-15-PathAI-Uses-PyTorch-to-Improve-Patient-Outcomes-with-AI-powered-Pathology-1.png new file mode 100644 index 000000000000..cd798f3ff1f8 Binary files /dev/null and b/assets/images/2022-7-15-PathAI-Uses-PyTorch-to-Improve-Patient-Outcomes-with-AI-powered-Pathology-1.png differ diff --git a/assets/images/2022-7-15-PathAI-Uses-PyTorch-to-Improve-Patient-Outcomes-with-AI-powered-Pathology-2.png b/assets/images/2022-7-15-PathAI-Uses-PyTorch-to-Improve-Patient-Outcomes-with-AI-powered-Pathology-2.png new file mode 100644 index 000000000000..69efafe6abc1 Binary files /dev/null and b/assets/images/2022-7-15-PathAI-Uses-PyTorch-to-Improve-Patient-Outcomes-with-AI-powered-Pathology-2.png differ diff --git a/assets/images/2022-7-15-PathAI-Uses-PyTorch-to-Improve-Patient-Outcomes-with-AI-powered-Pathology-3.png b/assets/images/2022-7-15-PathAI-Uses-PyTorch-to-Improve-Patient-Outcomes-with-AI-powered-Pathology-3.png new file mode 100644 index 000000000000..04b075c55510 Binary files /dev/null and b/assets/images/2022-7-15-PathAI-Uses-PyTorch-to-Improve-Patient-Outcomes-with-AI-powered-Pathology-3.png differ diff --git a/assets/images/2022-7-15-introducing-the-playtorch-app-1.gif b/assets/images/2022-7-15-introducing-the-playtorch-app-1.gif new file mode 100644 index 000000000000..99f7343eaddc Binary files /dev/null and b/assets/images/2022-7-15-introducing-the-playtorch-app-1.gif differ diff --git a/assets/images/2022-7-15-introducing-the-playtorch-app-2.jpg b/assets/images/2022-7-15-introducing-the-playtorch-app-2.jpg new file mode 100644 index 000000000000..171cbcbd20a3 Binary files /dev/null and b/assets/images/2022-7-15-introducing-the-playtorch-app-2.jpg differ diff --git a/assets/images/2022-7-15-introducing-the-playtorch-app-3.jpg b/assets/images/2022-7-15-introducing-the-playtorch-app-3.jpg new file mode 100644 index 000000000000..61a0fd65e30c Binary files /dev/null and b/assets/images/2022-7-15-introducing-the-playtorch-app-3.jpg differ diff --git a/assets/images/2022-7-15-introducing-the-playtorch-app-4.gif b/assets/images/2022-7-15-introducing-the-playtorch-app-4.gif new file mode 100644 index 000000000000..6af337bf9338 Binary files /dev/null and b/assets/images/2022-7-15-introducing-the-playtorch-app-4.gif differ diff --git a/assets/images/2022-7-15-introducing-the-playtorch-app-5.jpg b/assets/images/2022-7-15-introducing-the-playtorch-app-5.jpg new file mode 100644 index 000000000000..d8f829a3f6b2 Binary files /dev/null and b/assets/images/2022-7-15-introducing-the-playtorch-app-5.jpg differ diff --git a/assets/images/2022-7-15-introducing-the-playtorch-app-6.jpg b/assets/images/2022-7-15-introducing-the-playtorch-app-6.jpg new file mode 100644 index 000000000000..3ba963eb9d9d Binary files /dev/null and b/assets/images/2022-7-15-introducing-the-playtorch-app-6.jpg differ diff --git a/assets/images/2022-7-15-introducing-the-playtorch-app-7.jpg b/assets/images/2022-7-15-introducing-the-playtorch-app-7.jpg new file mode 100644 index 000000000000..6e1e05ae18a2 Binary files /dev/null and b/assets/images/2022-7-15-introducing-the-playtorch-app-7.jpg differ diff --git a/assets/images/2023-02-14-democratizing-ai-with-pytorch-1.png b/assets/images/2023-02-14-democratizing-ai-with-pytorch-1.png new file mode 100644 index 000000000000..812d64c9b68d Binary files /dev/null and b/assets/images/2023-02-14-democratizing-ai-with-pytorch-1.png differ diff --git a/assets/images/2023-02-14-democratizing-ai-with-pytorch-2.png b/assets/images/2023-02-14-democratizing-ai-with-pytorch-2.png new file mode 100644 index 000000000000..64d12df48cdf Binary files /dev/null and b/assets/images/2023-02-14-democratizing-ai-with-pytorch-2.png differ diff --git a/assets/images/2023-03-22-batchsizescaling.svg b/assets/images/2023-03-22-batchsizescaling.svg new file mode 100644 index 000000000000..1fa09c7adff3 --- /dev/null +++ b/assets/images/2023-03-22-batchsizescaling.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/assets/images/2023-03-22-inferencespeedup.svg b/assets/images/2023-03-22-inferencespeedup.svg new file mode 100644 index 000000000000..db16bdba2573 --- /dev/null +++ b/assets/images/2023-03-22-inferencespeedup.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/assets/images/2023-03-22-torchbenchtraining.svg b/assets/images/2023-03-22-torchbenchtraining.svg new file mode 100644 index 000000000000..566999611bee --- /dev/null +++ b/assets/images/2023-03-22-torchbenchtraining.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/assets/images/2023-03-22-trainingspeedup.svg b/assets/images/2023-03-22-trainingspeedup.svg new file mode 100644 index 000000000000..bc0873a04246 --- /dev/null +++ b/assets/images/2023-03-22-trainingspeedup.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/assets/images/2023-04-11-accelerated-generative-diffusion-models1.png b/assets/images/2023-04-11-accelerated-generative-diffusion-models1.png new file mode 100644 index 000000000000..27f6ba1cd984 Binary files /dev/null and b/assets/images/2023-04-11-accelerated-generative-diffusion-models1.png differ diff --git a/assets/images/2023-04-11-accelerated-generative-diffusion-models2.png b/assets/images/2023-04-11-accelerated-generative-diffusion-models2.png new file mode 100644 index 000000000000..260fcfe31430 Binary files /dev/null and b/assets/images/2023-04-11-accelerated-generative-diffusion-models2.png differ diff --git a/assets/images/2023-04-11-accelerated-generative-diffusion-models3.png b/assets/images/2023-04-11-accelerated-generative-diffusion-models3.png new file mode 100644 index 000000000000..e2c056a4d4b5 Binary files /dev/null and b/assets/images/2023-04-11-accelerated-generative-diffusion-models3.png differ diff --git a/assets/images/2023-04-11-accelerated-generative-diffusion-models4.png b/assets/images/2023-04-11-accelerated-generative-diffusion-models4.png new file mode 100644 index 000000000000..7a3cfadc8028 Binary files /dev/null and b/assets/images/2023-04-11-accelerated-generative-diffusion-models4.png differ diff --git a/assets/images/2023-04-18-accelerating-large-language-models/PyTorch_Better-Transformer_Chart-1.png b/assets/images/2023-04-18-accelerating-large-language-models/PyTorch_Better-Transformer_Chart-1.png new file mode 100644 index 000000000000..6a7420180898 Binary files /dev/null and b/assets/images/2023-04-18-accelerating-large-language-models/PyTorch_Better-Transformer_Chart-1.png differ diff --git a/assets/images/2023-04-18-accelerating-large-language-models/PyTorch_Better-Transformer_Chart-2.png b/assets/images/2023-04-18-accelerating-large-language-models/PyTorch_Better-Transformer_Chart-2.png new file mode 100644 index 000000000000..85ef6c5f812e Binary files /dev/null and b/assets/images/2023-04-18-accelerating-large-language-models/PyTorch_Better-Transformer_Chart-2.png differ diff --git a/assets/images/2023-04-18-accelerating-large-language-models/PyTorch_Better-Transformer_Figure-1.png b/assets/images/2023-04-18-accelerating-large-language-models/PyTorch_Better-Transformer_Figure-1.png new file mode 100644 index 000000000000..38ad2962b539 Binary files /dev/null and b/assets/images/2023-04-18-accelerating-large-language-models/PyTorch_Better-Transformer_Figure-1.png differ diff --git a/assets/images/2023-04-18-accelerating-large-language-models/causal_attention_step_1.png b/assets/images/2023-04-18-accelerating-large-language-models/causal_attention_step_1.png new file mode 100644 index 000000000000..bf1e939717cd Binary files /dev/null and b/assets/images/2023-04-18-accelerating-large-language-models/causal_attention_step_1.png differ diff --git a/assets/images/2023-04-18-accelerating-large-language-models/causal_attention_step_2.png b/assets/images/2023-04-18-accelerating-large-language-models/causal_attention_step_2.png new file mode 100644 index 000000000000..315652cd929e Binary files /dev/null and b/assets/images/2023-04-18-accelerating-large-language-models/causal_attention_step_2.png differ diff --git a/assets/images/2023-04-18-accelerating-large-language-models/chart.png b/assets/images/2023-04-18-accelerating-large-language-models/chart.png new file mode 100644 index 000000000000..b585303bae20 Binary files /dev/null and b/assets/images/2023-04-18-accelerating-large-language-models/chart.png differ diff --git a/assets/images/2023-04-18-accelerating-large-language-models/tweet.png b/assets/images/2023-04-18-accelerating-large-language-models/tweet.png new file mode 100644 index 000000000000..7598441da0fb Binary files /dev/null and b/assets/images/2023-04-18-accelerating-large-language-models/tweet.png differ diff --git a/assets/images/2023-07-31-performant-distributed-checkpointing-1.png b/assets/images/2023-07-31-performant-distributed-checkpointing-1.png new file mode 100644 index 000000000000..68066636bc0d Binary files /dev/null and b/assets/images/2023-07-31-performant-distributed-checkpointing-1.png differ diff --git a/assets/images/2023-07-31-performant-distributed-checkpointing-2.png b/assets/images/2023-07-31-performant-distributed-checkpointing-2.png new file mode 100644 index 000000000000..97c362aebe07 Binary files /dev/null and b/assets/images/2023-07-31-performant-distributed-checkpointing-2.png differ diff --git a/assets/images/2023-4-27-hidet.png b/assets/images/2023-4-27-hidet.png new file mode 100644 index 000000000000..74ebb9456076 Binary files /dev/null and b/assets/images/2023-4-27-hidet.png differ diff --git a/assets/images/2023-7-25-announcing-ccp-based-s3-io-datapipes-1.png b/assets/images/2023-7-25-announcing-ccp-based-s3-io-datapipes-1.png new file mode 100644 index 000000000000..e76cd2486ed3 Binary files /dev/null and b/assets/images/2023-7-25-announcing-ccp-based-s3-io-datapipes-1.png differ diff --git a/assets/images/2023-7-25-announcing-ccp-based-s3-io-datapipes-2.png b/assets/images/2023-7-25-announcing-ccp-based-s3-io-datapipes-2.png new file mode 100644 index 000000000000..62e864d70d52 Binary files /dev/null and b/assets/images/2023-7-25-announcing-ccp-based-s3-io-datapipes-2.png differ diff --git a/assets/images/2023-7-25-announcing-ccp-based-s3-io-datapipes-3.png b/assets/images/2023-7-25-announcing-ccp-based-s3-io-datapipes-3.png new file mode 100644 index 000000000000..a21d8bb8b08c Binary files /dev/null and b/assets/images/2023-7-25-announcing-ccp-based-s3-io-datapipes-3.png differ diff --git a/assets/images/2023-7-25-announcing-ccp-based-s3-io-datapipes-4.png b/assets/images/2023-7-25-announcing-ccp-based-s3-io-datapipes-4.png new file mode 100644 index 000000000000..15de2d1a79fb Binary files /dev/null and b/assets/images/2023-7-25-announcing-ccp-based-s3-io-datapipes-4.png differ diff --git a/assets/images/2023-7-25-announcing-ccp-based-s3-io-datapipes-5.png b/assets/images/2023-7-25-announcing-ccp-based-s3-io-datapipes-5.png new file mode 100644 index 000000000000..2aaa9ccb08a6 Binary files /dev/null and b/assets/images/2023-7-25-announcing-ccp-based-s3-io-datapipes-5.png differ diff --git a/assets/images/2024-year-in-review/fg1.jpg b/assets/images/2024-year-in-review/fg1.jpg new file mode 100644 index 000000000000..e133c87a788f Binary files /dev/null and b/assets/images/2024-year-in-review/fg1.jpg differ diff --git a/assets/images/2024-year-in-review/fg10.jpg b/assets/images/2024-year-in-review/fg10.jpg new file mode 100644 index 000000000000..7735d773f4fe Binary files /dev/null and b/assets/images/2024-year-in-review/fg10.jpg differ diff --git a/assets/images/2024-year-in-review/fg2.jpg b/assets/images/2024-year-in-review/fg2.jpg new file mode 100644 index 000000000000..bacb08440f16 Binary files /dev/null and b/assets/images/2024-year-in-review/fg2.jpg differ diff --git a/assets/images/2024-year-in-review/fg3.jpg b/assets/images/2024-year-in-review/fg3.jpg new file mode 100644 index 000000000000..9a86d2c8837e Binary files /dev/null and b/assets/images/2024-year-in-review/fg3.jpg differ diff --git a/assets/images/2024-year-in-review/fg4.jpg b/assets/images/2024-year-in-review/fg4.jpg new file mode 100644 index 000000000000..0c91ef6e95cd Binary files /dev/null and b/assets/images/2024-year-in-review/fg4.jpg differ diff --git a/assets/images/2024-year-in-review/fg5.jpg b/assets/images/2024-year-in-review/fg5.jpg new file mode 100644 index 000000000000..0a3fe7a95aef Binary files /dev/null and b/assets/images/2024-year-in-review/fg5.jpg differ diff --git a/assets/images/2024-year-in-review/fg6.jpg b/assets/images/2024-year-in-review/fg6.jpg new file mode 100644 index 000000000000..38c094252db9 Binary files /dev/null and b/assets/images/2024-year-in-review/fg6.jpg differ diff --git a/assets/images/2024-year-in-review/fg7.jpg b/assets/images/2024-year-in-review/fg7.jpg new file mode 100644 index 000000000000..2a6a318b650e Binary files /dev/null and b/assets/images/2024-year-in-review/fg7.jpg differ diff --git a/assets/images/2024-year-in-review/fg8.jpg b/assets/images/2024-year-in-review/fg8.jpg new file mode 100644 index 000000000000..93f7afa0be36 Binary files /dev/null and b/assets/images/2024-year-in-review/fg8.jpg differ diff --git a/assets/images/2024-year-in-review/fg9.jpg b/assets/images/2024-year-in-review/fg9.jpg new file mode 100644 index 000000000000..87753d0f51d7 Binary files /dev/null and b/assets/images/2024-year-in-review/fg9.jpg differ diff --git a/assets/images/2281965-ROCm-development-radeon.jpg b/assets/images/2281965-ROCm-development-radeon.jpg new file mode 100644 index 000000000000..26afa4425155 Binary files /dev/null and b/assets/images/2281965-ROCm-development-radeon.jpg differ diff --git a/assets/images/3-16-accelerated-d/fig1-latest.png b/assets/images/3-16-accelerated-d/fig1-latest.png new file mode 100644 index 000000000000..2589f2203e23 Binary files /dev/null and b/assets/images/3-16-accelerated-d/fig1-latest.png differ diff --git a/assets/images/3-16-accelerated-d/fig10-latest.png b/assets/images/3-16-accelerated-d/fig10-latest.png new file mode 100644 index 000000000000..217b7c89c98b Binary files /dev/null and b/assets/images/3-16-accelerated-d/fig10-latest.png differ diff --git a/assets/images/3-16-accelerated-d/fig2-latest.png b/assets/images/3-16-accelerated-d/fig2-latest.png new file mode 100644 index 000000000000..529b9fa6c30e Binary files /dev/null and b/assets/images/3-16-accelerated-d/fig2-latest.png differ diff --git a/assets/images/3-16-accelerated-d/fig3-latest.png b/assets/images/3-16-accelerated-d/fig3-latest.png new file mode 100644 index 000000000000..3875d00ff625 Binary files /dev/null and b/assets/images/3-16-accelerated-d/fig3-latest.png differ diff --git a/assets/images/3-16-accelerated-d/fig4-latest.png b/assets/images/3-16-accelerated-d/fig4-latest.png new file mode 100644 index 000000000000..eb47d924056f Binary files /dev/null and b/assets/images/3-16-accelerated-d/fig4-latest.png differ diff --git a/assets/images/3-16-accelerated-d/fig5-latest.png b/assets/images/3-16-accelerated-d/fig5-latest.png new file mode 100644 index 000000000000..638f33876d31 Binary files /dev/null and b/assets/images/3-16-accelerated-d/fig5-latest.png differ diff --git a/assets/images/3-16-accelerated-d/fig6-latest.png b/assets/images/3-16-accelerated-d/fig6-latest.png new file mode 100644 index 000000000000..42abdeb909de Binary files /dev/null and b/assets/images/3-16-accelerated-d/fig6-latest.png differ diff --git a/assets/images/3-16-accelerated-d/fig7-latest.png b/assets/images/3-16-accelerated-d/fig7-latest.png new file mode 100644 index 000000000000..5d35d2800051 Binary files /dev/null and b/assets/images/3-16-accelerated-d/fig7-latest.png differ diff --git a/assets/images/3-16-accelerated-d/fig8-latest.png b/assets/images/3-16-accelerated-d/fig8-latest.png new file mode 100644 index 000000000000..9cee7d00aeee Binary files /dev/null and b/assets/images/3-16-accelerated-d/fig8-latest.png differ diff --git a/assets/images/3-16-accelerated-d/fig9-latest.png b/assets/images/3-16-accelerated-d/fig9-latest.png new file mode 100644 index 000000000000..8f3370baba6a Binary files /dev/null and b/assets/images/3-16-accelerated-d/fig9-latest.png differ diff --git a/assets/images/404_sign.png b/assets/images/404_sign.png new file mode 100644 index 000000000000..2c2ae05fb9ef Binary files /dev/null and b/assets/images/404_sign.png differ diff --git a/assets/images/6x-faster-async-checkpointing/fg1.png b/assets/images/6x-faster-async-checkpointing/fg1.png new file mode 100644 index 000000000000..4c3d3bf50d02 Binary files /dev/null and b/assets/images/6x-faster-async-checkpointing/fg1.png differ diff --git a/assets/images/6x-faster-async-checkpointing/fg2.png b/assets/images/6x-faster-async-checkpointing/fg2.png new file mode 100644 index 000000000000..1eaddbc43e68 Binary files /dev/null and b/assets/images/6x-faster-async-checkpointing/fg2.png differ diff --git a/assets/images/6x-faster-async-checkpointing/fg3.png b/assets/images/6x-faster-async-checkpointing/fg3.png new file mode 100644 index 000000000000..4c3d3bf50d02 Binary files /dev/null and b/assets/images/6x-faster-async-checkpointing/fg3.png differ diff --git a/assets/images/AutoDP.png b/assets/images/AutoDP.png new file mode 100644 index 000000000000..3534dc73753b Binary files /dev/null and b/assets/images/AutoDP.png differ diff --git a/assets/images/AutoPipe_algorithm.png b/assets/images/AutoPipe_algorithm.png new file mode 100644 index 000000000000..784431d51d4f Binary files /dev/null and b/assets/images/AutoPipe_algorithm.png differ diff --git a/assets/images/Bert_HF.png b/assets/images/Bert_HF.png new file mode 100644 index 000000000000..e98d06f74f63 Binary files /dev/null and b/assets/images/Bert_HF.png differ diff --git a/assets/images/Captum 1.jpg b/assets/images/Captum 1.jpg new file mode 100644 index 000000000000..fec68a92eafd Binary files /dev/null and b/assets/images/Captum 1.jpg differ diff --git a/assets/images/Captum 2.png b/assets/images/Captum 2.png new file mode 100644 index 000000000000..9691bba50c44 Binary files /dev/null and b/assets/images/Captum 2.png differ diff --git a/assets/images/Caveats.jpg b/assets/images/Caveats.jpg new file mode 100644 index 000000000000..698424f3261b Binary files /dev/null and b/assets/images/Caveats.jpg differ diff --git a/assets/images/Cub200Dataset.png b/assets/images/Cub200Dataset.png new file mode 100644 index 000000000000..ead780b0d8ac Binary files /dev/null and b/assets/images/Cub200Dataset.png differ diff --git a/assets/images/Ecosystem1.png b/assets/images/Ecosystem1.png new file mode 100644 index 000000000000..1ea5257eef6a Binary files /dev/null and b/assets/images/Ecosystem1.png differ diff --git a/assets/images/Event-Webinar-PyTorch-a-foundation-for-open-source.png b/assets/images/Event-Webinar-PyTorch-a-foundation-for-open-source.png new file mode 100644 index 000000000000..08ec42c70f22 Binary files /dev/null and b/assets/images/Event-Webinar-PyTorch-a-foundation-for-open-source.png differ diff --git a/assets/images/Figure_1.png b/assets/images/Figure_1.png new file mode 100644 index 000000000000..f2b6a4767da3 Binary files /dev/null and b/assets/images/Figure_1.png differ diff --git a/assets/images/Figure_2.png b/assets/images/Figure_2.png new file mode 100644 index 000000000000..92444d3b4165 Binary files /dev/null and b/assets/images/Figure_2.png differ diff --git a/assets/images/Figure_3.jpg b/assets/images/Figure_3.jpg new file mode 100644 index 000000000000..f66052949e37 Binary files /dev/null and b/assets/images/Figure_3.jpg differ diff --git a/assets/images/Figure_4.png b/assets/images/Figure_4.png new file mode 100644 index 000000000000..e67781975927 Binary files /dev/null and b/assets/images/Figure_4.png differ diff --git a/assets/images/Figure_5.png b/assets/images/Figure_5.png new file mode 100644 index 000000000000..5cadf6656188 Binary files /dev/null and b/assets/images/Figure_5.png differ diff --git a/assets/images/GPT1.png b/assets/images/GPT1.png new file mode 100644 index 000000000000..425ea2e75963 Binary files /dev/null and b/assets/images/GPT1.png differ diff --git a/assets/images/Hackathon_Facebook_Cover.png b/assets/images/Hackathon_Facebook_Cover.png new file mode 100644 index 000000000000..e4d446207f9e Binary files /dev/null and b/assets/images/Hackathon_Facebook_Cover.png differ diff --git a/assets/images/Inference_regular_attn.gif b/assets/images/Inference_regular_attn.gif new file mode 100644 index 000000000000..b70fa38b6ba2 Binary files /dev/null and b/assets/images/Inference_regular_attn.gif differ diff --git a/assets/images/Jagged-Tensor-Figure-from-FBGEMM-section.png b/assets/images/Jagged-Tensor-Figure-from-FBGEMM-section.png new file mode 100644 index 000000000000..a3b97ebdfb63 Binary files /dev/null and b/assets/images/Jagged-Tensor-Figure-from-FBGEMM-section.png differ diff --git a/assets/images/Key optimizations for improving the mAP of SSD300 VGG16.png b/assets/images/Key optimizations for improving the mAP of SSD300 VGG16.png new file mode 100644 index 000000000000..33071d7edcd4 Binary files /dev/null and b/assets/images/Key optimizations for improving the mAP of SSD300 VGG16.png differ diff --git a/assets/images/MEALV2.png b/assets/images/MEALV2.png new file mode 100644 index 000000000000..b4e8b2088599 Binary files /dev/null and b/assets/images/MEALV2.png differ diff --git a/assets/images/MEALV2_method.png b/assets/images/MEALV2_method.png new file mode 100644 index 000000000000..02f7668d4a8c Binary files /dev/null and b/assets/images/MEALV2_method.png differ diff --git a/assets/images/MEALV2_results.png b/assets/images/MEALV2_results.png new file mode 100644 index 000000000000..947734e7044c Binary files /dev/null and b/assets/images/MEALV2_results.png differ diff --git a/assets/images/METAPT-002-BarGraph-02-static.png b/assets/images/METAPT-002-BarGraph-02-static.png new file mode 100644 index 000000000000..55492495bda6 Binary files /dev/null and b/assets/images/METAPT-002-BarGraph-02-static.png differ diff --git a/assets/images/METAPT-002-BarGraph-02.gif b/assets/images/METAPT-002-BarGraph-02.gif new file mode 100644 index 000000000000..4b4dc82b3668 Binary files /dev/null and b/assets/images/METAPT-002-BarGraph-02.gif differ diff --git a/assets/images/MOO-NAS-blog-img1-ax_scheduler_illustration.png b/assets/images/MOO-NAS-blog-img1-ax_scheduler_illustration.png new file mode 100644 index 000000000000..65e5a004a1b6 Binary files /dev/null and b/assets/images/MOO-NAS-blog-img1-ax_scheduler_illustration.png differ diff --git a/assets/images/MOO-NAS-blog-img2-pareto_frontier_plot.png b/assets/images/MOO-NAS-blog-img2-pareto_frontier_plot.png new file mode 100644 index 000000000000..7cbabb4415f3 Binary files /dev/null and b/assets/images/MOO-NAS-blog-img2-pareto_frontier_plot.png differ diff --git a/assets/images/MOO-NAS-blog-img3-cv_plot_val_acc.png b/assets/images/MOO-NAS-blog-img3-cv_plot_val_acc.png new file mode 100644 index 000000000000..f00a88205b0a Binary files /dev/null and b/assets/images/MOO-NAS-blog-img3-cv_plot_val_acc.png differ diff --git a/assets/images/MOO-NAS-blog-img4-cv_plot_num_params.png b/assets/images/MOO-NAS-blog-img4-cv_plot_num_params.png new file mode 100644 index 000000000000..567e3112cf17 Binary files /dev/null and b/assets/images/MOO-NAS-blog-img4-cv_plot_num_params.png differ diff --git a/assets/images/MS-Azure_logo.png b/assets/images/MS-Azure_logo.png new file mode 100644 index 000000000000..28a34a56067e Binary files /dev/null and b/assets/images/MS-Azure_logo.png differ diff --git a/assets/images/PTD2-social-asset.png b/assets/images/PTD2-social-asset.png new file mode 100644 index 000000000000..37ba3c990c9e Binary files /dev/null and b/assets/images/PTD2-social-asset.png differ diff --git a/assets/images/PTEDPostEventHeader.png b/assets/images/PTEDPostEventHeader.png new file mode 100644 index 000000000000..4e41c9ad3c4a Binary files /dev/null and b/assets/images/PTEDPostEventHeader.png differ diff --git a/assets/images/PTE_lockup_PRIMARY.svg b/assets/images/PTE_lockup_PRIMARY.svg new file mode 100644 index 000000000000..f992e9d506d5 --- /dev/null +++ b/assets/images/PTE_lockup_PRIMARY.svg @@ -0,0 +1 @@ + diff --git a/assets/images/PipeTransformer-Animation.gif b/assets/images/PipeTransformer-Animation.gif new file mode 100644 index 000000000000..0bd6e02b2fdd Binary files /dev/null and b/assets/images/PipeTransformer-Animation.gif differ diff --git a/assets/images/PipeTransformer.png b/assets/images/PipeTransformer.png new file mode 100644 index 000000000000..1576f114c103 Binary files /dev/null and b/assets/images/PipeTransformer.png differ diff --git a/assets/images/PyTorch_XLA Future Stack.svg b/assets/images/PyTorch_XLA Future Stack.svg new file mode 100644 index 000000000000..f573882ae0de --- /dev/null +++ b/assets/images/PyTorch_XLA Future Stack.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/assets/images/Pytorch-Enterprise-Support-Img1.png b/assets/images/Pytorch-Enterprise-Support-Img1.png new file mode 100644 index 000000000000..c6b0b586fb39 Binary files /dev/null and b/assets/images/Pytorch-Enterprise-Support-Img1.png differ diff --git a/assets/images/Pytorch_2_0_Animation_AdobeExpress.gif b/assets/images/Pytorch_2_0_Animation_AdobeExpress.gif new file mode 100644 index 000000000000..cfb4df2d447f Binary files /dev/null and b/assets/images/Pytorch_2_0_Animation_AdobeExpress.gif differ diff --git a/assets/images/Q_AID_architecture.png b/assets/images/Q_AID_architecture.png new file mode 100644 index 000000000000..538b9df6a4ee Binary files /dev/null and b/assets/images/Q_AID_architecture.png differ diff --git a/assets/images/ResNeXtArch.png b/assets/images/ResNeXtArch.png new file mode 100644 index 000000000000..b75d41b64af5 Binary files /dev/null and b/assets/images/ResNeXtArch.png differ diff --git a/assets/images/SEArch.png b/assets/images/SEArch.png new file mode 100755 index 000000000000..a7fb8d047226 Binary files /dev/null and b/assets/images/SEArch.png differ diff --git a/assets/images/Summer_hackathon.png b/assets/images/Summer_hackathon.png new file mode 100644 index 000000000000..16f925f353d7 Binary files /dev/null and b/assets/images/Summer_hackathon.png differ diff --git a/assets/images/Transforms-v2-feature-image.png b/assets/images/Transforms-v2-feature-image.png new file mode 100644 index 000000000000..2529d86f1708 Binary files /dev/null and b/assets/images/Transforms-v2-feature-image.png differ diff --git a/assets/images/Tuning-flow-chart.png b/assets/images/Tuning-flow-chart.png new file mode 100644 index 000000000000..e81d6c532be4 Binary files /dev/null and b/assets/images/Tuning-flow-chart.png differ diff --git a/assets/images/Unlowered-Op.jpg b/assets/images/Unlowered-Op.jpg new file mode 100644 index 000000000000..3557e0ef1403 Binary files /dev/null and b/assets/images/Unlowered-Op.jpg differ diff --git a/assets/images/accelerated-cpu-inference/f1-pytorch-inference-speedup-ratio-trend-multi.png.rendition.intel.web.1648.927.png b/assets/images/accelerated-cpu-inference/f1-pytorch-inference-speedup-ratio-trend-multi.png.rendition.intel.web.1648.927.png new file mode 100644 index 000000000000..185a5b90e67d Binary files /dev/null and b/assets/images/accelerated-cpu-inference/f1-pytorch-inference-speedup-ratio-trend-multi.png.rendition.intel.web.1648.927.png differ diff --git a/assets/images/accelerated-cpu-inference/f2-torchbench-fp32-performance-multithread.png.rendition.intel.web.1648.927.png b/assets/images/accelerated-cpu-inference/f2-torchbench-fp32-performance-multithread.png.rendition.intel.web.1648.927.png new file mode 100644 index 000000000000..455d74d1aab5 Binary files /dev/null and b/assets/images/accelerated-cpu-inference/f2-torchbench-fp32-performance-multithread.png.rendition.intel.web.1648.927.png differ diff --git a/assets/images/accelerated-cpu-inference/f3-huggingface-fp32-performance-multithread.png.rendition.intel.web.1648.927.png b/assets/images/accelerated-cpu-inference/f3-huggingface-fp32-performance-multithread.png.rendition.intel.web.1648.927.png new file mode 100644 index 000000000000..eac86b7e343a Binary files /dev/null and b/assets/images/accelerated-cpu-inference/f3-huggingface-fp32-performance-multithread.png.rendition.intel.web.1648.927.png differ diff --git a/assets/images/accelerated-cpu-inference/f4-timm-fp32-performance-multithread.png.rendition.intel.web.1648.927.png b/assets/images/accelerated-cpu-inference/f4-timm-fp32-performance-multithread.png.rendition.intel.web.1648.927.png new file mode 100644 index 000000000000..b36b3176b672 Binary files /dev/null and b/assets/images/accelerated-cpu-inference/f4-timm-fp32-performance-multithread.png.rendition.intel.web.1648.927.png differ diff --git a/assets/images/accelerated-cpu-inference/f5-torchbench-fp32-performance-single-thread.png.rendition.intel.web.1648.927.png b/assets/images/accelerated-cpu-inference/f5-torchbench-fp32-performance-single-thread.png.rendition.intel.web.1648.927.png new file mode 100644 index 000000000000..b2de142fd028 Binary files /dev/null and b/assets/images/accelerated-cpu-inference/f5-torchbench-fp32-performance-single-thread.png.rendition.intel.web.1648.927.png differ diff --git a/assets/images/accelerated-cpu-inference/f6-huggingface-fp32-performance-single-thread.png.rendition.intel.web.1648.927.png b/assets/images/accelerated-cpu-inference/f6-huggingface-fp32-performance-single-thread.png.rendition.intel.web.1648.927.png new file mode 100644 index 000000000000..6c0ccbe19b87 Binary files /dev/null and b/assets/images/accelerated-cpu-inference/f6-huggingface-fp32-performance-single-thread.png.rendition.intel.web.1648.927.png differ diff --git a/assets/images/accelerated-cpu-inference/f7-timm-fp32-performance-single-thread.png.rendition.intel.web.1648.927.png b/assets/images/accelerated-cpu-inference/f7-timm-fp32-performance-single-thread.png.rendition.intel.web.1648.927.png new file mode 100644 index 000000000000..ab1b41f839bf Binary files /dev/null and b/assets/images/accelerated-cpu-inference/f7-timm-fp32-performance-single-thread.png.rendition.intel.web.1648.927.png differ diff --git a/assets/images/accelerated-pytorch-inference/fg1.png b/assets/images/accelerated-pytorch-inference/fg1.png new file mode 100644 index 000000000000..7ebd37ff896f Binary files /dev/null and b/assets/images/accelerated-pytorch-inference/fg1.png differ diff --git a/assets/images/accelerated-pytorch-inference/fg2.png b/assets/images/accelerated-pytorch-inference/fg2.png new file mode 100644 index 000000000000..9e32282113f6 Binary files /dev/null and b/assets/images/accelerated-pytorch-inference/fg2.png differ diff --git a/assets/images/accelerated-pytorch-inference/fg3.png b/assets/images/accelerated-pytorch-inference/fg3.png new file mode 100644 index 000000000000..23c904287759 Binary files /dev/null and b/assets/images/accelerated-pytorch-inference/fg3.png differ diff --git a/assets/images/accelerating-gemms-triton/fg1.png b/assets/images/accelerating-gemms-triton/fg1.png new file mode 100644 index 000000000000..037d3fdc3cfd Binary files /dev/null and b/assets/images/accelerating-gemms-triton/fg1.png differ diff --git a/assets/images/accelerating-gemms-triton/fg2.png b/assets/images/accelerating-gemms-triton/fg2.png new file mode 100644 index 000000000000..7b9fe867bf42 Binary files /dev/null and b/assets/images/accelerating-gemms-triton/fg2.png differ diff --git a/assets/images/accelerating-gemms-triton/fg3.png b/assets/images/accelerating-gemms-triton/fg3.png new file mode 100644 index 000000000000..c7a7d691e59d Binary files /dev/null and b/assets/images/accelerating-gemms-triton/fg3.png differ diff --git a/assets/images/accelerating-gemms-triton/fg4.png b/assets/images/accelerating-gemms-triton/fg4.png new file mode 100644 index 000000000000..2d3ea4c2ed04 Binary files /dev/null and b/assets/images/accelerating-gemms-triton/fg4.png differ diff --git a/assets/images/accelerating-gemms-triton/fg5.png b/assets/images/accelerating-gemms-triton/fg5.png new file mode 100644 index 000000000000..9eea160f4c97 Binary files /dev/null and b/assets/images/accelerating-gemms-triton/fg5.png differ diff --git a/assets/images/accelerating-generative-ai-2.jpg b/assets/images/accelerating-generative-ai-2.jpg new file mode 100644 index 000000000000..d2bddef62d8f Binary files /dev/null and b/assets/images/accelerating-generative-ai-2.jpg differ diff --git a/assets/images/accelerating-generative-ai-2/image1.png b/assets/images/accelerating-generative-ai-2/image1.png new file mode 100644 index 000000000000..ed8472afaac3 Binary files /dev/null and b/assets/images/accelerating-generative-ai-2/image1.png differ diff --git a/assets/images/accelerating-generative-ai-2/image10.png b/assets/images/accelerating-generative-ai-2/image10.png new file mode 100644 index 000000000000..7ad7dfcd9dc4 Binary files /dev/null and b/assets/images/accelerating-generative-ai-2/image10.png differ diff --git a/assets/images/accelerating-generative-ai-2/image11.png b/assets/images/accelerating-generative-ai-2/image11.png new file mode 100644 index 000000000000..1d3ace9fe770 Binary files /dev/null and b/assets/images/accelerating-generative-ai-2/image11.png differ diff --git a/assets/images/accelerating-generative-ai-2/image12.png b/assets/images/accelerating-generative-ai-2/image12.png new file mode 100644 index 000000000000..cd5f470d6b81 Binary files /dev/null and b/assets/images/accelerating-generative-ai-2/image12.png differ diff --git a/assets/images/accelerating-generative-ai-2/image13.png b/assets/images/accelerating-generative-ai-2/image13.png new file mode 100644 index 000000000000..22cae2e87c9e Binary files /dev/null and b/assets/images/accelerating-generative-ai-2/image13.png differ diff --git a/assets/images/accelerating-generative-ai-2/image14.png b/assets/images/accelerating-generative-ai-2/image14.png new file mode 100644 index 000000000000..ba815156465a Binary files /dev/null and b/assets/images/accelerating-generative-ai-2/image14.png differ diff --git a/assets/images/accelerating-generative-ai-2/image15.png b/assets/images/accelerating-generative-ai-2/image15.png new file mode 100644 index 000000000000..e2a8e2135c3a Binary files /dev/null and b/assets/images/accelerating-generative-ai-2/image15.png differ diff --git a/assets/images/accelerating-generative-ai-2/image16.png b/assets/images/accelerating-generative-ai-2/image16.png new file mode 100644 index 000000000000..c8c940023c85 Binary files /dev/null and b/assets/images/accelerating-generative-ai-2/image16.png differ diff --git a/assets/images/accelerating-generative-ai-2/image17.png b/assets/images/accelerating-generative-ai-2/image17.png new file mode 100644 index 000000000000..fe18b0f914d7 Binary files /dev/null and b/assets/images/accelerating-generative-ai-2/image17.png differ diff --git a/assets/images/accelerating-generative-ai-2/image18.png b/assets/images/accelerating-generative-ai-2/image18.png new file mode 100644 index 000000000000..b8fc63f3cb55 Binary files /dev/null and b/assets/images/accelerating-generative-ai-2/image18.png differ diff --git a/assets/images/accelerating-generative-ai-2/image19.png b/assets/images/accelerating-generative-ai-2/image19.png new file mode 100644 index 000000000000..4e343e50e02c Binary files /dev/null and b/assets/images/accelerating-generative-ai-2/image19.png differ diff --git a/assets/images/accelerating-generative-ai-2/image2.png b/assets/images/accelerating-generative-ai-2/image2.png new file mode 100644 index 000000000000..5ae52a3331ad Binary files /dev/null and b/assets/images/accelerating-generative-ai-2/image2.png differ diff --git a/assets/images/accelerating-generative-ai-2/image20.png b/assets/images/accelerating-generative-ai-2/image20.png new file mode 100644 index 000000000000..4ac2b5584f77 Binary files /dev/null and b/assets/images/accelerating-generative-ai-2/image20.png differ diff --git a/assets/images/accelerating-generative-ai-2/image21.png b/assets/images/accelerating-generative-ai-2/image21.png new file mode 100644 index 000000000000..5400be694490 Binary files /dev/null and b/assets/images/accelerating-generative-ai-2/image21.png differ diff --git a/assets/images/accelerating-generative-ai-2/image22.png b/assets/images/accelerating-generative-ai-2/image22.png new file mode 100644 index 000000000000..62b39647b571 Binary files /dev/null and b/assets/images/accelerating-generative-ai-2/image22.png differ diff --git a/assets/images/accelerating-generative-ai-2/image23.png b/assets/images/accelerating-generative-ai-2/image23.png new file mode 100644 index 000000000000..ce2e39195ac1 Binary files /dev/null and b/assets/images/accelerating-generative-ai-2/image23.png differ diff --git a/assets/images/accelerating-generative-ai-2/image24.png b/assets/images/accelerating-generative-ai-2/image24.png new file mode 100644 index 000000000000..5ce1c2fb1c57 Binary files /dev/null and b/assets/images/accelerating-generative-ai-2/image24.png differ diff --git a/assets/images/accelerating-generative-ai-2/image25.png b/assets/images/accelerating-generative-ai-2/image25.png new file mode 100644 index 000000000000..2abb4331ce33 Binary files /dev/null and b/assets/images/accelerating-generative-ai-2/image25.png differ diff --git a/assets/images/accelerating-generative-ai-2/image26.png b/assets/images/accelerating-generative-ai-2/image26.png new file mode 100644 index 000000000000..2682b3d3eb58 Binary files /dev/null and b/assets/images/accelerating-generative-ai-2/image26.png differ diff --git a/assets/images/accelerating-generative-ai-2/image27.png b/assets/images/accelerating-generative-ai-2/image27.png new file mode 100644 index 000000000000..6e66c79e66e1 Binary files /dev/null and b/assets/images/accelerating-generative-ai-2/image27.png differ diff --git a/assets/images/accelerating-generative-ai-2/image28.png b/assets/images/accelerating-generative-ai-2/image28.png new file mode 100644 index 000000000000..43cfee7fd092 Binary files /dev/null and b/assets/images/accelerating-generative-ai-2/image28.png differ diff --git a/assets/images/accelerating-generative-ai-2/image3.png b/assets/images/accelerating-generative-ai-2/image3.png new file mode 100644 index 000000000000..6af1abf96799 Binary files /dev/null and b/assets/images/accelerating-generative-ai-2/image3.png differ diff --git a/assets/images/accelerating-generative-ai-2/image4.png b/assets/images/accelerating-generative-ai-2/image4.png new file mode 100644 index 000000000000..ee49fe3be2d4 Binary files /dev/null and b/assets/images/accelerating-generative-ai-2/image4.png differ diff --git a/assets/images/accelerating-generative-ai-2/image5.png b/assets/images/accelerating-generative-ai-2/image5.png new file mode 100644 index 000000000000..b9d1f19803fc Binary files /dev/null and b/assets/images/accelerating-generative-ai-2/image5.png differ diff --git a/assets/images/accelerating-generative-ai-2/image6.png b/assets/images/accelerating-generative-ai-2/image6.png new file mode 100644 index 000000000000..23afaa969596 Binary files /dev/null and b/assets/images/accelerating-generative-ai-2/image6.png differ diff --git a/assets/images/accelerating-generative-ai-2/image7.png b/assets/images/accelerating-generative-ai-2/image7.png new file mode 100644 index 000000000000..56089a2376ce Binary files /dev/null and b/assets/images/accelerating-generative-ai-2/image7.png differ diff --git a/assets/images/accelerating-generative-ai-2/image8.png b/assets/images/accelerating-generative-ai-2/image8.png new file mode 100644 index 000000000000..29d1924b7d6c Binary files /dev/null and b/assets/images/accelerating-generative-ai-2/image8.png differ diff --git a/assets/images/accelerating-generative-ai-2/image9.png b/assets/images/accelerating-generative-ai-2/image9.png new file mode 100644 index 000000000000..06ff871ca608 Binary files /dev/null and b/assets/images/accelerating-generative-ai-2/image9.png differ diff --git a/assets/images/accelerating-generative-ai-2/screen-recording.gif b/assets/images/accelerating-generative-ai-2/screen-recording.gif new file mode 100644 index 000000000000..31d0176d863c Binary files /dev/null and b/assets/images/accelerating-generative-ai-2/screen-recording.gif differ diff --git a/assets/images/accelerating-generative-ai-2/social-share.jpg b/assets/images/accelerating-generative-ai-2/social-share.jpg new file mode 100644 index 000000000000..36fbf73f579e Binary files /dev/null and b/assets/images/accelerating-generative-ai-2/social-share.jpg differ diff --git a/assets/images/accelerating-generative-ai-3/fg1.png b/assets/images/accelerating-generative-ai-3/fg1.png new file mode 100644 index 000000000000..3a3a701980e1 Binary files /dev/null and b/assets/images/accelerating-generative-ai-3/fg1.png differ diff --git a/assets/images/accelerating-generative-ai-3/fg10.png b/assets/images/accelerating-generative-ai-3/fg10.png new file mode 100644 index 000000000000..fd14cbcb9af8 Binary files /dev/null and b/assets/images/accelerating-generative-ai-3/fg10.png differ diff --git a/assets/images/accelerating-generative-ai-3/fg2.png b/assets/images/accelerating-generative-ai-3/fg2.png new file mode 100644 index 000000000000..6fd64638d937 Binary files /dev/null and b/assets/images/accelerating-generative-ai-3/fg2.png differ diff --git a/assets/images/accelerating-generative-ai-3/fg3.png b/assets/images/accelerating-generative-ai-3/fg3.png new file mode 100644 index 000000000000..4c5ab4c8c90c Binary files /dev/null and b/assets/images/accelerating-generative-ai-3/fg3.png differ diff --git a/assets/images/accelerating-generative-ai-3/fg4.png b/assets/images/accelerating-generative-ai-3/fg4.png new file mode 100644 index 000000000000..b7ff7ed3a610 Binary files /dev/null and b/assets/images/accelerating-generative-ai-3/fg4.png differ diff --git a/assets/images/accelerating-generative-ai-3/fg5.png b/assets/images/accelerating-generative-ai-3/fg5.png new file mode 100644 index 000000000000..004dc366a924 Binary files /dev/null and b/assets/images/accelerating-generative-ai-3/fg5.png differ diff --git a/assets/images/accelerating-generative-ai-3/fg5b.jpg b/assets/images/accelerating-generative-ai-3/fg5b.jpg new file mode 100644 index 000000000000..f915ef222bba Binary files /dev/null and b/assets/images/accelerating-generative-ai-3/fg5b.jpg differ diff --git a/assets/images/accelerating-generative-ai-3/fg6.png b/assets/images/accelerating-generative-ai-3/fg6.png new file mode 100644 index 000000000000..47d23db033aa Binary files /dev/null and b/assets/images/accelerating-generative-ai-3/fg6.png differ diff --git a/assets/images/accelerating-generative-ai-3/fg7.png b/assets/images/accelerating-generative-ai-3/fg7.png new file mode 100644 index 000000000000..d2141729be38 Binary files /dev/null and b/assets/images/accelerating-generative-ai-3/fg7.png differ diff --git a/assets/images/accelerating-generative-ai-3/fg8.png b/assets/images/accelerating-generative-ai-3/fg8.png new file mode 100644 index 000000000000..a6d4615acf38 Binary files /dev/null and b/assets/images/accelerating-generative-ai-3/fg8.png differ diff --git a/assets/images/accelerating-generative-ai-3/fg9.png b/assets/images/accelerating-generative-ai-3/fg9.png new file mode 100644 index 000000000000..628667befcea Binary files /dev/null and b/assets/images/accelerating-generative-ai-3/fg9.png differ diff --git a/assets/images/accelerating-generative-ai-4/fg1.png b/assets/images/accelerating-generative-ai-4/fg1.png new file mode 100644 index 000000000000..4a6518cf45d5 Binary files /dev/null and b/assets/images/accelerating-generative-ai-4/fg1.png differ diff --git a/assets/images/accelerating-generative-ai-4/fg10.png b/assets/images/accelerating-generative-ai-4/fg10.png new file mode 100644 index 000000000000..048c28683e0b Binary files /dev/null and b/assets/images/accelerating-generative-ai-4/fg10.png differ diff --git a/assets/images/accelerating-generative-ai-4/fg11.png b/assets/images/accelerating-generative-ai-4/fg11.png new file mode 100644 index 000000000000..7825f605acdb Binary files /dev/null and b/assets/images/accelerating-generative-ai-4/fg11.png differ diff --git a/assets/images/accelerating-generative-ai-4/fg12.png b/assets/images/accelerating-generative-ai-4/fg12.png new file mode 100644 index 000000000000..7a28772204a6 Binary files /dev/null and b/assets/images/accelerating-generative-ai-4/fg12.png differ diff --git a/assets/images/accelerating-generative-ai-4/fg13.png b/assets/images/accelerating-generative-ai-4/fg13.png new file mode 100644 index 000000000000..8a0e4a9d9cba Binary files /dev/null and b/assets/images/accelerating-generative-ai-4/fg13.png differ diff --git a/assets/images/accelerating-generative-ai-4/fg14.png b/assets/images/accelerating-generative-ai-4/fg14.png new file mode 100644 index 000000000000..19df0c366688 Binary files /dev/null and b/assets/images/accelerating-generative-ai-4/fg14.png differ diff --git a/assets/images/accelerating-generative-ai-4/fg15.png b/assets/images/accelerating-generative-ai-4/fg15.png new file mode 100644 index 000000000000..d04cc6438c66 Binary files /dev/null and b/assets/images/accelerating-generative-ai-4/fg15.png differ diff --git a/assets/images/accelerating-generative-ai-4/fg16.png b/assets/images/accelerating-generative-ai-4/fg16.png new file mode 100644 index 000000000000..dc498b3a9dad Binary files /dev/null and b/assets/images/accelerating-generative-ai-4/fg16.png differ diff --git a/assets/images/accelerating-generative-ai-4/fg17.png b/assets/images/accelerating-generative-ai-4/fg17.png new file mode 100644 index 000000000000..83c1232c8045 Binary files /dev/null and b/assets/images/accelerating-generative-ai-4/fg17.png differ diff --git a/assets/images/accelerating-generative-ai-4/fg2.png b/assets/images/accelerating-generative-ai-4/fg2.png new file mode 100644 index 000000000000..d06efd2139b7 Binary files /dev/null and b/assets/images/accelerating-generative-ai-4/fg2.png differ diff --git a/assets/images/accelerating-generative-ai-4/fg3.png b/assets/images/accelerating-generative-ai-4/fg3.png new file mode 100644 index 000000000000..ae3bc8fa48b5 Binary files /dev/null and b/assets/images/accelerating-generative-ai-4/fg3.png differ diff --git a/assets/images/accelerating-generative-ai-4/fg4.jpg b/assets/images/accelerating-generative-ai-4/fg4.jpg new file mode 100644 index 000000000000..62e182a0b46d Binary files /dev/null and b/assets/images/accelerating-generative-ai-4/fg4.jpg differ diff --git a/assets/images/accelerating-generative-ai-4/fg5.jpg b/assets/images/accelerating-generative-ai-4/fg5.jpg new file mode 100644 index 000000000000..cf3a7fc0f66c Binary files /dev/null and b/assets/images/accelerating-generative-ai-4/fg5.jpg differ diff --git a/assets/images/accelerating-generative-ai-4/fg6.png b/assets/images/accelerating-generative-ai-4/fg6.png new file mode 100644 index 000000000000..4773d50d57a7 Binary files /dev/null and b/assets/images/accelerating-generative-ai-4/fg6.png differ diff --git a/assets/images/accelerating-generative-ai-4/fg7.png b/assets/images/accelerating-generative-ai-4/fg7.png new file mode 100644 index 000000000000..4ac2094e858f Binary files /dev/null and b/assets/images/accelerating-generative-ai-4/fg7.png differ diff --git a/assets/images/accelerating-generative-ai-4/fg8.png b/assets/images/accelerating-generative-ai-4/fg8.png new file mode 100644 index 000000000000..20f3d9518b90 Binary files /dev/null and b/assets/images/accelerating-generative-ai-4/fg8.png differ diff --git a/assets/images/accelerating-generative-ai-4/fg8b.png b/assets/images/accelerating-generative-ai-4/fg8b.png new file mode 100644 index 000000000000..fcb314b89028 Binary files /dev/null and b/assets/images/accelerating-generative-ai-4/fg8b.png differ diff --git a/assets/images/accelerating-generative-ai-4/fg9.png b/assets/images/accelerating-generative-ai-4/fg9.png new file mode 100644 index 000000000000..ab6985bbbbc6 Binary files /dev/null and b/assets/images/accelerating-generative-ai-4/fg9.png differ diff --git a/assets/images/accelerating-generative-ai/bar_chart_0.png b/assets/images/accelerating-generative-ai/bar_chart_0.png new file mode 100644 index 000000000000..400c18cea075 Binary files /dev/null and b/assets/images/accelerating-generative-ai/bar_chart_0.png differ diff --git a/assets/images/accelerating-generative-ai/bar_chart_1.png b/assets/images/accelerating-generative-ai/bar_chart_1.png new file mode 100644 index 000000000000..6aeef462ba47 Binary files /dev/null and b/assets/images/accelerating-generative-ai/bar_chart_1.png differ diff --git a/assets/images/accelerating-generative-ai/bar_chart_2.png b/assets/images/accelerating-generative-ai/bar_chart_2.png new file mode 100644 index 000000000000..fc7b80a6d09a Binary files /dev/null and b/assets/images/accelerating-generative-ai/bar_chart_2.png differ diff --git a/assets/images/accelerating-generative-ai/bar_chart_3.png b/assets/images/accelerating-generative-ai/bar_chart_3.png new file mode 100644 index 000000000000..8a25b4ee8283 Binary files /dev/null and b/assets/images/accelerating-generative-ai/bar_chart_3.png differ diff --git a/assets/images/accelerating-generative-ai/bar_chart_4.png b/assets/images/accelerating-generative-ai/bar_chart_4.png new file mode 100644 index 000000000000..acfe281d28fd Binary files /dev/null and b/assets/images/accelerating-generative-ai/bar_chart_4.png differ diff --git a/assets/images/accelerating-generative-ai/bar_chart_5.png b/assets/images/accelerating-generative-ai/bar_chart_5.png new file mode 100644 index 000000000000..006de895b124 Binary files /dev/null and b/assets/images/accelerating-generative-ai/bar_chart_5.png differ diff --git a/assets/images/accelerating-generative-ai/bar_chart_6.png b/assets/images/accelerating-generative-ai/bar_chart_6.png new file mode 100644 index 000000000000..6c00107edcb4 Binary files /dev/null and b/assets/images/accelerating-generative-ai/bar_chart_6.png differ diff --git a/assets/images/accelerating-generative-ai/bar_chart_7.png b/assets/images/accelerating-generative-ai/bar_chart_7.png new file mode 100644 index 000000000000..b4a004dbd7c7 Binary files /dev/null and b/assets/images/accelerating-generative-ai/bar_chart_7.png differ diff --git a/assets/images/accelerating-generative-ai/baseline_trace.jpg b/assets/images/accelerating-generative-ai/baseline_trace.jpg new file mode 100644 index 000000000000..b8355a6eea2c Binary files /dev/null and b/assets/images/accelerating-generative-ai/baseline_trace.jpg differ diff --git a/assets/images/accelerating-generative-ai/bfloat16_kernels.jpg b/assets/images/accelerating-generative-ai/bfloat16_kernels.jpg new file mode 100644 index 000000000000..263208215423 Binary files /dev/null and b/assets/images/accelerating-generative-ai/bfloat16_kernels.jpg differ diff --git a/assets/images/accelerating-generative-ai/bfloat16_snippet.jpg b/assets/images/accelerating-generative-ai/bfloat16_snippet.jpg new file mode 100644 index 000000000000..f9e0c3fb741a Binary files /dev/null and b/assets/images/accelerating-generative-ai/bfloat16_snippet.jpg differ diff --git a/assets/images/accelerating-generative-ai/bfloat16_snippet2.jpg b/assets/images/accelerating-generative-ai/bfloat16_snippet2.jpg new file mode 100644 index 000000000000..d1821d6ede0b Binary files /dev/null and b/assets/images/accelerating-generative-ai/bfloat16_snippet2.jpg differ diff --git a/assets/images/accelerating-generative-ai/bfloat16_trace.jpg b/assets/images/accelerating-generative-ai/bfloat16_trace.jpg new file mode 100644 index 000000000000..0af5f5232275 Binary files /dev/null and b/assets/images/accelerating-generative-ai/bfloat16_trace.jpg differ diff --git a/assets/images/accelerating-generative-ai/code1.jpg b/assets/images/accelerating-generative-ai/code1.jpg new file mode 100644 index 000000000000..647891c59d39 Binary files /dev/null and b/assets/images/accelerating-generative-ai/code1.jpg differ diff --git a/assets/images/accelerating-generative-ai/compile_kernels.jpg b/assets/images/accelerating-generative-ai/compile_kernels.jpg new file mode 100644 index 000000000000..39d255c39503 Binary files /dev/null and b/assets/images/accelerating-generative-ai/compile_kernels.jpg differ diff --git a/assets/images/accelerating-generative-ai/compile_trace.jpg b/assets/images/accelerating-generative-ai/compile_trace.jpg new file mode 100644 index 000000000000..7c356564b568 Binary files /dev/null and b/assets/images/accelerating-generative-ai/compile_trace.jpg differ diff --git a/assets/images/accelerating-generative-ai/intro_image.jpg b/assets/images/accelerating-generative-ai/intro_image.jpg new file mode 100644 index 000000000000..f94af85ffa55 Binary files /dev/null and b/assets/images/accelerating-generative-ai/intro_image.jpg differ diff --git a/assets/images/accelerating-generative-ai/nt_kernel.jpg b/assets/images/accelerating-generative-ai/nt_kernel.jpg new file mode 100644 index 000000000000..c62aaf36ae36 Binary files /dev/null and b/assets/images/accelerating-generative-ai/nt_kernel.jpg differ diff --git a/assets/images/accelerating-generative-ai/sdpa_kernels.jpg b/assets/images/accelerating-generative-ai/sdpa_kernels.jpg new file mode 100644 index 000000000000..d8065fbbedb1 Binary files /dev/null and b/assets/images/accelerating-generative-ai/sdpa_kernels.jpg differ diff --git a/assets/images/accelerating-generative-ai/sdpa_snippet.jpg b/assets/images/accelerating-generative-ai/sdpa_snippet.jpg new file mode 100644 index 000000000000..bf0e8eadd2ce Binary files /dev/null and b/assets/images/accelerating-generative-ai/sdpa_snippet.jpg differ diff --git a/assets/images/accelerating-generative-ai/sparse_image.png b/assets/images/accelerating-generative-ai/sparse_image.png new file mode 100644 index 000000000000..e56e89df0a5e Binary files /dev/null and b/assets/images/accelerating-generative-ai/sparse_image.png differ diff --git a/assets/images/accelerating-generative-ai/trace1.jpg b/assets/images/accelerating-generative-ai/trace1.jpg new file mode 100644 index 000000000000..9810dc7fe37f Binary files /dev/null and b/assets/images/accelerating-generative-ai/trace1.jpg differ diff --git a/assets/images/accelerating-generative-ai/triton_kernels.jpg b/assets/images/accelerating-generative-ai/triton_kernels.jpg new file mode 100644 index 000000000000..9dd984c6c46e Binary files /dev/null and b/assets/images/accelerating-generative-ai/triton_kernels.jpg differ diff --git a/assets/images/accelerating-generative-ai/triton_snippet.jpg b/assets/images/accelerating-generative-ai/triton_snippet.jpg new file mode 100644 index 000000000000..11052cfd6894 Binary files /dev/null and b/assets/images/accelerating-generative-ai/triton_snippet.jpg differ diff --git a/assets/images/accelerating-generative-ai/triton_trace.png b/assets/images/accelerating-generative-ai/triton_trace.png new file mode 100644 index 000000000000..582b89aa718f Binary files /dev/null and b/assets/images/accelerating-generative-ai/triton_trace.png differ diff --git a/assets/images/accelerating-llama3/fig1.png b/assets/images/accelerating-llama3/fig1.png new file mode 100644 index 000000000000..a79c54e2b5b1 Binary files /dev/null and b/assets/images/accelerating-llama3/fig1.png differ diff --git a/assets/images/accelerating-llama3/fig10.png b/assets/images/accelerating-llama3/fig10.png new file mode 100644 index 000000000000..a6e8f42ba329 Binary files /dev/null and b/assets/images/accelerating-llama3/fig10.png differ diff --git a/assets/images/accelerating-llama3/fig11.png b/assets/images/accelerating-llama3/fig11.png new file mode 100644 index 000000000000..d6d67a3b6adf Binary files /dev/null and b/assets/images/accelerating-llama3/fig11.png differ diff --git a/assets/images/accelerating-llama3/fig12.png b/assets/images/accelerating-llama3/fig12.png new file mode 100644 index 000000000000..8a2e1cc83e83 Binary files /dev/null and b/assets/images/accelerating-llama3/fig12.png differ diff --git a/assets/images/accelerating-llama3/fig13.png b/assets/images/accelerating-llama3/fig13.png new file mode 100644 index 000000000000..aac9d2aa38ef Binary files /dev/null and b/assets/images/accelerating-llama3/fig13.png differ diff --git a/assets/images/accelerating-llama3/fig2.png b/assets/images/accelerating-llama3/fig2.png new file mode 100644 index 000000000000..fcaf92f9a262 Binary files /dev/null and b/assets/images/accelerating-llama3/fig2.png differ diff --git a/assets/images/accelerating-llama3/fig3.png b/assets/images/accelerating-llama3/fig3.png new file mode 100644 index 000000000000..15b8f3d0beea Binary files /dev/null and b/assets/images/accelerating-llama3/fig3.png differ diff --git a/assets/images/accelerating-llama3/fig4.png b/assets/images/accelerating-llama3/fig4.png new file mode 100644 index 000000000000..aea95093ed89 Binary files /dev/null and b/assets/images/accelerating-llama3/fig4.png differ diff --git a/assets/images/accelerating-llama3/fig5.png b/assets/images/accelerating-llama3/fig5.png new file mode 100644 index 000000000000..391efb243a9b Binary files /dev/null and b/assets/images/accelerating-llama3/fig5.png differ diff --git a/assets/images/accelerating-llama3/fig6.png b/assets/images/accelerating-llama3/fig6.png new file mode 100644 index 000000000000..54905659798c Binary files /dev/null and b/assets/images/accelerating-llama3/fig6.png differ diff --git a/assets/images/accelerating-llama3/fig7.png b/assets/images/accelerating-llama3/fig7.png new file mode 100644 index 000000000000..885178d51c0b Binary files /dev/null and b/assets/images/accelerating-llama3/fig7.png differ diff --git a/assets/images/accelerating-llama3/fig8.png b/assets/images/accelerating-llama3/fig8.png new file mode 100644 index 000000000000..732c88343523 Binary files /dev/null and b/assets/images/accelerating-llama3/fig8.png differ diff --git a/assets/images/accelerating-llama3/fig9.png b/assets/images/accelerating-llama3/fig9.png new file mode 100644 index 000000000000..30fc19a53c78 Binary files /dev/null and b/assets/images/accelerating-llama3/fig9.png differ diff --git a/assets/images/accelerating-llm-inference/fg1.png b/assets/images/accelerating-llm-inference/fg1.png new file mode 100644 index 000000000000..68e37dc442f1 Binary files /dev/null and b/assets/images/accelerating-llm-inference/fg1.png differ diff --git a/assets/images/accelerating-llm-inference/fg2.png b/assets/images/accelerating-llm-inference/fg2.png new file mode 100644 index 000000000000..aa33f57d2455 Binary files /dev/null and b/assets/images/accelerating-llm-inference/fg2.png differ diff --git a/assets/images/accelerating-llm-inference/fg3.png b/assets/images/accelerating-llm-inference/fg3.png new file mode 100644 index 000000000000..74192bab2d2c Binary files /dev/null and b/assets/images/accelerating-llm-inference/fg3.png differ diff --git a/assets/images/accelerating-llm-inference/fg4.png b/assets/images/accelerating-llm-inference/fg4.png new file mode 100644 index 000000000000..26ad62e67a21 Binary files /dev/null and b/assets/images/accelerating-llm-inference/fg4.png differ diff --git a/assets/images/accelerating-llm-inference/fg5.jpg b/assets/images/accelerating-llm-inference/fg5.jpg new file mode 100644 index 000000000000..b9fd9e589f25 Binary files /dev/null and b/assets/images/accelerating-llm-inference/fg5.jpg differ diff --git a/assets/images/accelerating-moe-model/fig-1.gif b/assets/images/accelerating-moe-model/fig-1.gif new file mode 100644 index 000000000000..631e5249fc5a Binary files /dev/null and b/assets/images/accelerating-moe-model/fig-1.gif differ diff --git a/assets/images/accelerating-moe-model/fig-1.png b/assets/images/accelerating-moe-model/fig-1.png new file mode 100644 index 000000000000..d24fe5bb1e10 Binary files /dev/null and b/assets/images/accelerating-moe-model/fig-1.png differ diff --git a/assets/images/accelerating-moe-model/fig-2.png b/assets/images/accelerating-moe-model/fig-2.png new file mode 100644 index 000000000000..5cecfaf769cd Binary files /dev/null and b/assets/images/accelerating-moe-model/fig-2.png differ diff --git a/assets/images/accelerating-moe-model/fig-3.png b/assets/images/accelerating-moe-model/fig-3.png new file mode 100644 index 000000000000..d312839ee9cd Binary files /dev/null and b/assets/images/accelerating-moe-model/fig-3.png differ diff --git a/assets/images/accelerating-moe-model/fig-4.png b/assets/images/accelerating-moe-model/fig-4.png new file mode 100644 index 000000000000..fd4749782dd0 Binary files /dev/null and b/assets/images/accelerating-moe-model/fig-4.png differ diff --git a/assets/images/accelerating-moe-model/fig-5.png b/assets/images/accelerating-moe-model/fig-5.png new file mode 100644 index 000000000000..f5b0a91a1a4e Binary files /dev/null and b/assets/images/accelerating-moe-model/fig-5.png differ diff --git a/assets/images/accelerating-moe-model/fig-6.png b/assets/images/accelerating-moe-model/fig-6.png new file mode 100644 index 000000000000..9f4abf580064 Binary files /dev/null and b/assets/images/accelerating-moe-model/fig-6.png differ diff --git a/assets/images/accelerating-moe-model/fig-7.png b/assets/images/accelerating-moe-model/fig-7.png new file mode 100644 index 000000000000..da76f855d966 Binary files /dev/null and b/assets/images/accelerating-moe-model/fig-7.png differ diff --git a/assets/images/accelerating-moe-model/fig-8.png b/assets/images/accelerating-moe-model/fig-8.png new file mode 100644 index 000000000000..58b736fe1e81 Binary files /dev/null and b/assets/images/accelerating-moe-model/fig-8.png differ diff --git a/assets/images/accelerating-moe-model/fig.gif b/assets/images/accelerating-moe-model/fig.gif new file mode 100644 index 000000000000..10f560fdd5ed Binary files /dev/null and b/assets/images/accelerating-moe-model/fig.gif differ diff --git a/assets/images/accelerating-moe-model/fig.png b/assets/images/accelerating-moe-model/fig.png new file mode 100644 index 000000000000..38cf28689c37 Binary files /dev/null and b/assets/images/accelerating-moe-model/fig.png differ diff --git a/assets/images/accelerating-neural-network-training/fg1.jpg b/assets/images/accelerating-neural-network-training/fg1.jpg new file mode 100644 index 000000000000..3e8acf897d28 Binary files /dev/null and b/assets/images/accelerating-neural-network-training/fg1.jpg differ diff --git a/assets/images/accelerating-neural-network-training/fg1.png b/assets/images/accelerating-neural-network-training/fg1.png new file mode 100644 index 000000000000..0e279b088d8e Binary files /dev/null and b/assets/images/accelerating-neural-network-training/fg1.png differ diff --git a/assets/images/accelerating-neural-network-training/fg10.png b/assets/images/accelerating-neural-network-training/fg10.png new file mode 100644 index 000000000000..c5d2126c089e Binary files /dev/null and b/assets/images/accelerating-neural-network-training/fg10.png differ diff --git a/assets/images/accelerating-neural-network-training/fg11.png b/assets/images/accelerating-neural-network-training/fg11.png new file mode 100644 index 000000000000..4102d07a466c Binary files /dev/null and b/assets/images/accelerating-neural-network-training/fg11.png differ diff --git a/assets/images/accelerating-neural-network-training/fg12.png b/assets/images/accelerating-neural-network-training/fg12.png new file mode 100644 index 000000000000..f14e2ab079ea Binary files /dev/null and b/assets/images/accelerating-neural-network-training/fg12.png differ diff --git a/assets/images/accelerating-neural-network-training/fg2.png b/assets/images/accelerating-neural-network-training/fg2.png new file mode 100644 index 000000000000..0fbf7b53ac1a Binary files /dev/null and b/assets/images/accelerating-neural-network-training/fg2.png differ diff --git a/assets/images/accelerating-neural-network-training/fg3.png b/assets/images/accelerating-neural-network-training/fg3.png new file mode 100644 index 000000000000..71886fc3213b Binary files /dev/null and b/assets/images/accelerating-neural-network-training/fg3.png differ diff --git a/assets/images/accelerating-neural-network-training/fg4.png b/assets/images/accelerating-neural-network-training/fg4.png new file mode 100644 index 000000000000..d0b012cfc157 Binary files /dev/null and b/assets/images/accelerating-neural-network-training/fg4.png differ diff --git a/assets/images/accelerating-neural-network-training/fg5.png b/assets/images/accelerating-neural-network-training/fg5.png new file mode 100644 index 000000000000..e570f548dcaf Binary files /dev/null and b/assets/images/accelerating-neural-network-training/fg5.png differ diff --git a/assets/images/accelerating-neural-network-training/fg6.png b/assets/images/accelerating-neural-network-training/fg6.png new file mode 100644 index 000000000000..553e56f9afe8 Binary files /dev/null and b/assets/images/accelerating-neural-network-training/fg6.png differ diff --git a/assets/images/accelerating-neural-network-training/fg7.png b/assets/images/accelerating-neural-network-training/fg7.png new file mode 100644 index 000000000000..186f4865ab87 Binary files /dev/null and b/assets/images/accelerating-neural-network-training/fg7.png differ diff --git a/assets/images/accelerating-neural-network-training/fg8.png b/assets/images/accelerating-neural-network-training/fg8.png new file mode 100644 index 000000000000..45eb0cdd1561 Binary files /dev/null and b/assets/images/accelerating-neural-network-training/fg8.png differ diff --git a/assets/images/accelerating-neural-network-training/fg9.png b/assets/images/accelerating-neural-network-training/fg9.png new file mode 100644 index 000000000000..825139ad4410 Binary files /dev/null and b/assets/images/accelerating-neural-network-training/fg9.png differ diff --git a/assets/images/accelerating-pytorch-vision-models-with-channels-last-on-cpu-1.png b/assets/images/accelerating-pytorch-vision-models-with-channels-last-on-cpu-1.png new file mode 100644 index 000000000000..a3093442c4a5 Binary files /dev/null and b/assets/images/accelerating-pytorch-vision-models-with-channels-last-on-cpu-1.png differ diff --git a/assets/images/accelerating-pytorch-vision-models-with-channels-last-on-cpu-2.png b/assets/images/accelerating-pytorch-vision-models-with-channels-last-on-cpu-2.png new file mode 100644 index 000000000000..697f864c226c Binary files /dev/null and b/assets/images/accelerating-pytorch-vision-models-with-channels-last-on-cpu-2.png differ diff --git a/assets/images/accelerating-pytorch-vision-models-with-channels-last-on-cpu-3.png b/assets/images/accelerating-pytorch-vision-models-with-channels-last-on-cpu-3.png new file mode 100644 index 000000000000..7c27caddfbed Binary files /dev/null and b/assets/images/accelerating-pytorch-vision-models-with-channels-last-on-cpu-3.png differ diff --git a/assets/images/accelerating-training-float8-rowwise-crusoe/fg1.png b/assets/images/accelerating-training-float8-rowwise-crusoe/fg1.png new file mode 100644 index 000000000000..7dcf02db043e Binary files /dev/null and b/assets/images/accelerating-training-float8-rowwise-crusoe/fg1.png differ diff --git a/assets/images/accelerating-training-float8-rowwise-crusoe/fg2.png b/assets/images/accelerating-training-float8-rowwise-crusoe/fg2.png new file mode 100644 index 000000000000..2245f96c5fff Binary files /dev/null and b/assets/images/accelerating-training-float8-rowwise-crusoe/fg2.png differ diff --git a/assets/images/accelerating-training-float8-rowwise-crusoe/fg3.png b/assets/images/accelerating-training-float8-rowwise-crusoe/fg3.png new file mode 100644 index 000000000000..e5797aedd0ca Binary files /dev/null and b/assets/images/accelerating-training-float8-rowwise-crusoe/fg3.png differ diff --git a/assets/images/accelerating-training-float8-rowwise-crusoe/fg4.png b/assets/images/accelerating-training-float8-rowwise-crusoe/fg4.png new file mode 100644 index 000000000000..3adae3b02e6b Binary files /dev/null and b/assets/images/accelerating-training-float8-rowwise-crusoe/fg4.png differ diff --git a/assets/images/accelerating-training-float8-rowwise-crusoe/fg5.png b/assets/images/accelerating-training-float8-rowwise-crusoe/fg5.png new file mode 100644 index 000000000000..7dcf02db043e Binary files /dev/null and b/assets/images/accelerating-training-float8-rowwise-crusoe/fg5.png differ diff --git a/assets/images/accelerating-training-float8-rowwise-crusoe/fg6.png b/assets/images/accelerating-training-float8-rowwise-crusoe/fg6.png new file mode 100644 index 000000000000..9c77b71f5d4f Binary files /dev/null and b/assets/images/accelerating-training-float8-rowwise-crusoe/fg6.png differ diff --git a/assets/images/accelerating-training-float8-rowwise-crusoe/fg7.png b/assets/images/accelerating-training-float8-rowwise-crusoe/fg7.png new file mode 100644 index 000000000000..35695c3de6d0 Binary files /dev/null and b/assets/images/accelerating-training-float8-rowwise-crusoe/fg7.png differ diff --git a/assets/images/accelerating-triton/fg1.png b/assets/images/accelerating-triton/fg1.png new file mode 100644 index 000000000000..3467b2597749 Binary files /dev/null and b/assets/images/accelerating-triton/fg1.png differ diff --git a/assets/images/accelerating-triton/fg10.jpg b/assets/images/accelerating-triton/fg10.jpg new file mode 100644 index 000000000000..cff232b430ac Binary files /dev/null and b/assets/images/accelerating-triton/fg10.jpg differ diff --git a/assets/images/accelerating-triton/fg11.jpg b/assets/images/accelerating-triton/fg11.jpg new file mode 100644 index 000000000000..99d6b162280b Binary files /dev/null and b/assets/images/accelerating-triton/fg11.jpg differ diff --git a/assets/images/accelerating-triton/fg12.png b/assets/images/accelerating-triton/fg12.png new file mode 100644 index 000000000000..2b1b507ac4bf Binary files /dev/null and b/assets/images/accelerating-triton/fg12.png differ diff --git a/assets/images/accelerating-triton/fg13.png b/assets/images/accelerating-triton/fg13.png new file mode 100644 index 000000000000..90d2f1945eb6 Binary files /dev/null and b/assets/images/accelerating-triton/fg13.png differ diff --git a/assets/images/accelerating-triton/fg14.png b/assets/images/accelerating-triton/fg14.png new file mode 100644 index 000000000000..7c63bedcba59 Binary files /dev/null and b/assets/images/accelerating-triton/fg14.png differ diff --git a/assets/images/accelerating-triton/fg15.png b/assets/images/accelerating-triton/fg15.png new file mode 100644 index 000000000000..89bb782c1cc7 Binary files /dev/null and b/assets/images/accelerating-triton/fg15.png differ diff --git a/assets/images/accelerating-triton/fg16.jpg b/assets/images/accelerating-triton/fg16.jpg new file mode 100644 index 000000000000..e121b484c52b Binary files /dev/null and b/assets/images/accelerating-triton/fg16.jpg differ diff --git a/assets/images/accelerating-triton/fg17.png b/assets/images/accelerating-triton/fg17.png new file mode 100644 index 000000000000..00589f9516c0 Binary files /dev/null and b/assets/images/accelerating-triton/fg17.png differ diff --git a/assets/images/accelerating-triton/fg18.png b/assets/images/accelerating-triton/fg18.png new file mode 100644 index 000000000000..77cc5550a2f2 Binary files /dev/null and b/assets/images/accelerating-triton/fg18.png differ diff --git a/assets/images/accelerating-triton/fg19.png b/assets/images/accelerating-triton/fg19.png new file mode 100644 index 000000000000..8f16dc54fbc0 Binary files /dev/null and b/assets/images/accelerating-triton/fg19.png differ diff --git a/assets/images/accelerating-triton/fg2.png b/assets/images/accelerating-triton/fg2.png new file mode 100644 index 000000000000..239c89d7f422 Binary files /dev/null and b/assets/images/accelerating-triton/fg2.png differ diff --git a/assets/images/accelerating-triton/fg3.png b/assets/images/accelerating-triton/fg3.png new file mode 100644 index 000000000000..b311ebf8ae0f Binary files /dev/null and b/assets/images/accelerating-triton/fg3.png differ diff --git a/assets/images/accelerating-triton/fg4.jpg b/assets/images/accelerating-triton/fg4.jpg new file mode 100644 index 000000000000..c4451d710158 Binary files /dev/null and b/assets/images/accelerating-triton/fg4.jpg differ diff --git a/assets/images/accelerating-triton/fg5.png b/assets/images/accelerating-triton/fg5.png new file mode 100644 index 000000000000..e3aa4ad5c31e Binary files /dev/null and b/assets/images/accelerating-triton/fg5.png differ diff --git a/assets/images/accelerating-triton/fg6.png b/assets/images/accelerating-triton/fg6.png new file mode 100644 index 000000000000..f86d8c6581c9 Binary files /dev/null and b/assets/images/accelerating-triton/fg6.png differ diff --git a/assets/images/accelerating-triton/fg7.png b/assets/images/accelerating-triton/fg7.png new file mode 100644 index 000000000000..3c549fe0db16 Binary files /dev/null and b/assets/images/accelerating-triton/fg7.png differ diff --git a/assets/images/accelerating-triton/fg8.jpg b/assets/images/accelerating-triton/fg8.jpg new file mode 100644 index 000000000000..41fca8de1953 Binary files /dev/null and b/assets/images/accelerating-triton/fg8.jpg differ diff --git a/assets/images/accelerating-triton/fg9.png b/assets/images/accelerating-triton/fg9.png new file mode 100644 index 000000000000..f5d9eb616f34 Binary files /dev/null and b/assets/images/accelerating-triton/fg9.png differ diff --git a/assets/images/achieving-sustainability-goals.png b/assets/images/achieving-sustainability-goals.png new file mode 100644 index 000000000000..93e17ecb01f1 Binary files /dev/null and b/assets/images/achieving-sustainability-goals.png differ diff --git a/assets/images/activation-checkpointing-techniques/fg1.png b/assets/images/activation-checkpointing-techniques/fg1.png new file mode 100644 index 000000000000..e4805cb40ea6 Binary files /dev/null and b/assets/images/activation-checkpointing-techniques/fg1.png differ diff --git a/assets/images/activation-checkpointing-techniques/fg10.png b/assets/images/activation-checkpointing-techniques/fg10.png new file mode 100644 index 000000000000..91bd1c909173 Binary files /dev/null and b/assets/images/activation-checkpointing-techniques/fg10.png differ diff --git a/assets/images/activation-checkpointing-techniques/fg11.png b/assets/images/activation-checkpointing-techniques/fg11.png new file mode 100644 index 000000000000..d4fa91fb677c Binary files /dev/null and b/assets/images/activation-checkpointing-techniques/fg11.png differ diff --git a/assets/images/activation-checkpointing-techniques/fg12.png b/assets/images/activation-checkpointing-techniques/fg12.png new file mode 100644 index 000000000000..e6c1679433dd Binary files /dev/null and b/assets/images/activation-checkpointing-techniques/fg12.png differ diff --git a/assets/images/activation-checkpointing-techniques/fg13.png b/assets/images/activation-checkpointing-techniques/fg13.png new file mode 100644 index 000000000000..ea5a5cbe0bf8 Binary files /dev/null and b/assets/images/activation-checkpointing-techniques/fg13.png differ diff --git a/assets/images/activation-checkpointing-techniques/fg14.png b/assets/images/activation-checkpointing-techniques/fg14.png new file mode 100644 index 000000000000..cc20d543962d Binary files /dev/null and b/assets/images/activation-checkpointing-techniques/fg14.png differ diff --git a/assets/images/activation-checkpointing-techniques/fg2.png b/assets/images/activation-checkpointing-techniques/fg2.png new file mode 100644 index 000000000000..00c20f76c09a Binary files /dev/null and b/assets/images/activation-checkpointing-techniques/fg2.png differ diff --git a/assets/images/activation-checkpointing-techniques/fg3.png b/assets/images/activation-checkpointing-techniques/fg3.png new file mode 100644 index 000000000000..412639ab92b8 Binary files /dev/null and b/assets/images/activation-checkpointing-techniques/fg3.png differ diff --git a/assets/images/activation-checkpointing-techniques/fg4.png b/assets/images/activation-checkpointing-techniques/fg4.png new file mode 100644 index 000000000000..5b4af130db49 Binary files /dev/null and b/assets/images/activation-checkpointing-techniques/fg4.png differ diff --git a/assets/images/activation-checkpointing-techniques/fg5.png b/assets/images/activation-checkpointing-techniques/fg5.png new file mode 100644 index 000000000000..d4cdc3202836 Binary files /dev/null and b/assets/images/activation-checkpointing-techniques/fg5.png differ diff --git a/assets/images/activation-checkpointing-techniques/fg6.png b/assets/images/activation-checkpointing-techniques/fg6.png new file mode 100644 index 000000000000..919609dbabce Binary files /dev/null and b/assets/images/activation-checkpointing-techniques/fg6.png differ diff --git a/assets/images/activation-checkpointing-techniques/fg7.png b/assets/images/activation-checkpointing-techniques/fg7.png new file mode 100644 index 000000000000..bbddbd9bf91a Binary files /dev/null and b/assets/images/activation-checkpointing-techniques/fg7.png differ diff --git a/assets/images/activation-checkpointing-techniques/fg8.png b/assets/images/activation-checkpointing-techniques/fg8.png new file mode 100644 index 000000000000..42b413e2118f Binary files /dev/null and b/assets/images/activation-checkpointing-techniques/fg8.png differ diff --git a/assets/images/activation-checkpointing-techniques/fg9.png b/assets/images/activation-checkpointing-techniques/fg9.png new file mode 100644 index 000000000000..a4b748ead8e9 Binary files /dev/null and b/assets/images/activation-checkpointing-techniques/fg9.png differ diff --git a/assets/images/ai-programming.png b/assets/images/ai-programming.png new file mode 100644 index 000000000000..b1a8093af9df Binary files /dev/null and b/assets/images/ai-programming.png differ diff --git a/assets/images/alexnet1.png b/assets/images/alexnet1.png new file mode 100644 index 000000000000..9a34bfe5d278 Binary files /dev/null and b/assets/images/alexnet1.png differ diff --git a/assets/images/alexnet2.png b/assets/images/alexnet2.png new file mode 100644 index 000000000000..8eb6b7465b39 Binary files /dev/null and b/assets/images/alexnet2.png differ diff --git a/assets/images/alibaba-logo.svg b/assets/images/alibaba-logo.svg new file mode 100644 index 000000000000..039e39c3a6e1 --- /dev/null +++ b/assets/images/alibaba-logo.svg @@ -0,0 +1,40 @@ + + + +Created by potrace 1.15, written by Peter Selinger 2001-2017 + + + + + + + diff --git a/assets/images/allennlp.png b/assets/images/allennlp.png new file mode 100644 index 000000000000..ee27ea66463b Binary files /dev/null and b/assets/images/allennlp.png differ diff --git a/assets/images/amazon-ads-case-study/amazon-advertising.png b/assets/images/amazon-ads-case-study/amazon-advertising.png new file mode 100644 index 000000000000..2a1cf0ace760 Binary files /dev/null and b/assets/images/amazon-ads-case-study/amazon-advertising.png differ diff --git a/assets/images/amazon-ads-case-study/data-parallel.png b/assets/images/amazon-ads-case-study/data-parallel.png new file mode 100644 index 000000000000..5b9ce6e6cf1c Binary files /dev/null and b/assets/images/amazon-ads-case-study/data-parallel.png differ diff --git a/assets/images/amazon-ads-case-study/model-parallel.png b/assets/images/amazon-ads-case-study/model-parallel.png new file mode 100644 index 000000000000..633fa487b630 Binary files /dev/null and b/assets/images/amazon-ads-case-study/model-parallel.png differ diff --git a/assets/images/amazon-ads-logo.png b/assets/images/amazon-ads-logo.png new file mode 100644 index 000000000000..b83b5c1267e3 Binary files /dev/null and b/assets/images/amazon-ads-logo.png differ diff --git a/assets/images/amazon-sagemaker-w-torchserve/fg1.png b/assets/images/amazon-sagemaker-w-torchserve/fg1.png new file mode 100644 index 000000000000..26b563dcebbf Binary files /dev/null and b/assets/images/amazon-sagemaker-w-torchserve/fg1.png differ diff --git a/assets/images/amazon-sagemaker-w-torchserve/fg10.jpg b/assets/images/amazon-sagemaker-w-torchserve/fg10.jpg new file mode 100644 index 000000000000..415d9dcb8933 Binary files /dev/null and b/assets/images/amazon-sagemaker-w-torchserve/fg10.jpg differ diff --git a/assets/images/amazon-sagemaker-w-torchserve/fg11.jpg b/assets/images/amazon-sagemaker-w-torchserve/fg11.jpg new file mode 100644 index 000000000000..03cd79d0d54c Binary files /dev/null and b/assets/images/amazon-sagemaker-w-torchserve/fg11.jpg differ diff --git a/assets/images/amazon-sagemaker-w-torchserve/fg12.jpg b/assets/images/amazon-sagemaker-w-torchserve/fg12.jpg new file mode 100644 index 000000000000..30b1a01a97f6 Binary files /dev/null and b/assets/images/amazon-sagemaker-w-torchserve/fg12.jpg differ diff --git a/assets/images/amazon-sagemaker-w-torchserve/fg13.jpg b/assets/images/amazon-sagemaker-w-torchserve/fg13.jpg new file mode 100644 index 000000000000..865240e694b2 Binary files /dev/null and b/assets/images/amazon-sagemaker-w-torchserve/fg13.jpg differ diff --git a/assets/images/amazon-sagemaker-w-torchserve/fg14.jpg b/assets/images/amazon-sagemaker-w-torchserve/fg14.jpg new file mode 100644 index 000000000000..e18a703995a6 Binary files /dev/null and b/assets/images/amazon-sagemaker-w-torchserve/fg14.jpg differ diff --git a/assets/images/amazon-sagemaker-w-torchserve/fg15.jpg b/assets/images/amazon-sagemaker-w-torchserve/fg15.jpg new file mode 100644 index 000000000000..73f3c233b8bc Binary files /dev/null and b/assets/images/amazon-sagemaker-w-torchserve/fg15.jpg differ diff --git a/assets/images/amazon-sagemaker-w-torchserve/fg16.jpg b/assets/images/amazon-sagemaker-w-torchserve/fg16.jpg new file mode 100644 index 000000000000..9197ad6e57b7 Binary files /dev/null and b/assets/images/amazon-sagemaker-w-torchserve/fg16.jpg differ diff --git a/assets/images/amazon-sagemaker-w-torchserve/fg2.jpg b/assets/images/amazon-sagemaker-w-torchserve/fg2.jpg new file mode 100644 index 000000000000..a4fadf32c043 Binary files /dev/null and b/assets/images/amazon-sagemaker-w-torchserve/fg2.jpg differ diff --git a/assets/images/amazon-sagemaker-w-torchserve/fg3.jpg b/assets/images/amazon-sagemaker-w-torchserve/fg3.jpg new file mode 100644 index 000000000000..1779eeea3a45 Binary files /dev/null and b/assets/images/amazon-sagemaker-w-torchserve/fg3.jpg differ diff --git a/assets/images/amazon-sagemaker-w-torchserve/fg4.png b/assets/images/amazon-sagemaker-w-torchserve/fg4.png new file mode 100644 index 000000000000..a406c6f3b6c3 Binary files /dev/null and b/assets/images/amazon-sagemaker-w-torchserve/fg4.png differ diff --git a/assets/images/amazon-sagemaker-w-torchserve/fg5.jpg b/assets/images/amazon-sagemaker-w-torchserve/fg5.jpg new file mode 100644 index 000000000000..700965c3a64a Binary files /dev/null and b/assets/images/amazon-sagemaker-w-torchserve/fg5.jpg differ diff --git a/assets/images/amazon-sagemaker-w-torchserve/fg6.jpg b/assets/images/amazon-sagemaker-w-torchserve/fg6.jpg new file mode 100644 index 000000000000..24ef68f78101 Binary files /dev/null and b/assets/images/amazon-sagemaker-w-torchserve/fg6.jpg differ diff --git a/assets/images/amazon-sagemaker-w-torchserve/fg7.png b/assets/images/amazon-sagemaker-w-torchserve/fg7.png new file mode 100644 index 000000000000..f5715e064310 Binary files /dev/null and b/assets/images/amazon-sagemaker-w-torchserve/fg7.png differ diff --git a/assets/images/amazon-sagemaker-w-torchserve/fg8.png b/assets/images/amazon-sagemaker-w-torchserve/fg8.png new file mode 100644 index 000000000000..fa16670e8384 Binary files /dev/null and b/assets/images/amazon-sagemaker-w-torchserve/fg8.png differ diff --git a/assets/images/amazon-sagemaker-w-torchserve/fg9.jpg b/assets/images/amazon-sagemaker-w-torchserve/fg9.jpg new file mode 100644 index 000000000000..15837d47c57a Binary files /dev/null and b/assets/images/amazon-sagemaker-w-torchserve/fg9.jpg differ diff --git a/assets/images/amazon-sagemaker-w-torchserve/fg9b.jpg b/assets/images/amazon-sagemaker-w-torchserve/fg9b.jpg new file mode 100644 index 000000000000..d9fa287b9fb0 Binary files /dev/null and b/assets/images/amazon-sagemaker-w-torchserve/fg9b.jpg differ diff --git a/assets/images/ambient_clinical_intel_fig2.png b/assets/images/ambient_clinical_intel_fig2.png new file mode 100644 index 000000000000..d02b09faf45e Binary files /dev/null and b/assets/images/ambient_clinical_intel_fig2.png differ diff --git a/assets/images/ambient_clinical_intel_fig3.png b/assets/images/ambient_clinical_intel_fig3.png new file mode 100644 index 000000000000..698e01c50721 Binary files /dev/null and b/assets/images/ambient_clinical_intel_fig3.png differ diff --git a/assets/images/amd_rocm_blog.png b/assets/images/amd_rocm_blog.png new file mode 100644 index 000000000000..49d58c4d4b58 Binary files /dev/null and b/assets/images/amd_rocm_blog.png differ diff --git a/assets/images/android-demo-app.png b/assets/images/android-demo-app.png new file mode 100644 index 000000000000..59aa5cd6a359 Binary files /dev/null and b/assets/images/android-demo-app.png differ diff --git a/assets/images/announcement-logo-amd.jpg b/assets/images/announcement-logo-amd.jpg new file mode 100644 index 000000000000..64263c26fda7 Binary files /dev/null and b/assets/images/announcement-logo-amd.jpg differ diff --git a/assets/images/announcement-logo-aws.jpg b/assets/images/announcement-logo-aws.jpg new file mode 100644 index 000000000000..dca5aaed9343 Binary files /dev/null and b/assets/images/announcement-logo-aws.jpg differ diff --git a/assets/images/announcement-logo-google.png b/assets/images/announcement-logo-google.png new file mode 100644 index 000000000000..a6e1c5a9b806 Binary files /dev/null and b/assets/images/announcement-logo-google.png differ diff --git a/assets/images/announcement-logo-meta.jpg b/assets/images/announcement-logo-meta.jpg new file mode 100644 index 000000000000..e2b500b770aa Binary files /dev/null and b/assets/images/announcement-logo-meta.jpg differ diff --git a/assets/images/announcement-logo-microsoft.jpg b/assets/images/announcement-logo-microsoft.jpg new file mode 100644 index 000000000000..1b6ac6bf35d2 Binary files /dev/null and b/assets/images/announcement-logo-microsoft.jpg differ diff --git a/assets/images/announcement-logo-nvidia.jpg b/assets/images/announcement-logo-nvidia.jpg new file mode 100644 index 000000000000..4910cea427bc Binary files /dev/null and b/assets/images/announcement-logo-nvidia.jpg differ diff --git a/assets/images/announcing-pytorch-conference-2022.gif b/assets/images/announcing-pytorch-conference-2022.gif new file mode 100644 index 000000000000..46a51bab37db Binary files /dev/null and b/assets/images/announcing-pytorch-conference-2022.gif differ diff --git a/assets/images/apple_m1_eval.png b/assets/images/apple_m1_eval.png new file mode 100644 index 000000000000..7cc1cf3fef28 Binary files /dev/null and b/assets/images/apple_m1_eval.png differ diff --git a/assets/images/arrow-right-with-tail-white.svg b/assets/images/arrow-right-with-tail-white.svg new file mode 100644 index 000000000000..be46c6c2ed0e --- /dev/null +++ b/assets/images/arrow-right-with-tail-white.svg @@ -0,0 +1,19 @@ + + + + Page 1 + Created with Sketch. + + + + + + + + + + + + + + diff --git a/assets/images/arrow-right-with-tail.svg b/assets/images/arrow-right-with-tail.svg new file mode 100644 index 000000000000..5843588fca6f --- /dev/null +++ b/assets/images/arrow-right-with-tail.svg @@ -0,0 +1,19 @@ + + + + Page 1 + Created with Sketch. + + + + + + + + + + + + + + \ No newline at end of file diff --git a/assets/images/arrows-icon.svg b/assets/images/arrows-icon.svg new file mode 100644 index 000000000000..690eb9718fa1 --- /dev/null +++ b/assets/images/arrows-icon.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/assets/images/ascend-backend-w-torchtune.png b/assets/images/ascend-backend-w-torchtune.png new file mode 100644 index 000000000000..6fced8c87fd3 Binary files /dev/null and b/assets/images/ascend-backend-w-torchtune.png differ diff --git a/assets/images/audio-backbone-image-1.png b/assets/images/audio-backbone-image-1.png new file mode 100644 index 000000000000..b99fed74f2de Binary files /dev/null and b/assets/images/audio-backbone-image-1.png differ diff --git a/assets/images/augmented_computational_graph.png b/assets/images/augmented_computational_graph.png new file mode 100644 index 000000000000..5e17affd37a4 Binary files /dev/null and b/assets/images/augmented_computational_graph.png differ diff --git a/assets/images/autonomous-language-model-systems.png b/assets/images/autonomous-language-model-systems.png new file mode 100644 index 000000000000..06b75fe2c6be Binary files /dev/null and b/assets/images/autonomous-language-model-systems.png differ diff --git a/assets/images/aws-logo.svg b/assets/images/aws-logo.svg new file mode 100644 index 000000000000..cae8267897f6 --- /dev/null +++ b/assets/images/aws-logo.svg @@ -0,0 +1,51 @@ + + + + + + + + + diff --git a/assets/images/backward-grad-fig-1.png b/assets/images/backward-grad-fig-1.png new file mode 100644 index 000000000000..f5a62257a893 Binary files /dev/null and b/assets/images/backward-grad-fig-1.png differ diff --git a/assets/images/balancing_partition.png b/assets/images/balancing_partition.png new file mode 100644 index 000000000000..adc5c3fcd5e9 Binary files /dev/null and b/assets/images/balancing_partition.png differ diff --git a/assets/images/ben-consolvo.jpg b/assets/images/ben-consolvo.jpg new file mode 100644 index 000000000000..df5131135a67 Binary files /dev/null and b/assets/images/ben-consolvo.jpg differ diff --git a/assets/images/benchmark_readme_chart.png b/assets/images/benchmark_readme_chart.png new file mode 100644 index 000000000000..84002bd56a8e Binary files /dev/null and b/assets/images/benchmark_readme_chart.png differ diff --git a/assets/images/bert1.png b/assets/images/bert1.png new file mode 100644 index 000000000000..af404522aef4 Binary files /dev/null and b/assets/images/bert1.png differ diff --git a/assets/images/bert2.png b/assets/images/bert2.png new file mode 100644 index 000000000000..73d9ae425561 Binary files /dev/null and b/assets/images/bert2.png differ diff --git a/assets/images/blockfiltering.png b/assets/images/blockfiltering.png new file mode 100644 index 000000000000..6a656bb352a7 Binary files /dev/null and b/assets/images/blockfiltering.png differ diff --git a/assets/images/blog-2022-10-25-Pytorch-1.13-Release.png b/assets/images/blog-2022-10-25-Pytorch-1.13-Release.png new file mode 100644 index 000000000000..b3512c78ec08 Binary files /dev/null and b/assets/images/blog-2022-10-25-Pytorch-1.13-Release.png differ diff --git a/assets/images/blog-2022-3-10-setting-up-jetson-nano.png b/assets/images/blog-2022-3-10-setting-up-jetson-nano.png new file mode 100644 index 000000000000..a175ea58a52c Binary files /dev/null and b/assets/images/blog-2022-3-10-setting-up-jetson-nano.png differ diff --git a/assets/images/blog-2022-3-10-using-jetson-interface-1.jpeg b/assets/images/blog-2022-3-10-using-jetson-interface-1.jpeg new file mode 100644 index 000000000000..5efbfaedde6f Binary files /dev/null and b/assets/images/blog-2022-3-10-using-jetson-interface-1.jpeg differ diff --git a/assets/images/blog-2022-3-10-using-jetson-interface-2.jpeg b/assets/images/blog-2022-3-10-using-jetson-interface-2.jpeg new file mode 100644 index 000000000000..46bba419c1a8 Binary files /dev/null and b/assets/images/blog-2022-3-10-using-jetson-interface-2.jpeg differ diff --git a/assets/images/blog-2022-3-10-using-jetson-interface-3.jpeg b/assets/images/blog-2022-3-10-using-jetson-interface-3.jpeg new file mode 100644 index 000000000000..da0653b7d284 Binary files /dev/null and b/assets/images/blog-2022-3-10-using-jetson-interface-3.jpeg differ diff --git a/assets/images/blog-2022-3-10-using-jetson-interface-4.jpeg b/assets/images/blog-2022-3-10-using-jetson-interface-4.jpeg new file mode 100644 index 000000000000..cdd7b78157b6 Binary files /dev/null and b/assets/images/blog-2022-3-10-using-jetson-interface-4.jpeg differ diff --git a/assets/images/blog-2022-3-10-using-pytorch-1.png b/assets/images/blog-2022-3-10-using-pytorch-1.png new file mode 100644 index 000000000000..608b24128821 Binary files /dev/null and b/assets/images/blog-2022-3-10-using-pytorch-1.png differ diff --git a/assets/images/blog-2022-3-10-using-pytorch-2.jpeg b/assets/images/blog-2022-3-10-using-pytorch-2.jpeg new file mode 100644 index 000000000000..1fcb7e1f445d Binary files /dev/null and b/assets/images/blog-2022-3-10-using-pytorch-2.jpeg differ diff --git a/assets/images/blog-2022-3-10-using-pytorch-3.png b/assets/images/blog-2022-3-10-using-pytorch-3.png new file mode 100644 index 000000000000..fd9ef0231449 Binary files /dev/null and b/assets/images/blog-2022-3-10-using-pytorch-3.png differ diff --git a/assets/images/blog-2022-3-10-using-pytorch-4.png b/assets/images/blog-2022-3-10-using-pytorch-4.png new file mode 100644 index 000000000000..b9d9fc3e56b1 Binary files /dev/null and b/assets/images/blog-2022-3-10-using-pytorch-4.png differ diff --git a/assets/images/blog-2022-3-10-using-pytorch-5.png b/assets/images/blog-2022-3-10-using-pytorch-5.png new file mode 100644 index 000000000000..77017858e545 Binary files /dev/null and b/assets/images/blog-2022-3-10-using-pytorch-5.png differ diff --git a/assets/images/blog-2022-3-10-using-pytorch-6.png b/assets/images/blog-2022-3-10-using-pytorch-6.png new file mode 100644 index 000000000000..e17833807bf2 Binary files /dev/null and b/assets/images/blog-2022-3-10-using-pytorch-6.png differ diff --git a/assets/images/blog-2022-3-10-using-tensorrt.png b/assets/images/blog-2022-3-10-using-tensorrt.png new file mode 100644 index 000000000000..5bf62fb51dca Binary files /dev/null and b/assets/images/blog-2022-3-10-using-tensorrt.png differ diff --git a/assets/images/blog-background-2.jpg b/assets/images/blog-background-2.jpg new file mode 100644 index 000000000000..e74097ea5122 Binary files /dev/null and b/assets/images/blog-background-2.jpg differ diff --git a/assets/images/blog-background-3.jpg b/assets/images/blog-background-3.jpg new file mode 100644 index 000000000000..f76db8e68bb4 Binary files /dev/null and b/assets/images/blog-background-3.jpg differ diff --git a/assets/images/blog-background-4.jpg b/assets/images/blog-background-4.jpg new file mode 100644 index 000000000000..2ea3239e2aa8 Binary files /dev/null and b/assets/images/blog-background-4.jpg differ diff --git a/assets/images/blog-background-5.jpg b/assets/images/blog-background-5.jpg new file mode 100644 index 000000000000..cc2715563fb8 Binary files /dev/null and b/assets/images/blog-background-5.jpg differ diff --git a/assets/images/blog-background.jpg b/assets/images/blog-background.jpg new file mode 100644 index 000000000000..06f54cea592a Binary files /dev/null and b/assets/images/blog-background.jpg differ diff --git a/assets/images/blog1-fig-1.png b/assets/images/blog1-fig-1.png new file mode 100644 index 000000000000..5dcf83274a1e Binary files /dev/null and b/assets/images/blog1-fig-1.png differ diff --git a/assets/images/blog1-fig-2.png b/assets/images/blog1-fig-2.png new file mode 100644 index 000000000000..8fbfda799742 Binary files /dev/null and b/assets/images/blog1-fig-2.png differ diff --git a/assets/images/blog1-fig-3a.png b/assets/images/blog1-fig-3a.png new file mode 100644 index 000000000000..e606c8d86f4a Binary files /dev/null and b/assets/images/blog1-fig-3a.png differ diff --git a/assets/images/blog1-fig-3b.png b/assets/images/blog1-fig-3b.png new file mode 100644 index 000000000000..bc7d07dd3652 Binary files /dev/null and b/assets/images/blog1-fig-3b.png differ diff --git a/assets/images/blog1-fig-4a.png b/assets/images/blog1-fig-4a.png new file mode 100644 index 000000000000..809779d97e15 Binary files /dev/null and b/assets/images/blog1-fig-4a.png differ diff --git a/assets/images/blog1-fig-4b.png b/assets/images/blog1-fig-4b.png new file mode 100644 index 000000000000..38d975f24455 Binary files /dev/null and b/assets/images/blog1-fig-4b.png differ diff --git a/assets/images/blog1-fig-5a.png b/assets/images/blog1-fig-5a.png new file mode 100644 index 000000000000..96418001fc5b Binary files /dev/null and b/assets/images/blog1-fig-5a.png differ diff --git a/assets/images/blog1-fig-5b.png b/assets/images/blog1-fig-5b.png new file mode 100644 index 000000000000..87cba2c9e867 Binary files /dev/null and b/assets/images/blog1-fig-5b.png differ diff --git a/assets/images/blog1-fig-5c.png b/assets/images/blog1-fig-5c.png new file mode 100644 index 000000000000..90f320031486 Binary files /dev/null and b/assets/images/blog1-fig-5c.png differ diff --git a/assets/images/blog_combined_tutorials.png b/assets/images/blog_combined_tutorials.png new file mode 100644 index 000000000000..5ce46191563a Binary files /dev/null and b/assets/images/blog_combined_tutorials.png differ diff --git a/assets/images/bob-chesebrough.jpg b/assets/images/bob-chesebrough.jpg new file mode 100644 index 000000000000..9b945ae2c51f Binary files /dev/null and b/assets/images/bob-chesebrough.jpg differ diff --git a/assets/images/chain_rule_backward_differentiation.png b/assets/images/chain_rule_backward_differentiation.png new file mode 100644 index 000000000000..39b7cceac6a7 Binary files /dev/null and b/assets/images/chain_rule_backward_differentiation.png differ diff --git a/assets/images/chevron-down-black.svg b/assets/images/chevron-down-black.svg new file mode 100644 index 000000000000..9fffa77268c3 --- /dev/null +++ b/assets/images/chevron-down-black.svg @@ -0,0 +1,17 @@ + + + + Created with Sketch. + + + + + + + + + + + + + diff --git a/assets/images/chevron-down-orange.svg b/assets/images/chevron-down-orange.svg new file mode 100644 index 000000000000..8a3c27cc713d --- /dev/null +++ b/assets/images/chevron-down-orange.svg @@ -0,0 +1,17 @@ + + + + Created with Sketch. + + + + + + + + + + + + + diff --git a/assets/images/chevron-down-white.svg b/assets/images/chevron-down-white.svg new file mode 100644 index 000000000000..e7bd84de3a86 --- /dev/null +++ b/assets/images/chevron-down-white.svg @@ -0,0 +1,17 @@ + + + + Created with Sketch. + + + + + + + + + + + + + diff --git a/assets/images/chevron-left-grey.svg b/assets/images/chevron-left-grey.svg new file mode 100644 index 000000000000..c3bc0130bb2e --- /dev/null +++ b/assets/images/chevron-left-grey.svg @@ -0,0 +1,14 @@ + + + + Page 1 + Created with Sketch. + + + + + + + + + \ No newline at end of file diff --git a/assets/images/chevron-left-orange.svg b/assets/images/chevron-left-orange.svg new file mode 100644 index 000000000000..f005a743b812 --- /dev/null +++ b/assets/images/chevron-left-orange.svg @@ -0,0 +1,11 @@ + + + + Group + Created with Sketch. + + + + + + \ No newline at end of file diff --git a/assets/images/chevron-right-grey.svg b/assets/images/chevron-right-grey.svg new file mode 100644 index 000000000000..16eca9899ccf --- /dev/null +++ b/assets/images/chevron-right-grey.svg @@ -0,0 +1,14 @@ + + + + Page 1 + Created with Sketch. + + + + + + + + + \ No newline at end of file diff --git a/assets/images/chevron-right-orange.svg b/assets/images/chevron-right-orange.svg new file mode 100644 index 000000000000..220a2c37fbb5 --- /dev/null +++ b/assets/images/chevron-right-orange.svg @@ -0,0 +1,17 @@ + + + + +Page 1 +Created with Sketch. + + + + + + + + + + diff --git a/assets/images/chevron-right-white.svg b/assets/images/chevron-right-white.svg new file mode 100644 index 000000000000..dd9e77f26165 --- /dev/null +++ b/assets/images/chevron-right-white.svg @@ -0,0 +1,17 @@ + + + + +Page 1 +Created with Sketch. + + + + + + + + + + \ No newline at end of file diff --git a/assets/images/chip-icon.svg b/assets/images/chip-icon.svg new file mode 100644 index 000000000000..b46477ee3728 --- /dev/null +++ b/assets/images/chip-icon.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/assets/images/cholesky-decomposition.png b/assets/images/cholesky-decomposition.png new file mode 100644 index 000000000000..8bf88cb86b13 Binary files /dev/null and b/assets/images/cholesky-decomposition.png differ diff --git a/assets/images/clacheck.png b/assets/images/clacheck.png new file mode 100644 index 000000000000..c6076ebea949 Binary files /dev/null and b/assets/images/clacheck.png differ diff --git a/assets/images/clafb.png b/assets/images/clafb.png new file mode 100644 index 000000000000..1aa5a0126857 Binary files /dev/null and b/assets/images/clafb.png differ diff --git a/assets/images/classification.jpg b/assets/images/classification.jpg new file mode 100644 index 000000000000..eb1e20641c3c Binary files /dev/null and b/assets/images/classification.jpg differ diff --git a/assets/images/clipping-in-opacus/fg1.jpg b/assets/images/clipping-in-opacus/fg1.jpg new file mode 100644 index 000000000000..f9045f7a6f7c Binary files /dev/null and b/assets/images/clipping-in-opacus/fg1.jpg differ diff --git a/assets/images/clipping-in-opacus/fg2.png b/assets/images/clipping-in-opacus/fg2.png new file mode 100644 index 000000000000..42de0f570add Binary files /dev/null and b/assets/images/clipping-in-opacus/fg2.png differ diff --git a/assets/images/cloud-credits-people.jpg b/assets/images/cloud-credits-people.jpg new file mode 100644 index 000000000000..c77f15cdfa43 Binary files /dev/null and b/assets/images/cloud-credits-people.jpg differ diff --git a/assets/images/colab-logo.svg b/assets/images/colab-logo.svg new file mode 100644 index 000000000000..5f4cf28f78be --- /dev/null +++ b/assets/images/colab-logo.svg @@ -0,0 +1,37 @@ + + + + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/assets/images/community-events-recap/fg1.jpg b/assets/images/community-events-recap/fg1.jpg new file mode 100644 index 000000000000..629c39a353c4 Binary files /dev/null and b/assets/images/community-events-recap/fg1.jpg differ diff --git a/assets/images/community-events-recap/fg2.jpeg b/assets/images/community-events-recap/fg2.jpeg new file mode 100644 index 000000000000..abba3e473b5f Binary files /dev/null and b/assets/images/community-events-recap/fg2.jpeg differ diff --git a/assets/images/community-events-recap/fg3.png b/assets/images/community-events-recap/fg3.png new file mode 100644 index 000000000000..7b3c8b8c9120 Binary files /dev/null and b/assets/images/community-events-recap/fg3.png differ diff --git a/assets/images/community-events-recap/fg4.jpg b/assets/images/community-events-recap/fg4.jpg new file mode 100644 index 000000000000..b99760295a51 Binary files /dev/null and b/assets/images/community-events-recap/fg4.jpg differ diff --git a/assets/images/community-events-recap/fg5.jpg b/assets/images/community-events-recap/fg5.jpg new file mode 100644 index 000000000000..ab9a95e1d16c Binary files /dev/null and b/assets/images/community-events-recap/fg5.jpg differ diff --git a/assets/images/community-events-recap/fg6.jpeg b/assets/images/community-events-recap/fg6.jpeg new file mode 100644 index 000000000000..c6d38c3a5eba Binary files /dev/null and b/assets/images/community-events-recap/fg6.jpeg differ diff --git a/assets/images/community-events-recap/fg7.jpeg b/assets/images/community-events-recap/fg7.jpeg new file mode 100644 index 000000000000..2a52e451e662 Binary files /dev/null and b/assets/images/community-events-recap/fg7.jpeg differ diff --git a/assets/images/community-events-recap/fg8.png b/assets/images/community-events-recap/fg8.png new file mode 100644 index 000000000000..edbd4fb2cdc4 Binary files /dev/null and b/assets/images/community-events-recap/fg8.png differ diff --git a/assets/images/community-events-recap/fg9.png b/assets/images/community-events-recap/fg9.png new file mode 100644 index 000000000000..3acee7a2bea5 Binary files /dev/null and b/assets/images/community-events-recap/fg9.png differ diff --git a/assets/images/compact-hub-icon-selected.svg b/assets/images/compact-hub-icon-selected.svg new file mode 100644 index 000000000000..4f2d4ef35192 --- /dev/null +++ b/assets/images/compact-hub-icon-selected.svg @@ -0,0 +1,61 @@ + + + + 071519_Airlift_PyTorchOrg_HubCompactTemplate_v2 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/assets/images/compact-hub-icon.svg b/assets/images/compact-hub-icon.svg new file mode 100644 index 000000000000..af633ea67ea8 --- /dev/null +++ b/assets/images/compact-hub-icon.svg @@ -0,0 +1,64 @@ + + + + Group 449 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/assets/images/computation-animation-fig-5.gif b/assets/images/computation-animation-fig-5.gif new file mode 100644 index 000000000000..d6a29c5f6825 Binary files /dev/null and b/assets/images/computation-animation-fig-5.gif differ diff --git a/assets/images/computational_graph_backward_pass.png b/assets/images/computational_graph_backward_pass.png new file mode 100644 index 000000000000..bcd3ba5b4e08 Binary files /dev/null and b/assets/images/computational_graph_backward_pass.png differ diff --git a/assets/images/computational_graph_creation.gif b/assets/images/computational_graph_creation.gif new file mode 100644 index 000000000000..82ad8c3b66d4 Binary files /dev/null and b/assets/images/computational_graph_creation.gif differ diff --git a/assets/images/computational_graph_reverse_auto_differentiation.png b/assets/images/computational_graph_reverse_auto_differentiation.png new file mode 100644 index 000000000000..6d2fe43440df Binary files /dev/null and b/assets/images/computational_graph_reverse_auto_differentiation.png differ diff --git a/assets/images/contributor-awards-2023/pt-awardee.png b/assets/images/contributor-awards-2023/pt-awardee.png new file mode 100644 index 000000000000..de7203539746 Binary files /dev/null and b/assets/images/contributor-awards-2023/pt-awardee.png differ diff --git a/assets/images/contributor-awards-2023/pt-foundation-2023.png b/assets/images/contributor-awards-2023/pt-foundation-2023.png new file mode 100644 index 000000000000..352da546c927 Binary files /dev/null and b/assets/images/contributor-awards-2023/pt-foundation-2023.png differ diff --git a/assets/images/contributor-awards-2023/pt-nominee.png b/assets/images/contributor-awards-2023/pt-nominee.png new file mode 100644 index 000000000000..2a97cb80bcb9 Binary files /dev/null and b/assets/images/contributor-awards-2023/pt-nominee.png differ diff --git a/assets/images/contributor-awards-2024/pt-awardee.png b/assets/images/contributor-awards-2024/pt-awardee.png new file mode 100644 index 000000000000..973cb775215e Binary files /dev/null and b/assets/images/contributor-awards-2024/pt-awardee.png differ diff --git a/assets/images/contributor-awards-2024/pt-nominee.png b/assets/images/contributor-awards-2024/pt-nominee.png new file mode 100644 index 000000000000..c67cff4e08e3 Binary files /dev/null and b/assets/images/contributor-awards-2024/pt-nominee.png differ diff --git a/assets/images/cuda-image-2.png b/assets/images/cuda-image-2.png new file mode 100644 index 000000000000..5cc2bade9a43 Binary files /dev/null and b/assets/images/cuda-image-2.png differ diff --git a/assets/images/cuda-image-3.png b/assets/images/cuda-image-3.png new file mode 100644 index 000000000000..997b48fb4fdd Binary files /dev/null and b/assets/images/cuda-image-3.png differ diff --git a/assets/images/cuda-image-4.png b/assets/images/cuda-image-4.png new file mode 100644 index 000000000000..8c77a5aea0bc Binary files /dev/null and b/assets/images/cuda-image-4.png differ diff --git a/assets/images/cuda-image-5.png b/assets/images/cuda-image-5.png new file mode 100644 index 000000000000..4a854558e0b8 Binary files /dev/null and b/assets/images/cuda-image-5.png differ diff --git a/assets/images/cuda-image-6.png b/assets/images/cuda-image-6.png new file mode 100644 index 000000000000..bd59a5397552 Binary files /dev/null and b/assets/images/cuda-image-6.png differ diff --git a/assets/images/cuda-image-7.png b/assets/images/cuda-image-7.png new file mode 100644 index 000000000000..d9a1388a790f Binary files /dev/null and b/assets/images/cuda-image-7.png differ diff --git a/assets/images/cuda-image-8.png b/assets/images/cuda-image-8.png new file mode 100644 index 000000000000..41e09099c39c Binary files /dev/null and b/assets/images/cuda-image-8.png differ diff --git a/assets/images/cudagraphs-pytorch.png b/assets/images/cudagraphs-pytorch.png new file mode 100644 index 000000000000..60715eb6b120 Binary files /dev/null and b/assets/images/cudagraphs-pytorch.png differ diff --git a/assets/images/cursor-icon.svg b/assets/images/cursor-icon.svg new file mode 100644 index 000000000000..8a186d4ef7b3 --- /dev/null +++ b/assets/images/cursor-icon.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/assets/images/custom-rnn-chunk.png b/assets/images/custom-rnn-chunk.png new file mode 100644 index 000000000000..d1dda2893702 Binary files /dev/null and b/assets/images/custom-rnn-chunk.png differ diff --git a/assets/images/custom-rnn-improve.png b/assets/images/custom-rnn-improve.png new file mode 100644 index 000000000000..22b111ca4b3d Binary files /dev/null and b/assets/images/custom-rnn-improve.png differ diff --git a/assets/images/cutlass-ping-pong-gemm-kernel/fg1.png b/assets/images/cutlass-ping-pong-gemm-kernel/fg1.png new file mode 100644 index 000000000000..02dd4ba74548 Binary files /dev/null and b/assets/images/cutlass-ping-pong-gemm-kernel/fg1.png differ diff --git a/assets/images/cutlass-ping-pong-gemm-kernel/fg2.png b/assets/images/cutlass-ping-pong-gemm-kernel/fg2.png new file mode 100644 index 000000000000..f80756be1ca1 Binary files /dev/null and b/assets/images/cutlass-ping-pong-gemm-kernel/fg2.png differ diff --git a/assets/images/cutlass-ping-pong-gemm-kernel/fg3.png b/assets/images/cutlass-ping-pong-gemm-kernel/fg3.png new file mode 100644 index 000000000000..78496a573434 Binary files /dev/null and b/assets/images/cutlass-ping-pong-gemm-kernel/fg3.png differ diff --git a/assets/images/cutlass-ping-pong-gemm-kernel/fg4.png b/assets/images/cutlass-ping-pong-gemm-kernel/fg4.png new file mode 100644 index 000000000000..fcd1dfc84fc8 Binary files /dev/null and b/assets/images/cutlass-ping-pong-gemm-kernel/fg4.png differ diff --git a/assets/images/cutlass-ping-pong-gemm-kernel/fg5.png b/assets/images/cutlass-ping-pong-gemm-kernel/fg5.png new file mode 100644 index 000000000000..e3c91426a2b9 Binary files /dev/null and b/assets/images/cutlass-ping-pong-gemm-kernel/fg5.png differ diff --git a/assets/images/cutlass-ping-pong-gemm-kernel/fg6.png b/assets/images/cutlass-ping-pong-gemm-kernel/fg6.png new file mode 100644 index 000000000000..5aae93707e79 Binary files /dev/null and b/assets/images/cutlass-ping-pong-gemm-kernel/fg6.png differ diff --git a/assets/images/datathon-2025.png b/assets/images/datathon-2025.png new file mode 100644 index 000000000000..f2539d3a2692 Binary files /dev/null and b/assets/images/datathon-2025.png differ diff --git a/assets/images/dcgan_dtd.jpg b/assets/images/dcgan_dtd.jpg new file mode 100644 index 000000000000..1923a4e7aef9 Binary files /dev/null and b/assets/images/dcgan_dtd.jpg differ diff --git a/assets/images/dcgan_fashionGen.jpg b/assets/images/dcgan_fashionGen.jpg new file mode 100644 index 000000000000..ac852ae0e4dd Binary files /dev/null and b/assets/images/dcgan_fashionGen.jpg differ diff --git a/assets/images/decoding_codellama34b.png b/assets/images/decoding_codellama34b.png new file mode 100644 index 000000000000..ea1e1b50e88b Binary files /dev/null and b/assets/images/decoding_codellama34b.png differ diff --git a/assets/images/deep-learning-thank-you-background.jpg b/assets/images/deep-learning-thank-you-background.jpg new file mode 100644 index 000000000000..acc25e1f6d4a Binary files /dev/null and b/assets/images/deep-learning-thank-you-background.jpg differ diff --git a/assets/images/deep-learning-thumbnail.png b/assets/images/deep-learning-thumbnail.png new file mode 100644 index 000000000000..0ce580120bfa Binary files /dev/null and b/assets/images/deep-learning-thumbnail.png differ diff --git a/assets/images/deeplab1.png b/assets/images/deeplab1.png new file mode 100644 index 000000000000..740093882bbd Binary files /dev/null and b/assets/images/deeplab1.png differ diff --git a/assets/images/deeplab2.png b/assets/images/deeplab2.png new file mode 100644 index 000000000000..872b505eb7a7 Binary files /dev/null and b/assets/images/deeplab2.png differ diff --git a/assets/images/densenet1.png b/assets/images/densenet1.png new file mode 100644 index 000000000000..013598aa82c1 Binary files /dev/null and b/assets/images/densenet1.png differ diff --git a/assets/images/densenet2.png b/assets/images/densenet2.png new file mode 100644 index 000000000000..f6cd657a54ad Binary files /dev/null and b/assets/images/densenet2.png differ diff --git a/assets/images/deploying-llms-torchserve-vllm/fg1.png b/assets/images/deploying-llms-torchserve-vllm/fg1.png new file mode 100644 index 000000000000..129e02345290 Binary files /dev/null and b/assets/images/deploying-llms-torchserve-vllm/fg1.png differ diff --git a/assets/images/deploying-llms-torchserve-vllm/fg2.png b/assets/images/deploying-llms-torchserve-vllm/fg2.png new file mode 100644 index 000000000000..9dfbeee8a294 Binary files /dev/null and b/assets/images/deploying-llms-torchserve-vllm/fg2.png differ diff --git a/assets/images/deploying-llms-torchserve-vllm/fg3.png b/assets/images/deploying-llms-torchserve-vllm/fg3.png new file mode 100644 index 000000000000..44b0d45e240e Binary files /dev/null and b/assets/images/deploying-llms-torchserve-vllm/fg3.png differ diff --git a/assets/images/depyf-screenshot.png b/assets/images/depyf-screenshot.png new file mode 100644 index 000000000000..23ecde6f04da Binary files /dev/null and b/assets/images/depyf-screenshot.png differ diff --git a/assets/images/depyf.png b/assets/images/depyf.png new file mode 100644 index 000000000000..d9104cf8f829 Binary files /dev/null and b/assets/images/depyf.png differ diff --git a/assets/images/detection.png b/assets/images/detection.png new file mode 100644 index 000000000000..cc6934c29ff8 Binary files /dev/null and b/assets/images/detection.png differ diff --git a/assets/images/disney_media_logo(old).jpg b/assets/images/disney_media_logo(old).jpg new file mode 100644 index 000000000000..303b38c147ee Binary files /dev/null and b/assets/images/disney_media_logo(old).jpg differ diff --git a/assets/images/disney_media_logo.jpg b/assets/images/disney_media_logo.jpg new file mode 100644 index 000000000000..b472142d1429 Binary files /dev/null and b/assets/images/disney_media_logo.jpg differ diff --git a/assets/images/doc-logos.jpg b/assets/images/doc-logos.jpg new file mode 100644 index 000000000000..2ccaeacec265 Binary files /dev/null and b/assets/images/doc-logos.jpg differ diff --git a/assets/images/docathon-2024.png b/assets/images/docathon-2024.png new file mode 100644 index 000000000000..beb1592250a2 Binary files /dev/null and b/assets/images/docathon-2024.png differ diff --git a/assets/images/docathon-2025.png b/assets/images/docathon-2025.png new file mode 100644 index 000000000000..aad9c70d1f36 Binary files /dev/null and b/assets/images/docathon-2025.png differ diff --git a/assets/images/docathon-cover.jpg b/assets/images/docathon-cover.jpg new file mode 100644 index 000000000000..c9311fbb5439 Binary files /dev/null and b/assets/images/docathon-cover.jpg differ diff --git a/assets/images/doctr-joins-pytorch-ecosystem/fg1.png b/assets/images/doctr-joins-pytorch-ecosystem/fg1.png new file mode 100644 index 000000000000..615c0dfc30d4 Binary files /dev/null and b/assets/images/doctr-joins-pytorch-ecosystem/fg1.png differ diff --git a/assets/images/doctr-joins-pytorch-ecosystem/fg2.jpg b/assets/images/doctr-joins-pytorch-ecosystem/fg2.jpg new file mode 100644 index 000000000000..d552ac819349 Binary files /dev/null and b/assets/images/doctr-joins-pytorch-ecosystem/fg2.jpg differ diff --git a/assets/images/doctr-joins-pytorch-ecosystem/fg3.jpg b/assets/images/doctr-joins-pytorch-ecosystem/fg3.jpg new file mode 100644 index 000000000000..63d589f9292d Binary files /dev/null and b/assets/images/doctr-joins-pytorch-ecosystem/fg3.jpg differ diff --git a/assets/images/doctr-joins-pytorch-ecosystem/fg4.png b/assets/images/doctr-joins-pytorch-ecosystem/fg4.png new file mode 100644 index 000000000000..5bc36c855800 Binary files /dev/null and b/assets/images/doctr-joins-pytorch-ecosystem/fg4.png differ diff --git a/assets/images/doctr-joins-pytorch-ecosystem/fg5.png b/assets/images/doctr-joins-pytorch-ecosystem/fg5.png new file mode 100644 index 000000000000..07fd52c835be Binary files /dev/null and b/assets/images/doctr-joins-pytorch-ecosystem/fg5.png differ diff --git a/assets/images/doctr-joins-pytorch-ecosystem/fg6.png b/assets/images/doctr-joins-pytorch-ecosystem/fg6.png new file mode 100644 index 000000000000..d8286b8d835d Binary files /dev/null and b/assets/images/doctr-joins-pytorch-ecosystem/fg6.png differ diff --git a/assets/images/dog.jpg b/assets/images/dog.jpg new file mode 100644 index 000000000000..12f0e0dd1162 Binary files /dev/null and b/assets/images/dog.jpg differ diff --git a/assets/images/dyno_hta.png b/assets/images/dyno_hta.png new file mode 100644 index 000000000000..e43990622e05 Binary files /dev/null and b/assets/images/dyno_hta.png differ diff --git a/assets/images/easily-list-and-initialize-models-with-new-apis-in-torchvision-1.png b/assets/images/easily-list-and-initialize-models-with-new-apis-in-torchvision-1.png new file mode 100644 index 000000000000..56ea6a476d15 Binary files /dev/null and b/assets/images/easily-list-and-initialize-models-with-new-apis-in-torchvision-1.png differ diff --git a/assets/images/easily-list-and-initialize-models-with-new-apis-in-torchvision.gif b/assets/images/easily-list-and-initialize-models-with-new-apis-in-torchvision.gif new file mode 100644 index 000000000000..7459efac937a Binary files /dev/null and b/assets/images/easily-list-and-initialize-models-with-new-apis-in-torchvision.gif differ diff --git a/assets/images/ecosystem-background.jpg b/assets/images/ecosystem-background.jpg new file mode 100644 index 000000000000..94a18a291639 Binary files /dev/null and b/assets/images/ecosystem-background.jpg differ diff --git a/assets/images/ecosystem-day-thank-you.png b/assets/images/ecosystem-day-thank-you.png new file mode 100644 index 000000000000..56126a7d6b41 Binary files /dev/null and b/assets/images/ecosystem-day-thank-you.png differ diff --git a/assets/images/ecosystem-detail-background.jpg b/assets/images/ecosystem-detail-background.jpg new file mode 100644 index 000000000000..6f8a4d2de6df Binary files /dev/null and b/assets/images/ecosystem-detail-background.jpg differ diff --git a/assets/images/ecosystem_day.png b/assets/images/ecosystem_day.png new file mode 100644 index 000000000000..1132d5b03955 Binary files /dev/null and b/assets/images/ecosystem_day.png differ diff --git a/assets/images/ecosystem_day2021.jpg b/assets/images/ecosystem_day2021.jpg new file mode 100644 index 000000000000..3a93c8fe21a0 Binary files /dev/null and b/assets/images/ecosystem_day2021.jpg differ diff --git a/assets/images/ecosystem_day_2021.png b/assets/images/ecosystem_day_2021.png new file mode 100644 index 000000000000..2dac24a21b9a Binary files /dev/null and b/assets/images/ecosystem_day_2021.png differ diff --git a/assets/images/efficient_large_scale_training_2.png b/assets/images/efficient_large_scale_training_2.png new file mode 100644 index 000000000000..2219d89813be Binary files /dev/null and b/assets/images/efficient_large_scale_training_2.png differ diff --git a/assets/images/empowering-models-performance/fig1.jpg b/assets/images/empowering-models-performance/fig1.jpg new file mode 100644 index 000000000000..eb49eb15c05c Binary files /dev/null and b/assets/images/empowering-models-performance/fig1.jpg differ diff --git a/assets/images/empowering-models-performance/fig2.jpg b/assets/images/empowering-models-performance/fig2.jpg new file mode 100644 index 000000000000..b0476b2972dc Binary files /dev/null and b/assets/images/empowering-models-performance/fig2.jpg differ diff --git a/assets/images/empowering-models-performance/fig3.jpg b/assets/images/empowering-models-performance/fig3.jpg new file mode 100644 index 000000000000..c5fe3ea59ba8 Binary files /dev/null and b/assets/images/empowering-models-performance/fig3.jpg differ diff --git a/assets/images/empowering-models-performance/fig4.jpg b/assets/images/empowering-models-performance/fig4.jpg new file mode 100644 index 000000000000..075c7f4679f0 Binary files /dev/null and b/assets/images/empowering-models-performance/fig4.jpg differ diff --git a/assets/images/empowering-models-performance/fig5.jpg b/assets/images/empowering-models-performance/fig5.jpg new file mode 100644 index 000000000000..063eb584bc1b Binary files /dev/null and b/assets/images/empowering-models-performance/fig5.jpg differ diff --git a/assets/images/empowering-pytorch-on-intel-xeon-scalable-processors-with-bfloat16.png b/assets/images/empowering-pytorch-on-intel-xeon-scalable-processors-with-bfloat16.png new file mode 100644 index 000000000000..688d117520f3 Binary files /dev/null and b/assets/images/empowering-pytorch-on-intel-xeon-scalable-processors-with-bfloat16.png differ diff --git a/assets/images/executiontimegraph.jpg b/assets/images/executiontimegraph.jpg new file mode 100644 index 000000000000..21e96f178f40 Binary files /dev/null and b/assets/images/executiontimegraph.jpg differ diff --git a/assets/images/executorch-arrows.svg b/assets/images/executorch-arrows.svg new file mode 100644 index 000000000000..2febe67c7db0 --- /dev/null +++ b/assets/images/executorch-arrows.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/assets/images/executorch-chip-logo.svg b/assets/images/executorch-chip-logo.svg new file mode 100644 index 000000000000..11e5ed60956b --- /dev/null +++ b/assets/images/executorch-chip-logo.svg @@ -0,0 +1,205 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/assets/images/experiment_autocache.png b/assets/images/experiment_autocache.png new file mode 100644 index 000000000000..4a88e99ae74a Binary files /dev/null and b/assets/images/experiment_autocache.png differ diff --git a/assets/images/experiments_optimal_k.png b/assets/images/experiments_optimal_k.png new file mode 100644 index 000000000000..34c3a0f2e628 Binary files /dev/null and b/assets/images/experiments_optimal_k.png differ diff --git a/assets/images/experiments_table1.png b/assets/images/experiments_table1.png new file mode 100644 index 000000000000..ce2c03b3a890 Binary files /dev/null and b/assets/images/experiments_table1.png differ diff --git a/assets/images/experiments_throughput.png b/assets/images/experiments_throughput.png new file mode 100644 index 000000000000..c5df49e9060e Binary files /dev/null and b/assets/images/experiments_throughput.png differ diff --git a/assets/images/experiments_tuning_alpha.png b/assets/images/experiments_tuning_alpha.png new file mode 100644 index 000000000000..ea33d6bd9034 Binary files /dev/null and b/assets/images/experiments_tuning_alpha.png differ diff --git a/assets/images/extended_computational_graph.png b/assets/images/extended_computational_graph.png new file mode 100644 index 000000000000..4b20da3169c2 Binary files /dev/null and b/assets/images/extended_computational_graph.png differ diff --git a/assets/images/external-link-icon.svg b/assets/images/external-link-icon.svg new file mode 100644 index 000000000000..d0f7a53b5526 --- /dev/null +++ b/assets/images/external-link-icon.svg @@ -0,0 +1,27 @@ + + + + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + diff --git a/assets/images/f1-4th-gen-xeon-aws-instances.png b/assets/images/f1-4th-gen-xeon-aws-instances.png new file mode 100644 index 000000000000..74f79c5961fe Binary files /dev/null and b/assets/images/f1-4th-gen-xeon-aws-instances.png differ diff --git a/assets/images/f1-inference-pipeline-language-identification.png b/assets/images/f1-inference-pipeline-language-identification.png new file mode 100644 index 000000000000..3aae731ebf83 Binary files /dev/null and b/assets/images/f1-inference-pipeline-language-identification.png differ diff --git a/assets/images/f1-onednn-graph-api-code-snippet.png b/assets/images/f1-onednn-graph-api-code-snippet.png new file mode 100644 index 000000000000..53ff56523fa6 Binary files /dev/null and b/assets/images/f1-onednn-graph-api-code-snippet.png differ diff --git a/assets/images/f1-self-attention-vs-masked.png b/assets/images/f1-self-attention-vs-masked.png new file mode 100644 index 000000000000..7cb9bf72c301 Binary files /dev/null and b/assets/images/f1-self-attention-vs-masked.png differ diff --git a/assets/images/f2-4th-gen-xeon-googlecloud-instances.png b/assets/images/f2-4th-gen-xeon-googlecloud-instances.png new file mode 100644 index 000000000000..6f085ff70eec Binary files /dev/null and b/assets/images/f2-4th-gen-xeon-googlecloud-instances.png differ diff --git a/assets/images/f2-dialogpt-article.png b/assets/images/f2-dialogpt-article.png new file mode 100644 index 000000000000..7b8bdef51627 Binary files /dev/null and b/assets/images/f2-dialogpt-article.png differ diff --git a/assets/images/f2-inference-speedup-with-onednn-graph.png b/assets/images/f2-inference-speedup-with-onednn-graph.png new file mode 100644 index 000000000000..035fa5065b34 Binary files /dev/null and b/assets/images/f2-inference-speedup-with-onednn-graph.png differ diff --git a/assets/images/f2-timestamps-delivered-from-crdnn-model.png b/assets/images/f2-timestamps-delivered-from-crdnn-model.png new file mode 100644 index 000000000000..7c140129b2ec Binary files /dev/null and b/assets/images/f2-timestamps-delivered-from-crdnn-model.png differ diff --git a/assets/images/f3-dialogpt-interaction.png b/assets/images/f3-dialogpt-interaction.png new file mode 100644 index 000000000000..f4bf437085dc Binary files /dev/null and b/assets/images/f3-dialogpt-interaction.png differ diff --git a/assets/images/f3-moscow-satellite-image-dataset.png b/assets/images/f3-moscow-satellite-image-dataset.png new file mode 100644 index 000000000000..a1cd4b89f18e Binary files /dev/null and b/assets/images/f3-moscow-satellite-image-dataset.png differ diff --git a/assets/images/f4-moscow-satellite-image-complete.png b/assets/images/f4-moscow-satellite-image-complete.png new file mode 100644 index 000000000000..bde4058d7748 Binary files /dev/null and b/assets/images/f4-moscow-satellite-image-complete.png differ diff --git a/assets/images/f_x_y_graph.png b/assets/images/f_x_y_graph.png new file mode 100644 index 000000000000..246deb244fc6 Binary files /dev/null and b/assets/images/f_x_y_graph.png differ diff --git a/assets/images/fairseq_logo.png b/assets/images/fairseq_logo.png new file mode 100644 index 000000000000..75472cbb5ff7 Binary files /dev/null and b/assets/images/fairseq_logo.png differ diff --git a/assets/images/fast-beam-search-decoding-in-pytorch-with-torchaudio-and-flashlight-text-1.jpeg b/assets/images/fast-beam-search-decoding-in-pytorch-with-torchaudio-and-flashlight-text-1.jpeg new file mode 100644 index 000000000000..3e4c8007b936 Binary files /dev/null and b/assets/images/fast-beam-search-decoding-in-pytorch-with-torchaudio-and-flashlight-text-1.jpeg differ diff --git a/assets/images/fast-beam-search-decoding-in-pytorch-with-torchaudio-and-flashlight-text-2.jpeg b/assets/images/fast-beam-search-decoding-in-pytorch-with-torchaudio-and-flashlight-text-2.jpeg new file mode 100644 index 000000000000..d71be41bfed9 Binary files /dev/null and b/assets/images/fast-beam-search-decoding-in-pytorch-with-torchaudio-and-flashlight-text-2.jpeg differ diff --git a/assets/images/fast-beam-search-decoding-in-pytorch-with-torchaudio-and-flashlight-text-3.jpeg b/assets/images/fast-beam-search-decoding-in-pytorch-with-torchaudio-and-flashlight-text-3.jpeg new file mode 100644 index 000000000000..55e1f3da6b99 Binary files /dev/null and b/assets/images/fast-beam-search-decoding-in-pytorch-with-torchaudio-and-flashlight-text-3.jpeg differ diff --git a/assets/images/fast-beam-search-decoding-in-pytorch-with-torchaudio-and-flashlight-text-4.jpeg b/assets/images/fast-beam-search-decoding-in-pytorch-with-torchaudio-and-flashlight-text-4.jpeg new file mode 100644 index 000000000000..c9e9d8842f15 Binary files /dev/null and b/assets/images/fast-beam-search-decoding-in-pytorch-with-torchaudio-and-flashlight-text-4.jpeg differ diff --git a/assets/images/fast-beam-search-decoding-in-pytorch-with-torchaudio-and-flashlight-text-5.jpeg b/assets/images/fast-beam-search-decoding-in-pytorch-with-torchaudio-and-flashlight-text-5.jpeg new file mode 100644 index 000000000000..da35f5dabc9d Binary files /dev/null and b/assets/images/fast-beam-search-decoding-in-pytorch-with-torchaudio-and-flashlight-text-5.jpeg differ diff --git a/assets/images/fast-beam-search-decoding-in-pytorch-with-torchaudio-and-flashlight-text-6.png b/assets/images/fast-beam-search-decoding-in-pytorch-with-torchaudio-and-flashlight-text-6.png new file mode 100644 index 000000000000..d1766e2ea485 Binary files /dev/null and b/assets/images/fast-beam-search-decoding-in-pytorch-with-torchaudio-and-flashlight-text-6.png differ diff --git a/assets/images/fastpitch_model.png b/assets/images/fastpitch_model.png new file mode 100644 index 000000000000..f828877edfdd Binary files /dev/null and b/assets/images/fastpitch_model.png differ diff --git a/assets/images/fcn2.png b/assets/images/fcn2.png new file mode 100644 index 000000000000..f4f1de9ac55b Binary files /dev/null and b/assets/images/fcn2.png differ diff --git a/assets/images/feature extractor.png b/assets/images/feature extractor.png new file mode 100644 index 000000000000..09a4aac923f0 Binary files /dev/null and b/assets/images/feature extractor.png differ diff --git a/assets/images/featured-img-pytorch-2.png b/assets/images/featured-img-pytorch-2.png new file mode 100644 index 000000000000..e8f356789bf7 Binary files /dev/null and b/assets/images/featured-img-pytorch-2.png differ diff --git a/assets/images/features-background.jpg b/assets/images/features-background.jpg new file mode 100644 index 000000000000..645edc9f1ee9 Binary files /dev/null and b/assets/images/features-background.jpg differ diff --git a/assets/images/features-background2.jpg b/assets/images/features-background2.jpg new file mode 100644 index 000000000000..767ed41942fa Binary files /dev/null and b/assets/images/features-background2.jpg differ diff --git a/assets/images/feedback-flag.svg b/assets/images/feedback-flag.svg new file mode 100644 index 000000000000..ad1a9ffdbfbc --- /dev/null +++ b/assets/images/feedback-flag.svg @@ -0,0 +1,13 @@ + + + + + + + + + + + + + diff --git a/assets/images/filter-arrow.svg b/assets/images/filter-arrow.svg new file mode 100644 index 000000000000..5c4ac42d2ce0 --- /dev/null +++ b/assets/images/filter-arrow.svg @@ -0,0 +1,13 @@ + + + + + + + + + + + + + diff --git a/assets/images/finetune-llms/fg1.png b/assets/images/finetune-llms/fg1.png new file mode 100644 index 000000000000..ef457a5e2f69 Binary files /dev/null and b/assets/images/finetune-llms/fg1.png differ diff --git a/assets/images/finetune-llms/fg2.gif b/assets/images/finetune-llms/fg2.gif new file mode 100644 index 000000000000..902ce8e57dab Binary files /dev/null and b/assets/images/finetune-llms/fg2.gif differ diff --git a/assets/images/finetune-llms/fg3.png b/assets/images/finetune-llms/fg3.png new file mode 100644 index 000000000000..84b0de28550e Binary files /dev/null and b/assets/images/finetune-llms/fg3.png differ diff --git a/assets/images/finetune-llms/fg4.png b/assets/images/finetune-llms/fg4.png new file mode 100644 index 000000000000..c97dd4c60b33 Binary files /dev/null and b/assets/images/finetune-llms/fg4.png differ diff --git a/assets/images/finetune-llms/fg5.png b/assets/images/finetune-llms/fg5.png new file mode 100644 index 000000000000..4d011412de78 Binary files /dev/null and b/assets/images/finetune-llms/fg5.png differ diff --git a/assets/images/finetune-llms/fg6.png b/assets/images/finetune-llms/fg6.png new file mode 100644 index 000000000000..a63934fb250d Binary files /dev/null and b/assets/images/finetune-llms/fg6.png differ diff --git a/assets/images/finetune-llms/fg7.png b/assets/images/finetune-llms/fg7.png new file mode 100644 index 000000000000..11e9ebf604e7 Binary files /dev/null and b/assets/images/finetune-llms/fg7.png differ diff --git a/assets/images/finetune-llms/fg8.png b/assets/images/finetune-llms/fg8.png new file mode 100644 index 000000000000..4c8f42f11f84 Binary files /dev/null and b/assets/images/finetune-llms/fg8.png differ diff --git a/assets/images/flash_attention_tflops.png b/assets/images/flash_attention_tflops.png new file mode 100644 index 000000000000..92e44c76844f Binary files /dev/null and b/assets/images/flash_attention_tflops.png differ diff --git a/assets/images/flashattention-3/fg1.png b/assets/images/flashattention-3/fg1.png new file mode 100644 index 000000000000..3e73398cced9 Binary files /dev/null and b/assets/images/flashattention-3/fg1.png differ diff --git a/assets/images/flashattention-3/fg2.png b/assets/images/flashattention-3/fg2.png new file mode 100644 index 000000000000..6b3b1da13ecc Binary files /dev/null and b/assets/images/flashattention-3/fg2.png differ diff --git a/assets/images/flashattention-3/fg3.png b/assets/images/flashattention-3/fg3.png new file mode 100644 index 000000000000..5d95157d333f Binary files /dev/null and b/assets/images/flashattention-3/fg3.png differ diff --git a/assets/images/flashattention-3/fg4.png b/assets/images/flashattention-3/fg4.png new file mode 100644 index 000000000000..bbaba22ed289 Binary files /dev/null and b/assets/images/flashattention-3/fg4.png differ diff --git a/assets/images/flashattention-3/fg5.png b/assets/images/flashattention-3/fg5.png new file mode 100644 index 000000000000..a5378413d717 Binary files /dev/null and b/assets/images/flashattention-3/fg5.png differ diff --git a/assets/images/flashattention-3/fg6.png b/assets/images/flashattention-3/fg6.png new file mode 100644 index 000000000000..65a105bf32cd Binary files /dev/null and b/assets/images/flashattention-3/fg6.png differ diff --git a/assets/images/flashattention-3/fg6a.png b/assets/images/flashattention-3/fg6a.png new file mode 100644 index 000000000000..de5659c83b8b Binary files /dev/null and b/assets/images/flashattention-3/fg6a.png differ diff --git a/assets/images/flashattention-3/fg7.png b/assets/images/flashattention-3/fg7.png new file mode 100644 index 000000000000..9ea9e6b929b9 Binary files /dev/null and b/assets/images/flashattention-3/fg7.png differ diff --git a/assets/images/flashattention-3/fg8.png b/assets/images/flashattention-3/fg8.png new file mode 100644 index 000000000000..d757bd0f7bba Binary files /dev/null and b/assets/images/flashattention-3/fg8.png differ diff --git a/assets/images/flashattention-3/fg9.png b/assets/images/flashattention-3/fg9.png new file mode 100644 index 000000000000..f7ef51912cbb Binary files /dev/null and b/assets/images/flashattention-3/fg9.png differ diff --git a/assets/images/flexattention-for-inference/fg1.png b/assets/images/flexattention-for-inference/fg1.png new file mode 100644 index 000000000000..c42a3bf5717f Binary files /dev/null and b/assets/images/flexattention-for-inference/fg1.png differ diff --git a/assets/images/flexattention-for-inference/fg10.png b/assets/images/flexattention-for-inference/fg10.png new file mode 100644 index 000000000000..70d9e441b97c Binary files /dev/null and b/assets/images/flexattention-for-inference/fg10.png differ diff --git a/assets/images/flexattention-for-inference/fg11.png b/assets/images/flexattention-for-inference/fg11.png new file mode 100644 index 000000000000..94697c426b7e Binary files /dev/null and b/assets/images/flexattention-for-inference/fg11.png differ diff --git a/assets/images/flexattention-for-inference/fg2.png b/assets/images/flexattention-for-inference/fg2.png new file mode 100644 index 000000000000..47ae6ab99d26 Binary files /dev/null and b/assets/images/flexattention-for-inference/fg2.png differ diff --git a/assets/images/flexattention-for-inference/fg3.png b/assets/images/flexattention-for-inference/fg3.png new file mode 100644 index 000000000000..06bc61656d47 Binary files /dev/null and b/assets/images/flexattention-for-inference/fg3.png differ diff --git a/assets/images/flexattention-for-inference/fg4.png b/assets/images/flexattention-for-inference/fg4.png new file mode 100644 index 000000000000..b78a15172977 Binary files /dev/null and b/assets/images/flexattention-for-inference/fg4.png differ diff --git a/assets/images/flexattention-for-inference/fg5.png b/assets/images/flexattention-for-inference/fg5.png new file mode 100644 index 000000000000..dbb7081efe98 Binary files /dev/null and b/assets/images/flexattention-for-inference/fg5.png differ diff --git a/assets/images/flexattention-for-inference/fg6.png b/assets/images/flexattention-for-inference/fg6.png new file mode 100644 index 000000000000..d2221e66d982 Binary files /dev/null and b/assets/images/flexattention-for-inference/fg6.png differ diff --git a/assets/images/flexattention-for-inference/fg7.png b/assets/images/flexattention-for-inference/fg7.png new file mode 100644 index 000000000000..6ec36ad490c5 Binary files /dev/null and b/assets/images/flexattention-for-inference/fg7.png differ diff --git a/assets/images/flexattention-for-inference/fg8.png b/assets/images/flexattention-for-inference/fg8.png new file mode 100644 index 000000000000..a6c6a5227db8 Binary files /dev/null and b/assets/images/flexattention-for-inference/fg8.png differ diff --git a/assets/images/flexattention-for-inference/fg9.png b/assets/images/flexattention-for-inference/fg9.png new file mode 100644 index 000000000000..8187641ba4b5 Binary files /dev/null and b/assets/images/flexattention-for-inference/fg9.png differ diff --git a/assets/images/flexattention/fg1.jpg b/assets/images/flexattention/fg1.jpg new file mode 100644 index 000000000000..bab72ba56793 Binary files /dev/null and b/assets/images/flexattention/fg1.jpg differ diff --git a/assets/images/flexattention/fg10.png b/assets/images/flexattention/fg10.png new file mode 100644 index 000000000000..452cf78b07ed Binary files /dev/null and b/assets/images/flexattention/fg10.png differ diff --git a/assets/images/flexattention/fg11.png b/assets/images/flexattention/fg11.png new file mode 100644 index 000000000000..3ab03b4a9c62 Binary files /dev/null and b/assets/images/flexattention/fg11.png differ diff --git a/assets/images/flexattention/fg12.png b/assets/images/flexattention/fg12.png new file mode 100644 index 000000000000..6d5fbdf23fb5 Binary files /dev/null and b/assets/images/flexattention/fg12.png differ diff --git a/assets/images/flexattention/fg13.png b/assets/images/flexattention/fg13.png new file mode 100644 index 000000000000..a04be444d6e7 Binary files /dev/null and b/assets/images/flexattention/fg13.png differ diff --git a/assets/images/flexattention/fg14.png b/assets/images/flexattention/fg14.png new file mode 100644 index 000000000000..be26d7e1aa28 Binary files /dev/null and b/assets/images/flexattention/fg14.png differ diff --git a/assets/images/flexattention/fg15.png b/assets/images/flexattention/fg15.png new file mode 100644 index 000000000000..2ece70f8b062 Binary files /dev/null and b/assets/images/flexattention/fg15.png differ diff --git a/assets/images/flexattention/fg16.png b/assets/images/flexattention/fg16.png new file mode 100644 index 000000000000..5d52ccd43446 Binary files /dev/null and b/assets/images/flexattention/fg16.png differ diff --git a/assets/images/flexattention/fg2.jpg b/assets/images/flexattention/fg2.jpg new file mode 100644 index 000000000000..354347ec4c28 Binary files /dev/null and b/assets/images/flexattention/fg2.jpg differ diff --git a/assets/images/flexattention/fg3.png b/assets/images/flexattention/fg3.png new file mode 100644 index 000000000000..95924c69e598 Binary files /dev/null and b/assets/images/flexattention/fg3.png differ diff --git a/assets/images/flexattention/fg4.png b/assets/images/flexattention/fg4.png new file mode 100644 index 000000000000..0e5c51c6347e Binary files /dev/null and b/assets/images/flexattention/fg4.png differ diff --git a/assets/images/flexattention/fg5.png b/assets/images/flexattention/fg5.png new file mode 100644 index 000000000000..9056c68e6bb4 Binary files /dev/null and b/assets/images/flexattention/fg5.png differ diff --git a/assets/images/flexattention/fg6.png b/assets/images/flexattention/fg6.png new file mode 100644 index 000000000000..1966540f2761 Binary files /dev/null and b/assets/images/flexattention/fg6.png differ diff --git a/assets/images/flexattention/fg7.png b/assets/images/flexattention/fg7.png new file mode 100644 index 000000000000..a47754c5381d Binary files /dev/null and b/assets/images/flexattention/fg7.png differ diff --git a/assets/images/flexattention/fg8.png b/assets/images/flexattention/fg8.png new file mode 100644 index 000000000000..382c383a3b22 Binary files /dev/null and b/assets/images/flexattention/fg8.png differ diff --git a/assets/images/flexattention/fg9.png b/assets/images/flexattention/fg9.png new file mode 100644 index 000000000000..3604c5beea91 Binary files /dev/null and b/assets/images/flexattention/fg9.png differ diff --git a/assets/images/forward-backward-function-fig-4.png b/assets/images/forward-backward-function-fig-4.png new file mode 100644 index 000000000000..6ba05d481d3d Binary files /dev/null and b/assets/images/forward-backward-function-fig-4.png differ diff --git a/assets/images/freeze_training.png b/assets/images/freeze_training.png new file mode 100644 index 000000000000..0e170a3be3a0 Binary files /dev/null and b/assets/images/freeze_training.png differ diff --git a/assets/images/fsdp_workflow.png b/assets/images/fsdp_workflow.png new file mode 100644 index 000000000000..1a8df0e44b62 Binary files /dev/null and b/assets/images/fsdp_workflow.png differ diff --git a/assets/images/full-architecture-image-1.png b/assets/images/full-architecture-image-1.png new file mode 100644 index 000000000000..a213e04457f3 Binary files /dev/null and b/assets/images/full-architecture-image-1.png differ diff --git a/assets/images/full-hub-icon-selected.svg b/assets/images/full-hub-icon-selected.svg new file mode 100644 index 000000000000..d47d9f43968c --- /dev/null +++ b/assets/images/full-hub-icon-selected.svg @@ -0,0 +1,18 @@ + + + + + + + + + + + + + + + + + + diff --git a/assets/images/full-hub-icon.svg b/assets/images/full-hub-icon.svg new file mode 100644 index 000000000000..249fcf2763bc --- /dev/null +++ b/assets/images/full-hub-icon.svg @@ -0,0 +1,18 @@ + + + + + + + + + + + + + + + + + + diff --git a/assets/images/fx-image1.png b/assets/images/fx-image1.png new file mode 100644 index 000000000000..070e6c4fdc4d Binary files /dev/null and b/assets/images/fx-image1.png differ diff --git a/assets/images/fx-image2.png b/assets/images/fx-image2.png new file mode 100644 index 000000000000..cf16212b2f2e Binary files /dev/null and b/assets/images/fx-image2.png differ diff --git a/assets/images/fx-image3.png b/assets/images/fx-image3.png new file mode 100644 index 000000000000..9cc60bf11e6d Binary files /dev/null and b/assets/images/fx-image3.png differ diff --git a/assets/images/fx-image4.png b/assets/images/fx-image4.png new file mode 100644 index 000000000000..1f323ae72503 Binary files /dev/null and b/assets/images/fx-image4.png differ diff --git a/assets/images/fx-image5.png b/assets/images/fx-image5.png new file mode 100644 index 000000000000..6ff73a1c204e Binary files /dev/null and b/assets/images/fx-image5.png differ diff --git a/assets/images/fx-image6.png b/assets/images/fx-image6.png new file mode 100644 index 000000000000..92f39cded081 Binary files /dev/null and b/assets/images/fx-image6.png differ diff --git a/assets/images/genai-acceleration-intel-xeon/fg1.png b/assets/images/genai-acceleration-intel-xeon/fg1.png new file mode 100644 index 000000000000..3051ab529378 Binary files /dev/null and b/assets/images/genai-acceleration-intel-xeon/fg1.png differ diff --git a/assets/images/genai-acceleration-intel-xeon/fg2.png b/assets/images/genai-acceleration-intel-xeon/fg2.png new file mode 100644 index 000000000000..b609113058a6 Binary files /dev/null and b/assets/images/genai-acceleration-intel-xeon/fg2.png differ diff --git a/assets/images/genai-acceleration-intel-xeon/fg3.png b/assets/images/genai-acceleration-intel-xeon/fg3.png new file mode 100644 index 000000000000..0c56d92ed6c8 Binary files /dev/null and b/assets/images/genai-acceleration-intel-xeon/fg3.png differ diff --git a/assets/images/genai-acceleration-intel-xeon/fg4.png b/assets/images/genai-acceleration-intel-xeon/fg4.png new file mode 100644 index 000000000000..f2ab404335f7 Binary files /dev/null and b/assets/images/genai-acceleration-intel-xeon/fg4.png differ diff --git a/assets/images/genai-acceleration-intel-xeon/fg5.png b/assets/images/genai-acceleration-intel-xeon/fg5.png new file mode 100644 index 000000000000..384813c633c4 Binary files /dev/null and b/assets/images/genai-acceleration-intel-xeon/fg5.png differ diff --git a/assets/images/genai-acceleration-intel-xeon/fg6.png b/assets/images/genai-acceleration-intel-xeon/fg6.png new file mode 100644 index 000000000000..c6a25f8de23c Binary files /dev/null and b/assets/images/genai-acceleration-intel-xeon/fg6.png differ diff --git a/assets/images/geomloss.jpg b/assets/images/geomloss.jpg new file mode 100644 index 000000000000..1b0506b3604e Binary files /dev/null and b/assets/images/geomloss.jpg differ diff --git a/assets/images/get-started-background.jpg b/assets/images/get-started-background.jpg new file mode 100644 index 000000000000..dda2e6eeb077 Binary files /dev/null and b/assets/images/get-started-background.jpg differ diff --git a/assets/images/ghostnet.png b/assets/images/ghostnet.png new file mode 100644 index 000000000000..b91337e2aea3 Binary files /dev/null and b/assets/images/ghostnet.png differ diff --git a/assets/images/github-star.svg b/assets/images/github-star.svg new file mode 100644 index 000000000000..6a262594f037 --- /dev/null +++ b/assets/images/github-star.svg @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + + + + diff --git a/assets/images/github_logo_32.png b/assets/images/github_logo_32.png new file mode 100644 index 000000000000..ddbe39cce359 Binary files /dev/null and b/assets/images/github_logo_32.png differ diff --git a/assets/images/google-cloud-logo.svg b/assets/images/google-cloud-logo.svg new file mode 100644 index 000000000000..760eac778bb7 --- /dev/null +++ b/assets/images/google-cloud-logo.svg @@ -0,0 +1,40 @@ + + + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/assets/images/google_MLPMixer_architecture.png b/assets/images/google_MLPMixer_architecture.png new file mode 100644 index 000000000000..2afb485faf1c Binary files /dev/null and b/assets/images/google_MLPMixer_architecture.png differ diff --git a/assets/images/googlenet1.png b/assets/images/googlenet1.png new file mode 100644 index 000000000000..fa50a220c258 Binary files /dev/null and b/assets/images/googlenet1.png differ diff --git a/assets/images/googlenet2.png b/assets/images/googlenet2.png new file mode 100644 index 000000000000..ae52c1f34d74 Binary files /dev/null and b/assets/images/googlenet2.png differ diff --git a/assets/images/governance.png b/assets/images/governance.png new file mode 100644 index 000000000000..a23d38c21eec Binary files /dev/null and b/assets/images/governance.png differ diff --git a/assets/images/governing-board/Aparna-Ramani.jpg b/assets/images/governing-board/Aparna-Ramani.jpg new file mode 100644 index 000000000000..33d0098bd6ac Binary files /dev/null and b/assets/images/governing-board/Aparna-Ramani.jpg differ diff --git a/assets/images/governing-board/Brian-Granger.jpg b/assets/images/governing-board/Brian-Granger.jpg new file mode 100644 index 000000000000..f42ac299d5d1 Binary files /dev/null and b/assets/images/governing-board/Brian-Granger.jpg differ diff --git a/assets/images/governing-board/Duncan-Poole.jpg b/assets/images/governing-board/Duncan-Poole.jpg new file mode 100644 index 000000000000..c426efe1ca85 Binary files /dev/null and b/assets/images/governing-board/Duncan-Poole.jpg differ diff --git a/assets/images/governing-board/Fred-Li.jpg b/assets/images/governing-board/Fred-Li.jpg new file mode 100644 index 000000000000..7300600445fd Binary files /dev/null and b/assets/images/governing-board/Fred-Li.jpg differ diff --git a/assets/images/governing-board/Luca-Antiga.jpg b/assets/images/governing-board/Luca-Antiga.jpg new file mode 100644 index 000000000000..f7e989177633 Binary files /dev/null and b/assets/images/governing-board/Luca-Antiga.jpg differ diff --git a/assets/images/governing-board/Lysandre-Debut.jpg b/assets/images/governing-board/Lysandre-Debut.jpg new file mode 100644 index 000000000000..40f34aea0b5b Binary files /dev/null and b/assets/images/governing-board/Lysandre-Debut.jpg differ diff --git a/assets/images/governing-board/Niles-Burbank.jpg b/assets/images/governing-board/Niles-Burbank.jpg new file mode 100644 index 000000000000..116b5e309450 Binary files /dev/null and b/assets/images/governing-board/Niles-Burbank.jpg differ diff --git a/assets/images/governing-board/Raghu-Ganti.jpg b/assets/images/governing-board/Raghu-Ganti.jpg new file mode 100644 index 000000000000..93995deff265 Binary files /dev/null and b/assets/images/governing-board/Raghu-Ganti.jpg differ diff --git a/assets/images/governing-board/Steve-Wan.jpg b/assets/images/governing-board/Steve-Wan.jpg new file mode 100644 index 000000000000..c720ceb8e446 Binary files /dev/null and b/assets/images/governing-board/Steve-Wan.jpg differ diff --git a/assets/images/governing-board/Wei-Li.jpg b/assets/images/governing-board/Wei-Li.jpg new file mode 100644 index 000000000000..f477073e9186 Binary files /dev/null and b/assets/images/governing-board/Wei-Li.jpg differ diff --git a/assets/images/governing-board/alex-spinelli.jpeg b/assets/images/governing-board/alex-spinelli.jpeg new file mode 100644 index 000000000000..b810247d1558 Binary files /dev/null and b/assets/images/governing-board/alex-spinelli.jpeg differ diff --git a/assets/images/governing-board/andrew-wafaa.jpg b/assets/images/governing-board/andrew-wafaa.jpg new file mode 100644 index 000000000000..17557e768d94 Binary files /dev/null and b/assets/images/governing-board/andrew-wafaa.jpg differ diff --git a/assets/images/governing-board/ankit-patel.jpg b/assets/images/governing-board/ankit-patel.jpg new file mode 100644 index 000000000000..91a44213cdf7 Binary files /dev/null and b/assets/images/governing-board/ankit-patel.jpg differ diff --git a/assets/images/governing-board/damien-sereni.jpeg b/assets/images/governing-board/damien-sereni.jpeg new file mode 100644 index 000000000000..d208dde122ef Binary files /dev/null and b/assets/images/governing-board/damien-sereni.jpeg differ diff --git a/assets/images/governing-board/dwarakrajagopal2.jpg b/assets/images/governing-board/dwarakrajagopal2.jpg new file mode 100644 index 000000000000..9036b956605d Binary files /dev/null and b/assets/images/governing-board/dwarakrajagopal2.jpg differ diff --git a/assets/images/governing-board/joe-spisak.jpg b/assets/images/governing-board/joe-spisak.jpg new file mode 100644 index 000000000000..9a96bc3157a7 Binary files /dev/null and b/assets/images/governing-board/joe-spisak.jpg differ diff --git a/assets/images/governing-board/kismet-singh.jpg b/assets/images/governing-board/kismet-singh.jpg new file mode 100644 index 000000000000..79f09c4ea9a7 Binary files /dev/null and b/assets/images/governing-board/kismet-singh.jpg differ diff --git a/assets/images/governing-board/ricardo-aravena.jpg b/assets/images/governing-board/ricardo-aravena.jpg new file mode 100644 index 000000000000..4c76381a73cf Binary files /dev/null and b/assets/images/governing-board/ricardo-aravena.jpg differ diff --git a/assets/images/governing-board/shauheen-zahirazami.jpg b/assets/images/governing-board/shauheen-zahirazami.jpg new file mode 100644 index 000000000000..ffcc5761bd81 Binary files /dev/null and b/assets/images/governing-board/shauheen-zahirazami.jpg differ diff --git a/assets/images/granite_llama_throughput.png b/assets/images/granite_llama_throughput.png new file mode 100644 index 000000000000..96bc18cd684e Binary files /dev/null and b/assets/images/granite_llama_throughput.png differ diff --git a/assets/images/graphcore-logo.jpg b/assets/images/graphcore-logo.jpg new file mode 100644 index 000000000000..dc888f3897df Binary files /dev/null and b/assets/images/graphcore-logo.jpg differ diff --git a/assets/images/hadacore/fg1.png b/assets/images/hadacore/fg1.png new file mode 100644 index 000000000000..02321d88e4d8 Binary files /dev/null and b/assets/images/hadacore/fg1.png differ diff --git a/assets/images/hadacore/fg2.png b/assets/images/hadacore/fg2.png new file mode 100644 index 000000000000..d8602a882c9e Binary files /dev/null and b/assets/images/hadacore/fg2.png differ diff --git a/assets/images/hadacore/fg3.png b/assets/images/hadacore/fg3.png new file mode 100644 index 000000000000..3a2047f7927c Binary files /dev/null and b/assets/images/hadacore/fg3.png differ diff --git a/assets/images/hadacore/fg4.png b/assets/images/hadacore/fg4.png new file mode 100644 index 000000000000..02321d88e4d8 Binary files /dev/null and b/assets/images/hadacore/fg4.png differ diff --git a/assets/images/hadacore/fg5.png b/assets/images/hadacore/fg5.png new file mode 100644 index 000000000000..c2c0e31f2e91 Binary files /dev/null and b/assets/images/hadacore/fg5.png differ diff --git a/assets/images/hadacore/fg6.png b/assets/images/hadacore/fg6.png new file mode 100644 index 000000000000..3e11ef326cee Binary files /dev/null and b/assets/images/hadacore/fg6.png differ diff --git a/assets/images/hadacore/fg7.png b/assets/images/hadacore/fg7.png new file mode 100644 index 000000000000..f36380de2aa4 Binary files /dev/null and b/assets/images/hadacore/fg7.png differ diff --git a/assets/images/hardnet.png b/assets/images/hardnet.png new file mode 100644 index 000000000000..33e118fac6eb Binary files /dev/null and b/assets/images/hardnet.png differ diff --git a/assets/images/hardnet_blk.png b/assets/images/hardnet_blk.png new file mode 100644 index 000000000000..0aad0a8bcb95 Binary files /dev/null and b/assets/images/hardnet_blk.png differ diff --git a/assets/images/hi-po-low-bit.png b/assets/images/hi-po-low-bit.png new file mode 100644 index 000000000000..52d387ab062a Binary files /dev/null and b/assets/images/hi-po-low-bit.png differ diff --git a/assets/images/hifigan_model.png b/assets/images/hifigan_model.png new file mode 100644 index 000000000000..9ba92bb6a5e8 Binary files /dev/null and b/assets/images/hifigan_model.png differ diff --git a/assets/images/high-performance-llama-2/fig1.jpg b/assets/images/high-performance-llama-2/fig1.jpg new file mode 100644 index 000000000000..9e77716c4915 Binary files /dev/null and b/assets/images/high-performance-llama-2/fig1.jpg differ diff --git a/assets/images/high-performance-llama-2/fig2.jpg b/assets/images/high-performance-llama-2/fig2.jpg new file mode 100644 index 000000000000..0240c6da318d Binary files /dev/null and b/assets/images/high-performance-llama-2/fig2.jpg differ diff --git a/assets/images/high-performance-llama-2/fig3.jpg b/assets/images/high-performance-llama-2/fig3.jpg new file mode 100644 index 000000000000..98e3974f3e2e Binary files /dev/null and b/assets/images/high-performance-llama-2/fig3.jpg differ diff --git a/assets/images/high-performance-llama-2/fig4.jpg b/assets/images/high-performance-llama-2/fig4.jpg new file mode 100644 index 000000000000..59edee300066 Binary files /dev/null and b/assets/images/high-performance-llama-2/fig4.jpg differ diff --git a/assets/images/high-performance-llama-2/fig5.jpg b/assets/images/high-performance-llama-2/fig5.jpg new file mode 100644 index 000000000000..86fffd346cb5 Binary files /dev/null and b/assets/images/high-performance-llama-2/fig5.jpg differ diff --git a/assets/images/high-performance-llama-2/fig6.jpg b/assets/images/high-performance-llama-2/fig6.jpg new file mode 100644 index 000000000000..e3d3601d0b3c Binary files /dev/null and b/assets/images/high-performance-llama-2/fig6.jpg differ diff --git a/assets/images/high-performance-llama/cost_vs_output_token_length_7b_13b.jpg b/assets/images/high-performance-llama/cost_vs_output_token_length_7b_13b.jpg new file mode 100644 index 000000000000..5d796522bbfa Binary files /dev/null and b/assets/images/high-performance-llama/cost_vs_output_token_length_7b_13b.jpg differ diff --git a/assets/images/high-performance-llama/cost_vs_output_token_length_xl_48xl.jpg b/assets/images/high-performance-llama/cost_vs_output_token_length_xl_48xl.jpg new file mode 100644 index 000000000000..8e84b95eba5f Binary files /dev/null and b/assets/images/high-performance-llama/cost_vs_output_token_length_xl_48xl.jpg differ diff --git a/assets/images/high-performance-llama/latency_vs_input_token_length.jpg b/assets/images/high-performance-llama/latency_vs_input_token_length.jpg new file mode 100644 index 000000000000..37de19c0f7c8 Binary files /dev/null and b/assets/images/high-performance-llama/latency_vs_input_token_length.jpg differ diff --git a/assets/images/high-performance-llama/latency_vs_output_token_length.png b/assets/images/high-performance-llama/latency_vs_output_token_length.png new file mode 100644 index 000000000000..0b20737c44d1 Binary files /dev/null and b/assets/images/high-performance-llama/latency_vs_output_token_length.png differ diff --git a/assets/images/high-performance-llama/latency_vs_tp.jpg b/assets/images/high-performance-llama/latency_vs_tp.jpg new file mode 100644 index 000000000000..79a37627e429 Binary files /dev/null and b/assets/images/high-performance-llama/latency_vs_tp.jpg differ diff --git a/assets/images/high-performance-llama/software_stack_inf2.jpg b/assets/images/high-performance-llama/software_stack_inf2.jpg new file mode 100644 index 000000000000..e4115b69caa5 Binary files /dev/null and b/assets/images/high-performance-llama/software_stack_inf2.jpg differ diff --git a/assets/images/high-performance-llama/throughput_vs_output_token_length.jpg b/assets/images/high-performance-llama/throughput_vs_output_token_length.jpg new file mode 100644 index 000000000000..a9340db137fe Binary files /dev/null and b/assets/images/high-performance-llama/throughput_vs_output_token_length.jpg differ diff --git a/assets/images/hitchhikers-guide-speculative-decoding/fig1.gif b/assets/images/hitchhikers-guide-speculative-decoding/fig1.gif new file mode 100644 index 000000000000..e4d7a3cd023a Binary files /dev/null and b/assets/images/hitchhikers-guide-speculative-decoding/fig1.gif differ diff --git a/assets/images/hitchhikers-guide-speculative-decoding/fig2.png b/assets/images/hitchhikers-guide-speculative-decoding/fig2.png new file mode 100644 index 000000000000..795ce06154e2 Binary files /dev/null and b/assets/images/hitchhikers-guide-speculative-decoding/fig2.png differ diff --git a/assets/images/hitchhikers-guide-speculative-decoding/fig3.png b/assets/images/hitchhikers-guide-speculative-decoding/fig3.png new file mode 100644 index 000000000000..37081ddd0332 Binary files /dev/null and b/assets/images/hitchhikers-guide-speculative-decoding/fig3.png differ diff --git a/assets/images/hitchhikers-guide-speculative-decoding/fig4.png b/assets/images/hitchhikers-guide-speculative-decoding/fig4.png new file mode 100644 index 000000000000..77ab606033d3 Binary files /dev/null and b/assets/images/hitchhikers-guide-speculative-decoding/fig4.png differ diff --git a/assets/images/hitchhikers-guide-speculative-decoding/fig5.jpg b/assets/images/hitchhikers-guide-speculative-decoding/fig5.jpg new file mode 100644 index 000000000000..1bbe2b49dc9b Binary files /dev/null and b/assets/images/hitchhikers-guide-speculative-decoding/fig5.jpg differ diff --git a/assets/images/home-background.jpg b/assets/images/home-background.jpg new file mode 100644 index 000000000000..3f020015d2d6 Binary files /dev/null and b/assets/images/home-background.jpg differ diff --git a/assets/images/home-footer-background.jpg b/assets/images/home-footer-background.jpg new file mode 100644 index 000000000000..c541fca8e436 Binary files /dev/null and b/assets/images/home-footer-background.jpg differ diff --git a/assets/images/hopper-tma-unit/fg1.png b/assets/images/hopper-tma-unit/fg1.png new file mode 100644 index 000000000000..3088fda21663 Binary files /dev/null and b/assets/images/hopper-tma-unit/fg1.png differ diff --git a/assets/images/hopper-tma-unit/fg10.png b/assets/images/hopper-tma-unit/fg10.png new file mode 100644 index 000000000000..277f0bc2a0b5 Binary files /dev/null and b/assets/images/hopper-tma-unit/fg10.png differ diff --git a/assets/images/hopper-tma-unit/fg2.png b/assets/images/hopper-tma-unit/fg2.png new file mode 100644 index 000000000000..bb1548a3cbae Binary files /dev/null and b/assets/images/hopper-tma-unit/fg2.png differ diff --git a/assets/images/hopper-tma-unit/fg3.png b/assets/images/hopper-tma-unit/fg3.png new file mode 100644 index 000000000000..9aa769802507 Binary files /dev/null and b/assets/images/hopper-tma-unit/fg3.png differ diff --git a/assets/images/hopper-tma-unit/fg4.png b/assets/images/hopper-tma-unit/fg4.png new file mode 100644 index 000000000000..9f741178c57c Binary files /dev/null and b/assets/images/hopper-tma-unit/fg4.png differ diff --git a/assets/images/hopper-tma-unit/fg5.png b/assets/images/hopper-tma-unit/fg5.png new file mode 100644 index 000000000000..6fcd3d83a373 Binary files /dev/null and b/assets/images/hopper-tma-unit/fg5.png differ diff --git a/assets/images/hopper-tma-unit/fg6.png b/assets/images/hopper-tma-unit/fg6.png new file mode 100644 index 000000000000..51bff336086d Binary files /dev/null and b/assets/images/hopper-tma-unit/fg6.png differ diff --git a/assets/images/hopper-tma-unit/fg7.png b/assets/images/hopper-tma-unit/fg7.png new file mode 100644 index 000000000000..5a082da815d5 Binary files /dev/null and b/assets/images/hopper-tma-unit/fg7.png differ diff --git a/assets/images/hopper-tma-unit/fg8.png b/assets/images/hopper-tma-unit/fg8.png new file mode 100644 index 000000000000..bd7f16036932 Binary files /dev/null and b/assets/images/hopper-tma-unit/fg8.png differ diff --git a/assets/images/hopper-tma-unit/fg9.png b/assets/images/hopper-tma-unit/fg9.png new file mode 100644 index 000000000000..2ebb1b56a43a Binary files /dev/null and b/assets/images/hopper-tma-unit/fg9.png differ diff --git a/assets/images/horse2zebra.gif b/assets/images/horse2zebra.gif new file mode 100644 index 000000000000..b9b5f627e97d Binary files /dev/null and b/assets/images/horse2zebra.gif differ diff --git a/assets/images/how-disney-improved-social-small.jpg b/assets/images/how-disney-improved-social-small.jpg new file mode 100644 index 000000000000..964398fe9f45 Binary files /dev/null and b/assets/images/how-disney-improved-social-small.jpg differ diff --git a/assets/images/how-disney-improved-social.jpg b/assets/images/how-disney-improved-social.jpg new file mode 100644 index 000000000000..bd1eeb096ca5 Binary files /dev/null and b/assets/images/how-disney-improved-social.jpg differ diff --git a/assets/images/how-ibm-uses-pt-terratorch/fg1.png b/assets/images/how-ibm-uses-pt-terratorch/fg1.png new file mode 100644 index 000000000000..140186a272cf Binary files /dev/null and b/assets/images/how-ibm-uses-pt-terratorch/fg1.png differ diff --git a/assets/images/how-ibm-uses-pt-terratorch/fg2.png b/assets/images/how-ibm-uses-pt-terratorch/fg2.png new file mode 100644 index 000000000000..7a37b893773d Binary files /dev/null and b/assets/images/how-ibm-uses-pt-terratorch/fg2.png differ diff --git a/assets/images/how-ibm-uses-pt-terratorch/fg3.png b/assets/images/how-ibm-uses-pt-terratorch/fg3.png new file mode 100644 index 000000000000..bcbe77ea9eca Binary files /dev/null and b/assets/images/how-ibm-uses-pt-terratorch/fg3.png differ diff --git a/assets/images/how-ibm-uses-pt-terratorch/fg4.png b/assets/images/how-ibm-uses-pt-terratorch/fg4.png new file mode 100644 index 000000000000..798947a41f20 Binary files /dev/null and b/assets/images/how-ibm-uses-pt-terratorch/fg4.png differ diff --git a/assets/images/how-ibm-uses-pt-terratorch/fg5.png b/assets/images/how-ibm-uses-pt-terratorch/fg5.png new file mode 100644 index 000000000000..a8306bf3ed84 Binary files /dev/null and b/assets/images/how-ibm-uses-pt-terratorch/fg5.png differ diff --git a/assets/images/how-to-accelerate/f1-pyg-message-passing-paradigm.png b/assets/images/how-to-accelerate/f1-pyg-message-passing-paradigm.png new file mode 100644 index 000000000000..24e1d43f80e0 Binary files /dev/null and b/assets/images/how-to-accelerate/f1-pyg-message-passing-paradigm.png differ diff --git a/assets/images/how-to-accelerate/f2-scatter-reduce-scheme.png b/assets/images/how-to-accelerate/f2-scatter-reduce-scheme.png new file mode 100644 index 000000000000..b736a6ed8ecf Binary files /dev/null and b/assets/images/how-to-accelerate/f2-scatter-reduce-scheme.png differ diff --git a/assets/images/how-to-accelerate/f3-spmm-optimization-scheme.png b/assets/images/how-to-accelerate/f3-spmm-optimization-scheme.png new file mode 100644 index 000000000000..6cc210a6a72d Binary files /dev/null and b/assets/images/how-to-accelerate/f3-spmm-optimization-scheme.png differ diff --git a/assets/images/how-to-accelerate/f4-torch-compile-performance-speedup.png b/assets/images/how-to-accelerate/f4-torch-compile-performance-speedup.png new file mode 100644 index 000000000000..f76bb50e99ca Binary files /dev/null and b/assets/images/how-to-accelerate/f4-torch-compile-performance-speedup.png differ diff --git a/assets/images/hub-blog-header-1.png b/assets/images/hub-blog-header-1.png new file mode 100644 index 000000000000..e894e61818ce Binary files /dev/null and b/assets/images/hub-blog-header-1.png differ diff --git a/assets/images/hub-blog-pwc.png b/assets/images/hub-blog-pwc.png new file mode 100644 index 000000000000..c35a2c0b47d1 Binary files /dev/null and b/assets/images/hub-blog-pwc.png differ diff --git a/assets/images/hugging_face_transformers.jpg b/assets/images/hugging_face_transformers.jpg new file mode 100644 index 000000000000..3669acc4a029 Binary files /dev/null and b/assets/images/hugging_face_transformers.jpg differ diff --git a/assets/images/hugging_face_transformers.svg b/assets/images/hugging_face_transformers.svg new file mode 100644 index 000000000000..091946ae5cc0 --- /dev/null +++ b/assets/images/hugging_face_transformers.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/assets/images/huggingface-joins-1.jpg b/assets/images/huggingface-joins-1.jpg new file mode 100644 index 000000000000..a74db1d4efa6 Binary files /dev/null and b/assets/images/huggingface-joins-1.jpg differ diff --git a/assets/images/huggingface-joins-2.jpg b/assets/images/huggingface-joins-2.jpg new file mode 100644 index 000000000000..bd5fa784138f Binary files /dev/null and b/assets/images/huggingface-joins-2.jpg differ diff --git a/assets/images/huggingface-logo.png b/assets/images/huggingface-logo.png new file mode 100644 index 000000000000..cbd884e307a8 Binary files /dev/null and b/assets/images/huggingface-logo.png differ diff --git a/assets/images/hunting-dinosaurs-with-intel-ai-fig1.jpeg b/assets/images/hunting-dinosaurs-with-intel-ai-fig1.jpeg new file mode 100644 index 000000000000..a6d94c8ac156 Binary files /dev/null and b/assets/images/hunting-dinosaurs-with-intel-ai-fig1.jpeg differ diff --git a/assets/images/hybridnets.jpg b/assets/images/hybridnets.jpg new file mode 100644 index 000000000000..ee053ce4f549 Binary files /dev/null and b/assets/images/hybridnets.jpg differ diff --git a/assets/images/ibnnet.png b/assets/images/ibnnet.png new file mode 100644 index 000000000000..d6c0ce6006da Binary files /dev/null and b/assets/images/ibnnet.png differ diff --git a/assets/images/icon-close.svg b/assets/images/icon-close.svg new file mode 100644 index 000000000000..348964e79f7f --- /dev/null +++ b/assets/images/icon-close.svg @@ -0,0 +1,21 @@ + + + + Page 1 + Created with Sketch. + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/assets/images/icon-cloud1.svg b/assets/images/icon-cloud1.svg new file mode 100644 index 000000000000..aaff8db1086b --- /dev/null +++ b/assets/images/icon-cloud1.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/assets/images/icon-examine.svg b/assets/images/icon-examine.svg new file mode 100644 index 000000000000..5db6b85f93c5 --- /dev/null +++ b/assets/images/icon-examine.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/assets/images/icon-menu-dots-dark.svg b/assets/images/icon-menu-dots-dark.svg new file mode 100644 index 000000000000..fa2ad044b3f6 --- /dev/null +++ b/assets/images/icon-menu-dots-dark.svg @@ -0,0 +1,42 @@ + + + + Page 1 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/assets/images/icon-menu-dots.svg b/assets/images/icon-menu-dots.svg new file mode 100644 index 000000000000..fc0318e62639 --- /dev/null +++ b/assets/images/icon-menu-dots.svg @@ -0,0 +1,44 @@ + + + + Page 1 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/assets/images/icon-process.svg b/assets/images/icon-process.svg new file mode 100644 index 000000000000..a08f4909540b --- /dev/null +++ b/assets/images/icon-process.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/assets/images/image-classification.png b/assets/images/image-classification.png new file mode 100644 index 000000000000..38d089275b30 Binary files /dev/null and b/assets/images/image-classification.png differ diff --git a/assets/images/image-opacus.png b/assets/images/image-opacus.png new file mode 100644 index 000000000000..6e5c8696a409 Binary files /dev/null and b/assets/images/image-opacus.png differ diff --git a/assets/images/improve-rag-performance.png b/assets/images/improve-rag-performance.png new file mode 100644 index 000000000000..4c25526ecc5e Binary files /dev/null and b/assets/images/improve-rag-performance.png differ diff --git a/assets/images/improve-rag-performance2.jpg b/assets/images/improve-rag-performance2.jpg new file mode 100644 index 000000000000..7a48fa7343fc Binary files /dev/null and b/assets/images/improve-rag-performance2.jpg differ diff --git a/assets/images/inception_v3.png b/assets/images/inception_v3.png new file mode 100644 index 000000000000..c1bc7af71b7e Binary files /dev/null and b/assets/images/inception_v3.png differ diff --git a/assets/images/inference_splitkv.gif b/assets/images/inference_splitkv.gif new file mode 100644 index 000000000000..0e1eb10bc7eb Binary files /dev/null and b/assets/images/inference_splitkv.gif differ diff --git a/assets/images/inplace_abn.png b/assets/images/inplace_abn.png new file mode 100644 index 000000000000..1f281b2d1f38 Binary files /dev/null and b/assets/images/inplace_abn.png differ diff --git a/assets/images/inside-the-matrix/attn_cover1_crop.jpg b/assets/images/inside-the-matrix/attn_cover1_crop.jpg new file mode 100644 index 000000000000..105741e8e6a6 Binary files /dev/null and b/assets/images/inside-the-matrix/attn_cover1_crop.jpg differ diff --git a/assets/images/inside-the-matrix/attnlayer2.jpg b/assets/images/inside-the-matrix/attnlayer2.jpg new file mode 100644 index 000000000000..345dcea9ce0e Binary files /dev/null and b/assets/images/inside-the-matrix/attnlayer2.jpg differ diff --git a/assets/images/inside-the-matrix/attnlayer_ffnsplitk.jpg b/assets/images/inside-the-matrix/attnlayer_ffnsplitk.jpg new file mode 100644 index 000000000000..fd33216eb864 Binary files /dev/null and b/assets/images/inside-the-matrix/attnlayer_ffnsplitk.jpg differ diff --git a/assets/images/inside-the-matrix/binary4.jpg b/assets/images/inside-the-matrix/binary4.jpg new file mode 100644 index 000000000000..7849519f8367 Binary files /dev/null and b/assets/images/inside-the-matrix/binary4.jpg differ diff --git a/assets/images/inside-the-matrix/binary_part.jpg b/assets/images/inside-the-matrix/binary_part.jpg new file mode 100644 index 000000000000..a382a61f31e7 Binary files /dev/null and b/assets/images/inside-the-matrix/binary_part.jpg differ diff --git a/assets/images/inside-the-matrix/bottleneck_part.jpg b/assets/images/inside-the-matrix/bottleneck_part.jpg new file mode 100644 index 000000000000..85698be42430 Binary files /dev/null and b/assets/images/inside-the-matrix/bottleneck_part.jpg differ diff --git a/assets/images/inside-the-matrix/bptlayer.jpg b/assets/images/inside-the-matrix/bptlayer.jpg new file mode 100644 index 000000000000..f9128f0ead26 Binary files /dev/null and b/assets/images/inside-the-matrix/bptlayer.jpg differ diff --git a/assets/images/inside-the-matrix/bptlayer_ffnsplitk.jpg b/assets/images/inside-the-matrix/bptlayer_ffnsplitk.jpg new file mode 100644 index 000000000000..06dfe5b125f0 Binary files /dev/null and b/assets/images/inside-the-matrix/bptlayer_ffnsplitk.jpg differ diff --git a/assets/images/inside-the-matrix/decoding.jpg b/assets/images/inside-the-matrix/decoding.jpg new file mode 100644 index 000000000000..c2fa9f4da992 Binary files /dev/null and b/assets/images/inside-the-matrix/decoding.jpg differ diff --git a/assets/images/inside-the-matrix/gpt2_0_2c.jpg b/assets/images/inside-the-matrix/gpt2_0_2c.jpg new file mode 100644 index 000000000000..ddf17ff60613 Binary files /dev/null and b/assets/images/inside-the-matrix/gpt2_0_2c.jpg differ diff --git a/assets/images/inside-the-matrix/gpt2_decode2.jpg b/assets/images/inside-the-matrix/gpt2_decode2.jpg new file mode 100644 index 000000000000..96fe1d5d03bd Binary files /dev/null and b/assets/images/inside-the-matrix/gpt2_decode2.jpg differ diff --git a/assets/images/inside-the-matrix/gpt2_ik.jpg b/assets/images/inside-the-matrix/gpt2_ik.jpg new file mode 100644 index 000000000000..8feb1a230f99 Binary files /dev/null and b/assets/images/inside-the-matrix/gpt2_ik.jpg differ diff --git a/assets/images/inside-the-matrix/gpt2_parti.jpg b/assets/images/inside-the-matrix/gpt2_parti.jpg new file mode 100644 index 000000000000..770872606603 Binary files /dev/null and b/assets/images/inside-the-matrix/gpt2_parti.jpg differ diff --git a/assets/images/inside-the-matrix/initial.jpg b/assets/images/inside-the-matrix/initial.jpg new file mode 100644 index 000000000000..d8543a31ba9d Binary files /dev/null and b/assets/images/inside-the-matrix/initial.jpg differ diff --git a/assets/images/inside-the-matrix/la2still.jpg b/assets/images/inside-the-matrix/la2still.jpg new file mode 100644 index 000000000000..9faf5be03be8 Binary files /dev/null and b/assets/images/inside-the-matrix/la2still.jpg differ diff --git a/assets/images/inside-the-matrix/lacontract.jpg b/assets/images/inside-the-matrix/lacontract.jpg new file mode 100644 index 000000000000..d5700b95bacd Binary files /dev/null and b/assets/images/inside-the-matrix/lacontract.jpg differ diff --git a/assets/images/inside-the-matrix/lora_single.jpg b/assets/images/inside-the-matrix/lora_single.jpg new file mode 100644 index 000000000000..0527a7184e2c Binary files /dev/null and b/assets/images/inside-the-matrix/lora_single.jpg differ diff --git a/assets/images/inside-the-matrix/matmul3.jpg b/assets/images/inside-the-matrix/matmul3.jpg new file mode 100644 index 000000000000..23c13406e860 Binary files /dev/null and b/assets/images/inside-the-matrix/matmul3.jpg differ diff --git a/assets/images/inside-the-matrix/mha1.jpg b/assets/images/inside-the-matrix/mha1.jpg new file mode 100644 index 000000000000..e32821ff6001 Binary files /dev/null and b/assets/images/inside-the-matrix/mha1.jpg differ diff --git a/assets/images/inside-the-matrix/nlayerbottleneck.jpg b/assets/images/inside-the-matrix/nlayerbottleneck.jpg new file mode 100644 index 000000000000..b20f522b5f2e Binary files /dev/null and b/assets/images/inside-the-matrix/nlayerbottleneck.jpg differ diff --git a/assets/images/inside-the-matrix/raffn.jpg b/assets/images/inside-the-matrix/raffn.jpg new file mode 100644 index 000000000000..c2af7fb6d3ae Binary files /dev/null and b/assets/images/inside-the-matrix/raffn.jpg differ diff --git a/assets/images/install-matrix.png b/assets/images/install-matrix.png new file mode 100644 index 000000000000..3313d6421617 Binary files /dev/null and b/assets/images/install-matrix.png differ diff --git a/assets/images/int4-decoding/eq.jpg b/assets/images/int4-decoding/eq.jpg new file mode 100644 index 000000000000..2abbe7a6032e Binary files /dev/null and b/assets/images/int4-decoding/eq.jpg differ diff --git a/assets/images/int4-decoding/fg1.png b/assets/images/int4-decoding/fg1.png new file mode 100644 index 000000000000..abd3db7cd48f Binary files /dev/null and b/assets/images/int4-decoding/fg1.png differ diff --git a/assets/images/int4-decoding/fg10.jpg b/assets/images/int4-decoding/fg10.jpg new file mode 100644 index 000000000000..c72609ddb823 Binary files /dev/null and b/assets/images/int4-decoding/fg10.jpg differ diff --git a/assets/images/int4-decoding/fg11.jpg b/assets/images/int4-decoding/fg11.jpg new file mode 100644 index 000000000000..4805aa2e9670 Binary files /dev/null and b/assets/images/int4-decoding/fg11.jpg differ diff --git a/assets/images/int4-decoding/fg12.png b/assets/images/int4-decoding/fg12.png new file mode 100644 index 000000000000..aa9da5572e44 Binary files /dev/null and b/assets/images/int4-decoding/fg12.png differ diff --git a/assets/images/int4-decoding/fg13.jpg b/assets/images/int4-decoding/fg13.jpg new file mode 100644 index 000000000000..16bd56e900e7 Binary files /dev/null and b/assets/images/int4-decoding/fg13.jpg differ diff --git a/assets/images/int4-decoding/fg14.jpg b/assets/images/int4-decoding/fg14.jpg new file mode 100644 index 000000000000..96867c6e5ad4 Binary files /dev/null and b/assets/images/int4-decoding/fg14.jpg differ diff --git a/assets/images/int4-decoding/fg15.jpg b/assets/images/int4-decoding/fg15.jpg new file mode 100644 index 000000000000..e126533c01ba Binary files /dev/null and b/assets/images/int4-decoding/fg15.jpg differ diff --git a/assets/images/int4-decoding/fg16.jpg b/assets/images/int4-decoding/fg16.jpg new file mode 100644 index 000000000000..2911d3dff92e Binary files /dev/null and b/assets/images/int4-decoding/fg16.jpg differ diff --git a/assets/images/int4-decoding/fg17.jpg b/assets/images/int4-decoding/fg17.jpg new file mode 100644 index 000000000000..6d58034eb4c2 Binary files /dev/null and b/assets/images/int4-decoding/fg17.jpg differ diff --git a/assets/images/int4-decoding/fg18.png b/assets/images/int4-decoding/fg18.png new file mode 100644 index 000000000000..8c1dfe6075db Binary files /dev/null and b/assets/images/int4-decoding/fg18.png differ diff --git a/assets/images/int4-decoding/fg19.jpg b/assets/images/int4-decoding/fg19.jpg new file mode 100644 index 000000000000..0bee515adc12 Binary files /dev/null and b/assets/images/int4-decoding/fg19.jpg differ diff --git a/assets/images/int4-decoding/fg2.png b/assets/images/int4-decoding/fg2.png new file mode 100644 index 000000000000..2ef0739b0e04 Binary files /dev/null and b/assets/images/int4-decoding/fg2.png differ diff --git a/assets/images/int4-decoding/fg20.jpg b/assets/images/int4-decoding/fg20.jpg new file mode 100644 index 000000000000..09043b86a193 Binary files /dev/null and b/assets/images/int4-decoding/fg20.jpg differ diff --git a/assets/images/int4-decoding/fg21.png b/assets/images/int4-decoding/fg21.png new file mode 100644 index 000000000000..ff01e0144116 Binary files /dev/null and b/assets/images/int4-decoding/fg21.png differ diff --git a/assets/images/int4-decoding/fg3.png b/assets/images/int4-decoding/fg3.png new file mode 100644 index 000000000000..1b1d6845008d Binary files /dev/null and b/assets/images/int4-decoding/fg3.png differ diff --git a/assets/images/int4-decoding/fg4.jpg b/assets/images/int4-decoding/fg4.jpg new file mode 100644 index 000000000000..5a7023789490 Binary files /dev/null and b/assets/images/int4-decoding/fg4.jpg differ diff --git a/assets/images/int4-decoding/fg5.png b/assets/images/int4-decoding/fg5.png new file mode 100644 index 000000000000..40cdd59ed28d Binary files /dev/null and b/assets/images/int4-decoding/fg5.png differ diff --git a/assets/images/int4-decoding/fg6.png b/assets/images/int4-decoding/fg6.png new file mode 100644 index 000000000000..bf8c1ed26c65 Binary files /dev/null and b/assets/images/int4-decoding/fg6.png differ diff --git a/assets/images/int4-decoding/fg7.png b/assets/images/int4-decoding/fg7.png new file mode 100644 index 000000000000..4cdc167ce70f Binary files /dev/null and b/assets/images/int4-decoding/fg7.png differ diff --git a/assets/images/int4-decoding/fg8.png b/assets/images/int4-decoding/fg8.png new file mode 100644 index 000000000000..692b7995c489 Binary files /dev/null and b/assets/images/int4-decoding/fg8.png differ diff --git a/assets/images/int4-decoding/fg9.png b/assets/images/int4-decoding/fg9.png new file mode 100644 index 000000000000..b501010baed8 Binary files /dev/null and b/assets/images/int4-decoding/fg9.png differ diff --git a/assets/images/int8/pytorch_quant_x86_1.jpg b/assets/images/int8/pytorch_quant_x86_1.jpg new file mode 100644 index 000000000000..ac506ee7e09f Binary files /dev/null and b/assets/images/int8/pytorch_quant_x86_1.jpg differ diff --git a/assets/images/int8/pytorch_quant_x86_2.jpg b/assets/images/int8/pytorch_quant_x86_2.jpg new file mode 100644 index 000000000000..0c2ab43cee57 Binary files /dev/null and b/assets/images/int8/pytorch_quant_x86_2.jpg differ diff --git a/assets/images/int8/pytorch_quant_x86_3.jpg b/assets/images/int8/pytorch_quant_x86_3.jpg new file mode 100644 index 000000000000..bb3d9efa671f Binary files /dev/null and b/assets/images/int8/pytorch_quant_x86_3.jpg differ diff --git a/assets/images/intel-case-study/fg1.png b/assets/images/intel-case-study/fg1.png new file mode 100644 index 000000000000..b2a94fae07fc Binary files /dev/null and b/assets/images/intel-case-study/fg1.png differ diff --git a/assets/images/intel-case-study/fg2.png b/assets/images/intel-case-study/fg2.png new file mode 100644 index 000000000000..26ba47e31e71 Binary files /dev/null and b/assets/images/intel-case-study/fg2.png differ diff --git a/assets/images/intel-gpus-pytorch-2-4.jpg b/assets/images/intel-gpus-pytorch-2-4.jpg new file mode 100644 index 000000000000..a1264401c38f Binary files /dev/null and b/assets/images/intel-gpus-pytorch-2-4.jpg differ diff --git a/assets/images/intel-logo.png b/assets/images/intel-logo.png new file mode 100644 index 000000000000..2d022a97c15a Binary files /dev/null and b/assets/images/intel-logo.png differ diff --git a/assets/images/intel-new-logo.svg b/assets/images/intel-new-logo.svg new file mode 100644 index 000000000000..5133faa15233 --- /dev/null +++ b/assets/images/intel-new-logo.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/assets/images/intro-graphic-accelerated-pytorch-training-revised.png b/assets/images/intro-graphic-accelerated-pytorch-training-revised.png new file mode 100644 index 000000000000..8cefb64c4aca Binary files /dev/null and b/assets/images/intro-graphic-accelerated-pytorch-training-revised.png differ diff --git a/assets/images/intro-graphic-accelerated-pytorch-training.jpg b/assets/images/intro-graphic-accelerated-pytorch-training.jpg new file mode 100644 index 000000000000..4db67bb10d79 Binary files /dev/null and b/assets/images/intro-graphic-accelerated-pytorch-training.jpg differ diff --git a/assets/images/intro-graphic-accelerated-pytorch-training.png b/assets/images/intro-graphic-accelerated-pytorch-training.png new file mode 100644 index 000000000000..69aab8f75767 Binary files /dev/null and b/assets/images/intro-graphic-accelerated-pytorch-training.png differ diff --git a/assets/images/introducing-nvfuser-a-deep-learning-compiler-for-pytorch-1.png b/assets/images/introducing-nvfuser-a-deep-learning-compiler-for-pytorch-1.png new file mode 100644 index 000000000000..a60f08d3ceea Binary files /dev/null and b/assets/images/introducing-nvfuser-a-deep-learning-compiler-for-pytorch-1.png differ diff --git a/assets/images/introducing-nvfuser-a-deep-learning-compiler-for-pytorch-2.png b/assets/images/introducing-nvfuser-a-deep-learning-compiler-for-pytorch-2.png new file mode 100644 index 000000000000..887fe9b3d8cd Binary files /dev/null and b/assets/images/introducing-nvfuser-a-deep-learning-compiler-for-pytorch-2.png differ diff --git a/assets/images/introducing-nvfuser-a-deep-learning-compiler-for-pytorch-3.png b/assets/images/introducing-nvfuser-a-deep-learning-compiler-for-pytorch-3.png new file mode 100644 index 000000000000..00dec5156d6c Binary files /dev/null and b/assets/images/introducing-nvfuser-a-deep-learning-compiler-for-pytorch-3.png differ diff --git a/assets/images/introducing-nvfuser-a-deep-learning-compiler-for-pytorch-4.png b/assets/images/introducing-nvfuser-a-deep-learning-compiler-for-pytorch-4.png new file mode 100644 index 000000000000..4d638e1ee0dd Binary files /dev/null and b/assets/images/introducing-nvfuser-a-deep-learning-compiler-for-pytorch-4.png differ diff --git a/assets/images/introducing-nvfuser-a-deep-learning-compiler-for-pytorch-5.png b/assets/images/introducing-nvfuser-a-deep-learning-compiler-for-pytorch-5.png new file mode 100644 index 000000000000..f05b334feb25 Binary files /dev/null and b/assets/images/introducing-nvfuser-a-deep-learning-compiler-for-pytorch-5.png differ diff --git a/assets/images/introducing-nvfuser-a-deep-learning-compiler-for-pytorch-6.png b/assets/images/introducing-nvfuser-a-deep-learning-compiler-for-pytorch-6.png new file mode 100644 index 000000000000..5e33cd41d501 Binary files /dev/null and b/assets/images/introducing-nvfuser-a-deep-learning-compiler-for-pytorch-6.png differ diff --git a/assets/images/introducing-torchrec/torchrec_lockup.png b/assets/images/introducing-torchrec/torchrec_lockup.png new file mode 100644 index 000000000000..5e9d1a751c65 Binary files /dev/null and b/assets/images/introducing-torchrec/torchrec_lockup.png differ diff --git a/assets/images/introducing-torchrec/torchrec_social.png b/assets/images/introducing-torchrec/torchrec_social.png new file mode 100644 index 000000000000..048437c08290 Binary files /dev/null and b/assets/images/introducing-torchrec/torchrec_social.png differ diff --git a/assets/images/join/boxes.svg b/assets/images/join/boxes.svg new file mode 100644 index 000000000000..974683314e93 --- /dev/null +++ b/assets/images/join/boxes.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/assets/images/join/bulb.svg b/assets/images/join/bulb.svg new file mode 100644 index 000000000000..b5944e74b2e2 --- /dev/null +++ b/assets/images/join/bulb.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/assets/images/join/check.svg b/assets/images/join/check.svg new file mode 100644 index 000000000000..e04b4406cecb --- /dev/null +++ b/assets/images/join/check.svg @@ -0,0 +1,4 @@ + + + + diff --git a/assets/images/join/eye.svg b/assets/images/join/eye.svg new file mode 100644 index 000000000000..f4fe09b13fc8 --- /dev/null +++ b/assets/images/join/eye.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/assets/images/join/man.jpg b/assets/images/join/man.jpg new file mode 100644 index 000000000000..2b0a1770f86d Binary files /dev/null and b/assets/images/join/man.jpg differ diff --git a/assets/images/join/mic.svg b/assets/images/join/mic.svg new file mode 100644 index 000000000000..437412c2aad2 --- /dev/null +++ b/assets/images/join/mic.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/assets/images/join/star.svg b/assets/images/join/star.svg new file mode 100644 index 000000000000..d7a2a92029ca --- /dev/null +++ b/assets/images/join/star.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/assets/images/key-iterations-for-improving-the-accuracyof-mobilenetV3.png b/assets/images/key-iterations-for-improving-the-accuracyof-mobilenetV3.png new file mode 100644 index 000000000000..d3fe63aef5d1 Binary files /dev/null and b/assets/images/key-iterations-for-improving-the-accuracyof-mobilenetV3.png differ diff --git a/assets/images/landscape.jpg b/assets/images/landscape.jpg new file mode 100644 index 000000000000..b9702fdb895f Binary files /dev/null and b/assets/images/landscape.jpg differ diff --git a/assets/images/largeblob_index_4.gif b/assets/images/largeblob_index_4.gif new file mode 100644 index 000000000000..4c0647fff600 Binary files /dev/null and b/assets/images/largeblob_index_4.gif differ diff --git a/assets/images/largeblog_index_1.png b/assets/images/largeblog_index_1.png new file mode 100644 index 000000000000..263d51482663 Binary files /dev/null and b/assets/images/largeblog_index_1.png differ diff --git a/assets/images/largeblog_index_10.png b/assets/images/largeblog_index_10.png new file mode 100644 index 000000000000..63c71005395b Binary files /dev/null and b/assets/images/largeblog_index_10.png differ diff --git a/assets/images/largeblog_index_10a.png b/assets/images/largeblog_index_10a.png new file mode 100644 index 000000000000..4adf1125ab6f Binary files /dev/null and b/assets/images/largeblog_index_10a.png differ diff --git a/assets/images/largeblog_index_11.png b/assets/images/largeblog_index_11.png new file mode 100644 index 000000000000..57880138ecb0 Binary files /dev/null and b/assets/images/largeblog_index_11.png differ diff --git a/assets/images/largeblog_index_12.png b/assets/images/largeblog_index_12.png new file mode 100644 index 000000000000..9aa386aaadbc Binary files /dev/null and b/assets/images/largeblog_index_12.png differ diff --git a/assets/images/largeblog_index_13.png b/assets/images/largeblog_index_13.png new file mode 100644 index 000000000000..96a37f808573 Binary files /dev/null and b/assets/images/largeblog_index_13.png differ diff --git a/assets/images/largeblog_index_2.png b/assets/images/largeblog_index_2.png new file mode 100644 index 000000000000..2219d89813be Binary files /dev/null and b/assets/images/largeblog_index_2.png differ diff --git a/assets/images/largeblog_index_3.png b/assets/images/largeblog_index_3.png new file mode 100644 index 000000000000..1b15bf8651d3 Binary files /dev/null and b/assets/images/largeblog_index_3.png differ diff --git a/assets/images/largeblog_index_5.gif b/assets/images/largeblog_index_5.gif new file mode 100644 index 000000000000..4b363c2b1e1a Binary files /dev/null and b/assets/images/largeblog_index_5.gif differ diff --git a/assets/images/largeblog_index_6.5.gif b/assets/images/largeblog_index_6.5.gif new file mode 100644 index 000000000000..cfdf3f671644 Binary files /dev/null and b/assets/images/largeblog_index_6.5.gif differ diff --git a/assets/images/largeblog_index_6.gif b/assets/images/largeblog_index_6.gif new file mode 100644 index 000000000000..244aa980e0bd Binary files /dev/null and b/assets/images/largeblog_index_6.gif differ diff --git a/assets/images/largeblog_index_7.gif b/assets/images/largeblog_index_7.gif new file mode 100644 index 000000000000..4c0647fff600 Binary files /dev/null and b/assets/images/largeblog_index_7.gif differ diff --git a/assets/images/largeblog_index_8.png b/assets/images/largeblog_index_8.png new file mode 100644 index 000000000000..60186e75fa1b Binary files /dev/null and b/assets/images/largeblog_index_8.png differ diff --git a/assets/images/largeblog_index_9.png b/assets/images/largeblog_index_9.png new file mode 100644 index 000000000000..d49bf233fcb8 Binary files /dev/null and b/assets/images/largeblog_index_9.png differ diff --git a/assets/images/lightning-studios-logo.svg b/assets/images/lightning-studios-logo.svg new file mode 100644 index 000000000000..27a1b356a773 --- /dev/null +++ b/assets/images/lightning-studios-logo.svg @@ -0,0 +1,3 @@ + + + diff --git a/assets/images/linux+pytorch.png b/assets/images/linux+pytorch.png new file mode 100644 index 000000000000..0ed04b6d7329 Binary files /dev/null and b/assets/images/linux+pytorch.png differ diff --git a/assets/images/llama-into-torchtune/fg1.png b/assets/images/llama-into-torchtune/fg1.png new file mode 100644 index 000000000000..803b0d06cb40 Binary files /dev/null and b/assets/images/llama-into-torchtune/fg1.png differ diff --git a/assets/images/llama-into-torchtune/fg2.png b/assets/images/llama-into-torchtune/fg2.png new file mode 100644 index 000000000000..5e6e6693c795 Binary files /dev/null and b/assets/images/llama-into-torchtune/fg2.png differ diff --git a/assets/images/llama-into-torchtune/fg3.png b/assets/images/llama-into-torchtune/fg3.png new file mode 100644 index 000000000000..bca2b5c56f94 Binary files /dev/null and b/assets/images/llama-into-torchtune/fg3.png differ diff --git a/assets/images/llama-into-torchtune/fg4.png b/assets/images/llama-into-torchtune/fg4.png new file mode 100644 index 000000000000..178153c4fc82 Binary files /dev/null and b/assets/images/llama-into-torchtune/fg4.png differ diff --git a/assets/images/logo-dark.svg b/assets/images/logo-dark.svg new file mode 100644 index 000000000000..6da5e93f9421 --- /dev/null +++ b/assets/images/logo-dark.svg @@ -0,0 +1,30 @@ + + + + + + + + + + + + + + + + + + + + diff --git a/assets/images/logo-detectron.svg b/assets/images/logo-detectron.svg new file mode 100644 index 000000000000..19f88bc81afb --- /dev/null +++ b/assets/images/logo-detectron.svg @@ -0,0 +1,16 @@ + + + + Page 1 + Created with Sketch. + + + + + + + + + + + \ No newline at end of file diff --git a/assets/images/logo-elf.svg b/assets/images/logo-elf.svg new file mode 100644 index 000000000000..fb96846391ec --- /dev/null +++ b/assets/images/logo-elf.svg @@ -0,0 +1,16 @@ + + + + Page 1 + Created with Sketch. + + + + + + + + + + + \ No newline at end of file diff --git a/assets/images/logo-facebook-dark.svg b/assets/images/logo-facebook-dark.svg new file mode 100644 index 000000000000..e3a069d87967 --- /dev/null +++ b/assets/images/logo-facebook-dark.svg @@ -0,0 +1,8 @@ + + + + + + diff --git a/assets/images/logo-github.svg b/assets/images/logo-github.svg new file mode 100644 index 000000000000..2a38d6f6fb47 --- /dev/null +++ b/assets/images/logo-github.svg @@ -0,0 +1,12 @@ + + + + + + diff --git a/assets/images/logo-icon.svg b/assets/images/logo-icon.svg new file mode 100644 index 000000000000..9dcafc39af21 --- /dev/null +++ b/assets/images/logo-icon.svg @@ -0,0 +1,12 @@ + + + + + + + + + diff --git a/assets/images/logo-parlai.svg b/assets/images/logo-parlai.svg new file mode 100644 index 000000000000..4c46bce6563b --- /dev/null +++ b/assets/images/logo-parlai.svg @@ -0,0 +1,16 @@ + + + + Page 1 + Created with Sketch. + + + + + + + + + + + \ No newline at end of file diff --git a/assets/images/logo-slack.svg b/assets/images/logo-slack.svg new file mode 100644 index 000000000000..4b02a4fdf45e --- /dev/null +++ b/assets/images/logo-slack.svg @@ -0,0 +1,16 @@ + + + + slack + Created with Sketch. + + + + + + + + + + + \ No newline at end of file diff --git a/assets/images/logo-twitter-dark.svg b/assets/images/logo-twitter-dark.svg new file mode 100644 index 000000000000..b19f5ff1f721 --- /dev/null +++ b/assets/images/logo-twitter-dark.svg @@ -0,0 +1,16 @@ + + + + + + + + diff --git a/assets/images/logo-twitter-grey.svg b/assets/images/logo-twitter-grey.svg new file mode 100644 index 000000000000..b64b8409c0d3 --- /dev/null +++ b/assets/images/logo-twitter-grey.svg @@ -0,0 +1,16 @@ + + + + + + + + diff --git a/assets/images/logo-wav2letter.svg b/assets/images/logo-wav2letter.svg new file mode 100644 index 000000000000..9ad1e5124a21 --- /dev/null +++ b/assets/images/logo-wav2letter.svg @@ -0,0 +1,12 @@ + + + + Page 1 + Created with Sketch. + + + + + + + \ No newline at end of file diff --git a/assets/images/logo-white.svg b/assets/images/logo-white.svg new file mode 100644 index 000000000000..6aef1f225891 --- /dev/null +++ b/assets/images/logo-white.svg @@ -0,0 +1,31 @@ + + + + + + + + + + + + + + + + + + + + diff --git a/assets/images/logo-youtube-dark.svg b/assets/images/logo-youtube-dark.svg new file mode 100644 index 000000000000..e3cfedd79d14 --- /dev/null +++ b/assets/images/logo-youtube-dark.svg @@ -0,0 +1,21 @@ + + + + + + + + + + + + + + + + + + + + + diff --git a/assets/images/logo.svg b/assets/images/logo.svg new file mode 100644 index 000000000000..c1baf78ee022 --- /dev/null +++ b/assets/images/logo.svg @@ -0,0 +1,31 @@ + + + + + + + + + + + + + + + + + + + + diff --git a/assets/images/low-latency/im1.svg b/assets/images/low-latency/im1.svg new file mode 100644 index 000000000000..b0ec12ac7908 --- /dev/null +++ b/assets/images/low-latency/im1.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/assets/images/low-latency/im2.svg b/assets/images/low-latency/im2.svg new file mode 100644 index 000000000000..518dda4291b1 --- /dev/null +++ b/assets/images/low-latency/im2.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/assets/images/low-latency/im3.svg b/assets/images/low-latency/im3.svg new file mode 100644 index 000000000000..be6838016e59 --- /dev/null +++ b/assets/images/low-latency/im3.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/assets/images/low-latency/im4.svg b/assets/images/low-latency/im4.svg new file mode 100644 index 000000000000..6e7cdc083678 --- /dev/null +++ b/assets/images/low-latency/im4.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/assets/images/low-latency/im5.svg b/assets/images/low-latency/im5.svg new file mode 100644 index 000000000000..45aad8761553 --- /dev/null +++ b/assets/images/low-latency/im5.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/assets/images/low-latency/im6.svg b/assets/images/low-latency/im6.svg new file mode 100644 index 000000000000..6323efda1c2b --- /dev/null +++ b/assets/images/low-latency/im6.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/assets/images/mAP-of-SSD320-MobileNetV3-Large.png b/assets/images/mAP-of-SSD320-MobileNetV3-Large.png new file mode 100644 index 000000000000..d52ee4e99a29 Binary files /dev/null and b/assets/images/mAP-of-SSD320-MobileNetV3-Large.png differ diff --git a/assets/images/matt-white.jpg b/assets/images/matt-white.jpg new file mode 100644 index 000000000000..c9168218b553 Binary files /dev/null and b/assets/images/matt-white.jpg differ diff --git a/assets/images/max-training-chart.jpg b/assets/images/max-training-chart.jpg new file mode 100644 index 000000000000..cdd68c3a4940 Binary files /dev/null and b/assets/images/max-training-chart.jpg differ diff --git a/assets/images/max-training-table.png b/assets/images/max-training-table.png new file mode 100644 index 000000000000..722cfdf37d6c Binary files /dev/null and b/assets/images/max-training-table.png differ diff --git a/assets/images/maximizing-training/loss_curve.png b/assets/images/maximizing-training/loss_curve.png new file mode 100644 index 000000000000..51b88c45bca7 Binary files /dev/null and b/assets/images/maximizing-training/loss_curve.png differ diff --git a/assets/images/maximizing-training/overlap_zoomed_in.png b/assets/images/maximizing-training/overlap_zoomed_in.png new file mode 100644 index 000000000000..f7e134ef2582 Binary files /dev/null and b/assets/images/maximizing-training/overlap_zoomed_in.png differ diff --git a/assets/images/maximizing-training/overlap_zoomed_out.png b/assets/images/maximizing-training/overlap_zoomed_out.png new file mode 100644 index 000000000000..26246629f5a1 Binary files /dev/null and b/assets/images/maximizing-training/overlap_zoomed_out.png differ diff --git a/assets/images/members/amd-logo.svg b/assets/images/members/amd-logo.svg new file mode 100644 index 000000000000..e4174ec8367a --- /dev/null +++ b/assets/images/members/amd-logo.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/assets/images/members/arm-logo.svg b/assets/images/members/arm-logo.svg new file mode 100644 index 000000000000..acd3a586c593 --- /dev/null +++ b/assets/images/members/arm-logo.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/assets/images/members/aws-logo.svg b/assets/images/members/aws-logo.svg new file mode 100644 index 000000000000..686e92fc8dba --- /dev/null +++ b/assets/images/members/aws-logo.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/assets/images/members/baai-logo.svg b/assets/images/members/baai-logo.svg new file mode 100644 index 000000000000..a7c5f04e524d --- /dev/null +++ b/assets/images/members/baai-logo.svg @@ -0,0 +1 @@ +baai logo \ No newline at end of file diff --git a/assets/images/members/bayero-logo.svg b/assets/images/members/bayero-logo.svg new file mode 100644 index 000000000000..94ce006a9204 --- /dev/null +++ b/assets/images/members/bayero-logo.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/assets/images/members/columbia-university-logo.svg b/assets/images/members/columbia-university-logo.svg new file mode 100644 index 000000000000..3a0ab3455d51 --- /dev/null +++ b/assets/images/members/columbia-university-logo.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/assets/images/members/common-crawl-logo.svg b/assets/images/members/common-crawl-logo.svg new file mode 100644 index 000000000000..2a9efcd9ef62 --- /dev/null +++ b/assets/images/members/common-crawl-logo.svg @@ -0,0 +1,52 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/assets/images/members/dodlf-logo.jpg b/assets/images/members/dodlf-logo.jpg new file mode 100644 index 000000000000..c153de2adb11 Binary files /dev/null and b/assets/images/members/dodlf-logo.jpg differ diff --git a/assets/images/members/google-cloud-logo.svg b/assets/images/members/google-cloud-logo.svg new file mode 100644 index 000000000000..601d63857be6 --- /dev/null +++ b/assets/images/members/google-cloud-logo.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/assets/images/members/graphcore-logo.svg b/assets/images/members/graphcore-logo.svg new file mode 100644 index 000000000000..8de5b6e523ff --- /dev/null +++ b/assets/images/members/graphcore-logo.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/assets/images/members/hf-logo.svg b/assets/images/members/hf-logo.svg new file mode 100644 index 000000000000..0055b87be10c --- /dev/null +++ b/assets/images/members/hf-logo.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/assets/images/members/huawei-logo.svg b/assets/images/members/huawei-logo.svg new file mode 100644 index 000000000000..1c48cb5d13b1 --- /dev/null +++ b/assets/images/members/huawei-logo.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/assets/images/members/iabfu-logo.svg b/assets/images/members/iabfu-logo.svg new file mode 100644 index 000000000000..ac630fa9079e --- /dev/null +++ b/assets/images/members/iabfu-logo.svg @@ -0,0 +1,265 @@ + + + + +LOGO-01 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/assets/images/members/ibm-logo.svg b/assets/images/members/ibm-logo.svg new file mode 100644 index 000000000000..277ee398bb90 --- /dev/null +++ b/assets/images/members/ibm-logo.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/assets/images/members/iit-logo.png b/assets/images/members/iit-logo.png new file mode 100644 index 000000000000..1cdf841f2aa2 Binary files /dev/null and b/assets/images/members/iit-logo.png differ diff --git a/assets/images/members/lightning-logo.png b/assets/images/members/lightning-logo.png new file mode 100644 index 000000000000..57708c344cf9 Binary files /dev/null and b/assets/images/members/lightning-logo.png differ diff --git a/assets/images/members/meta-logo.svg b/assets/images/members/meta-logo.svg new file mode 100644 index 000000000000..d21824612296 --- /dev/null +++ b/assets/images/members/meta-logo.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/assets/images/members/microsoft-azure-logo.svg b/assets/images/members/microsoft-azure-logo.svg new file mode 100644 index 000000000000..33c6602517e1 --- /dev/null +++ b/assets/images/members/microsoft-azure-logo.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/assets/images/members/nvidia-logo.svg b/assets/images/members/nvidia-logo.svg new file mode 100644 index 000000000000..ed382e3f9cc0 --- /dev/null +++ b/assets/images/members/nvidia-logo.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/assets/images/members/onefact-logo.svg b/assets/images/members/onefact-logo.svg new file mode 100644 index 000000000000..e7c63505aa1a --- /dev/null +++ b/assets/images/members/onefact-logo.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/assets/images/members/rebellions-logo.svg b/assets/images/members/rebellions-logo.svg new file mode 100644 index 000000000000..cdf6397a6606 --- /dev/null +++ b/assets/images/members/rebellions-logo.svg @@ -0,0 +1,74 @@ + + + + + + + + + + + + + + + + + + diff --git a/assets/images/members/rensselaer-logo.png b/assets/images/members/rensselaer-logo.png new file mode 100644 index 000000000000..cc30e72e6df4 Binary files /dev/null and b/assets/images/members/rensselaer-logo.png differ diff --git a/assets/images/members/snowflake-logo.svg b/assets/images/members/snowflake-logo.svg new file mode 100644 index 000000000000..0ee6819585d5 --- /dev/null +++ b/assets/images/members/snowflake-logo.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/assets/images/members/texas-am-logo.svg b/assets/images/members/texas-am-logo.svg new file mode 100644 index 000000000000..403077b7049e --- /dev/null +++ b/assets/images/members/texas-am-logo.svg @@ -0,0 +1,115 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/assets/images/members/university-california-logo.svg b/assets/images/members/university-california-logo.svg new file mode 100644 index 000000000000..3e0fdc355a6e --- /dev/null +++ b/assets/images/members/university-california-logo.svg @@ -0,0 +1 @@ + diff --git a/assets/images/members/wedf-logo.png b/assets/images/members/wedf-logo.png new file mode 100644 index 000000000000..72ca57b2477f Binary files /dev/null and b/assets/images/members/wedf-logo.png differ diff --git a/assets/images/members/william-carey-logo.svg b/assets/images/members/william-carey-logo.svg new file mode 100644 index 000000000000..994990c2ae9f --- /dev/null +++ b/assets/images/members/william-carey-logo.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/assets/images/memory_reduction.png b/assets/images/memory_reduction.png new file mode 100644 index 000000000000..f50b50734c30 Binary files /dev/null and b/assets/images/memory_reduction.png differ diff --git a/assets/images/microsoft-azure-logo.svg b/assets/images/microsoft-azure-logo.svg new file mode 100644 index 000000000000..3dddb6683f40 --- /dev/null +++ b/assets/images/microsoft-azure-logo.svg @@ -0,0 +1,57 @@ + + + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/assets/images/microsoft-azure.png b/assets/images/microsoft-azure.png new file mode 100644 index 000000000000..4ebe6fc2fe55 Binary files /dev/null and b/assets/images/microsoft-azure.png differ diff --git a/assets/images/midas_samples.png b/assets/images/midas_samples.png new file mode 100644 index 000000000000..921e290edbae Binary files /dev/null and b/assets/images/midas_samples.png differ diff --git a/assets/images/mixed-precision-training-figure1.png b/assets/images/mixed-precision-training-figure1.png new file mode 100644 index 000000000000..76bd09df3251 Binary files /dev/null and b/assets/images/mixed-precision-training-figure1.png differ diff --git a/assets/images/mixed-precision-training-figure2.png b/assets/images/mixed-precision-training-figure2.png new file mode 100644 index 000000000000..4b1f6ba8dda8 Binary files /dev/null and b/assets/images/mixed-precision-training-figure2.png differ diff --git a/assets/images/mixed-precision-training-figure3.png b/assets/images/mixed-precision-training-figure3.png new file mode 100644 index 000000000000..238da8adf714 Binary files /dev/null and b/assets/images/mixed-precision-training-figure3.png differ diff --git a/assets/images/mixed-precision-training-figure5.png b/assets/images/mixed-precision-training-figure5.png new file mode 100644 index 000000000000..18d28b9ead52 Binary files /dev/null and b/assets/images/mixed-precision-training-figure5.png differ diff --git a/assets/images/ml-model-server-resource-saving/fg1-1.jpg b/assets/images/ml-model-server-resource-saving/fg1-1.jpg new file mode 100644 index 000000000000..f09d037b47b9 Binary files /dev/null and b/assets/images/ml-model-server-resource-saving/fg1-1.jpg differ diff --git a/assets/images/ml-model-server-resource-saving/fg1.jpg b/assets/images/ml-model-server-resource-saving/fg1.jpg new file mode 100644 index 000000000000..548eef50480d Binary files /dev/null and b/assets/images/ml-model-server-resource-saving/fg1.jpg differ diff --git a/assets/images/ml-model-server-resource-saving/fg2.jpg b/assets/images/ml-model-server-resource-saving/fg2.jpg new file mode 100644 index 000000000000..1c950f6a30a0 Binary files /dev/null and b/assets/images/ml-model-server-resource-saving/fg2.jpg differ diff --git a/assets/images/ml-model-server-resource-saving/fg3.jpg b/assets/images/ml-model-server-resource-saving/fg3.jpg new file mode 100644 index 000000000000..11b05df040c3 Binary files /dev/null and b/assets/images/ml-model-server-resource-saving/fg3.jpg differ diff --git a/assets/images/ml-model-server-resource-saving/fg4.jpg b/assets/images/ml-model-server-resource-saving/fg4.jpg new file mode 100644 index 000000000000..6c74f358f078 Binary files /dev/null and b/assets/images/ml-model-server-resource-saving/fg4.jpg differ diff --git a/assets/images/ml-model-server-resource-saving/fg5.jpg b/assets/images/ml-model-server-resource-saving/fg5.jpg new file mode 100644 index 000000000000..502e3e265435 Binary files /dev/null and b/assets/images/ml-model-server-resource-saving/fg5.jpg differ diff --git a/assets/images/ml-model-server-resource-saving/fg6.jpg b/assets/images/ml-model-server-resource-saving/fg6.jpg new file mode 100644 index 000000000000..87d971c71e2e Binary files /dev/null and b/assets/images/ml-model-server-resource-saving/fg6.jpg differ diff --git a/assets/images/ml-model-server-resource-saving/fg7.jpg b/assets/images/ml-model-server-resource-saving/fg7.jpg new file mode 100644 index 000000000000..3d9f31d6ae22 Binary files /dev/null and b/assets/images/ml-model-server-resource-saving/fg7.jpg differ diff --git a/assets/images/ml-model-server-resource-saving/fg8.jpg b/assets/images/ml-model-server-resource-saving/fg8.jpg new file mode 100644 index 000000000000..393ea3c76af3 Binary files /dev/null and b/assets/images/ml-model-server-resource-saving/fg8.jpg differ diff --git a/assets/images/ml-model-server-resource-saving/fg9.jpg b/assets/images/ml-model-server-resource-saving/fg9.jpg new file mode 100644 index 000000000000..8c6a73131a4d Binary files /dev/null and b/assets/images/ml-model-server-resource-saving/fg9.jpg differ diff --git a/assets/images/mlops-workflow/fg1.png b/assets/images/mlops-workflow/fg1.png new file mode 100644 index 000000000000..6236ea784a46 Binary files /dev/null and b/assets/images/mlops-workflow/fg1.png differ diff --git a/assets/images/mlops-workflow/fg2.png b/assets/images/mlops-workflow/fg2.png new file mode 100644 index 000000000000..70e91976e3e4 Binary files /dev/null and b/assets/images/mlops-workflow/fg2.png differ diff --git a/assets/images/mlops-workflow/fg3.png b/assets/images/mlops-workflow/fg3.png new file mode 100644 index 000000000000..cbbeb433bba6 Binary files /dev/null and b/assets/images/mlops-workflow/fg3.png differ diff --git a/assets/images/mobile-icon.svg b/assets/images/mobile-icon.svg new file mode 100644 index 000000000000..ba5cbebfbe8b --- /dev/null +++ b/assets/images/mobile-icon.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/assets/images/mobile_app_code.png b/assets/images/mobile_app_code.png new file mode 100644 index 000000000000..2ffbfe79a64f Binary files /dev/null and b/assets/images/mobile_app_code.png differ diff --git a/assets/images/mobilenet-v3-block.png b/assets/images/mobilenet-v3-block.png new file mode 100644 index 000000000000..6a49a84cc0c7 Binary files /dev/null and b/assets/images/mobilenet-v3-block.png differ diff --git a/assets/images/mobilenet_v2_1.png b/assets/images/mobilenet_v2_1.png new file mode 100644 index 000000000000..55fd2a18bc9d Binary files /dev/null and b/assets/images/mobilenet_v2_1.png differ diff --git a/assets/images/mobilenet_v2_2.png b/assets/images/mobilenet_v2_2.png new file mode 100644 index 000000000000..bc184fa04573 Binary files /dev/null and b/assets/images/mobilenet_v2_2.png differ diff --git a/assets/images/model_size.png b/assets/images/model_size.png new file mode 100644 index 000000000000..29722e23da43 Binary files /dev/null and b/assets/images/model_size.png differ diff --git a/assets/images/multi-modal-dl-frame.png b/assets/images/multi-modal-dl-frame.png new file mode 100644 index 000000000000..9bb6b68c60c6 Binary files /dev/null and b/assets/images/multi-modal-dl-frame.png differ diff --git a/assets/images/multi_derivative_graph.png b/assets/images/multi_derivative_graph.png new file mode 100644 index 000000000000..38005d016d57 Binary files /dev/null and b/assets/images/multi_derivative_graph.png differ diff --git a/assets/images/multidevice-integration/fg1.png b/assets/images/multidevice-integration/fg1.png new file mode 100644 index 000000000000..93139968e15c Binary files /dev/null and b/assets/images/multidevice-integration/fg1.png differ diff --git a/assets/images/multidevice-integration/fg2.png b/assets/images/multidevice-integration/fg2.png new file mode 100644 index 000000000000..b6ac4b6a9260 Binary files /dev/null and b/assets/images/multidevice-integration/fg2.png differ diff --git a/assets/images/ncf_diagram.png b/assets/images/ncf_diagram.png new file mode 100644 index 000000000000..ff27ccbfd1b9 Binary files /dev/null and b/assets/images/ncf_diagram.png differ diff --git a/assets/images/netlify.png b/assets/images/netlify.png new file mode 100644 index 000000000000..513fcd2b9c4e Binary files /dev/null and b/assets/images/netlify.png differ diff --git a/assets/images/new-library-updates-in-pytorch-1.13-2.jpg b/assets/images/new-library-updates-in-pytorch-1.13-2.jpg new file mode 100644 index 000000000000..956acc0eb38a Binary files /dev/null and b/assets/images/new-library-updates-in-pytorch-1.13-2.jpg differ diff --git a/assets/images/new-library-updates-in-pytorch-1.13.png b/assets/images/new-library-updates-in-pytorch-1.13.png new file mode 100644 index 000000000000..b3512c78ec08 Binary files /dev/null and b/assets/images/new-library-updates-in-pytorch-1.13.png differ diff --git a/.gitignore b/assets/images/no-image similarity index 100% rename from .gitignore rename to assets/images/no-image diff --git a/assets/images/node-fig-3.png b/assets/images/node-fig-3.png new file mode 100644 index 000000000000..81f8e5770b86 Binary files /dev/null and b/assets/images/node-fig-3.png differ diff --git a/assets/images/nswapytorch2.jpg b/assets/images/nswapytorch2.jpg new file mode 100644 index 000000000000..491a10e18d85 Binary files /dev/null and b/assets/images/nswapytorch2.jpg differ diff --git a/assets/images/nswapytorch6.png b/assets/images/nswapytorch6.png new file mode 100644 index 000000000000..e7483dae66c1 Binary files /dev/null and b/assets/images/nswapytorch6.png differ diff --git a/assets/images/nswapytorch8.png b/assets/images/nswapytorch8.png new file mode 100644 index 000000000000..3b1ba9e8bc32 Binary files /dev/null and b/assets/images/nswapytorch8.png differ diff --git a/assets/images/nsys_trace_cuda.png b/assets/images/nsys_trace_cuda.png new file mode 100644 index 000000000000..f24e9d6930aa Binary files /dev/null and b/assets/images/nsys_trace_cuda.png differ diff --git a/assets/images/nsys_trace_triton.png b/assets/images/nsys_trace_triton.png new file mode 100644 index 000000000000..37c7abf7af82 Binary files /dev/null and b/assets/images/nsys_trace_triton.png differ diff --git a/assets/images/nts-net.png b/assets/images/nts-net.png new file mode 100644 index 000000000000..b7bd97b1ec70 Binary files /dev/null and b/assets/images/nts-net.png differ diff --git a/assets/images/nuance-dragon-ambient-experience.png b/assets/images/nuance-dragon-ambient-experience.png new file mode 100644 index 000000000000..d3f0ebb79bfd Binary files /dev/null and b/assets/images/nuance-dragon-ambient-experience.png differ diff --git a/assets/images/nvidia-logo.png b/assets/images/nvidia-logo.png new file mode 100644 index 000000000000..194e1bad895a Binary files /dev/null and b/assets/images/nvidia-logo.png differ diff --git a/assets/images/nvidia_logo.png b/assets/images/nvidia_logo.png new file mode 100644 index 000000000000..41caa39c75d0 Binary files /dev/null and b/assets/images/nvidia_logo.png differ diff --git a/assets/images/nvidiafp16onv100.png b/assets/images/nvidiafp16onv100.png new file mode 100644 index 000000000000..46d29522a8d9 Binary files /dev/null and b/assets/images/nvidiafp16onv100.png differ diff --git a/assets/images/nvidiafp32onv100.jpg b/assets/images/nvidiafp32onv100.jpg new file mode 100644 index 000000000000..b3e0d03ed287 Binary files /dev/null and b/assets/images/nvidiafp32onv100.jpg differ diff --git a/assets/images/ofa_imagenet_results.png b/assets/images/ofa_imagenet_results.png new file mode 100644 index 000000000000..46ceae12c0c5 Binary files /dev/null and b/assets/images/ofa_imagenet_results.png differ diff --git a/assets/images/once_for_all_overview.png b/assets/images/once_for_all_overview.png new file mode 100644 index 000000000000..555bf30cc5e1 Binary files /dev/null and b/assets/images/once_for_all_overview.png differ diff --git a/assets/images/openmined-pytorch.png b/assets/images/openmined-pytorch.png new file mode 100644 index 000000000000..610799477dd1 Binary files /dev/null and b/assets/images/openmined-pytorch.png differ diff --git a/assets/images/openreg.png b/assets/images/openreg.png new file mode 100644 index 000000000000..71fab0973309 Binary files /dev/null and b/assets/images/openreg.png differ diff --git a/assets/images/optimize-llms.png b/assets/images/optimize-llms.png new file mode 100644 index 000000000000..ba6e73cf4899 Binary files /dev/null and b/assets/images/optimize-llms.png differ diff --git a/assets/images/optimized/im1.png b/assets/images/optimized/im1.png new file mode 100644 index 000000000000..0a8ade7b5c18 Binary files /dev/null and b/assets/images/optimized/im1.png differ diff --git a/assets/images/optimized/im2.png b/assets/images/optimized/im2.png new file mode 100644 index 000000000000..7b0bb40f9298 Binary files /dev/null and b/assets/images/optimized/im2.png differ diff --git a/assets/images/optimized/im3.png b/assets/images/optimized/im3.png new file mode 100644 index 000000000000..e72ce248af9b Binary files /dev/null and b/assets/images/optimized/im3.png differ diff --git a/assets/images/optimized/im4.png b/assets/images/optimized/im4.png new file mode 100644 index 000000000000..986c807199be Binary files /dev/null and b/assets/images/optimized/im4.png differ diff --git a/assets/images/optimized/im5.png b/assets/images/optimized/im5.png new file mode 100644 index 000000000000..e6e98771b812 Binary files /dev/null and b/assets/images/optimized/im5.png differ diff --git a/assets/images/optimized/im6.png b/assets/images/optimized/im6.png new file mode 100644 index 000000000000..40c5e3c33d51 Binary files /dev/null and b/assets/images/optimized/im6.png differ diff --git a/assets/images/optimized/im7.png b/assets/images/optimized/im7.png new file mode 100644 index 000000000000..b9760fd289bd Binary files /dev/null and b/assets/images/optimized/im7.png differ diff --git a/assets/images/optimizing-libtorch/im1.jpg b/assets/images/optimizing-libtorch/im1.jpg new file mode 100644 index 000000000000..320f52df9aa9 Binary files /dev/null and b/assets/images/optimizing-libtorch/im1.jpg differ diff --git a/assets/images/optimizing-libtorch/im2.jpg b/assets/images/optimizing-libtorch/im2.jpg new file mode 100644 index 000000000000..4bf6f1d1ab13 Binary files /dev/null and b/assets/images/optimizing-libtorch/im2.jpg differ diff --git a/assets/images/optimizing-libtorch/im3.jpg b/assets/images/optimizing-libtorch/im3.jpg new file mode 100644 index 000000000000..1f6ea5f82154 Binary files /dev/null and b/assets/images/optimizing-libtorch/im3.jpg differ diff --git a/assets/images/optimizing-libtorch/im4.jpg b/assets/images/optimizing-libtorch/im4.jpg new file mode 100644 index 000000000000..3eff5382ee48 Binary files /dev/null and b/assets/images/optimizing-libtorch/im4.jpg differ diff --git a/assets/images/optimizing-libtorch/im5.jpg b/assets/images/optimizing-libtorch/im5.jpg new file mode 100644 index 000000000000..203e39ceb343 Binary files /dev/null and b/assets/images/optimizing-libtorch/im5.jpg differ diff --git a/assets/images/out-of-the-box/Fig1.jpg b/assets/images/out-of-the-box/Fig1.jpg new file mode 100644 index 000000000000..b447352825ae Binary files /dev/null and b/assets/images/out-of-the-box/Fig1.jpg differ diff --git a/assets/images/out-of-the-box/Fig10.jpg b/assets/images/out-of-the-box/Fig10.jpg new file mode 100644 index 000000000000..c74c3a114d7b Binary files /dev/null and b/assets/images/out-of-the-box/Fig10.jpg differ diff --git a/assets/images/out-of-the-box/Fig2.jpg b/assets/images/out-of-the-box/Fig2.jpg new file mode 100644 index 000000000000..3de1f1b40e6f Binary files /dev/null and b/assets/images/out-of-the-box/Fig2.jpg differ diff --git a/assets/images/out-of-the-box/Fig3.jpg b/assets/images/out-of-the-box/Fig3.jpg new file mode 100644 index 000000000000..27ef7cb512fd Binary files /dev/null and b/assets/images/out-of-the-box/Fig3.jpg differ diff --git a/assets/images/out-of-the-box/Fig4.jpg b/assets/images/out-of-the-box/Fig4.jpg new file mode 100644 index 000000000000..baaf0c739b7d Binary files /dev/null and b/assets/images/out-of-the-box/Fig4.jpg differ diff --git a/assets/images/out-of-the-box/Fig5.jpg b/assets/images/out-of-the-box/Fig5.jpg new file mode 100644 index 000000000000..ff156b778982 Binary files /dev/null and b/assets/images/out-of-the-box/Fig5.jpg differ diff --git a/assets/images/out-of-the-box/Fig6.jpg b/assets/images/out-of-the-box/Fig6.jpg new file mode 100644 index 000000000000..0d5087323798 Binary files /dev/null and b/assets/images/out-of-the-box/Fig6.jpg differ diff --git a/assets/images/out-of-the-box/Fig7.jpg b/assets/images/out-of-the-box/Fig7.jpg new file mode 100644 index 000000000000..ae51f498a563 Binary files /dev/null and b/assets/images/out-of-the-box/Fig7.jpg differ diff --git a/assets/images/out-of-the-box/Fig8.jpg b/assets/images/out-of-the-box/Fig8.jpg new file mode 100644 index 000000000000..2262cb35060b Binary files /dev/null and b/assets/images/out-of-the-box/Fig8.jpg differ diff --git a/assets/images/out-of-the-box/Fig9.jpg b/assets/images/out-of-the-box/Fig9.jpg new file mode 100644 index 000000000000..140daec046ba Binary files /dev/null and b/assets/images/out-of-the-box/Fig9.jpg differ diff --git a/assets/images/packed_sequence.png b/assets/images/packed_sequence.png new file mode 100644 index 000000000000..b32a8953ef01 Binary files /dev/null and b/assets/images/packed_sequence.png differ diff --git a/assets/images/paris-tech-logo.png b/assets/images/paris-tech-logo.png new file mode 100644 index 000000000000..c2c77bf3b0bb Binary files /dev/null and b/assets/images/paris-tech-logo.png differ diff --git a/assets/images/peak-performance-minimized-memory/fg1.png b/assets/images/peak-performance-minimized-memory/fg1.png new file mode 100644 index 000000000000..175eadfbe04d Binary files /dev/null and b/assets/images/peak-performance-minimized-memory/fg1.png differ diff --git a/assets/images/peak-performance-minimized-memory/fg2.png b/assets/images/peak-performance-minimized-memory/fg2.png new file mode 100644 index 000000000000..365dfa313c7d Binary files /dev/null and b/assets/images/peak-performance-minimized-memory/fg2.png differ diff --git a/assets/images/peak-performance-minimized-memory/fg3.png b/assets/images/peak-performance-minimized-memory/fg3.png new file mode 100644 index 000000000000..6d28237582f5 Binary files /dev/null and b/assets/images/peak-performance-minimized-memory/fg3.png differ diff --git a/assets/images/peak-performance-minimized-memory/fg4.png b/assets/images/peak-performance-minimized-memory/fg4.png new file mode 100644 index 000000000000..3685c1c81f98 Binary files /dev/null and b/assets/images/peak-performance-minimized-memory/fg4.png differ diff --git a/assets/images/performance-boost-windows/fg1.png b/assets/images/performance-boost-windows/fg1.png new file mode 100644 index 000000000000..f9594aadb647 Binary files /dev/null and b/assets/images/performance-boost-windows/fg1.png differ diff --git a/assets/images/performance-boost-windows/fg2.png b/assets/images/performance-boost-windows/fg2.png new file mode 100644 index 000000000000..0bb6ef15622f Binary files /dev/null and b/assets/images/performance-boost-windows/fg2.png differ diff --git a/assets/images/performance-boost-windows/fg3.png b/assets/images/performance-boost-windows/fg3.png new file mode 100644 index 000000000000..2366489b045c Binary files /dev/null and b/assets/images/performance-boost-windows/fg3.png differ diff --git a/assets/images/performance-boost-windows/fg4.png b/assets/images/performance-boost-windows/fg4.png new file mode 100644 index 000000000000..61b0efadbbb2 Binary files /dev/null and b/assets/images/performance-boost-windows/fg4.png differ diff --git a/assets/images/performance-boost-windows/fg5.png b/assets/images/performance-boost-windows/fg5.png new file mode 100644 index 000000000000..57eb63f3ddc7 Binary files /dev/null and b/assets/images/performance-boost-windows/fg5.png differ diff --git a/assets/images/performance-boost-windows/fg6.png b/assets/images/performance-boost-windows/fg6.png new file mode 100644 index 000000000000..fe975644112a Binary files /dev/null and b/assets/images/performance-boost-windows/fg6.png differ diff --git a/assets/images/performance-debugging-of-production-pytorch-models-at-meta-1.png b/assets/images/performance-debugging-of-production-pytorch-models-at-meta-1.png new file mode 100644 index 000000000000..da43f344062b Binary files /dev/null and b/assets/images/performance-debugging-of-production-pytorch-models-at-meta-1.png differ diff --git a/assets/images/performance-debugging-of-production-pytorch-models-at-meta-2.png b/assets/images/performance-debugging-of-production-pytorch-models-at-meta-2.png new file mode 100644 index 000000000000..dd26a5aa5fdd Binary files /dev/null and b/assets/images/performance-debugging-of-production-pytorch-models-at-meta-2.png differ diff --git a/assets/images/performance-debugging-of-production-pytorch-models-at-meta-3.png b/assets/images/performance-debugging-of-production-pytorch-models-at-meta-3.png new file mode 100644 index 000000000000..8a110f0dde7a Binary files /dev/null and b/assets/images/performance-debugging-of-production-pytorch-models-at-meta-3.png differ diff --git a/assets/images/performance-debugging-of-production-pytorch-models-at-meta-4.png b/assets/images/performance-debugging-of-production-pytorch-models-at-meta-4.png new file mode 100644 index 000000000000..ad926125f175 Binary files /dev/null and b/assets/images/performance-debugging-of-production-pytorch-models-at-meta-4.png differ diff --git a/assets/images/performance-debugging-of-production-pytorch-models-at-meta-5.png b/assets/images/performance-debugging-of-production-pytorch-models-at-meta-5.png new file mode 100644 index 000000000000..81dc5dce9ed7 Binary files /dev/null and b/assets/images/performance-debugging-of-production-pytorch-models-at-meta-5.png differ diff --git a/assets/images/performance-gains-over-fp32-eager-2.png b/assets/images/performance-gains-over-fp32-eager-2.png new file mode 100644 index 000000000000..769b74fd9980 Binary files /dev/null and b/assets/images/performance-gains-over-fp32-eager-2.png differ diff --git a/assets/images/performance-gains-over-fp32-eager.png b/assets/images/performance-gains-over-fp32-eager.png new file mode 100644 index 000000000000..d8d9c32f3edc Binary files /dev/null and b/assets/images/performance-gains-over-fp32-eager.png differ diff --git a/assets/images/pgan_celebaHQ.jpg b/assets/images/pgan_celebaHQ.jpg new file mode 100644 index 000000000000..9fbcc4291db7 Binary files /dev/null and b/assets/images/pgan_celebaHQ.jpg differ diff --git a/assets/images/pgan_mix.jpg b/assets/images/pgan_mix.jpg new file mode 100644 index 000000000000..91959af4a578 Binary files /dev/null and b/assets/images/pgan_mix.jpg differ diff --git a/assets/images/pganlogo.png b/assets/images/pganlogo.png new file mode 100644 index 000000000000..1b623f792131 Binary files /dev/null and b/assets/images/pganlogo.png differ diff --git a/assets/images/pipe_buble.png b/assets/images/pipe_buble.png new file mode 100644 index 000000000000..3e700c3cfef7 Binary files /dev/null and b/assets/images/pipe_buble.png differ diff --git a/assets/images/pipetransformer_image_1.png b/assets/images/pipetransformer_image_1.png new file mode 100644 index 000000000000..0a36f9776b44 Binary files /dev/null and b/assets/images/pipetransformer_image_1.png differ diff --git a/assets/images/pipetransformer_image_2.gif b/assets/images/pipetransformer_image_2.gif new file mode 100644 index 000000000000..0bd6e02b2fdd Binary files /dev/null and b/assets/images/pipetransformer_image_2.gif differ diff --git a/assets/images/pipetransformer_image_3.png b/assets/images/pipetransformer_image_3.png new file mode 100644 index 000000000000..851df1c3216f Binary files /dev/null and b/assets/images/pipetransformer_image_3.png differ diff --git a/assets/images/pipetransformer_overview.png b/assets/images/pipetransformer_overview.png new file mode 100644 index 000000000000..480393f2277d Binary files /dev/null and b/assets/images/pipetransformer_overview.png differ diff --git a/assets/images/port-numbers.png b/assets/images/port-numbers.png new file mode 100644 index 000000000000..164fa6669dbe Binary files /dev/null and b/assets/images/port-numbers.png differ diff --git a/assets/images/prediction examples.png b/assets/images/prediction examples.png new file mode 100644 index 000000000000..d8ce029c2363 Binary files /dev/null and b/assets/images/prediction examples.png differ diff --git a/assets/images/probpackages.png b/assets/images/probpackages.png new file mode 100644 index 000000000000..82b3ec115afd Binary files /dev/null and b/assets/images/probpackages.png differ diff --git a/assets/images/profiler_1.9_image1.png b/assets/images/profiler_1.9_image1.png new file mode 100644 index 000000000000..6af994252f7d Binary files /dev/null and b/assets/images/profiler_1.9_image1.png differ diff --git a/assets/images/profiler_1.9_image10.png b/assets/images/profiler_1.9_image10.png new file mode 100644 index 000000000000..6a768d1727a7 Binary files /dev/null and b/assets/images/profiler_1.9_image10.png differ diff --git a/assets/images/profiler_1.9_image11.png b/assets/images/profiler_1.9_image11.png new file mode 100644 index 000000000000..0fab87af9deb Binary files /dev/null and b/assets/images/profiler_1.9_image11.png differ diff --git a/assets/images/profiler_1.9_image12.png b/assets/images/profiler_1.9_image12.png new file mode 100644 index 000000000000..f4b36c83b21b Binary files /dev/null and b/assets/images/profiler_1.9_image12.png differ diff --git a/assets/images/profiler_1.9_image13.png b/assets/images/profiler_1.9_image13.png new file mode 100644 index 000000000000..ae818a1135d0 Binary files /dev/null and b/assets/images/profiler_1.9_image13.png differ diff --git a/assets/images/profiler_1.9_image2.png b/assets/images/profiler_1.9_image2.png new file mode 100644 index 000000000000..9a8b4824b227 Binary files /dev/null and b/assets/images/profiler_1.9_image2.png differ diff --git a/assets/images/profiler_1.9_image4.png b/assets/images/profiler_1.9_image4.png new file mode 100644 index 000000000000..af3beff94f37 Binary files /dev/null and b/assets/images/profiler_1.9_image4.png differ diff --git a/assets/images/profiler_1.9_image5.png b/assets/images/profiler_1.9_image5.png new file mode 100644 index 000000000000..d5f9b2513d85 Binary files /dev/null and b/assets/images/profiler_1.9_image5.png differ diff --git a/assets/images/profiler_1.9_image6.png b/assets/images/profiler_1.9_image6.png new file mode 100644 index 000000000000..aef8a92535ca Binary files /dev/null and b/assets/images/profiler_1.9_image6.png differ diff --git a/assets/images/profiler_1.9_image7.png b/assets/images/profiler_1.9_image7.png new file mode 100644 index 000000000000..4023899b46c7 Binary files /dev/null and b/assets/images/profiler_1.9_image7.png differ diff --git a/assets/images/profiler_1.9_image8.png b/assets/images/profiler_1.9_image8.png new file mode 100644 index 000000000000..0b35eab21bc7 Binary files /dev/null and b/assets/images/profiler_1.9_image8.png differ diff --git a/assets/images/profiler_1.9_image9.png b/assets/images/profiler_1.9_image9.png new file mode 100644 index 000000000000..754e4334dff5 Binary files /dev/null and b/assets/images/profiler_1.9_image9.png differ diff --git a/assets/images/proxylessnas.png b/assets/images/proxylessnas.png new file mode 100644 index 000000000000..50fb554a589e Binary files /dev/null and b/assets/images/proxylessnas.png differ diff --git a/assets/images/pt-26-live-q-a.png b/assets/images/pt-26-live-q-a.png new file mode 100644 index 000000000000..d0b059cf8511 Binary files /dev/null and b/assets/images/pt-26-live-q-a.png differ diff --git a/assets/images/pt-conference.jpg b/assets/images/pt-conference.jpg new file mode 100644 index 000000000000..636c5626ea22 Binary files /dev/null and b/assets/images/pt-conference.jpg differ diff --git a/assets/images/pt-day-cfp.png b/assets/images/pt-day-cfp.png new file mode 100644 index 000000000000..f8f6a849f3ab Binary files /dev/null and b/assets/images/pt-day-cfp.png differ diff --git a/assets/images/pt-day-china-2025-cfp.jpg b/assets/images/pt-day-china-2025-cfp.jpg new file mode 100644 index 000000000000..d42c377175a5 Binary files /dev/null and b/assets/images/pt-day-china-2025-cfp.jpg differ diff --git a/assets/images/pt-dinov2-multi-label-plant-species-classification.png b/assets/images/pt-dinov2-multi-label-plant-species-classification.png new file mode 100644 index 000000000000..c544f914043a Binary files /dev/null and b/assets/images/pt-dinov2-multi-label-plant-species-classification.png differ diff --git a/assets/images/pt-executorch-ethos-u85/fg1.jpg b/assets/images/pt-executorch-ethos-u85/fg1.jpg new file mode 100644 index 000000000000..27c0ea1b6913 Binary files /dev/null and b/assets/images/pt-executorch-ethos-u85/fg1.jpg differ diff --git a/assets/images/pt-executorch-ethos-u85/fg2.jpg b/assets/images/pt-executorch-ethos-u85/fg2.jpg new file mode 100644 index 000000000000..95f70ba21995 Binary files /dev/null and b/assets/images/pt-executorch-ethos-u85/fg2.jpg differ diff --git a/assets/images/pt-executorch-ethos-u85/fg3.png b/assets/images/pt-executorch-ethos-u85/fg3.png new file mode 100644 index 000000000000..01beb91e7846 Binary files /dev/null and b/assets/images/pt-executorch-ethos-u85/fg3.png differ diff --git a/assets/images/pt-fedora-os-communities/fg1.jpg b/assets/images/pt-fedora-os-communities/fg1.jpg new file mode 100644 index 000000000000..e9c0de7b24ef Binary files /dev/null and b/assets/images/pt-fedora-os-communities/fg1.jpg differ diff --git a/assets/images/pt-fedora-os-communities/fg2.jpg b/assets/images/pt-fedora-os-communities/fg2.jpg new file mode 100644 index 000000000000..1aa340f71de9 Binary files /dev/null and b/assets/images/pt-fedora-os-communities/fg2.jpg differ diff --git a/assets/images/pt-fedora-os-communities/fg3.jpg b/assets/images/pt-fedora-os-communities/fg3.jpg new file mode 100644 index 000000000000..11ff09aaff08 Binary files /dev/null and b/assets/images/pt-fedora-os-communities/fg3.jpg differ diff --git a/assets/images/pt-fedora-os-communities/fg4.jpg b/assets/images/pt-fedora-os-communities/fg4.jpg new file mode 100644 index 000000000000..008d80e99dd4 Binary files /dev/null and b/assets/images/pt-fedora-os-communities/fg4.jpg differ diff --git a/assets/images/pt-fedora-os-communities/fg5.jpg b/assets/images/pt-fedora-os-communities/fg5.jpg new file mode 100644 index 000000000000..8761774d551b Binary files /dev/null and b/assets/images/pt-fedora-os-communities/fg5.jpg differ diff --git a/assets/images/pt-fedora-os-communities/fg6.jpg b/assets/images/pt-fedora-os-communities/fg6.jpg new file mode 100644 index 000000000000..9d06bd98d994 Binary files /dev/null and b/assets/images/pt-fedora-os-communities/fg6.jpg differ diff --git a/assets/images/pt-korea-user-group-recap/fg1.jpg b/assets/images/pt-korea-user-group-recap/fg1.jpg new file mode 100644 index 000000000000..dbd408d6baf8 Binary files /dev/null and b/assets/images/pt-korea-user-group-recap/fg1.jpg differ diff --git a/assets/images/pt-korea-user-group-recap/fg1.png b/assets/images/pt-korea-user-group-recap/fg1.png new file mode 100644 index 000000000000..02caf444eb0b Binary files /dev/null and b/assets/images/pt-korea-user-group-recap/fg1.png differ diff --git a/assets/images/pt-korea-user-group-recap/fg2.jpg b/assets/images/pt-korea-user-group-recap/fg2.jpg new file mode 100644 index 000000000000..924780bb6a3c Binary files /dev/null and b/assets/images/pt-korea-user-group-recap/fg2.jpg differ diff --git a/assets/images/pt-korea-user-group-recap/fg2.png b/assets/images/pt-korea-user-group-recap/fg2.png new file mode 100644 index 000000000000..3153c75f603c Binary files /dev/null and b/assets/images/pt-korea-user-group-recap/fg2.png differ diff --git a/assets/images/pt-korea-user-group-recap/fg3.jpg b/assets/images/pt-korea-user-group-recap/fg3.jpg new file mode 100644 index 000000000000..b49e18e82f9d Binary files /dev/null and b/assets/images/pt-korea-user-group-recap/fg3.jpg differ diff --git a/assets/images/pt-korea-user-group-recap/fg3.png b/assets/images/pt-korea-user-group-recap/fg3.png new file mode 100644 index 000000000000..ed9e8ccadb3e Binary files /dev/null and b/assets/images/pt-korea-user-group-recap/fg3.png differ diff --git a/assets/images/pt-korea-user-group-recap/fg4.jpg b/assets/images/pt-korea-user-group-recap/fg4.jpg new file mode 100644 index 000000000000..b89e21c71d78 Binary files /dev/null and b/assets/images/pt-korea-user-group-recap/fg4.jpg differ diff --git a/assets/images/pt-korea-user-group-recap/fg4.png b/assets/images/pt-korea-user-group-recap/fg4.png new file mode 100644 index 000000000000..4831610f703d Binary files /dev/null and b/assets/images/pt-korea-user-group-recap/fg4.png differ diff --git a/assets/images/pt-korea-user-group-recap/fg5.jpg b/assets/images/pt-korea-user-group-recap/fg5.jpg new file mode 100644 index 000000000000..5e41bbd65b32 Binary files /dev/null and b/assets/images/pt-korea-user-group-recap/fg5.jpg differ diff --git a/assets/images/pt-korea-user-group-recap/fg5.png b/assets/images/pt-korea-user-group-recap/fg5.png new file mode 100644 index 000000000000..02e5b01ac852 Binary files /dev/null and b/assets/images/pt-korea-user-group-recap/fg5.png differ diff --git a/assets/images/pt-korea-user-group-recap/fg6.jpg b/assets/images/pt-korea-user-group-recap/fg6.jpg new file mode 100644 index 000000000000..ce7c789c2385 Binary files /dev/null and b/assets/images/pt-korea-user-group-recap/fg6.jpg differ diff --git a/assets/images/pt-korea-user-group-recap/fg6.png b/assets/images/pt-korea-user-group-recap/fg6.png new file mode 100644 index 000000000000..5e5650ab16cf Binary files /dev/null and b/assets/images/pt-korea-user-group-recap/fg6.png differ diff --git a/assets/images/pt-korea-user-group-recap/fg7.jpg b/assets/images/pt-korea-user-group-recap/fg7.jpg new file mode 100644 index 000000000000..e0b7f94f2a3b Binary files /dev/null and b/assets/images/pt-korea-user-group-recap/fg7.jpg differ diff --git a/assets/images/pt-korea-user-group-recap/fg7.png b/assets/images/pt-korea-user-group-recap/fg7.png new file mode 100644 index 000000000000..3371ea3c6073 Binary files /dev/null and b/assets/images/pt-korea-user-group-recap/fg7.png differ diff --git a/assets/images/pt-korea-user-group-recap/fg8.jpg b/assets/images/pt-korea-user-group-recap/fg8.jpg new file mode 100644 index 000000000000..8d57c2a30bc4 Binary files /dev/null and b/assets/images/pt-korea-user-group-recap/fg8.jpg differ diff --git a/assets/images/pt-korea-user-group-recap/fg8.png b/assets/images/pt-korea-user-group-recap/fg8.png new file mode 100644 index 000000000000..4ba3b87290b3 Binary files /dev/null and b/assets/images/pt-korea-user-group-recap/fg8.png differ diff --git a/assets/images/pt-korea-user-group-recap/fg9.jpg b/assets/images/pt-korea-user-group-recap/fg9.jpg new file mode 100644 index 000000000000..e631c2a29545 Binary files /dev/null and b/assets/images/pt-korea-user-group-recap/fg9.jpg differ diff --git a/assets/images/pt-korea-user-group-recap/fg9.png b/assets/images/pt-korea-user-group-recap/fg9.png new file mode 100644 index 000000000000..02d9224b8c7d Binary files /dev/null and b/assets/images/pt-korea-user-group-recap/fg9.png differ diff --git a/assets/images/pt27qa.png b/assets/images/pt27qa.png new file mode 100644 index 000000000000..dbc60c8fcd0e Binary files /dev/null and b/assets/images/pt27qa.png differ diff --git a/assets/images/ptc2022/A01-thumb.png b/assets/images/ptc2022/A01-thumb.png new file mode 100644 index 000000000000..95a18fd71be7 Binary files /dev/null and b/assets/images/ptc2022/A01-thumb.png differ diff --git a/assets/images/ptc2022/B01-thumb.png b/assets/images/ptc2022/B01-thumb.png new file mode 100644 index 000000000000..fab9d8d08061 Binary files /dev/null and b/assets/images/ptc2022/B01-thumb.png differ diff --git a/assets/images/ptc2022/B02-thumb.png b/assets/images/ptc2022/B02-thumb.png new file mode 100644 index 000000000000..e41bbcf23538 Binary files /dev/null and b/assets/images/ptc2022/B02-thumb.png differ diff --git a/assets/images/ptc2022/B03-thumb.png b/assets/images/ptc2022/B03-thumb.png new file mode 100644 index 000000000000..b2c09fb51665 Binary files /dev/null and b/assets/images/ptc2022/B03-thumb.png differ diff --git a/assets/images/ptc2022/B04-thumb.png b/assets/images/ptc2022/B04-thumb.png new file mode 100644 index 000000000000..b9106eec9fae Binary files /dev/null and b/assets/images/ptc2022/B04-thumb.png differ diff --git a/assets/images/ptc2022/B05-thumb.png b/assets/images/ptc2022/B05-thumb.png new file mode 100644 index 000000000000..cc1eb7cd3ade Binary files /dev/null and b/assets/images/ptc2022/B05-thumb.png differ diff --git a/assets/images/ptc2022/B06-thumb.png b/assets/images/ptc2022/B06-thumb.png new file mode 100644 index 000000000000..78eb39eaecbf Binary files /dev/null and b/assets/images/ptc2022/B06-thumb.png differ diff --git a/assets/images/ptc2022/B07-thumb.png b/assets/images/ptc2022/B07-thumb.png new file mode 100644 index 000000000000..3ad33fb6bf29 Binary files /dev/null and b/assets/images/ptc2022/B07-thumb.png differ diff --git a/assets/images/ptc2022/B08-thumb.png b/assets/images/ptc2022/B08-thumb.png new file mode 100644 index 000000000000..d3721427e83a Binary files /dev/null and b/assets/images/ptc2022/B08-thumb.png differ diff --git a/assets/images/ptc2022/B09-thumb.png b/assets/images/ptc2022/B09-thumb.png new file mode 100644 index 000000000000..821546e46457 Binary files /dev/null and b/assets/images/ptc2022/B09-thumb.png differ diff --git a/assets/images/ptc2022/B10-thumb.png b/assets/images/ptc2022/B10-thumb.png new file mode 100644 index 000000000000..da219489bb6f Binary files /dev/null and b/assets/images/ptc2022/B10-thumb.png differ diff --git a/assets/images/ptc2022/B11-thumb.png b/assets/images/ptc2022/B11-thumb.png new file mode 100644 index 000000000000..8b38fb801fd9 Binary files /dev/null and b/assets/images/ptc2022/B11-thumb.png differ diff --git a/assets/images/ptc2022/B12-thumb.png b/assets/images/ptc2022/B12-thumb.png new file mode 100644 index 000000000000..0eef1c3ef9c3 Binary files /dev/null and b/assets/images/ptc2022/B12-thumb.png differ diff --git a/assets/images/ptc2022/B13-thumb.png b/assets/images/ptc2022/B13-thumb.png new file mode 100644 index 000000000000..7ae9801ab7e0 Binary files /dev/null and b/assets/images/ptc2022/B13-thumb.png differ diff --git a/assets/images/ptc2022/B14-thumb.png b/assets/images/ptc2022/B14-thumb.png new file mode 100644 index 000000000000..778d0525ff5b Binary files /dev/null and b/assets/images/ptc2022/B14-thumb.png differ diff --git a/assets/images/ptc2022/B15-thumb.png b/assets/images/ptc2022/B15-thumb.png new file mode 100644 index 000000000000..91d34ad4337e Binary files /dev/null and b/assets/images/ptc2022/B15-thumb.png differ diff --git a/assets/images/ptc2022/B17-thumb.png b/assets/images/ptc2022/B17-thumb.png new file mode 100644 index 000000000000..18e8ffa4a5a9 Binary files /dev/null and b/assets/images/ptc2022/B17-thumb.png differ diff --git a/assets/images/ptc2022/B18-thumb.png b/assets/images/ptc2022/B18-thumb.png new file mode 100644 index 000000000000..23a89ecf854d Binary files /dev/null and b/assets/images/ptc2022/B18-thumb.png differ diff --git a/assets/images/ptc2022/C01-thumb.png b/assets/images/ptc2022/C01-thumb.png new file mode 100644 index 000000000000..3fd3392338f0 Binary files /dev/null and b/assets/images/ptc2022/C01-thumb.png differ diff --git a/assets/images/ptc2022/C02-thumb.png b/assets/images/ptc2022/C02-thumb.png new file mode 100644 index 000000000000..a3ce300139cf Binary files /dev/null and b/assets/images/ptc2022/C02-thumb.png differ diff --git a/assets/images/ptc2022/C03-thumb.png b/assets/images/ptc2022/C03-thumb.png new file mode 100644 index 000000000000..e9d70f5b218e Binary files /dev/null and b/assets/images/ptc2022/C03-thumb.png differ diff --git a/assets/images/ptc2022/C04-thumb.png b/assets/images/ptc2022/C04-thumb.png new file mode 100644 index 000000000000..e904fec7f796 Binary files /dev/null and b/assets/images/ptc2022/C04-thumb.png differ diff --git a/assets/images/ptc2022/D01-thumb.png b/assets/images/ptc2022/D01-thumb.png new file mode 100644 index 000000000000..fe3f9d0a52d2 Binary files /dev/null and b/assets/images/ptc2022/D01-thumb.png differ diff --git a/assets/images/ptc2022/D02-thumb.png b/assets/images/ptc2022/D02-thumb.png new file mode 100644 index 000000000000..8b399b1b247d Binary files /dev/null and b/assets/images/ptc2022/D02-thumb.png differ diff --git a/assets/images/ptc2022/D03-thumb.png b/assets/images/ptc2022/D03-thumb.png new file mode 100644 index 000000000000..db9a54517809 Binary files /dev/null and b/assets/images/ptc2022/D03-thumb.png differ diff --git a/assets/images/ptc2022/E01-thumb.png b/assets/images/ptc2022/E01-thumb.png new file mode 100644 index 000000000000..b269491b39ae Binary files /dev/null and b/assets/images/ptc2022/E01-thumb.png differ diff --git a/assets/images/ptc2022/E02-thumb.png b/assets/images/ptc2022/E02-thumb.png new file mode 100644 index 000000000000..1d178791aafd Binary files /dev/null and b/assets/images/ptc2022/E02-thumb.png differ diff --git a/assets/images/ptc2022/E03-thumb.png b/assets/images/ptc2022/E03-thumb.png new file mode 100644 index 000000000000..2ae7d9284546 Binary files /dev/null and b/assets/images/ptc2022/E03-thumb.png differ diff --git a/assets/images/ptc2022/E04-thumb.png b/assets/images/ptc2022/E04-thumb.png new file mode 100644 index 000000000000..beb967a77dc3 Binary files /dev/null and b/assets/images/ptc2022/E04-thumb.png differ diff --git a/assets/images/ptc2022/F01-thumb.png b/assets/images/ptc2022/F01-thumb.png new file mode 100644 index 000000000000..71cac68a45e5 Binary files /dev/null and b/assets/images/ptc2022/F01-thumb.png differ diff --git a/assets/images/ptc2022/F02-thumb.png b/assets/images/ptc2022/F02-thumb.png new file mode 100644 index 000000000000..2d4ef4df46ab Binary files /dev/null and b/assets/images/ptc2022/F02-thumb.png differ diff --git a/assets/images/ptc2022/F03-thumb.png b/assets/images/ptc2022/F03-thumb.png new file mode 100644 index 000000000000..c2179ea02c9a Binary files /dev/null and b/assets/images/ptc2022/F03-thumb.png differ diff --git a/assets/images/ptc2022/F04-thumb.png b/assets/images/ptc2022/F04-thumb.png new file mode 100644 index 000000000000..27ad230ae7f3 Binary files /dev/null and b/assets/images/ptc2022/F04-thumb.png differ diff --git a/assets/images/ptc2022/G01-thumb.png b/assets/images/ptc2022/G01-thumb.png new file mode 100644 index 000000000000..010d72c363c1 Binary files /dev/null and b/assets/images/ptc2022/G01-thumb.png differ diff --git a/assets/images/ptc2022/H01-thumb.png b/assets/images/ptc2022/H01-thumb.png new file mode 100644 index 000000000000..87ddcb179e8c Binary files /dev/null and b/assets/images/ptc2022/H01-thumb.png differ diff --git a/assets/images/ptdevday21.gif b/assets/images/ptdevday21.gif new file mode 100644 index 000000000000..fe8cc65d5ff7 Binary files /dev/null and b/assets/images/ptdevday21.gif differ diff --git a/assets/images/pte-azure-lockup-stacked.png b/assets/images/pte-azure-lockup-stacked.png new file mode 100644 index 000000000000..ab710ed29471 Binary files /dev/null and b/assets/images/pte-azure-lockup-stacked.png differ diff --git a/assets/images/pte-azure-lockup.png b/assets/images/pte-azure-lockup.png new file mode 100644 index 000000000000..9fa4d58ec6ef Binary files /dev/null and b/assets/images/pte-azure-lockup.png differ diff --git a/assets/images/py20-ask-engs.png b/assets/images/py20-ask-engs.png new file mode 100644 index 000000000000..9fc91ad1a7a9 Binary files /dev/null and b/assets/images/py20-ask-engs.png differ diff --git a/assets/images/pyt-dev-day-2021.gif b/assets/images/pyt-dev-day-2021.gif new file mode 100644 index 000000000000..fe8cc65d5ff7 Binary files /dev/null and b/assets/images/pyt-dev-day-2021.gif differ diff --git a/assets/images/pytconf24-color-whitetext.svg b/assets/images/pytconf24-color-whitetext.svg new file mode 100644 index 000000000000..f04c78e89050 --- /dev/null +++ b/assets/images/pytconf24-color-whitetext.svg @@ -0,0 +1 @@ +pytconf24-color-whitetext.svg \ No newline at end of file diff --git a/assets/images/pytorch-2-7-intel-gpus/fg1.png b/assets/images/pytorch-2-7-intel-gpus/fg1.png new file mode 100644 index 000000000000..a0b4ee57da90 Binary files /dev/null and b/assets/images/pytorch-2-7-intel-gpus/fg1.png differ diff --git a/assets/images/pytorch-2-7-intel-gpus/fg2.png b/assets/images/pytorch-2-7-intel-gpus/fg2.png new file mode 100644 index 000000000000..cb39643891c1 Binary files /dev/null and b/assets/images/pytorch-2-7-intel-gpus/fg2.png differ diff --git a/assets/images/pytorch-2.0-feature-img.png b/assets/images/pytorch-2.0-feature-img.png new file mode 100644 index 000000000000..57f90a1873fc Binary files /dev/null and b/assets/images/pytorch-2.0-feature-img.png differ diff --git a/assets/images/pytorch-2.0-img10.png b/assets/images/pytorch-2.0-img10.png new file mode 100644 index 000000000000..4c408b78dceb Binary files /dev/null and b/assets/images/pytorch-2.0-img10.png differ diff --git a/assets/images/pytorch-2.0-img11.png b/assets/images/pytorch-2.0-img11.png new file mode 100644 index 000000000000..1e737c5291f7 Binary files /dev/null and b/assets/images/pytorch-2.0-img11.png differ diff --git a/assets/images/pytorch-2.0-img12.png b/assets/images/pytorch-2.0-img12.png new file mode 100644 index 000000000000..55d1dde5b68f Binary files /dev/null and b/assets/images/pytorch-2.0-img12.png differ diff --git a/assets/images/pytorch-2.0-img2.png b/assets/images/pytorch-2.0-img2.png new file mode 100644 index 000000000000..92bbc3ad6e48 Binary files /dev/null and b/assets/images/pytorch-2.0-img2.png differ diff --git a/assets/images/pytorch-2.0-img3.gif b/assets/images/pytorch-2.0-img3.gif new file mode 100644 index 000000000000..36a5035411ea Binary files /dev/null and b/assets/images/pytorch-2.0-img3.gif differ diff --git a/assets/images/pytorch-2.0-img4.jpg b/assets/images/pytorch-2.0-img4.jpg new file mode 100644 index 000000000000..ffebd754be6c Binary files /dev/null and b/assets/images/pytorch-2.0-img4.jpg differ diff --git a/assets/images/pytorch-2.0-img5.png b/assets/images/pytorch-2.0-img5.png new file mode 100644 index 000000000000..b63079389790 Binary files /dev/null and b/assets/images/pytorch-2.0-img5.png differ diff --git a/assets/images/pytorch-2.0-img6.png b/assets/images/pytorch-2.0-img6.png new file mode 100644 index 000000000000..d30c3c9fe8ab Binary files /dev/null and b/assets/images/pytorch-2.0-img6.png differ diff --git a/assets/images/pytorch-2.0-img7.png b/assets/images/pytorch-2.0-img7.png new file mode 100644 index 000000000000..d5f1143f9c09 Binary files /dev/null and b/assets/images/pytorch-2.0-img7.png differ diff --git a/assets/images/pytorch-2.0-img8.png b/assets/images/pytorch-2.0-img8.png new file mode 100644 index 000000000000..fe75d443f49c Binary files /dev/null and b/assets/images/pytorch-2.0-img8.png differ diff --git a/assets/images/pytorch-2.0-img9.png b/assets/images/pytorch-2.0-img9.png new file mode 100644 index 000000000000..cb1d40571047 Binary files /dev/null and b/assets/images/pytorch-2.0-img9.png differ diff --git a/assets/images/pytorch-at-gtc.jpg b/assets/images/pytorch-at-gtc.jpg new file mode 100644 index 000000000000..7380cfb0125a Binary files /dev/null and b/assets/images/pytorch-at-gtc.jpg differ diff --git a/assets/images/pytorch-compile-to-speed-up-inference/fig1.jpg b/assets/images/pytorch-compile-to-speed-up-inference/fig1.jpg new file mode 100644 index 000000000000..34e0c358967f Binary files /dev/null and b/assets/images/pytorch-compile-to-speed-up-inference/fig1.jpg differ diff --git a/assets/images/pytorch-compile-to-speed-up-inference/fig2.jpg b/assets/images/pytorch-compile-to-speed-up-inference/fig2.jpg new file mode 100644 index 000000000000..9aeb22fad369 Binary files /dev/null and b/assets/images/pytorch-compile-to-speed-up-inference/fig2.jpg differ diff --git a/assets/images/pytorch-compile-to-speed-up-inference/fig3.jpg b/assets/images/pytorch-compile-to-speed-up-inference/fig3.jpg new file mode 100644 index 000000000000..055aa6764044 Binary files /dev/null and b/assets/images/pytorch-compile-to-speed-up-inference/fig3.jpg differ diff --git a/assets/images/pytorch-conf-2023.png b/assets/images/pytorch-conf-2023.png new file mode 100644 index 000000000000..707636856a3d Binary files /dev/null and b/assets/images/pytorch-conf-2023.png differ diff --git a/assets/images/pytorch-conf-2024.png b/assets/images/pytorch-conf-2024.png new file mode 100644 index 000000000000..4c55f276b771 Binary files /dev/null and b/assets/images/pytorch-conf-2024.png differ diff --git a/assets/images/pytorch-conf-2025.jpg b/assets/images/pytorch-conf-2025.jpg new file mode 100644 index 000000000000..256c9efefd59 Binary files /dev/null and b/assets/images/pytorch-conf-2025.jpg differ diff --git a/assets/images/pytorch-conference-2022.png b/assets/images/pytorch-conference-2022.png new file mode 100644 index 000000000000..34cfce56e4cb Binary files /dev/null and b/assets/images/pytorch-conference-2022.png differ diff --git a/assets/images/pytorch-conference-2024-recap/54006027240_be489d89a3_k.jpg b/assets/images/pytorch-conference-2024-recap/54006027240_be489d89a3_k.jpg new file mode 100644 index 000000000000..e147fa969de2 Binary files /dev/null and b/assets/images/pytorch-conference-2024-recap/54006027240_be489d89a3_k.jpg differ diff --git a/assets/images/pytorch-conference-2024-recap/54017358432_8d9b53a2c8_k.jpg b/assets/images/pytorch-conference-2024-recap/54017358432_8d9b53a2c8_k.jpg new file mode 100644 index 000000000000..e130aee3a96d Binary files /dev/null and b/assets/images/pytorch-conference-2024-recap/54017358432_8d9b53a2c8_k.jpg differ diff --git a/assets/images/pytorch-conference-2024-recap/54018197476_9fce5b234d_k.jpg b/assets/images/pytorch-conference-2024-recap/54018197476_9fce5b234d_k.jpg new file mode 100644 index 000000000000..ef082b2dcc8a Binary files /dev/null and b/assets/images/pytorch-conference-2024-recap/54018197476_9fce5b234d_k.jpg differ diff --git a/assets/images/pytorch-conference-2024-recap/54018500933_4df67cbbd4_k.jpg b/assets/images/pytorch-conference-2024-recap/54018500933_4df67cbbd4_k.jpg new file mode 100644 index 000000000000..744f84a0ea64 Binary files /dev/null and b/assets/images/pytorch-conference-2024-recap/54018500933_4df67cbbd4_k.jpg differ diff --git a/assets/images/pytorch-conference-2024-recap/54018555324_daae473637_k.jpg b/assets/images/pytorch-conference-2024-recap/54018555324_daae473637_k.jpg new file mode 100644 index 000000000000..5524f00cbb9e Binary files /dev/null and b/assets/images/pytorch-conference-2024-recap/54018555324_daae473637_k.jpg differ diff --git a/assets/images/pytorch-conference-2024-recap/54036162068_0afdec2ca6_k.jpg b/assets/images/pytorch-conference-2024-recap/54036162068_0afdec2ca6_k.jpg new file mode 100644 index 000000000000..867fedab1a7b Binary files /dev/null and b/assets/images/pytorch-conference-2024-recap/54036162068_0afdec2ca6_k.jpg differ diff --git a/assets/images/pytorch-conference-2024-recap/sponsors.png b/assets/images/pytorch-conference-2024-recap/sponsors.png new file mode 100644 index 000000000000..9b987bb5b654 Binary files /dev/null and b/assets/images/pytorch-conference-2024-recap/sponsors.png differ diff --git a/assets/images/pytorch-developer-day-2021.png b/assets/images/pytorch-developer-day-2021.png new file mode 100644 index 000000000000..ebc8ad93b678 Binary files /dev/null and b/assets/images/pytorch-developer-day-2021.png differ diff --git a/assets/images/pytorch-ecosystem.png b/assets/images/pytorch-ecosystem.png new file mode 100644 index 000000000000..d453c5b26add Binary files /dev/null and b/assets/images/pytorch-ecosystem.png differ diff --git a/assets/images/pytorch-edge-arrows.svg b/assets/images/pytorch-edge-arrows.svg new file mode 100644 index 000000000000..f25e6771b125 --- /dev/null +++ b/assets/images/pytorch-edge-arrows.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/assets/images/pytorch-foundation-blog-image.jpg b/assets/images/pytorch-foundation-blog-image.jpg new file mode 100644 index 000000000000..bd4344b5125a Binary files /dev/null and b/assets/images/pytorch-foundation-blog-image.jpg differ diff --git a/assets/images/pytorch-hub-arrow.svg b/assets/images/pytorch-hub-arrow.svg new file mode 100644 index 000000000000..d5c383a2d7ee --- /dev/null +++ b/assets/images/pytorch-hub-arrow.svg @@ -0,0 +1,12 @@ + + + + + + + < + + + + + diff --git a/assets/images/pytorch-ibm-headshot.png b/assets/images/pytorch-ibm-headshot.png new file mode 100644 index 000000000000..04b8fd348252 Binary files /dev/null and b/assets/images/pytorch-ibm-headshot.png differ diff --git a/assets/images/pytorch-ibm-logo.png b/assets/images/pytorch-ibm-logo.png new file mode 100644 index 000000000000..cc46f33c2682 Binary files /dev/null and b/assets/images/pytorch-ibm-logo.png differ diff --git a/assets/images/pytorch-ibm-thumbnail.png b/assets/images/pytorch-ibm-thumbnail.png new file mode 100644 index 000000000000..382783a3f04e Binary files /dev/null and b/assets/images/pytorch-ibm-thumbnail.png differ diff --git a/assets/images/pytorch-logo.jpg b/assets/images/pytorch-logo.jpg new file mode 100644 index 000000000000..02fb6a004945 Binary files /dev/null and b/assets/images/pytorch-logo.jpg differ diff --git a/assets/images/pytorch-logo.png b/assets/images/pytorch-logo.png new file mode 100755 index 000000000000..bad49bf30b4a Binary files /dev/null and b/assets/images/pytorch-logo.png differ diff --git a/assets/images/pytorch-mobile.png b/assets/images/pytorch-mobile.png new file mode 100644 index 000000000000..1812bccd8f8a Binary files /dev/null and b/assets/images/pytorch-mobile.png differ diff --git a/assets/images/pytorch-profiler-bottleneck.png b/assets/images/pytorch-profiler-bottleneck.png new file mode 100644 index 000000000000..33bba0cf7d18 Binary files /dev/null and b/assets/images/pytorch-profiler-bottleneck.png differ diff --git a/assets/images/pytorch-profiler-vscode-launch.png b/assets/images/pytorch-profiler-vscode-launch.png new file mode 100644 index 000000000000..5fa0299f3be5 Binary files /dev/null and b/assets/images/pytorch-profiler-vscode-launch.png differ diff --git a/assets/images/pytorch-profiler-vscode.png b/assets/images/pytorch-profiler-vscode.png new file mode 100644 index 000000000000..4a7c47ac33f5 Binary files /dev/null and b/assets/images/pytorch-profiler-vscode.png differ diff --git a/assets/images/pytorch-profiler.gif b/assets/images/pytorch-profiler.gif new file mode 100644 index 000000000000..8b61b6e3b61e Binary files /dev/null and b/assets/images/pytorch-profiler.gif differ diff --git a/assets/images/pytorch-shanghai-notes/fg1.jpg b/assets/images/pytorch-shanghai-notes/fg1.jpg new file mode 100644 index 000000000000..da549f68b9ee Binary files /dev/null and b/assets/images/pytorch-shanghai-notes/fg1.jpg differ diff --git a/assets/images/pytorch-shanghai-notes/fg2.jpg b/assets/images/pytorch-shanghai-notes/fg2.jpg new file mode 100644 index 000000000000..1e1542621e3c Binary files /dev/null and b/assets/images/pytorch-shanghai-notes/fg2.jpg differ diff --git a/assets/images/pytorch-shanghai-notes/fg3.jpg b/assets/images/pytorch-shanghai-notes/fg3.jpg new file mode 100644 index 000000000000..5c50748e72f6 Binary files /dev/null and b/assets/images/pytorch-shanghai-notes/fg3.jpg differ diff --git a/assets/images/pytorch-shanghai-notes/fg4.jpg b/assets/images/pytorch-shanghai-notes/fg4.jpg new file mode 100644 index 000000000000..3f711588bffd Binary files /dev/null and b/assets/images/pytorch-shanghai-notes/fg4.jpg differ diff --git a/assets/images/pytorch-shanghai-notes/fg5.jpg b/assets/images/pytorch-shanghai-notes/fg5.jpg new file mode 100644 index 000000000000..04747f86613c Binary files /dev/null and b/assets/images/pytorch-shanghai-notes/fg5.jpg differ diff --git a/assets/images/pytorch-shanghai-notes/fg6.jpg b/assets/images/pytorch-shanghai-notes/fg6.jpg new file mode 100644 index 000000000000..00f2f7c793e2 Binary files /dev/null and b/assets/images/pytorch-shanghai-notes/fg6.jpg differ diff --git a/assets/images/pytorch-timeline.svg b/assets/images/pytorch-timeline.svg new file mode 100644 index 000000000000..6997d2833cb9 --- /dev/null +++ b/assets/images/pytorch-timeline.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/assets/images/pytorch-x.svg b/assets/images/pytorch-x.svg new file mode 100644 index 000000000000..74856ea9fdae --- /dev/null +++ b/assets/images/pytorch-x.svg @@ -0,0 +1,10 @@ + + + + + + + diff --git a/assets/images/pytorch-xla-spmd/fig1.png b/assets/images/pytorch-xla-spmd/fig1.png new file mode 100644 index 000000000000..d3bc80b637b8 Binary files /dev/null and b/assets/images/pytorch-xla-spmd/fig1.png differ diff --git a/assets/images/pytorch-xla-spmd/fig2.png b/assets/images/pytorch-xla-spmd/fig2.png new file mode 100644 index 000000000000..07b01e017404 Binary files /dev/null and b/assets/images/pytorch-xla-spmd/fig2.png differ diff --git a/assets/images/pytorch-xla-spmd/fig3.png b/assets/images/pytorch-xla-spmd/fig3.png new file mode 100644 index 000000000000..537dc8cc4c17 Binary files /dev/null and b/assets/images/pytorch-xla-spmd/fig3.png differ diff --git a/assets/images/pytorch-xla-spmd/fig4.png b/assets/images/pytorch-xla-spmd/fig4.png new file mode 100644 index 000000000000..1eeda0566b7b Binary files /dev/null and b/assets/images/pytorch-xla-spmd/fig4.png differ diff --git a/assets/images/pytorch1.6.png b/assets/images/pytorch1.6.png new file mode 100644 index 000000000000..2c173d9a7cc0 Binary files /dev/null and b/assets/images/pytorch1.6.png differ diff --git a/assets/images/pytorch20post.png b/assets/images/pytorch20post.png new file mode 100644 index 000000000000..c309f6384665 Binary files /dev/null and b/assets/images/pytorch20post.png differ diff --git a/assets/images/pytorch_better_transformer_chart1.png b/assets/images/pytorch_better_transformer_chart1.png new file mode 100644 index 000000000000..6a7420180898 Binary files /dev/null and b/assets/images/pytorch_better_transformer_chart1.png differ diff --git a/assets/images/pytorch_bg1.jpg b/assets/images/pytorch_bg1.jpg new file mode 100644 index 000000000000..6d3b1e389c70 Binary files /dev/null and b/assets/images/pytorch_bg1.jpg differ diff --git a/assets/images/pytorch_bg2.jpg b/assets/images/pytorch_bg2.jpg new file mode 100644 index 000000000000..ec5a9da99986 Binary files /dev/null and b/assets/images/pytorch_bg2.jpg differ diff --git a/assets/images/pytorch_bg3.jpg b/assets/images/pytorch_bg3.jpg new file mode 100644 index 000000000000..67bbb0df9f19 Binary files /dev/null and b/assets/images/pytorch_bg3.jpg differ diff --git a/assets/images/pytorch_bg4.jpg b/assets/images/pytorch_bg4.jpg new file mode 100644 index 000000000000..f4ddafca8030 Binary files /dev/null and b/assets/images/pytorch_bg4.jpg differ diff --git a/assets/images/pytorch_bg5.jpg b/assets/images/pytorch_bg5.jpg new file mode 100644 index 000000000000..1b3d0fa83438 Binary files /dev/null and b/assets/images/pytorch_bg5.jpg differ diff --git a/assets/images/pytorch_bg_purple.jpg b/assets/images/pytorch_bg_purple.jpg new file mode 100644 index 000000000000..2f48b15de324 Binary files /dev/null and b/assets/images/pytorch_bg_purple.jpg differ diff --git "a/assets/images/pytorch_conference\342\200\223dec_2nd_2022_new.gif" "b/assets/images/pytorch_conference\342\200\223dec_2nd_2022_new.gif" new file mode 100644 index 000000000000..aff5200d885a Binary files /dev/null and "b/assets/images/pytorch_conference\342\200\223dec_2nd_2022_new.gif" differ diff --git "a/assets/images/pytorch_conference\342\200\223dec_2nd_2022_ty.jpg" "b/assets/images/pytorch_conference\342\200\223dec_2nd_2022_ty.jpg" new file mode 100644 index 000000000000..95cac06bd39c Binary files /dev/null and "b/assets/images/pytorch_conference\342\200\223dec_2nd_2022_ty.jpg" differ diff --git a/assets/images/pytorch_developer_day21.gif b/assets/images/pytorch_developer_day21.gif new file mode 100644 index 000000000000..3577408f7274 Binary files /dev/null and b/assets/images/pytorch_developer_day21.gif differ diff --git a/assets/images/pytorch_developer_day_2020.png b/assets/images/pytorch_developer_day_2020.png new file mode 100644 index 000000000000..37ba3c990c9e Binary files /dev/null and b/assets/images/pytorch_developer_day_2020.png differ diff --git a/assets/images/pytorch_developer_day_2021.png b/assets/images/pytorch_developer_day_2021.png new file mode 100644 index 000000000000..fb3ef3efbd0b Binary files /dev/null and b/assets/images/pytorch_developer_day_2021.png differ diff --git a/assets/images/pytorch_ecosystem_day_2021.jpeg b/assets/images/pytorch_ecosystem_day_2021.jpeg new file mode 100644 index 000000000000..65e8102426af Binary files /dev/null and b/assets/images/pytorch_ecosystem_day_2021.jpeg differ diff --git a/assets/images/pytorch_hackathon_2021.gif b/assets/images/pytorch_hackathon_2021.gif new file mode 100644 index 000000000000..ada7ca1fcef4 Binary files /dev/null and b/assets/images/pytorch_hackathon_2021.gif differ diff --git a/assets/images/pytorch_wechat_qr_code.png b/assets/images/pytorch_wechat_qr_code.png new file mode 100644 index 000000000000..f11087d5b53d Binary files /dev/null and b/assets/images/pytorch_wechat_qr_code.png differ diff --git a/assets/images/pytorchmobile.png b/assets/images/pytorchmobile.png new file mode 100644 index 000000000000..8f5c87142858 Binary files /dev/null and b/assets/images/pytorchmobile.png differ diff --git a/assets/images/pytorchs-tracing-based-selective-build_Figure1.png b/assets/images/pytorchs-tracing-based-selective-build_Figure1.png new file mode 100644 index 000000000000..2206c89064f0 Binary files /dev/null and b/assets/images/pytorchs-tracing-based-selective-build_Figure1.png differ diff --git a/assets/images/pytorchs-tracing-based-selective-build_Figure2.png b/assets/images/pytorchs-tracing-based-selective-build_Figure2.png new file mode 100644 index 000000000000..d90412d8feed Binary files /dev/null and b/assets/images/pytorchs-tracing-based-selective-build_Figure2.png differ diff --git a/assets/images/pytorchs-tracing-based-selective-build_Figure3.png b/assets/images/pytorchs-tracing-based-selective-build_Figure3.png new file mode 100644 index 000000000000..f7cd0695dcd9 Binary files /dev/null and b/assets/images/pytorchs-tracing-based-selective-build_Figure3.png differ diff --git a/assets/images/pytorchs-tracing-based-selective-build_Figure_4.png b/assets/images/pytorchs-tracing-based-selective-build_Figure_4.png new file mode 100644 index 000000000000..589c1fbf266b Binary files /dev/null and b/assets/images/pytorchs-tracing-based-selective-build_Figure_4.png differ diff --git a/assets/images/pytorchwebdataset1.png b/assets/images/pytorchwebdataset1.png new file mode 100644 index 000000000000..5c61a77808ff Binary files /dev/null and b/assets/images/pytorchwebdataset1.png differ diff --git a/assets/images/qaid.gif b/assets/images/qaid.gif new file mode 100644 index 000000000000..4815089d0898 Binary files /dev/null and b/assets/images/qaid.gif differ diff --git a/assets/images/quantization-aware-training/fg1.jpg b/assets/images/quantization-aware-training/fg1.jpg new file mode 100644 index 000000000000..f66052949e37 Binary files /dev/null and b/assets/images/quantization-aware-training/fg1.jpg differ diff --git a/assets/images/quantization-aware-training/fg2.png b/assets/images/quantization-aware-training/fg2.png new file mode 100644 index 000000000000..26c9da5e1250 Binary files /dev/null and b/assets/images/quantization-aware-training/fg2.png differ diff --git a/assets/images/quantization-aware-training/fg3a.png b/assets/images/quantization-aware-training/fg3a.png new file mode 100644 index 000000000000..4ac97a712206 Binary files /dev/null and b/assets/images/quantization-aware-training/fg3a.png differ diff --git a/assets/images/quantization-aware-training/fg3b.png b/assets/images/quantization-aware-training/fg3b.png new file mode 100644 index 000000000000..1fdac64bc2b9 Binary files /dev/null and b/assets/images/quantization-aware-training/fg3b.png differ diff --git a/assets/images/quantization-aware-training/fg4a.png b/assets/images/quantization-aware-training/fg4a.png new file mode 100644 index 000000000000..c1240a2ea021 Binary files /dev/null and b/assets/images/quantization-aware-training/fg4a.png differ diff --git a/assets/images/quantization-aware-training/fg4b.png b/assets/images/quantization-aware-training/fg4b.png new file mode 100644 index 000000000000..faf1a7f1167f Binary files /dev/null and b/assets/images/quantization-aware-training/fg4b.png differ diff --git a/assets/images/quantization-aware-training/fg5a.png b/assets/images/quantization-aware-training/fg5a.png new file mode 100644 index 000000000000..fb00e0fb7f72 Binary files /dev/null and b/assets/images/quantization-aware-training/fg5a.png differ diff --git a/assets/images/quantization-aware-training/fg5b.png b/assets/images/quantization-aware-training/fg5b.png new file mode 100644 index 000000000000..e9cf21856ce2 Binary files /dev/null and b/assets/images/quantization-aware-training/fg5b.png differ diff --git a/assets/images/quantization-aware-training/fg6a.png b/assets/images/quantization-aware-training/fg6a.png new file mode 100644 index 000000000000..1e1ac0e07c6d Binary files /dev/null and b/assets/images/quantization-aware-training/fg6a.png differ diff --git a/assets/images/quantization-aware-training/fg6b.png b/assets/images/quantization-aware-training/fg6b.png new file mode 100644 index 000000000000..381389dd3ce8 Binary files /dev/null and b/assets/images/quantization-aware-training/fg6b.png differ diff --git a/assets/images/quantization-practice/affine-symmetric.png b/assets/images/quantization-practice/affine-symmetric.png new file mode 100644 index 000000000000..30248e49a948 Binary files /dev/null and b/assets/images/quantization-practice/affine-symmetric.png differ diff --git a/assets/images/quantization-practice/compare_output_ns.png b/assets/images/quantization-practice/compare_output_ns.png new file mode 100644 index 000000000000..4ece4d114833 Binary files /dev/null and b/assets/images/quantization-practice/compare_output_ns.png differ diff --git a/assets/images/quantization-practice/hero.gif b/assets/images/quantization-practice/hero.gif new file mode 100644 index 000000000000..829c50371882 Binary files /dev/null and b/assets/images/quantization-practice/hero.gif differ diff --git a/assets/images/quantization-practice/per-channel-tensor.svg b/assets/images/quantization-practice/per-channel-tensor.svg new file mode 100644 index 000000000000..04449b308062 --- /dev/null +++ b/assets/images/quantization-practice/per-channel-tensor.svg @@ -0,0 +1,16 @@ + + + + + + + S0, Z0S1, Z1S2, Z2S, Zper-channel quantizationper-tensor quantization \ No newline at end of file diff --git a/assets/images/quantization-practice/ptq-flowchart.svg b/assets/images/quantization-practice/ptq-flowchart.svg new file mode 100644 index 000000000000..1e2502034b59 --- /dev/null +++ b/assets/images/quantization-practice/ptq-flowchart.svg @@ -0,0 +1,16 @@ + + + + + + + Pre-trained modelFuse modulesInsert stubs & observersCalibration dataCalibrationQuantizationPTQ Model \ No newline at end of file diff --git a/assets/images/quantization-practice/ptq_vs_qat.png b/assets/images/quantization-practice/ptq_vs_qat.png new file mode 100644 index 000000000000..c2557e1826c6 Binary files /dev/null and b/assets/images/quantization-practice/ptq_vs_qat.png differ diff --git a/assets/images/quantization-practice/qat-fake-quantization.png b/assets/images/quantization-practice/qat-fake-quantization.png new file mode 100644 index 000000000000..4aa1d87ded97 Binary files /dev/null and b/assets/images/quantization-practice/qat-fake-quantization.png differ diff --git a/assets/images/quantization-practice/qat-flowchart.svg b/assets/images/quantization-practice/qat-flowchart.svg new file mode 100644 index 000000000000..0b03ab4a76c2 --- /dev/null +++ b/assets/images/quantization-practice/qat-flowchart.svg @@ -0,0 +1,16 @@ + + + + + + + Pre-trained modelFuse modulesInsert stubs & observersTrainingdataTraining / FinetuningQuantizationQAT Model \ No newline at end of file diff --git a/assets/images/quantization-practice/quantization-flowchart2.png b/assets/images/quantization-practice/quantization-flowchart2.png new file mode 100644 index 000000000000..428ddfc5b52e Binary files /dev/null and b/assets/images/quantization-practice/quantization-flowchart2.png differ diff --git a/assets/images/real-time-speech-rec/cropped.gif b/assets/images/real-time-speech-rec/cropped.gif new file mode 100644 index 000000000000..96cbdf948f98 Binary files /dev/null and b/assets/images/real-time-speech-rec/cropped.gif differ diff --git a/assets/images/real-time-speech-rec/detected.gif b/assets/images/real-time-speech-rec/detected.gif new file mode 100644 index 000000000000..f5e563b5252c Binary files /dev/null and b/assets/images/real-time-speech-rec/detected.gif differ diff --git a/assets/images/real-time-speech-rec/model.jpg b/assets/images/real-time-speech-rec/model.jpg new file mode 100644 index 000000000000..af4a7714a34b Binary files /dev/null and b/assets/images/real-time-speech-rec/model.jpg differ diff --git a/assets/images/real-time-speech-rec/original.gif b/assets/images/real-time-speech-rec/original.gif new file mode 100644 index 000000000000..b3b1f9ebfd23 Binary files /dev/null and b/assets/images/real-time-speech-rec/original.gif differ diff --git a/assets/images/real-time-speech-rec/pipeline.jpg b/assets/images/real-time-speech-rec/pipeline.jpg new file mode 100644 index 000000000000..0102a9820995 Binary files /dev/null and b/assets/images/real-time-speech-rec/pipeline.jpg differ diff --git a/assets/images/real-time-speech-rec/transformed.gif b/assets/images/real-time-speech-rec/transformed.gif new file mode 100644 index 000000000000..1c1924cb0371 Binary files /dev/null and b/assets/images/real-time-speech-rec/transformed.gif differ diff --git a/assets/images/rebellions-logo.svg b/assets/images/rebellions-logo.svg new file mode 100644 index 000000000000..200a62c80653 --- /dev/null +++ b/assets/images/rebellions-logo.svg @@ -0,0 +1,20 @@ + + + + + + + + + + + + + + + + + + + + diff --git a/assets/images/reducing-checkpointing-times/fg1.png b/assets/images/reducing-checkpointing-times/fg1.png new file mode 100644 index 000000000000..04d685834bd0 Binary files /dev/null and b/assets/images/reducing-checkpointing-times/fg1.png differ diff --git a/assets/images/reducing-checkpointing-times/fg2.png b/assets/images/reducing-checkpointing-times/fg2.png new file mode 100644 index 000000000000..35c4592d45dc Binary files /dev/null and b/assets/images/reducing-checkpointing-times/fg2.png differ diff --git a/assets/images/reducing-checkpointing-times/fg3.png b/assets/images/reducing-checkpointing-times/fg3.png new file mode 100644 index 000000000000..df96b630e4d2 Binary files /dev/null and b/assets/images/reducing-checkpointing-times/fg3.png differ diff --git a/assets/images/reducing-checkpointing-times/fg4.png b/assets/images/reducing-checkpointing-times/fg4.png new file mode 100644 index 000000000000..3b8218f08885 Binary files /dev/null and b/assets/images/reducing-checkpointing-times/fg4.png differ diff --git a/assets/images/reducing-checkpointing-times/fg5.png b/assets/images/reducing-checkpointing-times/fg5.png new file mode 100644 index 000000000000..48eb41ae4677 Binary files /dev/null and b/assets/images/reducing-checkpointing-times/fg5.png differ diff --git a/assets/images/resnest.jpg b/assets/images/resnest.jpg new file mode 100644 index 000000000000..994dc6ff00ee Binary files /dev/null and b/assets/images/resnest.jpg differ diff --git a/assets/images/resnet.png b/assets/images/resnet.png new file mode 100644 index 000000000000..81b7829677a3 Binary files /dev/null and b/assets/images/resnet.png differ diff --git a/assets/images/resnext.png b/assets/images/resnext.png new file mode 100644 index 000000000000..f74c4eb9025e Binary files /dev/null and b/assets/images/resnext.png differ diff --git a/assets/images/salesforce.png b/assets/images/salesforce.png new file mode 100644 index 000000000000..22bf99e04c01 Binary files /dev/null and b/assets/images/salesforce.png differ diff --git a/assets/images/scaling-multimodal-image1-diagram-of-multimodal-flava-new.png b/assets/images/scaling-multimodal-image1-diagram-of-multimodal-flava-new.png new file mode 100644 index 000000000000..3ff891e443df Binary files /dev/null and b/assets/images/scaling-multimodal-image1-diagram-of-multimodal-flava-new.png differ diff --git a/assets/images/scaling-multimodal-image2-diagram-of-standard-data-parallel-training.png b/assets/images/scaling-multimodal-image2-diagram-of-standard-data-parallel-training.png new file mode 100644 index 000000000000..8a624eb31a5b Binary files /dev/null and b/assets/images/scaling-multimodal-image2-diagram-of-standard-data-parallel-training.png differ diff --git a/assets/images/scaling-multimodal-image3-diagram-of-fully-shared-data-parallel-training.png b/assets/images/scaling-multimodal-image3-diagram-of-fully-shared-data-parallel-training.png new file mode 100644 index 000000000000..eef57f428471 Binary files /dev/null and b/assets/images/scaling-multimodal-image3-diagram-of-fully-shared-data-parallel-training.png differ diff --git a/assets/images/scaling-multimodal-image4-graph-experiments-figure1.png b/assets/images/scaling-multimodal-image4-graph-experiments-figure1.png new file mode 100644 index 000000000000..f7b379604f01 Binary files /dev/null and b/assets/images/scaling-multimodal-image4-graph-experiments-figure1.png differ diff --git a/assets/images/scaling-multimodal-image5-graph-experiments-figure-2.png b/assets/images/scaling-multimodal-image5-graph-experiments-figure-2.png new file mode 100644 index 000000000000..9cd8b15ff182 Binary files /dev/null and b/assets/images/scaling-multimodal-image5-graph-experiments-figure-2.png differ diff --git a/assets/images/scaling-pytorch-fsdp-image1-IBM_scaling_FSDP_visual_new.png b/assets/images/scaling-pytorch-fsdp-image1-IBM_scaling_FSDP_visual_new.png new file mode 100644 index 000000000000..e145e1e4a32a Binary files /dev/null and b/assets/images/scaling-pytorch-fsdp-image1-IBM_scaling_FSDP_visual_new.png differ diff --git a/assets/images/scaling-pytorch-fsdp-image2-tflops_per_second_new.png b/assets/images/scaling-pytorch-fsdp-image2-tflops_per_second_new.png new file mode 100644 index 000000000000..351ff7b5ecc1 Binary files /dev/null and b/assets/images/scaling-pytorch-fsdp-image2-tflops_per_second_new.png differ diff --git a/assets/images/scaling-pytorch-fsdp-image3-cli-and-dashboard.png b/assets/images/scaling-pytorch-fsdp-image3-cli-and-dashboard.png new file mode 100644 index 000000000000..a1388ed9c890 Binary files /dev/null and b/assets/images/scaling-pytorch-fsdp-image3-cli-and-dashboard.png differ diff --git a/assets/images/scaling-pytorch-models-on-cloud-tpus-with-fsdp.jpg b/assets/images/scaling-pytorch-models-on-cloud-tpus-with-fsdp.jpg new file mode 100644 index 000000000000..a14b6d6166e2 Binary files /dev/null and b/assets/images/scaling-pytorch-models-on-cloud-tpus-with-fsdp.jpg differ diff --git a/assets/images/scaling-recommendation-2d-sparse-parallelism/fg1.png b/assets/images/scaling-recommendation-2d-sparse-parallelism/fg1.png new file mode 100644 index 000000000000..08674e9efda3 Binary files /dev/null and b/assets/images/scaling-recommendation-2d-sparse-parallelism/fg1.png differ diff --git a/assets/images/scaling-recommendation-2d-sparse-parallelism/fg2.png b/assets/images/scaling-recommendation-2d-sparse-parallelism/fg2.png new file mode 100644 index 000000000000..45b60ca30c15 Binary files /dev/null and b/assets/images/scaling-recommendation-2d-sparse-parallelism/fg2.png differ diff --git a/assets/images/scaling-vision-figure_1-solutions-to-the-challenges.png b/assets/images/scaling-vision-figure_1-solutions-to-the-challenges.png new file mode 100644 index 000000000000..f5830c5e5a9b Binary files /dev/null and b/assets/images/scaling-vision-figure_1-solutions-to-the-challenges.png differ diff --git a/assets/images/scaling-vision-figure_2-image-classification-scaling-result.png b/assets/images/scaling-vision-figure_2-image-classification-scaling-result.png new file mode 100644 index 000000000000..6af0be90e396 Binary files /dev/null and b/assets/images/scaling-vision-figure_2-image-classification-scaling-result.png differ diff --git a/assets/images/scaling-vision-figure_3-object-detection-scaling-result.png b/assets/images/scaling-vision-figure_3-object-detection-scaling-result.png new file mode 100644 index 000000000000..b34d815959cd Binary files /dev/null and b/assets/images/scaling-vision-figure_3-object-detection-scaling-result.png differ diff --git a/assets/images/scaling-vision-figure_4-video-understanding-scaling-result.png b/assets/images/scaling-vision-figure_4-video-understanding-scaling-result.png new file mode 100644 index 000000000000..2799d9ddacb8 Binary files /dev/null and b/assets/images/scaling-vision-figure_4-video-understanding-scaling-result.png differ diff --git a/assets/images/scaling-vision-figure_5-training-speedups-from-various-optimizations.png b/assets/images/scaling-vision-figure_5-training-speedups-from-various-optimizations.png new file mode 100644 index 000000000000..8931c0293f3b Binary files /dev/null and b/assets/images/scaling-vision-figure_5-training-speedups-from-various-optimizations.png differ diff --git a/assets/images/screencast.png b/assets/images/screencast.png new file mode 100644 index 000000000000..81579c56c590 Binary files /dev/null and b/assets/images/screencast.png differ diff --git a/assets/images/screenshot_live_image_classification1.png b/assets/images/screenshot_live_image_classification1.png new file mode 100644 index 000000000000..2627b6ef7072 Binary files /dev/null and b/assets/images/screenshot_live_image_classification1.png differ diff --git a/assets/images/screenshot_live_image_classification2.png b/assets/images/screenshot_live_image_classification2.png new file mode 100644 index 000000000000..5101d36257a1 Binary files /dev/null and b/assets/images/screenshot_live_image_classification2.png differ diff --git a/assets/images/screenshot_mobile_asr1.png b/assets/images/screenshot_mobile_asr1.png new file mode 100644 index 000000000000..4b77963ad6ea Binary files /dev/null and b/assets/images/screenshot_mobile_asr1.png differ diff --git a/assets/images/screenshot_mobile_asr2.png b/assets/images/screenshot_mobile_asr2.png new file mode 100644 index 000000000000..2e73728fbd9c Binary files /dev/null and b/assets/images/screenshot_mobile_asr2.png differ diff --git a/assets/images/screenshot_mobile_d2go1.png b/assets/images/screenshot_mobile_d2go1.png new file mode 100644 index 000000000000..885a0c035b23 Binary files /dev/null and b/assets/images/screenshot_mobile_d2go1.png differ diff --git a/assets/images/screenshot_mobile_d2go2.png b/assets/images/screenshot_mobile_d2go2.png new file mode 100644 index 000000000000..ff6721cb4472 Binary files /dev/null and b/assets/images/screenshot_mobile_d2go2.png differ diff --git a/assets/images/screenshot_mobile_digit_recognition1.png b/assets/images/screenshot_mobile_digit_recognition1.png new file mode 100644 index 000000000000..73bf5df124ce Binary files /dev/null and b/assets/images/screenshot_mobile_digit_recognition1.png differ diff --git a/assets/images/screenshot_mobile_digit_recognition2.png b/assets/images/screenshot_mobile_digit_recognition2.png new file mode 100644 index 000000000000..1d4f541cc097 Binary files /dev/null and b/assets/images/screenshot_mobile_digit_recognition2.png differ diff --git a/assets/images/screenshot_mobile_helloworld.png b/assets/images/screenshot_mobile_helloworld.png new file mode 100644 index 000000000000..48b5857eee22 Binary files /dev/null and b/assets/images/screenshot_mobile_helloworld.png differ diff --git a/assets/images/screenshot_mobile_machinetranslation1.png b/assets/images/screenshot_mobile_machinetranslation1.png new file mode 100644 index 000000000000..8911fa4fac08 Binary files /dev/null and b/assets/images/screenshot_mobile_machinetranslation1.png differ diff --git a/assets/images/screenshot_mobile_machinetranslation2.png b/assets/images/screenshot_mobile_machinetranslation2.png new file mode 100644 index 000000000000..563a07d1d20f Binary files /dev/null and b/assets/images/screenshot_mobile_machinetranslation2.png differ diff --git a/assets/images/screenshot_mobile_object_detection1.png b/assets/images/screenshot_mobile_object_detection1.png new file mode 100644 index 000000000000..4caf2434798b Binary files /dev/null and b/assets/images/screenshot_mobile_object_detection1.png differ diff --git a/assets/images/screenshot_mobile_object_detection2.png b/assets/images/screenshot_mobile_object_detection2.png new file mode 100644 index 000000000000..9da5343c4662 Binary files /dev/null and b/assets/images/screenshot_mobile_object_detection2.png differ diff --git a/assets/images/screenshot_mobile_qa1.png b/assets/images/screenshot_mobile_qa1.png new file mode 100644 index 000000000000..82c9fbab6e52 Binary files /dev/null and b/assets/images/screenshot_mobile_qa1.png differ diff --git a/assets/images/screenshot_mobile_qa2.png b/assets/images/screenshot_mobile_qa2.png new file mode 100644 index 000000000000..11272c0fd34f Binary files /dev/null and b/assets/images/screenshot_mobile_qa2.png differ diff --git a/assets/images/screenshot_mobile_textclassification1.png b/assets/images/screenshot_mobile_textclassification1.png new file mode 100644 index 000000000000..8b435f53dc56 Binary files /dev/null and b/assets/images/screenshot_mobile_textclassification1.png differ diff --git a/assets/images/screenshot_mobile_textclassification2.png b/assets/images/screenshot_mobile_textclassification2.png new file mode 100644 index 000000000000..dd2de564ca60 Binary files /dev/null and b/assets/images/screenshot_mobile_textclassification2.png differ diff --git a/assets/images/screenshot_mobile_video1.png b/assets/images/screenshot_mobile_video1.png new file mode 100644 index 000000000000..831281acc3e4 Binary files /dev/null and b/assets/images/screenshot_mobile_video1.png differ diff --git a/assets/images/screenshot_mobile_video2.png b/assets/images/screenshot_mobile_video2.png new file mode 100644 index 000000000000..c802f9f1cf6a Binary files /dev/null and b/assets/images/screenshot_mobile_video2.png differ diff --git a/assets/images/seamless.png b/assets/images/seamless.png new file mode 100644 index 000000000000..ad0b9d8f2a8b Binary files /dev/null and b/assets/images/seamless.png differ diff --git a/assets/images/search-icon-orange.svg b/assets/images/search-icon-orange.svg new file mode 100644 index 000000000000..0e66c2c77b40 --- /dev/null +++ b/assets/images/search-icon-orange.svg @@ -0,0 +1,11 @@ + + + + + + + + + + + diff --git a/assets/images/search-icon-white.svg b/assets/images/search-icon-white.svg new file mode 100644 index 000000000000..70b2f6e5b203 --- /dev/null +++ b/assets/images/search-icon-white.svg @@ -0,0 +1,11 @@ + + + + + + + + + + + diff --git a/assets/images/search-icon.svg b/assets/images/search-icon.svg new file mode 100644 index 000000000000..ade68ef1aab3 --- /dev/null +++ b/assets/images/search-icon.svg @@ -0,0 +1,11 @@ + + + + + + + + + + + diff --git a/assets/images/segmentation.png b/assets/images/segmentation.png new file mode 100644 index 000000000000..3744d060d293 Binary files /dev/null and b/assets/images/segmentation.png differ diff --git a/assets/images/sentiment.png b/assets/images/sentiment.png new file mode 100644 index 000000000000..27f3ed12bf7b Binary files /dev/null and b/assets/images/sentiment.png differ diff --git a/assets/images/sglang-join-pytorch/fg1.png b/assets/images/sglang-join-pytorch/fg1.png new file mode 100644 index 000000000000..a7838c59ac6e Binary files /dev/null and b/assets/images/sglang-join-pytorch/fg1.png differ diff --git a/assets/images/sglang-join-pytorch/fg2.png b/assets/images/sglang-join-pytorch/fg2.png new file mode 100644 index 000000000000..5e7e3b1d1f0a Binary files /dev/null and b/assets/images/sglang-join-pytorch/fg2.png differ diff --git a/assets/images/shufflenet_v2_1.png b/assets/images/shufflenet_v2_1.png new file mode 100644 index 000000000000..2e6750b419d3 Binary files /dev/null and b/assets/images/shufflenet_v2_1.png differ diff --git a/assets/images/shufflenet_v2_2.png b/assets/images/shufflenet_v2_2.png new file mode 100644 index 000000000000..44960faacc4d Binary files /dev/null and b/assets/images/shufflenet_v2_2.png differ diff --git a/assets/images/sigsep_logo_inria.png b/assets/images/sigsep_logo_inria.png new file mode 100644 index 000000000000..066ea8861253 Binary files /dev/null and b/assets/images/sigsep_logo_inria.png differ diff --git a/assets/images/sigsep_umx-diagram.png b/assets/images/sigsep_umx-diagram.png new file mode 100644 index 000000000000..9cb5c4a3591d Binary files /dev/null and b/assets/images/sigsep_umx-diagram.png differ diff --git a/assets/images/silero_imagenet_moment.png b/assets/images/silero_imagenet_moment.png new file mode 100644 index 000000000000..faa16dc5ce49 Binary files /dev/null and b/assets/images/silero_imagenet_moment.png differ diff --git a/assets/images/silero_logo.jpg b/assets/images/silero_logo.jpg new file mode 100644 index 000000000000..0ced1942afa6 Binary files /dev/null and b/assets/images/silero_logo.jpg differ diff --git a/assets/images/silero_stt_model.jpg b/assets/images/silero_stt_model.jpg new file mode 100644 index 000000000000..2e67c11c2d31 Binary files /dev/null and b/assets/images/silero_stt_model.jpg differ diff --git a/assets/images/silero_vad_performance.png b/assets/images/silero_vad_performance.png new file mode 100644 index 000000000000..9d1d9f4f1479 Binary files /dev/null and b/assets/images/silero_vad_performance.png differ diff --git a/assets/images/simplenet.jpg b/assets/images/simplenet.jpg new file mode 100644 index 000000000000..e3bc71437dc9 Binary files /dev/null and b/assets/images/simplenet.jpg differ diff --git a/assets/images/sketch_animator.png b/assets/images/sketch_animator.png new file mode 100644 index 000000000000..073d86c68637 Binary files /dev/null and b/assets/images/sketch_animator.png differ diff --git a/assets/images/slowfast.png b/assets/images/slowfast.png new file mode 100644 index 000000000000..c5f542a1f81e Binary files /dev/null and b/assets/images/slowfast.png differ diff --git a/assets/images/snnmlp.png b/assets/images/snnmlp.png new file mode 100644 index 000000000000..f08f8ea86f6d Binary files /dev/null and b/assets/images/snnmlp.png differ diff --git a/assets/images/snowflake-logo.svg b/assets/images/snowflake-logo.svg new file mode 100644 index 000000000000..479b911b7333 --- /dev/null +++ b/assets/images/snowflake-logo.svg @@ -0,0 +1,26 @@ + + + + Group + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/assets/images/social-share.jpg b/assets/images/social-share.jpg new file mode 100644 index 000000000000..0f2e7b94bf7e Binary files /dev/null and b/assets/images/social-share.jpg differ diff --git a/assets/images/social_hackathon21.png b/assets/images/social_hackathon21.png new file mode 100644 index 000000000000..c6ef2f3cc56c Binary files /dev/null and b/assets/images/social_hackathon21.png differ diff --git a/assets/images/sota/Best ResNet50 trained with 176 Resolution.png b/assets/images/sota/Best ResNet50 trained with 176 Resolution.png new file mode 100644 index 000000000000..0ce924ceae15 Binary files /dev/null and b/assets/images/sota/Best ResNet50 trained with 176 Resolution.png differ diff --git a/assets/images/sota/Best ResNet50 trained with 224 Resolution.png b/assets/images/sota/Best ResNet50 trained with 224 Resolution.png new file mode 100644 index 000000000000..4f58e3697e70 Binary files /dev/null and b/assets/images/sota/Best ResNet50 trained with 224 Resolution.png differ diff --git a/assets/images/sota/Cumulative Accuracy Improvements for ResNet50.png b/assets/images/sota/Cumulative Accuracy Improvements for ResNet50.png new file mode 100644 index 000000000000..d352f33117c3 Binary files /dev/null and b/assets/images/sota/Cumulative Accuracy Improvements for ResNet50.png differ diff --git a/assets/images/sota/ResNet101 Inference Resize.png b/assets/images/sota/ResNet101 Inference Resize.png new file mode 100644 index 000000000000..ab09c22feb53 Binary files /dev/null and b/assets/images/sota/ResNet101 Inference Resize.png differ diff --git a/assets/images/sota/ResNet152 Inference Resize.png b/assets/images/sota/ResNet152 Inference Resize.png new file mode 100644 index 000000000000..f86d69bf2cf2 Binary files /dev/null and b/assets/images/sota/ResNet152 Inference Resize.png differ diff --git a/assets/images/sota/ResNet50 Inference Resize.png b/assets/images/sota/ResNet50 Inference Resize.png new file mode 100644 index 000000000000..04aa6cd18c80 Binary files /dev/null and b/assets/images/sota/ResNet50 Inference Resize.png differ diff --git a/assets/images/spectrograms.png b/assets/images/spectrograms.png new file mode 100644 index 000000000000..48f6a1ef50d8 Binary files /dev/null and b/assets/images/spectrograms.png differ diff --git a/assets/images/speeding-up-vits/fig1.png b/assets/images/speeding-up-vits/fig1.png new file mode 100644 index 000000000000..758b2be72f2b Binary files /dev/null and b/assets/images/speeding-up-vits/fig1.png differ diff --git a/assets/images/speeding-up-vits/fig2.png b/assets/images/speeding-up-vits/fig2.png new file mode 100644 index 000000000000..87229b169381 Binary files /dev/null and b/assets/images/speeding-up-vits/fig2.png differ diff --git a/assets/images/speeding-up-vits/fig3.png b/assets/images/speeding-up-vits/fig3.png new file mode 100644 index 000000000000..7744b9b41d6a Binary files /dev/null and b/assets/images/speeding-up-vits/fig3.png differ diff --git a/assets/images/speeding-up-vits/fig4.png b/assets/images/speeding-up-vits/fig4.png new file mode 100644 index 000000000000..550b5e3ea06a Binary files /dev/null and b/assets/images/speeding-up-vits/fig4.png differ diff --git a/assets/images/speeding-up-vits/fig5.png b/assets/images/speeding-up-vits/fig5.png new file mode 100644 index 000000000000..52eb171cdac3 Binary files /dev/null and b/assets/images/speeding-up-vits/fig5.png differ diff --git a/assets/images/speeding-up-vits/fig6.png b/assets/images/speeding-up-vits/fig6.png new file mode 100644 index 000000000000..ba5d79ed0304 Binary files /dev/null and b/assets/images/speeding-up-vits/fig6.png differ diff --git a/assets/images/speeding-up-vits/fig7.png b/assets/images/speeding-up-vits/fig7.png new file mode 100644 index 000000000000..8f59f3b5b8fd Binary files /dev/null and b/assets/images/speeding-up-vits/fig7.png differ diff --git a/assets/images/squares-icon.svg b/assets/images/squares-icon.svg new file mode 100644 index 000000000000..2b9a75a338e6 --- /dev/null +++ b/assets/images/squares-icon.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/assets/images/squeezenet.png b/assets/images/squeezenet.png new file mode 100644 index 000000000000..f98b670f4dfb Binary files /dev/null and b/assets/images/squeezenet.png differ diff --git a/assets/images/ssd.png b/assets/images/ssd.png new file mode 100644 index 000000000000..a7bdff94eca2 Binary files /dev/null and b/assets/images/ssd.png differ diff --git a/assets/images/ssd_diagram.png b/assets/images/ssd_diagram.png new file mode 100644 index 000000000000..cbb0e69bc04e Binary files /dev/null and b/assets/images/ssd_diagram.png differ diff --git a/assets/images/ssdlite-pre-trained.png b/assets/images/ssdlite-pre-trained.png new file mode 100644 index 000000000000..1f0d1b618310 Binary files /dev/null and b/assets/images/ssdlite-pre-trained.png differ diff --git a/assets/images/ssl-image.png b/assets/images/ssl-image.png new file mode 100644 index 000000000000..0fa72e245f1a Binary files /dev/null and b/assets/images/ssl-image.png differ diff --git a/assets/images/stable-diffusion/original_vs_optimized_A10_n_samples_1_n_iter_2_sd2.png b/assets/images/stable-diffusion/original_vs_optimized_A10_n_samples_1_n_iter_2_sd2.png new file mode 100644 index 000000000000..e2a6101e8568 Binary files /dev/null and b/assets/images/stable-diffusion/original_vs_optimized_A10_n_samples_1_n_iter_2_sd2.png differ diff --git a/assets/images/stable-diffusion/original_vs_optimized_A10_n_samples_2_n_iter_2_sd2.png b/assets/images/stable-diffusion/original_vs_optimized_A10_n_samples_2_n_iter_2_sd2.png new file mode 100644 index 000000000000..4c0d22cb5d1e Binary files /dev/null and b/assets/images/stable-diffusion/original_vs_optimized_A10_n_samples_2_n_iter_2_sd2.png differ diff --git a/assets/images/stable-diffusion/original_vs_optimized_A10_n_samples_4_n_iter_2_sd2.png b/assets/images/stable-diffusion/original_vs_optimized_A10_n_samples_4_n_iter_2_sd2.png new file mode 100644 index 000000000000..341d29ca76c8 Binary files /dev/null and b/assets/images/stable-diffusion/original_vs_optimized_A10_n_samples_4_n_iter_2_sd2.png differ diff --git a/assets/images/stable-diffusion/original_vs_optimized_a100_n_samples_1_n_iter_2_sd2.png b/assets/images/stable-diffusion/original_vs_optimized_a100_n_samples_1_n_iter_2_sd2.png new file mode 100644 index 000000000000..73108eb6d594 Binary files /dev/null and b/assets/images/stable-diffusion/original_vs_optimized_a100_n_samples_1_n_iter_2_sd2.png differ diff --git a/assets/images/stable-diffusion/original_vs_optimized_a100_n_samples_2_n_iter_2_sd2.png b/assets/images/stable-diffusion/original_vs_optimized_a100_n_samples_2_n_iter_2_sd2.png new file mode 100644 index 000000000000..3de1a6113759 Binary files /dev/null and b/assets/images/stable-diffusion/original_vs_optimized_a100_n_samples_2_n_iter_2_sd2.png differ diff --git a/assets/images/stable-diffusion/original_vs_optimized_a100_n_samples_4_n_iter_2_sd2.png b/assets/images/stable-diffusion/original_vs_optimized_a100_n_samples_4_n_iter_2_sd2.png new file mode 100644 index 000000000000..1483e742c6db Binary files /dev/null and b/assets/images/stable-diffusion/original_vs_optimized_a100_n_samples_4_n_iter_2_sd2.png differ diff --git a/assets/images/stable-diffusion/original_vs_optimized_p100_n_samples_1_n_iter_2_sd2.png b/assets/images/stable-diffusion/original_vs_optimized_p100_n_samples_1_n_iter_2_sd2.png new file mode 100644 index 000000000000..d78b0343a919 Binary files /dev/null and b/assets/images/stable-diffusion/original_vs_optimized_p100_n_samples_1_n_iter_2_sd2.png differ diff --git a/assets/images/stable-diffusion/original_vs_optimized_p100_n_samples_2_n_iter_2_sd2.png b/assets/images/stable-diffusion/original_vs_optimized_p100_n_samples_2_n_iter_2_sd2.png new file mode 100644 index 000000000000..c0f22f11ca1d Binary files /dev/null and b/assets/images/stable-diffusion/original_vs_optimized_p100_n_samples_2_n_iter_2_sd2.png differ diff --git a/assets/images/stable-diffusion/original_vs_optimized_p100_n_samples_4_n_iter_2_sd2.png b/assets/images/stable-diffusion/original_vs_optimized_p100_n_samples_4_n_iter_2_sd2.png new file mode 100644 index 000000000000..ae5e32fe4eb7 Binary files /dev/null and b/assets/images/stable-diffusion/original_vs_optimized_p100_n_samples_4_n_iter_2_sd2.png differ diff --git a/assets/images/stable-diffusion/original_vs_optimized_t4_n_samples_1_n_iter_2_sd2.png b/assets/images/stable-diffusion/original_vs_optimized_t4_n_samples_1_n_iter_2_sd2.png new file mode 100644 index 000000000000..33ccc7fec6f6 Binary files /dev/null and b/assets/images/stable-diffusion/original_vs_optimized_t4_n_samples_1_n_iter_2_sd2.png differ diff --git a/assets/images/stable-diffusion/original_vs_optimized_t4_n_samples_2_n_iter_2_sd2.png b/assets/images/stable-diffusion/original_vs_optimized_t4_n_samples_2_n_iter_2_sd2.png new file mode 100644 index 000000000000..9d9dd2b60628 Binary files /dev/null and b/assets/images/stable-diffusion/original_vs_optimized_t4_n_samples_2_n_iter_2_sd2.png differ diff --git a/assets/images/stable-diffusion/original_vs_optimized_t4_n_samples_4_n_iter_2_sd2.png b/assets/images/stable-diffusion/original_vs_optimized_t4_n_samples_4_n_iter_2_sd2.png new file mode 100644 index 000000000000..30743c5a8d91 Binary files /dev/null and b/assets/images/stable-diffusion/original_vs_optimized_t4_n_samples_4_n_iter_2_sd2.png differ diff --git a/assets/images/stable-diffusion/original_vs_optimized_v100_n_samples_1_n_iter_2_sd2.png b/assets/images/stable-diffusion/original_vs_optimized_v100_n_samples_1_n_iter_2_sd2.png new file mode 100644 index 000000000000..b2722aacb30e Binary files /dev/null and b/assets/images/stable-diffusion/original_vs_optimized_v100_n_samples_1_n_iter_2_sd2.png differ diff --git a/assets/images/stable-diffusion/original_vs_optimized_v100_n_samples_2_n_iter_2_sd2.png b/assets/images/stable-diffusion/original_vs_optimized_v100_n_samples_2_n_iter_2_sd2.png new file mode 100644 index 000000000000..34d425307242 Binary files /dev/null and b/assets/images/stable-diffusion/original_vs_optimized_v100_n_samples_2_n_iter_2_sd2.png differ diff --git a/assets/images/stable-diffusion/original_vs_optimized_v100_n_samples_4_n_iter_2_sd2.png b/assets/images/stable-diffusion/original_vs_optimized_v100_n_samples_4_n_iter_2_sd2.png new file mode 100644 index 000000000000..7d458d412272 Binary files /dev/null and b/assets/images/stable-diffusion/original_vs_optimized_v100_n_samples_4_n_iter_2_sd2.png differ diff --git a/assets/images/stable-diffusion/summary_n_samples_1_n_iter_2_sd2.png b/assets/images/stable-diffusion/summary_n_samples_1_n_iter_2_sd2.png new file mode 100644 index 000000000000..0551ac830fe7 Binary files /dev/null and b/assets/images/stable-diffusion/summary_n_samples_1_n_iter_2_sd2.png differ diff --git a/assets/images/stable-diffusion/summary_n_samples_2_n_iter_2_sd2.png b/assets/images/stable-diffusion/summary_n_samples_2_n_iter_2_sd2.png new file mode 100644 index 000000000000..5fb3a85960a5 Binary files /dev/null and b/assets/images/stable-diffusion/summary_n_samples_2_n_iter_2_sd2.png differ diff --git a/assets/images/stable-diffusion/summary_n_samples_4_n_iter_2_sd2.png b/assets/images/stable-diffusion/summary_n_samples_4_n_iter_2_sd2.png new file mode 100644 index 000000000000..ea25fd59e3d9 Binary files /dev/null and b/assets/images/stable-diffusion/summary_n_samples_4_n_iter_2_sd2.png differ diff --git a/assets/images/staff/bazil-sterling.jpg b/assets/images/staff/bazil-sterling.jpg new file mode 100644 index 000000000000..d4541b257c77 Binary files /dev/null and b/assets/images/staff/bazil-sterling.jpg differ diff --git a/assets/images/staff/chris-abraham.jpg b/assets/images/staff/chris-abraham.jpg new file mode 100644 index 000000000000..5b6a51acdca8 Binary files /dev/null and b/assets/images/staff/chris-abraham.jpg differ diff --git a/assets/images/staff/deb-giles.jpg b/assets/images/staff/deb-giles.jpg new file mode 100644 index 000000000000..e51b60a6bcf6 Binary files /dev/null and b/assets/images/staff/deb-giles.jpg differ diff --git a/assets/images/staff/jennifer-bly.jpg b/assets/images/staff/jennifer-bly.jpg new file mode 100644 index 000000000000..898a30d112db Binary files /dev/null and b/assets/images/staff/jennifer-bly.jpg differ diff --git a/assets/images/staff/jessica-gonzalez.jpg b/assets/images/staff/jessica-gonzalez.jpg new file mode 100644 index 000000000000..5a513db6c675 Binary files /dev/null and b/assets/images/staff/jessica-gonzalez.jpg differ diff --git a/assets/images/staff/matt-white.jpg b/assets/images/staff/matt-white.jpg new file mode 100644 index 000000000000..98a69008a480 Binary files /dev/null and b/assets/images/staff/matt-white.jpg differ diff --git a/assets/images/staff/nancy-rausch.jpg b/assets/images/staff/nancy-rausch.jpg new file mode 100644 index 000000000000..90b1dc2930a5 Binary files /dev/null and b/assets/images/staff/nancy-rausch.jpg differ diff --git a/assets/images/staff/regina-nkenchor.jpg b/assets/images/staff/regina-nkenchor.jpg new file mode 100644 index 000000000000..14732beb36fa Binary files /dev/null and b/assets/images/staff/regina-nkenchor.jpg differ diff --git a/assets/images/staff/thanh-ha.jpg b/assets/images/staff/thanh-ha.jpg new file mode 100644 index 000000000000..23e11b08f6f3 Binary files /dev/null and b/assets/images/staff/thanh-ha.jpg differ diff --git a/assets/images/stanford-university.png b/assets/images/stanford-university.png new file mode 100644 index 000000000000..c18454477daf Binary files /dev/null and b/assets/images/stanford-university.png differ diff --git a/assets/images/stencil-image-1.png b/assets/images/stencil-image-1.png new file mode 100644 index 000000000000..4017cc5bac2d Binary files /dev/null and b/assets/images/stencil-image-1.png differ diff --git a/assets/images/stencil-image-2.png b/assets/images/stencil-image-2.png new file mode 100644 index 000000000000..f67fec4895ca Binary files /dev/null and b/assets/images/stencil-image-2.png differ diff --git a/assets/images/stencil-image-3.png b/assets/images/stencil-image-3.png new file mode 100644 index 000000000000..308954dd5b95 Binary files /dev/null and b/assets/images/stencil-image-3.png differ diff --git a/assets/images/stopwatch-icon.svg b/assets/images/stopwatch-icon.svg new file mode 100644 index 000000000000..eed1bb869459 --- /dev/null +++ b/assets/images/stopwatch-icon.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/assets/images/straggler-mitigation/straggler-mitigation-1.png b/assets/images/straggler-mitigation/straggler-mitigation-1.png new file mode 100644 index 000000000000..25ce8f43b932 Binary files /dev/null and b/assets/images/straggler-mitigation/straggler-mitigation-1.png differ diff --git a/assets/images/straggler-mitigation/straggler-mitigation-2.png b/assets/images/straggler-mitigation/straggler-mitigation-2.png new file mode 100644 index 000000000000..4ceda3efd7c9 Binary files /dev/null and b/assets/images/straggler-mitigation/straggler-mitigation-2.png differ diff --git a/assets/images/straggler-mitigation/straggler-mitigation-3.png b/assets/images/straggler-mitigation/straggler-mitigation-3.png new file mode 100644 index 000000000000..1115d7ecae88 Binary files /dev/null and b/assets/images/straggler-mitigation/straggler-mitigation-3.png differ diff --git a/assets/images/straggler-mitigation/straggler-mitigation-4.png b/assets/images/straggler-mitigation/straggler-mitigation-4.png new file mode 100644 index 000000000000..9cfa68d88d04 Binary files /dev/null and b/assets/images/straggler-mitigation/straggler-mitigation-4.png differ diff --git a/assets/images/straggler-mitigation/straggler-mitigation-5.png b/assets/images/straggler-mitigation/straggler-mitigation-5.png new file mode 100644 index 000000000000..4941951b9300 Binary files /dev/null and b/assets/images/straggler-mitigation/straggler-mitigation-5.png differ diff --git a/assets/images/straggler-mitigation/straggler-mitigation-6.png b/assets/images/straggler-mitigation/straggler-mitigation-6.png new file mode 100644 index 000000000000..d0b8b6bf5d52 Binary files /dev/null and b/assets/images/straggler-mitigation/straggler-mitigation-6.png differ diff --git a/assets/images/straggler-mitigation/straggler-mitigation-7.png b/assets/images/straggler-mitigation/straggler-mitigation-7.png new file mode 100644 index 000000000000..ff9f3cc97e9c Binary files /dev/null and b/assets/images/straggler-mitigation/straggler-mitigation-7.png differ diff --git a/assets/images/streamingapi.jpeg b/assets/images/streamingapi.jpeg new file mode 100644 index 000000000000..c2035f850ac0 Binary files /dev/null and b/assets/images/streamingapi.jpeg differ diff --git a/assets/images/submit-to-speak/fg1.png b/assets/images/submit-to-speak/fg1.png new file mode 100644 index 000000000000..37ce1065eddc Binary files /dev/null and b/assets/images/submit-to-speak/fg1.png differ diff --git a/assets/images/submit-to-speak/fg2.jpg b/assets/images/submit-to-speak/fg2.jpg new file mode 100644 index 000000000000..b8e0a6f2cbfc Binary files /dev/null and b/assets/images/submit-to-speak/fg2.jpg differ diff --git a/assets/images/submit-to-speak/fg3.jpg b/assets/images/submit-to-speak/fg3.jpg new file mode 100644 index 000000000000..7beba8019952 Binary files /dev/null and b/assets/images/submit-to-speak/fg3.jpg differ diff --git a/assets/images/summer_hackathon_2020.jpeg b/assets/images/summer_hackathon_2020.jpeg new file mode 100644 index 000000000000..6eaff0ce2d08 Binary files /dev/null and b/assets/images/summer_hackathon_2020.jpeg differ diff --git a/assets/images/summer_hackathon_2020.png b/assets/images/summer_hackathon_2020.png new file mode 100644 index 000000000000..c25df2e04ef4 Binary files /dev/null and b/assets/images/summer_hackathon_2020.png differ diff --git a/assets/images/swa/Figure1.png b/assets/images/swa/Figure1.png new file mode 100644 index 000000000000..ebf3deae777e Binary files /dev/null and b/assets/images/swa/Figure1.png differ diff --git a/assets/images/swa/Figure2.png b/assets/images/swa/Figure2.png new file mode 100644 index 000000000000..d341f62eb360 Binary files /dev/null and b/assets/images/swa/Figure2.png differ diff --git a/assets/images/swa/Figure3.png b/assets/images/swa/Figure3.png new file mode 100644 index 000000000000..07c6e5bc3056 Binary files /dev/null and b/assets/images/swa/Figure3.png differ diff --git a/assets/images/swa/Figure4.png b/assets/images/swa/Figure4.png new file mode 100644 index 000000000000..1d428dfd4115 Binary files /dev/null and b/assets/images/swa/Figure4.png differ diff --git a/assets/images/swa/Figure5.png b/assets/images/swa/Figure5.png new file mode 100644 index 000000000000..c26321ad4de1 Binary files /dev/null and b/assets/images/swa/Figure5.png differ diff --git a/assets/images/swa/Figure6.png b/assets/images/swa/Figure6.png new file mode 100644 index 000000000000..aeb271d7c142 Binary files /dev/null and b/assets/images/swa/Figure6.png differ diff --git a/assets/images/swa/Figure7.png b/assets/images/swa/Figure7.png new file mode 100644 index 000000000000..2626bb663e4b Binary files /dev/null and b/assets/images/swa/Figure7.png differ diff --git a/assets/images/swa/Figure8.png b/assets/images/swa/Figure8.png new file mode 100644 index 000000000000..a96e90935292 Binary files /dev/null and b/assets/images/swa/Figure8.png differ diff --git a/assets/images/swa/Figure_rl.png b/assets/images/swa/Figure_rl.png new file mode 100644 index 000000000000..f09b1fae8271 Binary files /dev/null and b/assets/images/swa/Figure_rl.png differ diff --git a/assets/images/swa/figure2-highres.png b/assets/images/swa/figure2-highres.png new file mode 100644 index 000000000000..cce95473c07f Binary files /dev/null and b/assets/images/swa/figure2-highres.png differ diff --git a/assets/images/swa/figure3-highres.png b/assets/images/swa/figure3-highres.png new file mode 100644 index 000000000000..549710f5d600 Binary files /dev/null and b/assets/images/swa/figure3-highres.png differ diff --git a/assets/images/swapytorch1.png b/assets/images/swapytorch1.png new file mode 100644 index 000000000000..784e30f87d6b Binary files /dev/null and b/assets/images/swapytorch1.png differ diff --git a/assets/images/swapytorch10.png b/assets/images/swapytorch10.png new file mode 100644 index 000000000000..0b292faced4a Binary files /dev/null and b/assets/images/swapytorch10.png differ diff --git a/assets/images/swapytorch2.png b/assets/images/swapytorch2.png new file mode 100644 index 000000000000..9a4150da86ce Binary files /dev/null and b/assets/images/swapytorch2.png differ diff --git a/assets/images/swapytorch3.jpg b/assets/images/swapytorch3.jpg new file mode 100644 index 000000000000..6189bd88c28a Binary files /dev/null and b/assets/images/swapytorch3.jpg differ diff --git a/assets/images/swapytorch4.png b/assets/images/swapytorch4.png new file mode 100644 index 000000000000..a64656629601 Binary files /dev/null and b/assets/images/swapytorch4.png differ diff --git a/assets/images/swapytorch5.png b/assets/images/swapytorch5.png new file mode 100644 index 000000000000..cc3c1f8d45eb Binary files /dev/null and b/assets/images/swapytorch5.png differ diff --git a/assets/images/swapytorch6.png b/assets/images/swapytorch6.png new file mode 100644 index 000000000000..0be145c26a79 Binary files /dev/null and b/assets/images/swapytorch6.png differ diff --git a/assets/images/swapytorch7.png b/assets/images/swapytorch7.png new file mode 100644 index 000000000000..3e70edeb9f69 Binary files /dev/null and b/assets/images/swapytorch7.png differ diff --git a/assets/images/swapytorch8.jpg b/assets/images/swapytorch8.jpg new file mode 100644 index 000000000000..6dc85f1f3ecf Binary files /dev/null and b/assets/images/swapytorch8.jpg differ diff --git a/assets/images/swapytorch8.png b/assets/images/swapytorch8.png new file mode 100644 index 000000000000..3b1ba9e8bc32 Binary files /dev/null and b/assets/images/swapytorch8.png differ diff --git a/assets/images/swapytorch9.png b/assets/images/swapytorch9.png new file mode 100644 index 000000000000..64cd952912ed Binary files /dev/null and b/assets/images/swapytorch9.png differ diff --git a/assets/images/t-vs-eager-mode.svg b/assets/images/t-vs-eager-mode.svg new file mode 100644 index 000000000000..f56363d3bc2e --- /dev/null +++ b/assets/images/t-vs-eager-mode.svg @@ -0,0 +1,80 @@ + + + + + + + + ~1.5X + ~1.5X + ~1.7X + ~2.3X + 1X + eager-mode + Eager Mode + Eager Mode + + + DistillGPT2 + TorchInductor + CamemBert + T5Small + \ No newline at end of file diff --git a/assets/images/tac-elects-new-leadership/fg1.png b/assets/images/tac-elects-new-leadership/fg1.png new file mode 100644 index 000000000000..cd3b8160f4ca Binary files /dev/null and b/assets/images/tac-elects-new-leadership/fg1.png differ diff --git a/assets/images/tac-elects-new-leadership/jiong-gong.jpg b/assets/images/tac-elects-new-leadership/jiong-gong.jpg new file mode 100644 index 000000000000..9602c15bc774 Binary files /dev/null and b/assets/images/tac-elects-new-leadership/jiong-gong.jpg differ diff --git a/assets/images/tac-elects-new-leadership/luca-antiga.jpg b/assets/images/tac-elects-new-leadership/luca-antiga.jpg new file mode 100644 index 000000000000..fcf521477241 Binary files /dev/null and b/assets/images/tac-elects-new-leadership/luca-antiga.jpg differ diff --git a/assets/images/tac/brian-granger.jpg b/assets/images/tac/brian-granger.jpg new file mode 100644 index 000000000000..f42ac299d5d1 Binary files /dev/null and b/assets/images/tac/brian-granger.jpg differ diff --git a/assets/images/tac/generic-avatar.svg b/assets/images/tac/generic-avatar.svg new file mode 100644 index 000000000000..23a47429d5b7 --- /dev/null +++ b/assets/images/tac/generic-avatar.svg @@ -0,0 +1 @@ +generic-avatar.svg \ No newline at end of file diff --git a/assets/images/tac/gregory-chanan.jpg b/assets/images/tac/gregory-chanan.jpg new file mode 100644 index 000000000000..5550cd8ade60 Binary files /dev/null and b/assets/images/tac/gregory-chanan.jpg differ diff --git a/assets/images/tac/jeff-daily.jpg b/assets/images/tac/jeff-daily.jpg new file mode 100644 index 000000000000..35f0a855e82c Binary files /dev/null and b/assets/images/tac/jeff-daily.jpg differ diff --git a/assets/images/tac/jiong-gong.jpg b/assets/images/tac/jiong-gong.jpg new file mode 100644 index 000000000000..9602c15bc774 Binary files /dev/null and b/assets/images/tac/jiong-gong.jpg differ diff --git a/assets/images/tac/luca-antiga.jpg b/assets/images/tac/luca-antiga.jpg new file mode 100644 index 000000000000..fcf521477241 Binary files /dev/null and b/assets/images/tac/luca-antiga.jpg differ diff --git a/assets/images/tac/milos-puzovic.jpg b/assets/images/tac/milos-puzovic.jpg new file mode 100644 index 000000000000..acd11d33520e Binary files /dev/null and b/assets/images/tac/milos-puzovic.jpg differ diff --git a/assets/images/tac/mudhakar-srivatsa.jpg b/assets/images/tac/mudhakar-srivatsa.jpg new file mode 100644 index 000000000000..1056048ebe46 Binary files /dev/null and b/assets/images/tac/mudhakar-srivatsa.jpg differ diff --git a/assets/images/tac/piotr-bialecki.jpg b/assets/images/tac/piotr-bialecki.jpg new file mode 100644 index 000000000000..36b647f9e199 Binary files /dev/null and b/assets/images/tac/piotr-bialecki.jpg differ diff --git a/assets/images/tac/shauheen-zahirazami.jpg b/assets/images/tac/shauheen-zahirazami.jpg new file mode 100644 index 000000000000..fec650f87e91 Binary files /dev/null and b/assets/images/tac/shauheen-zahirazami.jpg differ diff --git a/assets/images/tac/soumith-chintala.jpg b/assets/images/tac/soumith-chintala.jpg new file mode 100644 index 000000000000..98c2a2fa1fd9 Binary files /dev/null and b/assets/images/tac/soumith-chintala.jpg differ diff --git a/assets/images/tac/xavier-dupre.jpg b/assets/images/tac/xavier-dupre.jpg new file mode 100644 index 000000000000..cfba0b1698df Binary files /dev/null and b/assets/images/tac/xavier-dupre.jpg differ diff --git a/assets/images/tac/yikun-jiang.jpg b/assets/images/tac/yikun-jiang.jpg new file mode 100644 index 000000000000..8844f4dd2209 Binary files /dev/null and b/assets/images/tac/yikun-jiang.jpg differ diff --git a/assets/images/tacotron2_diagram.png b/assets/images/tacotron2_diagram.png new file mode 100644 index 000000000000..6efb12f93461 Binary files /dev/null and b/assets/images/tacotron2_diagram.png differ diff --git a/assets/images/techge-nwpu.png b/assets/images/techge-nwpu.png new file mode 100644 index 000000000000..08a4e82c4490 Binary files /dev/null and b/assets/images/techge-nwpu.png differ diff --git a/assets/images/techgeo-inria.png b/assets/images/techgeo-inria.png new file mode 100644 index 000000000000..e4c013dae228 Binary files /dev/null and b/assets/images/techgeo-inria.png differ diff --git a/assets/images/techgeo-reproject.png b/assets/images/techgeo-reproject.png new file mode 100644 index 000000000000..03af326d4808 Binary files /dev/null and b/assets/images/techgeo-reproject.png differ diff --git a/assets/images/techgeo-sample.png b/assets/images/techgeo-sample.png new file mode 100644 index 000000000000..c2272b5f0d1a Binary files /dev/null and b/assets/images/techgeo-sample.png differ diff --git a/assets/images/techgeo-true-ndvi.png b/assets/images/techgeo-true-ndvi.png new file mode 100644 index 000000000000..b4fe17592dfc Binary files /dev/null and b/assets/images/techgeo-true-ndvi.png differ diff --git a/assets/images/tensor/image1.png b/assets/images/tensor/image1.png new file mode 100644 index 000000000000..21fe00e8a60b Binary files /dev/null and b/assets/images/tensor/image1.png differ diff --git a/assets/images/tensor/image2.png b/assets/images/tensor/image2.png new file mode 100644 index 000000000000..6000c3766e50 Binary files /dev/null and b/assets/images/tensor/image2.png differ diff --git a/assets/images/tensor/image3.png b/assets/images/tensor/image3.png new file mode 100644 index 000000000000..a34b9a522686 Binary files /dev/null and b/assets/images/tensor/image3.png differ diff --git a/assets/images/tensorboard_model.png b/assets/images/tensorboard_model.png new file mode 100644 index 000000000000..e4222bc9f02c Binary files /dev/null and b/assets/images/tensorboard_model.png differ diff --git a/assets/images/text-backbone-image-1.png b/assets/images/text-backbone-image-1.png new file mode 100644 index 000000000000..c41e4c03eac2 Binary files /dev/null and b/assets/images/text-backbone-image-1.png differ diff --git a/assets/images/tochvisionmobile.png b/assets/images/tochvisionmobile.png new file mode 100644 index 000000000000..bf84a4667924 Binary files /dev/null and b/assets/images/tochvisionmobile.png differ diff --git a/assets/images/topological-number-fig-2.png b/assets/images/topological-number-fig-2.png new file mode 100644 index 000000000000..210635276445 Binary files /dev/null and b/assets/images/topological-number-fig-2.png differ diff --git a/assets/images/torch-multimodal-feature-image.jpeg b/assets/images/torch-multimodal-feature-image.jpeg new file mode 100644 index 000000000000..bbd0b5c8c484 Binary files /dev/null and b/assets/images/torch-multimodal-feature-image.jpeg differ diff --git a/assets/images/torch_op_warpping_2.png b/assets/images/torch_op_warpping_2.png new file mode 100644 index 000000000000..8e9286593b45 Binary files /dev/null and b/assets/images/torch_op_warpping_2.png differ diff --git a/assets/images/torch_op_wrapping_1.png b/assets/images/torch_op_wrapping_1.png new file mode 100644 index 000000000000..d80a6da4e045 Binary files /dev/null and b/assets/images/torch_op_wrapping_1.png differ diff --git a/assets/images/torch_stack1.png b/assets/images/torch_stack1.png new file mode 100644 index 000000000000..2a5c509084dd Binary files /dev/null and b/assets/images/torch_stack1.png differ diff --git a/assets/images/torchaudio-0-12-streaming-ASR-2.gif b/assets/images/torchaudio-0-12-streaming-ASR-2.gif new file mode 100644 index 000000000000..73699b1575d9 Binary files /dev/null and b/assets/images/torchaudio-0-12-streaming-ASR-2.gif differ diff --git a/assets/images/torchaudio-rnnt-emformer-demo.gif b/assets/images/torchaudio-rnnt-emformer-demo.gif new file mode 100644 index 000000000000..be1a9fbe9565 Binary files /dev/null and b/assets/images/torchaudio-rnnt-emformer-demo.gif differ diff --git a/assets/images/torchchat.png b/assets/images/torchchat.png new file mode 100644 index 000000000000..2018dfd1e50a Binary files /dev/null and b/assets/images/torchchat.png differ diff --git a/assets/images/torchcsprng.png b/assets/images/torchcsprng.png new file mode 100644 index 000000000000..9d9273a0794d Binary files /dev/null and b/assets/images/torchcsprng.png differ diff --git a/assets/images/torchgeo-geospatial-data.png b/assets/images/torchgeo-geospatial-data.png new file mode 100644 index 000000000000..bccf150371f0 Binary files /dev/null and b/assets/images/torchgeo-geospatial-data.png differ diff --git a/assets/images/torchgeo-hurricane.jpeg b/assets/images/torchgeo-hurricane.jpeg new file mode 100644 index 000000000000..0ed0473b36b5 Binary files /dev/null and b/assets/images/torchgeo-hurricane.jpeg differ diff --git a/assets/images/torchgeo-hurricane.jpg b/assets/images/torchgeo-hurricane.jpg new file mode 100644 index 000000000000..bedec6d5e136 Binary files /dev/null and b/assets/images/torchgeo-hurricane.jpg differ diff --git a/assets/images/torchgeo-inria.png b/assets/images/torchgeo-inria.png new file mode 100644 index 000000000000..e4c013dae228 Binary files /dev/null and b/assets/images/torchgeo-inria.png differ diff --git a/assets/images/torchgeo-logo.png b/assets/images/torchgeo-logo.png new file mode 100644 index 000000000000..c8c31b845ba7 Binary files /dev/null and b/assets/images/torchgeo-logo.png differ diff --git a/assets/images/torchgeo-map.png b/assets/images/torchgeo-map.png new file mode 100644 index 000000000000..997054603a76 Binary files /dev/null and b/assets/images/torchgeo-map.png differ diff --git a/assets/images/torchgeo-nwpu.png b/assets/images/torchgeo-nwpu.png new file mode 100644 index 000000000000..08a4e82c4490 Binary files /dev/null and b/assets/images/torchgeo-nwpu.png differ diff --git a/assets/images/torchgeo-reproject.png b/assets/images/torchgeo-reproject.png new file mode 100644 index 000000000000..03af326d4808 Binary files /dev/null and b/assets/images/torchgeo-reproject.png differ diff --git a/assets/images/torchgeo-sample.png b/assets/images/torchgeo-sample.png new file mode 100644 index 000000000000..c2272b5f0d1a Binary files /dev/null and b/assets/images/torchgeo-sample.png differ diff --git a/assets/images/torchgeo-true-ndvi.png b/assets/images/torchgeo-true-ndvi.png new file mode 100644 index 000000000000..b4fe17592dfc Binary files /dev/null and b/assets/images/torchgeo-true-ndvi.png differ diff --git a/assets/images/torchrec-1.png b/assets/images/torchrec-1.png new file mode 100644 index 000000000000..7e9ebe08a84f Binary files /dev/null and b/assets/images/torchrec-1.png differ diff --git a/assets/images/torchrec-2.png b/assets/images/torchrec-2.png new file mode 100644 index 000000000000..4f4071aa109f Binary files /dev/null and b/assets/images/torchrec-2.png differ diff --git a/assets/images/torchvision_0.3_headline.png b/assets/images/torchvision_0.3_headline.png new file mode 100644 index 000000000000..270ed5f3c3d3 Binary files /dev/null and b/assets/images/torchvision_0.3_headline.png differ diff --git a/assets/images/torchvision_chart1.png b/assets/images/torchvision_chart1.png new file mode 100644 index 000000000000..61702ea43849 Binary files /dev/null and b/assets/images/torchvision_chart1.png differ diff --git a/assets/images/torchvision_featured.jpg b/assets/images/torchvision_featured.jpg new file mode 100644 index 000000000000..1c74c9cfd533 Binary files /dev/null and b/assets/images/torchvision_featured.jpg differ diff --git a/assets/images/torchvision_gif.gif b/assets/images/torchvision_gif.gif new file mode 100644 index 000000000000..80c2f202f105 Binary files /dev/null and b/assets/images/torchvision_gif.gif differ diff --git a/assets/images/trace-image1.png b/assets/images/trace-image1.png new file mode 100644 index 000000000000..6ee4e393bc47 Binary files /dev/null and b/assets/images/trace-image1.png differ diff --git a/assets/images/trace-image2.png b/assets/images/trace-image2.png new file mode 100644 index 000000000000..9df5383b5d9c Binary files /dev/null and b/assets/images/trace-image2.png differ diff --git a/assets/images/trace-image3.png b/assets/images/trace-image3.png new file mode 100644 index 000000000000..bf60325343b1 Binary files /dev/null and b/assets/images/trace-image3.png differ diff --git a/assets/images/trace-image4.png b/assets/images/trace-image4.png new file mode 100644 index 000000000000..f8f09d753dd1 Binary files /dev/null and b/assets/images/trace-image4.png differ diff --git a/assets/images/trace-image5.png b/assets/images/trace-image5.png new file mode 100644 index 000000000000..030f9909846d Binary files /dev/null and b/assets/images/trace-image5.png differ diff --git a/assets/images/trace-image6.png b/assets/images/trace-image6.png new file mode 100644 index 000000000000..ef993b50bec9 Binary files /dev/null and b/assets/images/trace-image6.png differ diff --git a/assets/images/training-moes/fg1.png b/assets/images/training-moes/fg1.png new file mode 100644 index 000000000000..70242caaa3ed Binary files /dev/null and b/assets/images/training-moes/fg1.png differ diff --git a/assets/images/training-moes/fg2.png b/assets/images/training-moes/fg2.png new file mode 100644 index 000000000000..0543d9062e0b Binary files /dev/null and b/assets/images/training-moes/fg2.png differ diff --git a/assets/images/training-moes/fg3.png b/assets/images/training-moes/fg3.png new file mode 100644 index 000000000000..525e2c110f91 Binary files /dev/null and b/assets/images/training-moes/fg3.png differ diff --git a/assets/images/training-moes/fg4.png b/assets/images/training-moes/fg4.png new file mode 100644 index 000000000000..40b6187addf9 Binary files /dev/null and b/assets/images/training-moes/fg4.png differ diff --git a/assets/images/training-moes/fg5.png b/assets/images/training-moes/fg5.png new file mode 100644 index 000000000000..0b8c79fd44e1 Binary files /dev/null and b/assets/images/training-moes/fg5.png differ diff --git a/assets/images/training-production-ai-models/blog-fig1.jpg b/assets/images/training-production-ai-models/blog-fig1.jpg new file mode 100644 index 000000000000..f52fa121d3c3 Binary files /dev/null and b/assets/images/training-production-ai-models/blog-fig1.jpg differ diff --git a/assets/images/training-production-ai-models/blog-fig1s.jpg b/assets/images/training-production-ai-models/blog-fig1s.jpg new file mode 100644 index 000000000000..c8835ff92c8f Binary files /dev/null and b/assets/images/training-production-ai-models/blog-fig1s.jpg differ diff --git a/assets/images/training-production-ai-models/blog-fig2.jpg b/assets/images/training-production-ai-models/blog-fig2.jpg new file mode 100644 index 000000000000..3aaafe38b7d1 Binary files /dev/null and b/assets/images/training-production-ai-models/blog-fig2.jpg differ diff --git a/assets/images/training-production-ai-models/blog-fig3.jpg b/assets/images/training-production-ai-models/blog-fig3.jpg new file mode 100644 index 000000000000..9d1ea6b39913 Binary files /dev/null and b/assets/images/training-production-ai-models/blog-fig3.jpg differ diff --git a/assets/images/training-production-ai-models/blog-fig4.jpg b/assets/images/training-production-ai-models/blog-fig4.jpg new file mode 100644 index 000000000000..ff6fe4b226b8 Binary files /dev/null and b/assets/images/training-production-ai-models/blog-fig4.jpg differ diff --git a/assets/images/training-production-ai-models/blog-fig5.jpg b/assets/images/training-production-ai-models/blog-fig5.jpg new file mode 100644 index 000000000000..1a5f8f851710 Binary files /dev/null and b/assets/images/training-production-ai-models/blog-fig5.jpg differ diff --git a/assets/images/training-production-ai-models/blog-fig6.jpg b/assets/images/training-production-ai-models/blog-fig6.jpg new file mode 100644 index 000000000000..363314b4e8f3 Binary files /dev/null and b/assets/images/training-production-ai-models/blog-fig6.jpg differ diff --git a/assets/images/training-production-ai-models/blog-fig7.jpg b/assets/images/training-production-ai-models/blog-fig7.jpg new file mode 100644 index 000000000000..c6b5429b4dc3 Binary files /dev/null and b/assets/images/training-production-ai-models/blog-fig7.jpg differ diff --git a/assets/images/training-production-ai-models/blog-fig8.jpg b/assets/images/training-production-ai-models/blog-fig8.jpg new file mode 100644 index 000000000000..9ff8d831c4da Binary files /dev/null and b/assets/images/training-production-ai-models/blog-fig8.jpg differ diff --git a/assets/images/training-using-float8-fsdp2/fg1.png b/assets/images/training-using-float8-fsdp2/fg1.png new file mode 100644 index 000000000000..d7df9c7c9d27 Binary files /dev/null and b/assets/images/training-using-float8-fsdp2/fg1.png differ diff --git a/assets/images/training-using-float8-fsdp2/fg2.png b/assets/images/training-using-float8-fsdp2/fg2.png new file mode 100644 index 000000000000..f515b980221c Binary files /dev/null and b/assets/images/training-using-float8-fsdp2/fg2.png differ diff --git a/assets/images/transformer.png b/assets/images/transformer.png new file mode 100644 index 000000000000..a84f91f1b288 Binary files /dev/null and b/assets/images/transformer.png differ diff --git a/assets/images/transformer_block.png b/assets/images/transformer_block.png new file mode 100644 index 000000000000..5957bb3d6a94 Binary files /dev/null and b/assets/images/transformer_block.png differ diff --git a/assets/images/transformer_fastpath.png b/assets/images/transformer_fastpath.png new file mode 100644 index 000000000000..ffce2d4fa883 Binary files /dev/null and b/assets/images/transformer_fastpath.png differ diff --git a/assets/images/triton-kernel-compilation-stages.jpg b/assets/images/triton-kernel-compilation-stages.jpg new file mode 100644 index 000000000000..c65829b5181f Binary files /dev/null and b/assets/images/triton-kernel-compilation-stages.jpg differ diff --git a/assets/images/tutorial.png b/assets/images/tutorial.png new file mode 100644 index 000000000000..774cfb36620c Binary files /dev/null and b/assets/images/tutorial.png differ diff --git a/assets/images/tutorialhomepage.png b/assets/images/tutorialhomepage.png new file mode 100644 index 000000000000..ebe3e7fea168 Binary files /dev/null and b/assets/images/tutorialhomepage.png differ diff --git a/assets/images/tweet_1_visual.png b/assets/images/tweet_1_visual.png new file mode 100644 index 000000000000..cbb3bfe2fc3d Binary files /dev/null and b/assets/images/tweet_1_visual.png differ diff --git a/assets/images/tweet_2_wrapper.png b/assets/images/tweet_2_wrapper.png new file mode 100644 index 000000000000..28e9be54ecdf Binary files /dev/null and b/assets/images/tweet_2_wrapper.png differ diff --git a/assets/images/tweet_3_FSDP_Model_size_increase_4x.png b/assets/images/tweet_3_FSDP_Model_size_increase_4x.png new file mode 100644 index 000000000000..b6739e019bbb Binary files /dev/null and b/assets/images/tweet_3_FSDP_Model_size_increase_4x.png differ diff --git a/assets/images/tweet_4.png b/assets/images/tweet_4.png new file mode 100644 index 000000000000..d22d00baad03 Binary files /dev/null and b/assets/images/tweet_4.png differ diff --git a/assets/images/udacity.png b/assets/images/udacity.png new file mode 100644 index 000000000000..413f42e84d84 Binary files /dev/null and b/assets/images/udacity.png differ diff --git a/assets/images/ultralytics_yolov5_img0.jpg b/assets/images/ultralytics_yolov5_img0.jpg new file mode 100644 index 000000000000..b4147e36764a Binary files /dev/null and b/assets/images/ultralytics_yolov5_img0.jpg differ diff --git a/assets/images/ultralytics_yolov5_img1.png b/assets/images/ultralytics_yolov5_img1.png new file mode 100644 index 000000000000..73b996b237df Binary files /dev/null and b/assets/images/ultralytics_yolov5_img1.png differ diff --git a/assets/images/ultralytics_yolov5_img2.png b/assets/images/ultralytics_yolov5_img2.png new file mode 100644 index 000000000000..4e648fba938d Binary files /dev/null and b/assets/images/ultralytics_yolov5_img2.png differ diff --git a/assets/images/understanding-gpu-memory-1/awaitable_code_snippet.png b/assets/images/understanding-gpu-memory-1/awaitable_code_snippet.png new file mode 100644 index 000000000000..d67ec45dd9c1 Binary files /dev/null and b/assets/images/understanding-gpu-memory-1/awaitable_code_snippet.png differ diff --git a/assets/images/understanding-gpu-memory-1/awaitable_leak_cycle.png b/assets/images/understanding-gpu-memory-1/awaitable_leak_cycle.png new file mode 100644 index 000000000000..0fa4f72a8a75 Binary files /dev/null and b/assets/images/understanding-gpu-memory-1/awaitable_leak_cycle.png differ diff --git a/assets/images/understanding-gpu-memory-1/fig1.png b/assets/images/understanding-gpu-memory-1/fig1.png new file mode 100644 index 000000000000..6ea32120acce Binary files /dev/null and b/assets/images/understanding-gpu-memory-1/fig1.png differ diff --git a/assets/images/understanding-gpu-memory-1/fig2.png b/assets/images/understanding-gpu-memory-1/fig2.png new file mode 100644 index 000000000000..0de3e9563f8b Binary files /dev/null and b/assets/images/understanding-gpu-memory-1/fig2.png differ diff --git a/assets/images/understanding-gpu-memory-1/fig3.png b/assets/images/understanding-gpu-memory-1/fig3.png new file mode 100644 index 000000000000..951a2c0a24b1 Binary files /dev/null and b/assets/images/understanding-gpu-memory-1/fig3.png differ diff --git a/assets/images/understanding-gpu-memory-1/fig4.png b/assets/images/understanding-gpu-memory-1/fig4.png new file mode 100644 index 000000000000..fe586e7ebf7e Binary files /dev/null and b/assets/images/understanding-gpu-memory-1/fig4.png differ diff --git a/assets/images/understanding-gpu-memory-1/fig5.png b/assets/images/understanding-gpu-memory-1/fig5.png new file mode 100644 index 000000000000..fcfce76302c5 Binary files /dev/null and b/assets/images/understanding-gpu-memory-1/fig5.png differ diff --git a/assets/images/understanding-gpu-memory-1/fig6.png b/assets/images/understanding-gpu-memory-1/fig6.png new file mode 100644 index 000000000000..797f9273fea3 Binary files /dev/null and b/assets/images/understanding-gpu-memory-1/fig6.png differ diff --git a/assets/images/understanding-gpu-memory-1/memory_leak_awaitable.jpg b/assets/images/understanding-gpu-memory-1/memory_leak_awaitable.jpg new file mode 100644 index 000000000000..4a43a19a1b8c Binary files /dev/null and b/assets/images/understanding-gpu-memory-1/memory_leak_awaitable.jpg differ diff --git a/assets/images/understanding-gpu-memory-1/memory_leak_gc_collect.jpg b/assets/images/understanding-gpu-memory-1/memory_leak_gc_collect.jpg new file mode 100644 index 000000000000..1e37d87ac919 Binary files /dev/null and b/assets/images/understanding-gpu-memory-1/memory_leak_gc_collect.jpg differ diff --git a/assets/images/understanding-gpu-memory-1/memory_leak_oom.jpg b/assets/images/understanding-gpu-memory-1/memory_leak_oom.jpg new file mode 100644 index 000000000000..9db333814881 Binary files /dev/null and b/assets/images/understanding-gpu-memory-1/memory_leak_oom.jpg differ diff --git a/assets/images/understanding-gpu-memory-1/simple_reference_cycle.png b/assets/images/understanding-gpu-memory-1/simple_reference_cycle.png new file mode 100644 index 000000000000..51b6f9ec8d6e Binary files /dev/null and b/assets/images/understanding-gpu-memory-1/simple_reference_cycle.png differ diff --git a/assets/images/understanding-gpu-memory-1/snapshot.html b/assets/images/understanding-gpu-memory-1/snapshot.html new file mode 100644 index 000000000000..8ca25ea3e6f3 --- /dev/null +++ b/assets/images/understanding-gpu-memory-1/snapshot.html @@ -0,0 +1,12 @@ + + + + + + + + diff --git a/assets/images/understanding-gpu-memory-1/torchrec_code_snippet.png b/assets/images/understanding-gpu-memory-1/torchrec_code_snippet.png new file mode 100644 index 000000000000..23c0f8e49882 Binary files /dev/null and b/assets/images/understanding-gpu-memory-1/torchrec_code_snippet.png differ diff --git a/assets/images/unet_brain_mri.png b/assets/images/unet_brain_mri.png new file mode 100644 index 000000000000..397719f3701d Binary files /dev/null and b/assets/images/unet_brain_mri.png differ diff --git a/assets/images/unet_tcga_cs_4944.png b/assets/images/unet_tcga_cs_4944.png new file mode 100644 index 000000000000..d7e556675504 Binary files /dev/null and b/assets/images/unet_tcga_cs_4944.png differ diff --git a/assets/images/unleashing-ai-mobile/fg1.png b/assets/images/unleashing-ai-mobile/fg1.png new file mode 100644 index 000000000000..f88a96cef291 Binary files /dev/null and b/assets/images/unleashing-ai-mobile/fg1.png differ diff --git a/assets/images/unleashing-ai-mobile/fg2.png b/assets/images/unleashing-ai-mobile/fg2.png new file mode 100644 index 000000000000..ab0fca053e7c Binary files /dev/null and b/assets/images/unleashing-ai-mobile/fg2.png differ diff --git a/assets/images/unleashing-ai-mobile/fg3.jpg b/assets/images/unleashing-ai-mobile/fg3.jpg new file mode 100644 index 000000000000..3d97eba9005a Binary files /dev/null and b/assets/images/unleashing-ai-mobile/fg3.jpg differ diff --git a/assets/images/unleashing-ai-mobile/fg4.png b/assets/images/unleashing-ai-mobile/fg4.png new file mode 100644 index 000000000000..b105d3ba5e28 Binary files /dev/null and b/assets/images/unleashing-ai-mobile/fg4.png differ diff --git a/assets/images/unleashing-ai-mobile/fg5.png b/assets/images/unleashing-ai-mobile/fg5.png new file mode 100644 index 000000000000..28962913d31a Binary files /dev/null and b/assets/images/unleashing-ai-mobile/fg5.png differ diff --git a/assets/images/unleashing-ai-mobile/fg6.png b/assets/images/unleashing-ai-mobile/fg6.png new file mode 100644 index 000000000000..4a817f080a74 Binary files /dev/null and b/assets/images/unleashing-ai-mobile/fg6.png differ diff --git a/assets/images/unlocking-pt-2-6-intel.png b/assets/images/unlocking-pt-2-6-intel.png new file mode 100644 index 000000000000..94d372662a2c Binary files /dev/null and b/assets/images/unlocking-pt-2-6-intel.png differ diff --git a/assets/images/unlowered-op.jpg b/assets/images/unlowered-op.jpg new file mode 100644 index 000000000000..3557e0ef1403 Binary files /dev/null and b/assets/images/unlowered-op.jpg differ diff --git a/assets/images/vgg.png b/assets/images/vgg.png new file mode 100644 index 000000000000..f7a03d160f8f Binary files /dev/null and b/assets/images/vgg.png differ diff --git a/assets/images/video-backbone-image-1.png b/assets/images/video-backbone-image-1.png new file mode 100644 index 000000000000..d9f20067a376 Binary files /dev/null and b/assets/images/video-backbone-image-1.png differ diff --git a/assets/images/visdom.png b/assets/images/visdom.png new file mode 100644 index 000000000000..2234ef271bf7 Binary files /dev/null and b/assets/images/visdom.png differ diff --git a/assets/images/visualization of ground truths.png b/assets/images/visualization of ground truths.png new file mode 100644 index 000000000000..59a9b1440479 Binary files /dev/null and b/assets/images/visualization of ground truths.png differ diff --git a/assets/images/vllm.png b/assets/images/vllm.png new file mode 100644 index 000000000000..5b272828f498 Binary files /dev/null and b/assets/images/vllm.png differ diff --git a/assets/images/warp-specialization/fg1.jpg b/assets/images/warp-specialization/fg1.jpg new file mode 100644 index 000000000000..cea07c126baa Binary files /dev/null and b/assets/images/warp-specialization/fg1.jpg differ diff --git a/assets/images/warp-specialization/fg2.jpg b/assets/images/warp-specialization/fg2.jpg new file mode 100644 index 000000000000..f5121c1d0469 Binary files /dev/null and b/assets/images/warp-specialization/fg2.jpg differ diff --git a/assets/images/warp-specialization/fg3.png b/assets/images/warp-specialization/fg3.png new file mode 100644 index 000000000000..3f190a06a40c Binary files /dev/null and b/assets/images/warp-specialization/fg3.png differ diff --git a/assets/images/warp-specialization/fg4.png b/assets/images/warp-specialization/fg4.png new file mode 100644 index 000000000000..183441f59f88 Binary files /dev/null and b/assets/images/warp-specialization/fg4.png differ diff --git a/assets/images/waveglow_diagram.png b/assets/images/waveglow_diagram.png new file mode 100644 index 000000000000..3ea45444ac0f Binary files /dev/null and b/assets/images/waveglow_diagram.png differ diff --git a/assets/images/webdataset1.png b/assets/images/webdataset1.png new file mode 100644 index 000000000000..9bf62cf89dbf Binary files /dev/null and b/assets/images/webdataset1.png differ diff --git a/assets/images/webdataset2.png b/assets/images/webdataset2.png new file mode 100644 index 000000000000..b660928e38cc Binary files /dev/null and b/assets/images/webdataset2.png differ diff --git a/assets/images/webdataset3.png b/assets/images/webdataset3.png new file mode 100644 index 000000000000..f77d7108ef5f Binary files /dev/null and b/assets/images/webdataset3.png differ diff --git a/assets/images/wei-li.jpg b/assets/images/wei-li.jpg new file mode 100644 index 000000000000..d4a78626d601 Binary files /dev/null and b/assets/images/wei-li.jpg differ diff --git a/assets/images/wide_resnet.png b/assets/images/wide_resnet.png new file mode 100644 index 000000000000..21ab12edb687 Binary files /dev/null and b/assets/images/wide_resnet.png differ diff --git a/assets/images/wsl-image.png b/assets/images/wsl-image.png new file mode 100644 index 000000000000..0da85d8dd1fe Binary files /dev/null and b/assets/images/wsl-image.png differ diff --git a/assets/images/x3d.png b/assets/images/x3d.png new file mode 100644 index 000000000000..7f86e44b724f Binary files /dev/null and b/assets/images/x3d.png differ diff --git a/assets/images/yolop.png b/assets/images/yolop.png new file mode 100644 index 000000000000..1a6088452dc7 Binary files /dev/null and b/assets/images/yolop.png differ diff --git a/assets/images/zeus/fig1.png b/assets/images/zeus/fig1.png new file mode 100644 index 000000000000..05be1cc99a0a Binary files /dev/null and b/assets/images/zeus/fig1.png differ diff --git a/assets/images/zeus/fig2.png b/assets/images/zeus/fig2.png new file mode 100644 index 000000000000..e7486983e387 Binary files /dev/null and b/assets/images/zeus/fig2.png differ diff --git a/assets/images/zeus/fig3.png b/assets/images/zeus/fig3.png new file mode 100644 index 000000000000..caf4e0a1af4b Binary files /dev/null and b/assets/images/zeus/fig3.png differ diff --git a/assets/load-tweets.js b/assets/load-tweets.js new file mode 100644 index 000000000000..eabb5fb62bca --- /dev/null +++ b/assets/load-tweets.js @@ -0,0 +1,80 @@ +var twitter = { + bind: function() { + twitter.waitForIframe(); + }, + + updateInitialText(text) { + $("[data-target='twitter-timeline']").text(text); + }, + + waitForIframe: function() { + var count = 0; + var interval = setInterval(function() { + var iframe = document.getElementById("twitter-widget-0"); + + if (iframe !== null) { + clearInterval(interval); + twitter.updateInitialText(""); + twitter.copyContent(iframe); + } + + // Give up after 5 seconds + if (count >= 5) { + clearInterval(interval); + twitter.updateInitialText("Twitter widget could not be loaded."); + } else { + count += 1; + } + }, 1000); + }, + + copyContent(iframe) { + var tweets = $(iframe.contentWindow.document). + find("ol.timeline-TweetList > li"). + map(function() { + return { + isRetweet: $(this).find('.timeline-Tweet-retweetCredit').length > 0, + tweetAuthor: $(this).find('.tweetAuthor-screenName').text(), + inReplyTo: $(this).find('.timeline-Tweet-inReplyTo').text(), + tweetHTML: $(this).find('p.timeline-tweet-text').html() + } + }).get(); + + $("#twitter-widget-0").remove(); + + twitter.populateCustomTweets(tweets); + }, + + populateCustomTweets(tweets) { + var tweetsWrapper = $("
"); + + tweets.forEach(function(tweet) { + var tweetWrapper = $("
"); + var metadata = $("

"); + + if (tweet.isRetweet) { + metadata.append("PyTorch Retweeted " + tweet.tweetAuthor + "
"); + } + + if (tweet.inReplyTo) { + metadata.append("" + tweet.inReplyTo + ""); + } + + tweetWrapper.append(metadata); + + tweetWrapper.append("

" + tweet.tweetHTML + "

"); + + tweetWrapper.append( + "
\ + PyTorch, @pytorch \ +
" + ); + + tweetWrapper.prepend("
"); + + tweetsWrapper.append(tweetWrapper); + }); + + $("[data-target='twitter-timeline']").append(tweetsWrapper); + } +} diff --git a/assets/logo-datarock.png b/assets/logo-datarock.png new file mode 100644 index 000000000000..2ca1b273473e Binary files /dev/null and b/assets/logo-datarock.png differ diff --git a/assets/logo-wadhwaniAI.png b/assets/logo-wadhwaniAI.png new file mode 100644 index 000000000000..58e2ce02f0c7 Binary files /dev/null and b/assets/logo-wadhwaniAI.png differ diff --git a/assets/main-menu-dropdown.js b/assets/main-menu-dropdown.js new file mode 100644 index 000000000000..3da2d3220480 --- /dev/null +++ b/assets/main-menu-dropdown.js @@ -0,0 +1,15 @@ +$("[data-toggle='resources-dropdown']").hover(function() { + toggleDropdown($(this).attr("data-toggle")); +}); + +function toggleDropdown(menuToggle) { + var showMenuClass = "show-menu"; + var menuClass = "." + menuToggle + "-menu"; + + if ($(menuClass).hasClass(showMenuClass)) { + $(menuClass).removeClass(showMenuClass); + } else { + $("[data-toggle=" + menuToggle + "].show-menu").removeClass(showMenuClass); + $(menuClass).addClass(showMenuClass); + } +} diff --git a/assets/main.css b/assets/main.css new file mode 100644 index 000000000000..aa7eb1db6cdd --- /dev/null +++ b/assets/main.css @@ -0,0 +1,6 @@ +/*! + * Bootstrap v4.3.1 (https://getbootstrap.com/) + * Copyright 2011-2019 The Bootstrap Authors + * Copyright 2011-2019 Twitter, Inc. + * Licensed under MIT (https://github.com/twbs/bootstrap/blob/master/LICENSE) + */:root{--blue: #007bff;--indigo: #6610f2;--purple: #6f42c1;--pink: #e83e8c;--red: #dc3545;--orange: #fd7e14;--yellow: #ffc107;--green: #28a745;--teal: #20c997;--cyan: #17a2b8;--white: #fff;--gray: #6c757d;--gray-dark: #343a40;--primary: #007bff;--secondary: #6c757d;--success: #28a745;--info: #17a2b8;--warning: #ffc107;--danger: #dc3545;--light: #f8f9fa;--dark: #343a40;--breakpoint-xs: 0;--breakpoint-sm: 576px;--breakpoint-md: 768px;--breakpoint-lg: 992px;--breakpoint-xl: 1200px;--font-family-sans-serif: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, "Noto Sans", sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol", "Noto Color Emoji";--font-family-monospace: SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace}*,*::before,*::after{box-sizing:border-box}html{font-family:sans-serif;line-height:1.15;-webkit-text-size-adjust:100%;-webkit-tap-highlight-color:transparent}article,aside,figcaption,figure,footer,header,hgroup,main,nav,section{display:block}body{margin:0;font-family:-apple-system,BlinkMacSystemFont,"Segoe UI",Roboto,"Helvetica Neue",Arial,"Noto Sans",sans-serif,"Apple Color Emoji","Segoe UI Emoji","Segoe UI Symbol","Noto Color Emoji";font-size:1rem;font-weight:400;line-height:1.5;color:#212529;text-align:left;background-color:#fff}[tabindex="-1"]:focus{outline:0 !important}hr{box-sizing:content-box;height:0;overflow:visible}h1,h2,h3,h4,h5,h6{margin-top:0;margin-bottom:.5rem}p{margin-top:0;margin-bottom:1rem}abbr[title],abbr[data-original-title]{text-decoration:underline;-webkit-text-decoration:underline dotted;text-decoration:underline dotted;cursor:help;border-bottom:0;-webkit-text-decoration-skip-ink:none;text-decoration-skip-ink:none}address{margin-bottom:1rem;font-style:normal;line-height:inherit}ol,ul,dl{margin-top:0;margin-bottom:1rem}ol ol,ul ul,ol ul,ul ol{margin-bottom:0}dt{font-weight:700}dd{margin-bottom:.5rem;margin-left:0}blockquote{margin:0 0 1rem}b,strong{font-weight:bolder}small{font-size:80%}sub,sup{position:relative;font-size:75%;line-height:0;vertical-align:baseline}sub{bottom:-.25em}sup{top:-.5em}a{color:#007bff;text-decoration:none;background-color:transparent}a:hover{color:#0056b3;text-decoration:underline}a:not([href]):not([tabindex]){color:inherit;text-decoration:none}a:not([href]):not([tabindex]):hover,a:not([href]):not([tabindex]):focus{color:inherit;text-decoration:none}a:not([href]):not([tabindex]):focus{outline:0}pre,code,kbd,samp{font-family:SFMono-Regular,Menlo,Monaco,Consolas,"Liberation Mono","Courier New",monospace;font-size:1em}pre{margin-top:0;margin-bottom:1rem;overflow:auto}figure{margin:0 0 1rem}img{vertical-align:middle;border-style:none}svg{overflow:hidden;vertical-align:middle}table{border-collapse:collapse}caption{padding-top:.75rem;padding-bottom:.75rem;color:#6c757d;text-align:left;caption-side:bottom}th{text-align:inherit}label{display:inline-block;margin-bottom:.5rem}button{border-radius:0}button:focus{outline:1px dotted;outline:5px auto -webkit-focus-ring-color}input,button,select,optgroup,textarea{margin:0;font-family:inherit;font-size:inherit;line-height:inherit}button,input{overflow:visible}button,select{text-transform:none}select{word-wrap:normal}button,[type="button"],[type="reset"],[type="submit"]{-webkit-appearance:button}button:not(:disabled),[type="button"]:not(:disabled),[type="reset"]:not(:disabled),[type="submit"]:not(:disabled){cursor:pointer}button::-moz-focus-inner,[type="button"]::-moz-focus-inner,[type="reset"]::-moz-focus-inner,[type="submit"]::-moz-focus-inner{padding:0;border-style:none}input[type="radio"],input[type="checkbox"]{box-sizing:border-box;padding:0}input[type="date"],input[type="time"],input[type="datetime-local"],input[type="month"]{-webkit-appearance:listbox}textarea{overflow:auto;resize:vertical}fieldset{min-width:0;padding:0;margin:0;border:0}legend{display:block;width:100%;max-width:100%;padding:0;margin-bottom:.5rem;font-size:1.5rem;line-height:inherit;color:inherit;white-space:normal}progress{vertical-align:baseline}[type="number"]::-webkit-inner-spin-button,[type="number"]::-webkit-outer-spin-button{height:auto}[type="search"]{outline-offset:-2px;-webkit-appearance:none}[type="search"]::-webkit-search-decoration{-webkit-appearance:none}::-webkit-file-upload-button{font:inherit;-webkit-appearance:button}output{display:inline-block}summary{display:list-item;cursor:pointer}template{display:none}[hidden]{display:none !important}h1,h2,h3,h4,h5,h6,.h1,.h2,.h3,.h4,.h5,.h6{margin-bottom:.5rem;font-weight:500;line-height:1.2}h1,.h1{font-size:2.5rem}h2,.h2{font-size:2rem}h3,.h3{font-size:1.75rem}h4,.h4{font-size:1.5rem}h5,.h5{font-size:1.25rem}h6,.h6{font-size:1rem}.lead{font-size:1.25rem;font-weight:300}.display-1{font-size:6rem;font-weight:300;line-height:1.2}.display-2{font-size:5.5rem;font-weight:300;line-height:1.2}.display-3{font-size:4.5rem;font-weight:300;line-height:1.2}.display-4{font-size:3.5rem;font-weight:300;line-height:1.2}hr{margin-top:1rem;margin-bottom:1rem;border:0;border-top:1px solid rgba(0,0,0,0.1)}small,.small{font-size:80%;font-weight:400}mark,.mark{padding:.2em;background-color:#fcf8e3}.list-unstyled{padding-left:0;list-style:none}.list-inline{padding-left:0;list-style:none}.list-inline-item{display:inline-block}.list-inline-item:not(:last-child){margin-right:.5rem}.initialism{font-size:90%;text-transform:uppercase}.blockquote{margin-bottom:1rem;font-size:1.25rem}.blockquote-footer{display:block;font-size:80%;color:#6c757d}.blockquote-footer::before{content:"\2014\00A0"}.img-fluid{max-width:100%;height:auto}.img-thumbnail{padding:.25rem;background-color:#fff;border:1px solid #dee2e6;border-radius:.25rem;max-width:100%;height:auto}.figure{display:inline-block}.figure-img{margin-bottom:.5rem;line-height:1}.figure-caption{font-size:90%;color:#6c757d}code{font-size:87.5%;color:#e83e8c;word-break:break-word}a>code{color:inherit}kbd{padding:.2rem .4rem;font-size:87.5%;color:#fff;background-color:#212529;border-radius:.2rem}kbd kbd{padding:0;font-size:100%;font-weight:700}pre{display:block;font-size:87.5%;color:#212529}pre code{font-size:inherit;color:inherit;word-break:normal}.pre-scrollable{max-height:340px;overflow-y:scroll}.container{width:100%;padding-right:15px;padding-left:15px;margin-right:auto;margin-left:auto}@media (min-width: 576px){.container{max-width:540px}}@media (min-width: 768px){.container{max-width:720px}}@media (min-width: 992px){.container{max-width:960px}}@media (min-width: 1200px){.container{max-width:1140px}}.container-fluid{width:100%;padding-right:15px;padding-left:15px;margin-right:auto;margin-left:auto}.row{display:flex;flex-wrap:wrap;margin-right:-15px;margin-left:-15px}.no-gutters{margin-right:0;margin-left:0}.no-gutters>.col,.no-gutters>[class*="col-"]{padding-right:0;padding-left:0}.col-1,.col-2,.col-3,.col-4,.col-5,.col-6,.col-7,.col-8,.col-9,.col-10,.col-11,.col-12,.col,.col-auto,.col-sm-1,.col-sm-2,.col-sm-3,.col-sm-4,.col-sm-5,.col-sm-6,.col-sm-7,.col-sm-8,.col-sm-9,.col-sm-10,.col-sm-11,.col-sm-12,.col-sm,.col-sm-auto,.col-md-1,.col-md-2,.col-md-3,.col-md-4,.col-md-5,.col-md-6,.col-md-7,.col-md-8,.col-md-9,.col-md-10,.col-md-11,.col-md-12,.col-md,.col-md-auto,.col-lg-1,.col-lg-2,.col-lg-3,.col-lg-4,.col-lg-5,.col-lg-6,.col-lg-7,.col-lg-8,.col-lg-9,.col-lg-10,.col-lg-11,.col-lg-12,.col-lg,.col-lg-auto,.col-xl-1,.col-xl-2,.col-xl-3,.col-xl-4,.col-xl-5,.col-xl-6,.col-xl-7,.col-xl-8,.col-xl-9,.col-xl-10,.col-xl-11,.col-xl-12,.col-xl,.col-xl-auto{position:relative;width:100%;padding-right:15px;padding-left:15px}.col{flex-basis:0;flex-grow:1;max-width:100%}.col-auto{flex:0 0 auto;width:auto;max-width:100%}.col-1{flex:0 0 8.3333333333%;max-width:8.3333333333%}.col-2{flex:0 0 16.6666666667%;max-width:16.6666666667%}.col-3{flex:0 0 25%;max-width:25%}.col-4{flex:0 0 33.3333333333%;max-width:33.3333333333%}.col-5{flex:0 0 41.6666666667%;max-width:41.6666666667%}.col-6{flex:0 0 50%;max-width:50%}.col-7{flex:0 0 58.3333333333%;max-width:58.3333333333%}.col-8{flex:0 0 66.6666666667%;max-width:66.6666666667%}.col-9{flex:0 0 75%;max-width:75%}.col-10{flex:0 0 83.3333333333%;max-width:83.3333333333%}.col-11{flex:0 0 91.6666666667%;max-width:91.6666666667%}.col-12{flex:0 0 100%;max-width:100%}.order-first{order:-1}.order-last{order:13}.order-0{order:0}.order-1{order:1}.order-2{order:2}.order-3{order:3}.order-4{order:4}.order-5{order:5}.order-6{order:6}.order-7{order:7}.order-8{order:8}.order-9{order:9}.order-10{order:10}.order-11{order:11}.order-12{order:12}.offset-1{margin-left:8.3333333333%}.offset-2{margin-left:16.6666666667%}.offset-3{margin-left:25%}.offset-4{margin-left:33.3333333333%}.offset-5{margin-left:41.6666666667%}.offset-6{margin-left:50%}.offset-7{margin-left:58.3333333333%}.offset-8{margin-left:66.6666666667%}.offset-9{margin-left:75%}.offset-10{margin-left:83.3333333333%}.offset-11{margin-left:91.6666666667%}@media (min-width: 576px){.col-sm{flex-basis:0;flex-grow:1;max-width:100%}.col-sm-auto{flex:0 0 auto;width:auto;max-width:100%}.col-sm-1{flex:0 0 8.3333333333%;max-width:8.3333333333%}.col-sm-2{flex:0 0 16.6666666667%;max-width:16.6666666667%}.col-sm-3{flex:0 0 25%;max-width:25%}.col-sm-4{flex:0 0 33.3333333333%;max-width:33.3333333333%}.col-sm-5{flex:0 0 41.6666666667%;max-width:41.6666666667%}.col-sm-6{flex:0 0 50%;max-width:50%}.col-sm-7{flex:0 0 58.3333333333%;max-width:58.3333333333%}.col-sm-8{flex:0 0 66.6666666667%;max-width:66.6666666667%}.col-sm-9{flex:0 0 75%;max-width:75%}.col-sm-10{flex:0 0 83.3333333333%;max-width:83.3333333333%}.col-sm-11{flex:0 0 91.6666666667%;max-width:91.6666666667%}.col-sm-12{flex:0 0 100%;max-width:100%}.order-sm-first{order:-1}.order-sm-last{order:13}.order-sm-0{order:0}.order-sm-1{order:1}.order-sm-2{order:2}.order-sm-3{order:3}.order-sm-4{order:4}.order-sm-5{order:5}.order-sm-6{order:6}.order-sm-7{order:7}.order-sm-8{order:8}.order-sm-9{order:9}.order-sm-10{order:10}.order-sm-11{order:11}.order-sm-12{order:12}.offset-sm-0{margin-left:0}.offset-sm-1{margin-left:8.3333333333%}.offset-sm-2{margin-left:16.6666666667%}.offset-sm-3{margin-left:25%}.offset-sm-4{margin-left:33.3333333333%}.offset-sm-5{margin-left:41.6666666667%}.offset-sm-6{margin-left:50%}.offset-sm-7{margin-left:58.3333333333%}.offset-sm-8{margin-left:66.6666666667%}.offset-sm-9{margin-left:75%}.offset-sm-10{margin-left:83.3333333333%}.offset-sm-11{margin-left:91.6666666667%}}@media (min-width: 768px){.col-md{flex-basis:0;flex-grow:1;max-width:100%}.col-md-auto{flex:0 0 auto;width:auto;max-width:100%}.col-md-1{flex:0 0 8.3333333333%;max-width:8.3333333333%}.col-md-2{flex:0 0 16.6666666667%;max-width:16.6666666667%}.col-md-3{flex:0 0 25%;max-width:25%}.col-md-4{flex:0 0 33.3333333333%;max-width:33.3333333333%}.col-md-5{flex:0 0 41.6666666667%;max-width:41.6666666667%}.col-md-6{flex:0 0 50%;max-width:50%}.col-md-7{flex:0 0 58.3333333333%;max-width:58.3333333333%}.col-md-8{flex:0 0 66.6666666667%;max-width:66.6666666667%}.col-md-9{flex:0 0 75%;max-width:75%}.col-md-10{flex:0 0 83.3333333333%;max-width:83.3333333333%}.col-md-11{flex:0 0 91.6666666667%;max-width:91.6666666667%}.col-md-12{flex:0 0 100%;max-width:100%}.order-md-first{order:-1}.order-md-last{order:13}.order-md-0{order:0}.order-md-1{order:1}.order-md-2{order:2}.order-md-3{order:3}.order-md-4{order:4}.order-md-5{order:5}.order-md-6{order:6}.order-md-7{order:7}.order-md-8{order:8}.order-md-9{order:9}.order-md-10{order:10}.order-md-11{order:11}.order-md-12{order:12}.offset-md-0{margin-left:0}.offset-md-1{margin-left:8.3333333333%}.offset-md-2{margin-left:16.6666666667%}.offset-md-3{margin-left:25%}.offset-md-4{margin-left:33.3333333333%}.offset-md-5{margin-left:41.6666666667%}.offset-md-6{margin-left:50%}.offset-md-7{margin-left:58.3333333333%}.offset-md-8{margin-left:66.6666666667%}.offset-md-9{margin-left:75%}.offset-md-10{margin-left:83.3333333333%}.offset-md-11{margin-left:91.6666666667%}}@media (min-width: 992px){.col-lg{flex-basis:0;flex-grow:1;max-width:100%}.col-lg-auto{flex:0 0 auto;width:auto;max-width:100%}.col-lg-1{flex:0 0 8.3333333333%;max-width:8.3333333333%}.col-lg-2{flex:0 0 16.6666666667%;max-width:16.6666666667%}.col-lg-3{flex:0 0 25%;max-width:25%}.col-lg-4{flex:0 0 33.3333333333%;max-width:33.3333333333%}.col-lg-5{flex:0 0 41.6666666667%;max-width:41.6666666667%}.col-lg-6{flex:0 0 50%;max-width:50%}.col-lg-7{flex:0 0 58.3333333333%;max-width:58.3333333333%}.col-lg-8{flex:0 0 66.6666666667%;max-width:66.6666666667%}.col-lg-9{flex:0 0 75%;max-width:75%}.col-lg-10{flex:0 0 83.3333333333%;max-width:83.3333333333%}.col-lg-11{flex:0 0 91.6666666667%;max-width:91.6666666667%}.col-lg-12{flex:0 0 100%;max-width:100%}.order-lg-first{order:-1}.order-lg-last{order:13}.order-lg-0{order:0}.order-lg-1{order:1}.order-lg-2{order:2}.order-lg-3{order:3}.order-lg-4{order:4}.order-lg-5{order:5}.order-lg-6{order:6}.order-lg-7{order:7}.order-lg-8{order:8}.order-lg-9{order:9}.order-lg-10{order:10}.order-lg-11{order:11}.order-lg-12{order:12}.offset-lg-0{margin-left:0}.offset-lg-1{margin-left:8.3333333333%}.offset-lg-2{margin-left:16.6666666667%}.offset-lg-3{margin-left:25%}.offset-lg-4{margin-left:33.3333333333%}.offset-lg-5{margin-left:41.6666666667%}.offset-lg-6{margin-left:50%}.offset-lg-7{margin-left:58.3333333333%}.offset-lg-8{margin-left:66.6666666667%}.offset-lg-9{margin-left:75%}.offset-lg-10{margin-left:83.3333333333%}.offset-lg-11{margin-left:91.6666666667%}}@media (min-width: 1200px){.col-xl{flex-basis:0;flex-grow:1;max-width:100%}.col-xl-auto{flex:0 0 auto;width:auto;max-width:100%}.col-xl-1{flex:0 0 8.3333333333%;max-width:8.3333333333%}.col-xl-2{flex:0 0 16.6666666667%;max-width:16.6666666667%}.col-xl-3{flex:0 0 25%;max-width:25%}.col-xl-4{flex:0 0 33.3333333333%;max-width:33.3333333333%}.col-xl-5{flex:0 0 41.6666666667%;max-width:41.6666666667%}.col-xl-6{flex:0 0 50%;max-width:50%}.col-xl-7{flex:0 0 58.3333333333%;max-width:58.3333333333%}.col-xl-8{flex:0 0 66.6666666667%;max-width:66.6666666667%}.col-xl-9{flex:0 0 75%;max-width:75%}.col-xl-10{flex:0 0 83.3333333333%;max-width:83.3333333333%}.col-xl-11{flex:0 0 91.6666666667%;max-width:91.6666666667%}.col-xl-12{flex:0 0 100%;max-width:100%}.order-xl-first{order:-1}.order-xl-last{order:13}.order-xl-0{order:0}.order-xl-1{order:1}.order-xl-2{order:2}.order-xl-3{order:3}.order-xl-4{order:4}.order-xl-5{order:5}.order-xl-6{order:6}.order-xl-7{order:7}.order-xl-8{order:8}.order-xl-9{order:9}.order-xl-10{order:10}.order-xl-11{order:11}.order-xl-12{order:12}.offset-xl-0{margin-left:0}.offset-xl-1{margin-left:8.3333333333%}.offset-xl-2{margin-left:16.6666666667%}.offset-xl-3{margin-left:25%}.offset-xl-4{margin-left:33.3333333333%}.offset-xl-5{margin-left:41.6666666667%}.offset-xl-6{margin-left:50%}.offset-xl-7{margin-left:58.3333333333%}.offset-xl-8{margin-left:66.6666666667%}.offset-xl-9{margin-left:75%}.offset-xl-10{margin-left:83.3333333333%}.offset-xl-11{margin-left:91.6666666667%}}.table{width:100%;margin-bottom:1rem;color:#212529}.table th,.table td{padding:.75rem;vertical-align:top;border-top:1px solid #dee2e6}.table thead th{vertical-align:bottom;border-bottom:2px solid #dee2e6}.table tbody+tbody{border-top:2px solid #dee2e6}.table-sm th,.table-sm td{padding:.3rem}.table-bordered{border:1px solid #dee2e6}.table-bordered th,.table-bordered td{border:1px solid #dee2e6}.table-bordered thead th,.table-bordered thead td{border-bottom-width:2px}.table-borderless th,.table-borderless td,.table-borderless thead th,.table-borderless tbody+tbody{border:0}.table-striped tbody tr:nth-of-type(odd){background-color:rgba(0,0,0,0.05)}.table-hover tbody tr:hover{color:#212529;background-color:rgba(0,0,0,0.075)}.table-primary,.table-primary>th,.table-primary>td{background-color:#b8daff}.table-primary th,.table-primary td,.table-primary thead th,.table-primary tbody+tbody{border-color:#7abaff}.table-hover .table-primary:hover{background-color:#9fcdff}.table-hover .table-primary:hover>td,.table-hover .table-primary:hover>th{background-color:#9fcdff}.table-secondary,.table-secondary>th,.table-secondary>td{background-color:#d6d8db}.table-secondary th,.table-secondary td,.table-secondary thead th,.table-secondary tbody+tbody{border-color:#b3b7bb}.table-hover .table-secondary:hover{background-color:#c8cbcf}.table-hover .table-secondary:hover>td,.table-hover .table-secondary:hover>th{background-color:#c8cbcf}.table-success,.table-success>th,.table-success>td{background-color:#c3e6cb}.table-success th,.table-success td,.table-success thead th,.table-success tbody+tbody{border-color:#8fd19e}.table-hover .table-success:hover{background-color:#b1dfbb}.table-hover .table-success:hover>td,.table-hover .table-success:hover>th{background-color:#b1dfbb}.table-info,.table-info>th,.table-info>td{background-color:#bee5eb}.table-info th,.table-info td,.table-info thead th,.table-info tbody+tbody{border-color:#86cfda}.table-hover .table-info:hover{background-color:#abdde5}.table-hover .table-info:hover>td,.table-hover .table-info:hover>th{background-color:#abdde5}.table-warning,.table-warning>th,.table-warning>td{background-color:#ffeeba}.table-warning th,.table-warning td,.table-warning thead th,.table-warning tbody+tbody{border-color:#ffdf7e}.table-hover .table-warning:hover{background-color:#ffe8a1}.table-hover .table-warning:hover>td,.table-hover .table-warning:hover>th{background-color:#ffe8a1}.table-danger,.table-danger>th,.table-danger>td{background-color:#f5c6cb}.table-danger th,.table-danger td,.table-danger thead th,.table-danger tbody+tbody{border-color:#ed969e}.table-hover .table-danger:hover{background-color:#f1b0b7}.table-hover .table-danger:hover>td,.table-hover .table-danger:hover>th{background-color:#f1b0b7}.table-light,.table-light>th,.table-light>td{background-color:#fdfdfe}.table-light th,.table-light td,.table-light thead th,.table-light tbody+tbody{border-color:#fbfcfc}.table-hover .table-light:hover{background-color:#ececf6}.table-hover .table-light:hover>td,.table-hover .table-light:hover>th{background-color:#ececf6}.table-dark,.table-dark>th,.table-dark>td{background-color:#c6c8ca}.table-dark th,.table-dark td,.table-dark thead th,.table-dark tbody+tbody{border-color:#95999c}.table-hover .table-dark:hover{background-color:#b9bbbe}.table-hover .table-dark:hover>td,.table-hover .table-dark:hover>th{background-color:#b9bbbe}.table-active,.table-active>th,.table-active>td{background-color:rgba(0,0,0,0.075)}.table-hover .table-active:hover{background-color:rgba(0,0,0,0.075)}.table-hover .table-active:hover>td,.table-hover .table-active:hover>th{background-color:rgba(0,0,0,0.075)}.table .thead-dark th{color:#fff;background-color:#343a40;border-color:#454d55}.table .thead-light th{color:#495057;background-color:#e9ecef;border-color:#dee2e6}.table-dark{color:#fff;background-color:#343a40}.table-dark th,.table-dark td,.table-dark thead th{border-color:#454d55}.table-dark.table-bordered{border:0}.table-dark.table-striped tbody tr:nth-of-type(odd){background-color:rgba(255,255,255,0.05)}.table-dark.table-hover tbody tr:hover{color:#fff;background-color:rgba(255,255,255,0.075)}@media (max-width: 575.98px){.table-responsive-sm{display:block;width:100%;overflow-x:auto;-webkit-overflow-scrolling:touch}.table-responsive-sm>.table-bordered{border:0}}@media (max-width: 767.98px){.table-responsive-md{display:block;width:100%;overflow-x:auto;-webkit-overflow-scrolling:touch}.table-responsive-md>.table-bordered{border:0}}@media (max-width: 991.98px){.table-responsive-lg{display:block;width:100%;overflow-x:auto;-webkit-overflow-scrolling:touch}.table-responsive-lg>.table-bordered{border:0}}@media (max-width: 1199.98px){.table-responsive-xl{display:block;width:100%;overflow-x:auto;-webkit-overflow-scrolling:touch}.table-responsive-xl>.table-bordered{border:0}}.table-responsive{display:block;width:100%;overflow-x:auto;-webkit-overflow-scrolling:touch}.table-responsive>.table-bordered{border:0}.form-control{display:block;width:100%;height:calc(1.5em + .75rem + 2px);padding:.375rem .75rem;font-size:1rem;font-weight:400;line-height:1.5;color:#495057;background-color:#fff;background-clip:padding-box;border:1px solid #ced4da;border-radius:.25rem;transition:border-color 0.15s ease-in-out,box-shadow 0.15s ease-in-out}@media (prefers-reduced-motion: reduce){.form-control{transition:none}}.form-control::-ms-expand{background-color:transparent;border:0}.form-control:focus{color:#495057;background-color:#fff;border-color:#80bdff;outline:0;box-shadow:0 0 0 .2rem rgba(0,123,255,0.25)}.form-control::-moz-placeholder{color:#6c757d;opacity:1}.form-control:-ms-input-placeholder{color:#6c757d;opacity:1}.form-control::-ms-input-placeholder{color:#6c757d;opacity:1}.form-control::placeholder{color:#6c757d;opacity:1}.form-control:disabled,.form-control[readonly]{background-color:#e9ecef;opacity:1}select.form-control:focus::-ms-value{color:#495057;background-color:#fff}.form-control-file,.form-control-range{display:block;width:100%}.col-form-label{padding-top:calc(.375rem + 1px);padding-bottom:calc(.375rem + 1px);margin-bottom:0;font-size:inherit;line-height:1.5}.col-form-label-lg{padding-top:calc(.5rem + 1px);padding-bottom:calc(.5rem + 1px);font-size:1.25rem;line-height:1.5}.col-form-label-sm{padding-top:calc(.25rem + 1px);padding-bottom:calc(.25rem + 1px);font-size:.875rem;line-height:1.5}.form-control-plaintext{display:block;width:100%;padding-top:.375rem;padding-bottom:.375rem;margin-bottom:0;line-height:1.5;color:#212529;background-color:transparent;border:solid transparent;border-width:1px 0}.form-control-plaintext.form-control-sm,.form-control-plaintext.form-control-lg{padding-right:0;padding-left:0}.form-control-sm{height:calc(1.5em + .5rem + 2px);padding:.25rem .5rem;font-size:.875rem;line-height:1.5;border-radius:.2rem}.form-control-lg{height:calc(1.5em + 1rem + 2px);padding:.5rem 1rem;font-size:1.25rem;line-height:1.5;border-radius:.3rem}select.form-control[size],select.form-control[multiple]{height:auto}textarea.form-control{height:auto}.form-group{margin-bottom:1rem}.form-text{display:block;margin-top:.25rem}.form-row{display:flex;flex-wrap:wrap;margin-right:-5px;margin-left:-5px}.form-row>.col,.form-row>[class*="col-"]{padding-right:5px;padding-left:5px}.form-check{position:relative;display:block;padding-left:1.25rem}.form-check-input{position:absolute;margin-top:.3rem;margin-left:-1.25rem}.form-check-input:disabled ~ .form-check-label{color:#6c757d}.form-check-label{margin-bottom:0}.form-check-inline{display:inline-flex;align-items:center;padding-left:0;margin-right:.75rem}.form-check-inline .form-check-input{position:static;margin-top:0;margin-right:.3125rem;margin-left:0}.valid-feedback{display:none;width:100%;margin-top:.25rem;font-size:80%;color:#28a745}.valid-tooltip{position:absolute;top:100%;z-index:5;display:none;max-width:100%;padding:.25rem .5rem;margin-top:.1rem;font-size:.875rem;line-height:1.5;color:#fff;background-color:rgba(40,167,69,0.9);border-radius:.25rem}.was-validated .form-control:valid,.form-control.is-valid{border-color:#28a745;padding-right:calc(1.5em + .75rem);background-image:url("data:image/svg+xml,%3csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 8 8'%3e%3cpath fill='%2328a745' d='M2.3 6.73L.6 4.53c-.4-1.04.46-1.4 1.1-.8l1.1 1.4 3.4-3.8c.6-.63 1.6-.27 1.2.7l-4 4.6c-.43.5-.8.4-1.1.1z'/%3e%3c/svg%3e");background-repeat:no-repeat;background-position:center right calc(.375em + .1875rem);background-size:calc(.75em + .375rem) calc(.75em + .375rem)}.was-validated .form-control:valid:focus,.form-control.is-valid:focus{border-color:#28a745;box-shadow:0 0 0 .2rem rgba(40,167,69,0.25)}.was-validated .form-control:valid ~ .valid-feedback,.was-validated .form-control:valid ~ .valid-tooltip,.form-control.is-valid ~ .valid-feedback,.form-control.is-valid ~ .valid-tooltip{display:block}.was-validated textarea.form-control:valid,textarea.form-control.is-valid{padding-right:calc(1.5em + .75rem);background-position:top calc(.375em + .1875rem) right calc(.375em + .1875rem)}.was-validated .custom-select:valid,.custom-select.is-valid{border-color:#28a745;padding-right:calc((1em + .75rem) * 3 / 4 + 1.75rem);background:url("data:image/svg+xml,%3csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 4 5'%3e%3cpath fill='%23343a40' d='M2 0L0 2h4zm0 5L0 3h4z'/%3e%3c/svg%3e") no-repeat right .75rem center/8px 10px,url("data:image/svg+xml,%3csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 8 8'%3e%3cpath fill='%2328a745' d='M2.3 6.73L.6 4.53c-.4-1.04.46-1.4 1.1-.8l1.1 1.4 3.4-3.8c.6-.63 1.6-.27 1.2.7l-4 4.6c-.43.5-.8.4-1.1.1z'/%3e%3c/svg%3e") #fff no-repeat center right 1.75rem/calc(.75em + .375rem) calc(.75em + .375rem)}.was-validated .custom-select:valid:focus,.custom-select.is-valid:focus{border-color:#28a745;box-shadow:0 0 0 .2rem rgba(40,167,69,0.25)}.was-validated .custom-select:valid ~ .valid-feedback,.was-validated .custom-select:valid ~ .valid-tooltip,.custom-select.is-valid ~ .valid-feedback,.custom-select.is-valid ~ .valid-tooltip{display:block}.was-validated .form-control-file:valid ~ .valid-feedback,.was-validated .form-control-file:valid ~ .valid-tooltip,.form-control-file.is-valid ~ .valid-feedback,.form-control-file.is-valid ~ .valid-tooltip{display:block}.was-validated .form-check-input:valid ~ .form-check-label,.form-check-input.is-valid ~ .form-check-label{color:#28a745}.was-validated .form-check-input:valid ~ .valid-feedback,.was-validated .form-check-input:valid ~ .valid-tooltip,.form-check-input.is-valid ~ .valid-feedback,.form-check-input.is-valid ~ .valid-tooltip{display:block}.was-validated .custom-control-input:valid ~ .custom-control-label,.custom-control-input.is-valid ~ .custom-control-label{color:#28a745}.was-validated .custom-control-input:valid ~ .custom-control-label::before,.custom-control-input.is-valid ~ .custom-control-label::before{border-color:#28a745}.was-validated .custom-control-input:valid ~ .valid-feedback,.was-validated .custom-control-input:valid ~ .valid-tooltip,.custom-control-input.is-valid ~ .valid-feedback,.custom-control-input.is-valid ~ .valid-tooltip{display:block}.was-validated .custom-control-input:valid:checked ~ .custom-control-label::before,.custom-control-input.is-valid:checked ~ .custom-control-label::before{border-color:#34ce57;background-color:#34ce57}.was-validated .custom-control-input:valid:focus ~ .custom-control-label::before,.custom-control-input.is-valid:focus ~ .custom-control-label::before{box-shadow:0 0 0 .2rem rgba(40,167,69,0.25)}.was-validated .custom-control-input:valid:focus:not(:checked) ~ .custom-control-label::before,.custom-control-input.is-valid:focus:not(:checked) ~ .custom-control-label::before{border-color:#28a745}.was-validated .custom-file-input:valid ~ .custom-file-label,.custom-file-input.is-valid ~ .custom-file-label{border-color:#28a745}.was-validated .custom-file-input:valid ~ .valid-feedback,.was-validated .custom-file-input:valid ~ .valid-tooltip,.custom-file-input.is-valid ~ .valid-feedback,.custom-file-input.is-valid ~ .valid-tooltip{display:block}.was-validated .custom-file-input:valid:focus ~ .custom-file-label,.custom-file-input.is-valid:focus ~ .custom-file-label{border-color:#28a745;box-shadow:0 0 0 .2rem rgba(40,167,69,0.25)}.invalid-feedback{display:none;width:100%;margin-top:.25rem;font-size:80%;color:#dc3545}.invalid-tooltip{position:absolute;top:100%;z-index:5;display:none;max-width:100%;padding:.25rem .5rem;margin-top:.1rem;font-size:.875rem;line-height:1.5;color:#fff;background-color:rgba(220,53,69,0.9);border-radius:.25rem}.was-validated .form-control:invalid,.form-control.is-invalid{border-color:#dc3545;padding-right:calc(1.5em + .75rem);background-image:url("data:image/svg+xml,%3csvg xmlns='http://www.w3.org/2000/svg' fill='%23dc3545' viewBox='-2 -2 7 7'%3e%3cpath stroke='%23dc3545' d='M0 0l3 3m0-3L0 3'/%3e%3ccircle r='.5'/%3e%3ccircle cx='3' r='.5'/%3e%3ccircle cy='3' r='.5'/%3e%3ccircle cx='3' cy='3' r='.5'/%3e%3c/svg%3E");background-repeat:no-repeat;background-position:center right calc(.375em + .1875rem);background-size:calc(.75em + .375rem) calc(.75em + .375rem)}.was-validated .form-control:invalid:focus,.form-control.is-invalid:focus{border-color:#dc3545;box-shadow:0 0 0 .2rem rgba(220,53,69,0.25)}.was-validated .form-control:invalid ~ .invalid-feedback,.was-validated .form-control:invalid ~ .invalid-tooltip,.form-control.is-invalid ~ .invalid-feedback,.form-control.is-invalid ~ .invalid-tooltip{display:block}.was-validated textarea.form-control:invalid,textarea.form-control.is-invalid{padding-right:calc(1.5em + .75rem);background-position:top calc(.375em + .1875rem) right calc(.375em + .1875rem)}.was-validated .custom-select:invalid,.custom-select.is-invalid{border-color:#dc3545;padding-right:calc((1em + .75rem) * 3 / 4 + 1.75rem);background:url("data:image/svg+xml,%3csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 4 5'%3e%3cpath fill='%23343a40' d='M2 0L0 2h4zm0 5L0 3h4z'/%3e%3c/svg%3e") no-repeat right .75rem center/8px 10px,url("data:image/svg+xml,%3csvg xmlns='http://www.w3.org/2000/svg' fill='%23dc3545' viewBox='-2 -2 7 7'%3e%3cpath stroke='%23dc3545' d='M0 0l3 3m0-3L0 3'/%3e%3ccircle r='.5'/%3e%3ccircle cx='3' r='.5'/%3e%3ccircle cy='3' r='.5'/%3e%3ccircle cx='3' cy='3' r='.5'/%3e%3c/svg%3E") #fff no-repeat center right 1.75rem/calc(.75em + .375rem) calc(.75em + .375rem)}.was-validated .custom-select:invalid:focus,.custom-select.is-invalid:focus{border-color:#dc3545;box-shadow:0 0 0 .2rem rgba(220,53,69,0.25)}.was-validated .custom-select:invalid ~ .invalid-feedback,.was-validated .custom-select:invalid ~ .invalid-tooltip,.custom-select.is-invalid ~ .invalid-feedback,.custom-select.is-invalid ~ .invalid-tooltip{display:block}.was-validated .form-control-file:invalid ~ .invalid-feedback,.was-validated .form-control-file:invalid ~ .invalid-tooltip,.form-control-file.is-invalid ~ .invalid-feedback,.form-control-file.is-invalid ~ .invalid-tooltip{display:block}.was-validated .form-check-input:invalid ~ .form-check-label,.form-check-input.is-invalid ~ .form-check-label{color:#dc3545}.was-validated .form-check-input:invalid ~ .invalid-feedback,.was-validated .form-check-input:invalid ~ .invalid-tooltip,.form-check-input.is-invalid ~ .invalid-feedback,.form-check-input.is-invalid ~ .invalid-tooltip{display:block}.was-validated .custom-control-input:invalid ~ .custom-control-label,.custom-control-input.is-invalid ~ .custom-control-label{color:#dc3545}.was-validated .custom-control-input:invalid ~ .custom-control-label::before,.custom-control-input.is-invalid ~ .custom-control-label::before{border-color:#dc3545}.was-validated .custom-control-input:invalid ~ .invalid-feedback,.was-validated .custom-control-input:invalid ~ .invalid-tooltip,.custom-control-input.is-invalid ~ .invalid-feedback,.custom-control-input.is-invalid ~ .invalid-tooltip{display:block}.was-validated .custom-control-input:invalid:checked ~ .custom-control-label::before,.custom-control-input.is-invalid:checked ~ .custom-control-label::before{border-color:#e4606d;background-color:#e4606d}.was-validated .custom-control-input:invalid:focus ~ .custom-control-label::before,.custom-control-input.is-invalid:focus ~ .custom-control-label::before{box-shadow:0 0 0 .2rem rgba(220,53,69,0.25)}.was-validated .custom-control-input:invalid:focus:not(:checked) ~ .custom-control-label::before,.custom-control-input.is-invalid:focus:not(:checked) ~ .custom-control-label::before{border-color:#dc3545}.was-validated .custom-file-input:invalid ~ .custom-file-label,.custom-file-input.is-invalid ~ .custom-file-label{border-color:#dc3545}.was-validated .custom-file-input:invalid ~ .invalid-feedback,.was-validated .custom-file-input:invalid ~ .invalid-tooltip,.custom-file-input.is-invalid ~ .invalid-feedback,.custom-file-input.is-invalid ~ .invalid-tooltip{display:block}.was-validated .custom-file-input:invalid:focus ~ .custom-file-label,.custom-file-input.is-invalid:focus ~ .custom-file-label{border-color:#dc3545;box-shadow:0 0 0 .2rem rgba(220,53,69,0.25)}.form-inline{display:flex;flex-flow:row wrap;align-items:center}.form-inline .form-check{width:100%}@media (min-width: 576px){.form-inline label{display:flex;align-items:center;justify-content:center;margin-bottom:0}.form-inline .form-group{display:flex;flex:0 0 auto;flex-flow:row wrap;align-items:center;margin-bottom:0}.form-inline .form-control{display:inline-block;width:auto;vertical-align:middle}.form-inline .form-control-plaintext{display:inline-block}.form-inline .input-group,.form-inline .custom-select{width:auto}.form-inline .form-check{display:flex;align-items:center;justify-content:center;width:auto;padding-left:0}.form-inline .form-check-input{position:relative;flex-shrink:0;margin-top:0;margin-right:.25rem;margin-left:0}.form-inline .custom-control{align-items:center;justify-content:center}.form-inline .custom-control-label{margin-bottom:0}}.btn{display:inline-block;font-weight:400;color:#212529;text-align:center;vertical-align:middle;-webkit-user-select:none;-moz-user-select:none;-ms-user-select:none;user-select:none;background-color:transparent;border:1px solid transparent;padding:.375rem .75rem;font-size:1rem;line-height:1.5;border-radius:.25rem;transition:color 0.15s ease-in-out,background-color 0.15s ease-in-out,border-color 0.15s ease-in-out,box-shadow 0.15s ease-in-out}@media (prefers-reduced-motion: reduce){.btn{transition:none}}.btn:hover{color:#212529;text-decoration:none}.btn:focus,.btn.focus{outline:0;box-shadow:0 0 0 .2rem rgba(0,123,255,0.25)}.btn.disabled,.btn:disabled{opacity:.65}a.btn.disabled,fieldset:disabled a.btn{pointer-events:none}.btn-primary{color:#fff;background-color:#007bff;border-color:#007bff}.btn-primary:hover{color:#fff;background-color:#0069d9;border-color:#0062cc}.btn-primary:focus,.btn-primary.focus{box-shadow:0 0 0 .2rem rgba(38,143,255,0.5)}.btn-primary.disabled,.btn-primary:disabled{color:#fff;background-color:#007bff;border-color:#007bff}.btn-primary:not(:disabled):not(.disabled):active,.btn-primary:not(:disabled):not(.disabled).active,.show>.btn-primary.dropdown-toggle{color:#fff;background-color:#0062cc;border-color:#005cbf}.btn-primary:not(:disabled):not(.disabled):active:focus,.btn-primary:not(:disabled):not(.disabled).active:focus,.show>.btn-primary.dropdown-toggle:focus{box-shadow:0 0 0 .2rem rgba(38,143,255,0.5)}.btn-secondary{color:#fff;background-color:#6c757d;border-color:#6c757d}.btn-secondary:hover{color:#fff;background-color:#5a6268;border-color:#545b62}.btn-secondary:focus,.btn-secondary.focus{box-shadow:0 0 0 .2rem rgba(130,138,145,0.5)}.btn-secondary.disabled,.btn-secondary:disabled{color:#fff;background-color:#6c757d;border-color:#6c757d}.btn-secondary:not(:disabled):not(.disabled):active,.btn-secondary:not(:disabled):not(.disabled).active,.show>.btn-secondary.dropdown-toggle{color:#fff;background-color:#545b62;border-color:#4e555b}.btn-secondary:not(:disabled):not(.disabled):active:focus,.btn-secondary:not(:disabled):not(.disabled).active:focus,.show>.btn-secondary.dropdown-toggle:focus{box-shadow:0 0 0 .2rem rgba(130,138,145,0.5)}.btn-success{color:#fff;background-color:#28a745;border-color:#28a745}.btn-success:hover{color:#fff;background-color:#218838;border-color:#1e7e34}.btn-success:focus,.btn-success.focus{box-shadow:0 0 0 .2rem rgba(72,180,97,0.5)}.btn-success.disabled,.btn-success:disabled{color:#fff;background-color:#28a745;border-color:#28a745}.btn-success:not(:disabled):not(.disabled):active,.btn-success:not(:disabled):not(.disabled).active,.show>.btn-success.dropdown-toggle{color:#fff;background-color:#1e7e34;border-color:#1c7430}.btn-success:not(:disabled):not(.disabled):active:focus,.btn-success:not(:disabled):not(.disabled).active:focus,.show>.btn-success.dropdown-toggle:focus{box-shadow:0 0 0 .2rem rgba(72,180,97,0.5)}.btn-info{color:#fff;background-color:#17a2b8;border-color:#17a2b8}.btn-info:hover{color:#fff;background-color:#138496;border-color:#117a8b}.btn-info:focus,.btn-info.focus{box-shadow:0 0 0 .2rem rgba(58,176,195,0.5)}.btn-info.disabled,.btn-info:disabled{color:#fff;background-color:#17a2b8;border-color:#17a2b8}.btn-info:not(:disabled):not(.disabled):active,.btn-info:not(:disabled):not(.disabled).active,.show>.btn-info.dropdown-toggle{color:#fff;background-color:#117a8b;border-color:#10707f}.btn-info:not(:disabled):not(.disabled):active:focus,.btn-info:not(:disabled):not(.disabled).active:focus,.show>.btn-info.dropdown-toggle:focus{box-shadow:0 0 0 .2rem rgba(58,176,195,0.5)}.btn-warning{color:#212529;background-color:#ffc107;border-color:#ffc107}.btn-warning:hover{color:#212529;background-color:#e0a800;border-color:#d39e00}.btn-warning:focus,.btn-warning.focus{box-shadow:0 0 0 .2rem rgba(222,170,12,0.5)}.btn-warning.disabled,.btn-warning:disabled{color:#212529;background-color:#ffc107;border-color:#ffc107}.btn-warning:not(:disabled):not(.disabled):active,.btn-warning:not(:disabled):not(.disabled).active,.show>.btn-warning.dropdown-toggle{color:#212529;background-color:#d39e00;border-color:#c69500}.btn-warning:not(:disabled):not(.disabled):active:focus,.btn-warning:not(:disabled):not(.disabled).active:focus,.show>.btn-warning.dropdown-toggle:focus{box-shadow:0 0 0 .2rem rgba(222,170,12,0.5)}.btn-danger{color:#fff;background-color:#dc3545;border-color:#dc3545}.btn-danger:hover{color:#fff;background-color:#c82333;border-color:#bd2130}.btn-danger:focus,.btn-danger.focus{box-shadow:0 0 0 .2rem rgba(225,83,97,0.5)}.btn-danger.disabled,.btn-danger:disabled{color:#fff;background-color:#dc3545;border-color:#dc3545}.btn-danger:not(:disabled):not(.disabled):active,.btn-danger:not(:disabled):not(.disabled).active,.show>.btn-danger.dropdown-toggle{color:#fff;background-color:#bd2130;border-color:#b21f2d}.btn-danger:not(:disabled):not(.disabled):active:focus,.btn-danger:not(:disabled):not(.disabled).active:focus,.show>.btn-danger.dropdown-toggle:focus{box-shadow:0 0 0 .2rem rgba(225,83,97,0.5)}.btn-light{color:#212529;background-color:#f8f9fa;border-color:#f8f9fa}.btn-light:hover{color:#212529;background-color:#e2e6ea;border-color:#dae0e5}.btn-light:focus,.btn-light.focus{box-shadow:0 0 0 .2rem rgba(216,217,219,0.5)}.btn-light.disabled,.btn-light:disabled{color:#212529;background-color:#f8f9fa;border-color:#f8f9fa}.btn-light:not(:disabled):not(.disabled):active,.btn-light:not(:disabled):not(.disabled).active,.show>.btn-light.dropdown-toggle{color:#212529;background-color:#dae0e5;border-color:#d3d9df}.btn-light:not(:disabled):not(.disabled):active:focus,.btn-light:not(:disabled):not(.disabled).active:focus,.show>.btn-light.dropdown-toggle:focus{box-shadow:0 0 0 .2rem rgba(216,217,219,0.5)}.btn-dark{color:#fff;background-color:#343a40;border-color:#343a40}.btn-dark:hover{color:#fff;background-color:#23272b;border-color:#1d2124}.btn-dark:focus,.btn-dark.focus{box-shadow:0 0 0 .2rem rgba(82,88,93,0.5)}.btn-dark.disabled,.btn-dark:disabled{color:#fff;background-color:#343a40;border-color:#343a40}.btn-dark:not(:disabled):not(.disabled):active,.btn-dark:not(:disabled):not(.disabled).active,.show>.btn-dark.dropdown-toggle{color:#fff;background-color:#1d2124;border-color:#171a1d}.btn-dark:not(:disabled):not(.disabled):active:focus,.btn-dark:not(:disabled):not(.disabled).active:focus,.show>.btn-dark.dropdown-toggle:focus{box-shadow:0 0 0 .2rem rgba(82,88,93,0.5)}.btn-outline-primary{color:#007bff;border-color:#007bff}.btn-outline-primary:hover{color:#fff;background-color:#007bff;border-color:#007bff}.btn-outline-primary:focus,.btn-outline-primary.focus{box-shadow:0 0 0 .2rem rgba(0,123,255,0.5)}.btn-outline-primary.disabled,.btn-outline-primary:disabled{color:#007bff;background-color:transparent}.btn-outline-primary:not(:disabled):not(.disabled):active,.btn-outline-primary:not(:disabled):not(.disabled).active,.show>.btn-outline-primary.dropdown-toggle{color:#fff;background-color:#007bff;border-color:#007bff}.btn-outline-primary:not(:disabled):not(.disabled):active:focus,.btn-outline-primary:not(:disabled):not(.disabled).active:focus,.show>.btn-outline-primary.dropdown-toggle:focus{box-shadow:0 0 0 .2rem rgba(0,123,255,0.5)}.btn-outline-secondary{color:#6c757d;border-color:#6c757d}.btn-outline-secondary:hover{color:#fff;background-color:#6c757d;border-color:#6c757d}.btn-outline-secondary:focus,.btn-outline-secondary.focus{box-shadow:0 0 0 .2rem rgba(108,117,125,0.5)}.btn-outline-secondary.disabled,.btn-outline-secondary:disabled{color:#6c757d;background-color:transparent}.btn-outline-secondary:not(:disabled):not(.disabled):active,.btn-outline-secondary:not(:disabled):not(.disabled).active,.show>.btn-outline-secondary.dropdown-toggle{color:#fff;background-color:#6c757d;border-color:#6c757d}.btn-outline-secondary:not(:disabled):not(.disabled):active:focus,.btn-outline-secondary:not(:disabled):not(.disabled).active:focus,.show>.btn-outline-secondary.dropdown-toggle:focus{box-shadow:0 0 0 .2rem rgba(108,117,125,0.5)}.btn-outline-success{color:#28a745;border-color:#28a745}.btn-outline-success:hover{color:#fff;background-color:#28a745;border-color:#28a745}.btn-outline-success:focus,.btn-outline-success.focus{box-shadow:0 0 0 .2rem rgba(40,167,69,0.5)}.btn-outline-success.disabled,.btn-outline-success:disabled{color:#28a745;background-color:transparent}.btn-outline-success:not(:disabled):not(.disabled):active,.btn-outline-success:not(:disabled):not(.disabled).active,.show>.btn-outline-success.dropdown-toggle{color:#fff;background-color:#28a745;border-color:#28a745}.btn-outline-success:not(:disabled):not(.disabled):active:focus,.btn-outline-success:not(:disabled):not(.disabled).active:focus,.show>.btn-outline-success.dropdown-toggle:focus{box-shadow:0 0 0 .2rem rgba(40,167,69,0.5)}.btn-outline-info{color:#17a2b8;border-color:#17a2b8}.btn-outline-info:hover{color:#fff;background-color:#17a2b8;border-color:#17a2b8}.btn-outline-info:focus,.btn-outline-info.focus{box-shadow:0 0 0 .2rem rgba(23,162,184,0.5)}.btn-outline-info.disabled,.btn-outline-info:disabled{color:#17a2b8;background-color:transparent}.btn-outline-info:not(:disabled):not(.disabled):active,.btn-outline-info:not(:disabled):not(.disabled).active,.show>.btn-outline-info.dropdown-toggle{color:#fff;background-color:#17a2b8;border-color:#17a2b8}.btn-outline-info:not(:disabled):not(.disabled):active:focus,.btn-outline-info:not(:disabled):not(.disabled).active:focus,.show>.btn-outline-info.dropdown-toggle:focus{box-shadow:0 0 0 .2rem rgba(23,162,184,0.5)}.btn-outline-warning{color:#ffc107;border-color:#ffc107}.btn-outline-warning:hover{color:#212529;background-color:#ffc107;border-color:#ffc107}.btn-outline-warning:focus,.btn-outline-warning.focus{box-shadow:0 0 0 .2rem rgba(255,193,7,0.5)}.btn-outline-warning.disabled,.btn-outline-warning:disabled{color:#ffc107;background-color:transparent}.btn-outline-warning:not(:disabled):not(.disabled):active,.btn-outline-warning:not(:disabled):not(.disabled).active,.show>.btn-outline-warning.dropdown-toggle{color:#212529;background-color:#ffc107;border-color:#ffc107}.btn-outline-warning:not(:disabled):not(.disabled):active:focus,.btn-outline-warning:not(:disabled):not(.disabled).active:focus,.show>.btn-outline-warning.dropdown-toggle:focus{box-shadow:0 0 0 .2rem rgba(255,193,7,0.5)}.btn-outline-danger{color:#dc3545;border-color:#dc3545}.btn-outline-danger:hover{color:#fff;background-color:#dc3545;border-color:#dc3545}.btn-outline-danger:focus,.btn-outline-danger.focus{box-shadow:0 0 0 .2rem rgba(220,53,69,0.5)}.btn-outline-danger.disabled,.btn-outline-danger:disabled{color:#dc3545;background-color:transparent}.btn-outline-danger:not(:disabled):not(.disabled):active,.btn-outline-danger:not(:disabled):not(.disabled).active,.show>.btn-outline-danger.dropdown-toggle{color:#fff;background-color:#dc3545;border-color:#dc3545}.btn-outline-danger:not(:disabled):not(.disabled):active:focus,.btn-outline-danger:not(:disabled):not(.disabled).active:focus,.show>.btn-outline-danger.dropdown-toggle:focus{box-shadow:0 0 0 .2rem rgba(220,53,69,0.5)}.btn-outline-light{color:#f8f9fa;border-color:#f8f9fa}.btn-outline-light:hover{color:#212529;background-color:#f8f9fa;border-color:#f8f9fa}.btn-outline-light:focus,.btn-outline-light.focus{box-shadow:0 0 0 .2rem rgba(248,249,250,0.5)}.btn-outline-light.disabled,.btn-outline-light:disabled{color:#f8f9fa;background-color:transparent}.btn-outline-light:not(:disabled):not(.disabled):active,.btn-outline-light:not(:disabled):not(.disabled).active,.show>.btn-outline-light.dropdown-toggle{color:#212529;background-color:#f8f9fa;border-color:#f8f9fa}.btn-outline-light:not(:disabled):not(.disabled):active:focus,.btn-outline-light:not(:disabled):not(.disabled).active:focus,.show>.btn-outline-light.dropdown-toggle:focus{box-shadow:0 0 0 .2rem rgba(248,249,250,0.5)}.btn-outline-dark{color:#343a40;border-color:#343a40}.btn-outline-dark:hover{color:#fff;background-color:#343a40;border-color:#343a40}.btn-outline-dark:focus,.btn-outline-dark.focus{box-shadow:0 0 0 .2rem rgba(52,58,64,0.5)}.btn-outline-dark.disabled,.btn-outline-dark:disabled{color:#343a40;background-color:transparent}.btn-outline-dark:not(:disabled):not(.disabled):active,.btn-outline-dark:not(:disabled):not(.disabled).active,.show>.btn-outline-dark.dropdown-toggle{color:#fff;background-color:#343a40;border-color:#343a40}.btn-outline-dark:not(:disabled):not(.disabled):active:focus,.btn-outline-dark:not(:disabled):not(.disabled).active:focus,.show>.btn-outline-dark.dropdown-toggle:focus{box-shadow:0 0 0 .2rem rgba(52,58,64,0.5)}.btn-link{font-weight:400;color:#007bff;text-decoration:none}.btn-link:hover{color:#0056b3;text-decoration:underline}.btn-link:focus,.btn-link.focus{text-decoration:underline;box-shadow:none}.btn-link:disabled,.btn-link.disabled{color:#6c757d;pointer-events:none}.btn-lg,.btn-group-lg>.btn{padding:.5rem 1rem;font-size:1.25rem;line-height:1.5;border-radius:.3rem}.btn-sm,.btn-group-sm>.btn{padding:.25rem .5rem;font-size:.875rem;line-height:1.5;border-radius:.2rem}.btn-block{display:block;width:100%}.btn-block+.btn-block{margin-top:.5rem}input[type="submit"].btn-block,input[type="reset"].btn-block,input[type="button"].btn-block{width:100%}.fade{transition:opacity 0.15s linear}@media (prefers-reduced-motion: reduce){.fade{transition:none}}.fade:not(.show){opacity:0}.collapse:not(.show){display:none}.collapsing{position:relative;height:0;overflow:hidden;transition:height 0.35s ease}@media (prefers-reduced-motion: reduce){.collapsing{transition:none}}.dropup,.dropright,.dropdown,.dropleft{position:relative}.dropdown-toggle{white-space:nowrap}.dropdown-toggle::after{display:inline-block;margin-left:.255em;vertical-align:.255em;content:"";border-top:.3em solid;border-right:.3em solid transparent;border-bottom:0;border-left:.3em solid transparent}.dropdown-toggle:empty::after{margin-left:0}.dropdown-menu{position:absolute;top:100%;left:0;z-index:1000;display:none;float:left;min-width:10rem;padding:.5rem 0;margin:.125rem 0 0;font-size:1rem;color:#212529;text-align:left;list-style:none;background-color:#fff;background-clip:padding-box;border:1px solid rgba(0,0,0,0.15);border-radius:.25rem}.dropdown-menu-left{right:auto;left:0}.dropdown-menu-right{right:0;left:auto}@media (min-width: 576px){.dropdown-menu-sm-left{right:auto;left:0}.dropdown-menu-sm-right{right:0;left:auto}}@media (min-width: 768px){.dropdown-menu-md-left{right:auto;left:0}.dropdown-menu-md-right{right:0;left:auto}}@media (min-width: 992px){.dropdown-menu-lg-left{right:auto;left:0}.dropdown-menu-lg-right{right:0;left:auto}}@media (min-width: 1200px){.dropdown-menu-xl-left{right:auto;left:0}.dropdown-menu-xl-right{right:0;left:auto}}.dropup .dropdown-menu{top:auto;bottom:100%;margin-top:0;margin-bottom:.125rem}.dropup .dropdown-toggle::after{display:inline-block;margin-left:.255em;vertical-align:.255em;content:"";border-top:0;border-right:.3em solid transparent;border-bottom:.3em solid;border-left:.3em solid transparent}.dropup .dropdown-toggle:empty::after{margin-left:0}.dropright .dropdown-menu{top:0;right:auto;left:100%;margin-top:0;margin-left:.125rem}.dropright .dropdown-toggle::after{display:inline-block;margin-left:.255em;vertical-align:.255em;content:"";border-top:.3em solid transparent;border-right:0;border-bottom:.3em solid transparent;border-left:.3em solid}.dropright .dropdown-toggle:empty::after{margin-left:0}.dropright .dropdown-toggle::after{vertical-align:0}.dropleft .dropdown-menu{top:0;right:100%;left:auto;margin-top:0;margin-right:.125rem}.dropleft .dropdown-toggle::after{display:inline-block;margin-left:.255em;vertical-align:.255em;content:""}.dropleft .dropdown-toggle::after{display:none}.dropleft .dropdown-toggle::before{display:inline-block;margin-right:.255em;vertical-align:.255em;content:"";border-top:.3em solid transparent;border-right:.3em solid;border-bottom:.3em solid transparent}.dropleft .dropdown-toggle:empty::after{margin-left:0}.dropleft .dropdown-toggle::before{vertical-align:0}.dropdown-menu[x-placement^="top"],.dropdown-menu[x-placement^="right"],.dropdown-menu[x-placement^="bottom"],.dropdown-menu[x-placement^="left"]{right:auto;bottom:auto}.dropdown-divider{height:0;margin:.5rem 0;overflow:hidden;border-top:1px solid #e9ecef}.dropdown-item{display:block;width:100%;padding:.25rem 1.5rem;clear:both;font-weight:400;color:#212529;text-align:inherit;white-space:nowrap;background-color:transparent;border:0}.dropdown-item:hover,.dropdown-item:focus{color:#16181b;text-decoration:none;background-color:#f8f9fa}.dropdown-item.active,.dropdown-item:active{color:#fff;text-decoration:none;background-color:#007bff}.dropdown-item.disabled,.dropdown-item:disabled{color:#6c757d;pointer-events:none;background-color:transparent}.dropdown-menu.show{display:block}.dropdown-header{display:block;padding:.5rem 1.5rem;margin-bottom:0;font-size:.875rem;color:#6c757d;white-space:nowrap}.dropdown-item-text{display:block;padding:.25rem 1.5rem;color:#212529}.btn-group,.btn-group-vertical{position:relative;display:inline-flex;vertical-align:middle}.btn-group>.btn,.btn-group-vertical>.btn{position:relative;flex:1 1 auto}.btn-group>.btn:hover,.btn-group-vertical>.btn:hover{z-index:1}.btn-group>.btn:focus,.btn-group>.btn:active,.btn-group>.btn.active,.btn-group-vertical>.btn:focus,.btn-group-vertical>.btn:active,.btn-group-vertical>.btn.active{z-index:1}.btn-toolbar{display:flex;flex-wrap:wrap;justify-content:flex-start}.btn-toolbar .input-group{width:auto}.btn-group>.btn:not(:first-child),.btn-group>.btn-group:not(:first-child){margin-left:-1px}.btn-group>.btn:not(:last-child):not(.dropdown-toggle),.btn-group>.btn-group:not(:last-child)>.btn{border-top-right-radius:0;border-bottom-right-radius:0}.btn-group>.btn:not(:first-child),.btn-group>.btn-group:not(:first-child)>.btn{border-top-left-radius:0;border-bottom-left-radius:0}.dropdown-toggle-split{padding-right:.5625rem;padding-left:.5625rem}.dropdown-toggle-split::after,.dropup .dropdown-toggle-split::after,.dropright .dropdown-toggle-split::after{margin-left:0}.dropleft .dropdown-toggle-split::before{margin-right:0}.btn-sm+.dropdown-toggle-split,.btn-group-sm>.btn+.dropdown-toggle-split{padding-right:.375rem;padding-left:.375rem}.btn-lg+.dropdown-toggle-split,.btn-group-lg>.btn+.dropdown-toggle-split{padding-right:.75rem;padding-left:.75rem}.btn-group-vertical{flex-direction:column;align-items:flex-start;justify-content:center}.btn-group-vertical>.btn,.btn-group-vertical>.btn-group{width:100%}.btn-group-vertical>.btn:not(:first-child),.btn-group-vertical>.btn-group:not(:first-child){margin-top:-1px}.btn-group-vertical>.btn:not(:last-child):not(.dropdown-toggle),.btn-group-vertical>.btn-group:not(:last-child)>.btn{border-bottom-right-radius:0;border-bottom-left-radius:0}.btn-group-vertical>.btn:not(:first-child),.btn-group-vertical>.btn-group:not(:first-child)>.btn{border-top-left-radius:0;border-top-right-radius:0}.btn-group-toggle>.btn,.btn-group-toggle>.btn-group>.btn{margin-bottom:0}.btn-group-toggle>.btn input[type="radio"],.btn-group-toggle>.btn input[type="checkbox"],.btn-group-toggle>.btn-group>.btn input[type="radio"],.btn-group-toggle>.btn-group>.btn input[type="checkbox"]{position:absolute;clip:rect(0, 0, 0, 0);pointer-events:none}.input-group{position:relative;display:flex;flex-wrap:wrap;align-items:stretch;width:100%}.input-group>.form-control,.input-group>.form-control-plaintext,.input-group>.custom-select,.input-group>.custom-file{position:relative;flex:1 1 auto;width:1%;margin-bottom:0}.input-group>.form-control+.form-control,.input-group>.form-control+.custom-select,.input-group>.form-control+.custom-file,.input-group>.form-control-plaintext+.form-control,.input-group>.form-control-plaintext+.custom-select,.input-group>.form-control-plaintext+.custom-file,.input-group>.custom-select+.form-control,.input-group>.custom-select+.custom-select,.input-group>.custom-select+.custom-file,.input-group>.custom-file+.form-control,.input-group>.custom-file+.custom-select,.input-group>.custom-file+.custom-file{margin-left:-1px}.input-group>.form-control:focus,.input-group>.custom-select:focus,.input-group>.custom-file .custom-file-input:focus ~ .custom-file-label{z-index:3}.input-group>.custom-file .custom-file-input:focus{z-index:4}.input-group>.form-control:not(:last-child),.input-group>.custom-select:not(:last-child){border-top-right-radius:0;border-bottom-right-radius:0}.input-group>.form-control:not(:first-child),.input-group>.custom-select:not(:first-child){border-top-left-radius:0;border-bottom-left-radius:0}.input-group>.custom-file{display:flex;align-items:center}.input-group>.custom-file:not(:last-child) .custom-file-label,.input-group>.custom-file:not(:last-child) .custom-file-label::after{border-top-right-radius:0;border-bottom-right-radius:0}.input-group>.custom-file:not(:first-child) .custom-file-label{border-top-left-radius:0;border-bottom-left-radius:0}.input-group-prepend,.input-group-append{display:flex}.input-group-prepend .btn,.input-group-append .btn{position:relative;z-index:2}.input-group-prepend .btn:focus,.input-group-append .btn:focus{z-index:3}.input-group-prepend .btn+.btn,.input-group-prepend .btn+.input-group-text,.input-group-prepend .input-group-text+.input-group-text,.input-group-prepend .input-group-text+.btn,.input-group-append .btn+.btn,.input-group-append .btn+.input-group-text,.input-group-append .input-group-text+.input-group-text,.input-group-append .input-group-text+.btn{margin-left:-1px}.input-group-prepend{margin-right:-1px}.input-group-append{margin-left:-1px}.input-group-text{display:flex;align-items:center;padding:.375rem .75rem;margin-bottom:0;font-size:1rem;font-weight:400;line-height:1.5;color:#495057;text-align:center;white-space:nowrap;background-color:#e9ecef;border:1px solid #ced4da;border-radius:.25rem}.input-group-text input[type="radio"],.input-group-text input[type="checkbox"]{margin-top:0}.input-group-lg>.form-control:not(textarea),.input-group-lg>.custom-select{height:calc(1.5em + 1rem + 2px)}.input-group-lg>.form-control,.input-group-lg>.custom-select,.input-group-lg>.input-group-prepend>.input-group-text,.input-group-lg>.input-group-append>.input-group-text,.input-group-lg>.input-group-prepend>.btn,.input-group-lg>.input-group-append>.btn{padding:.5rem 1rem;font-size:1.25rem;line-height:1.5;border-radius:.3rem}.input-group-sm>.form-control:not(textarea),.input-group-sm>.custom-select{height:calc(1.5em + .5rem + 2px)}.input-group-sm>.form-control,.input-group-sm>.custom-select,.input-group-sm>.input-group-prepend>.input-group-text,.input-group-sm>.input-group-append>.input-group-text,.input-group-sm>.input-group-prepend>.btn,.input-group-sm>.input-group-append>.btn{padding:.25rem .5rem;font-size:.875rem;line-height:1.5;border-radius:.2rem}.input-group-lg>.custom-select,.input-group-sm>.custom-select{padding-right:1.75rem}.input-group>.input-group-prepend>.btn,.input-group>.input-group-prepend>.input-group-text,.input-group>.input-group-append:not(:last-child)>.btn,.input-group>.input-group-append:not(:last-child)>.input-group-text,.input-group>.input-group-append:last-child>.btn:not(:last-child):not(.dropdown-toggle),.input-group>.input-group-append:last-child>.input-group-text:not(:last-child){border-top-right-radius:0;border-bottom-right-radius:0}.input-group>.input-group-append>.btn,.input-group>.input-group-append>.input-group-text,.input-group>.input-group-prepend:not(:first-child)>.btn,.input-group>.input-group-prepend:not(:first-child)>.input-group-text,.input-group>.input-group-prepend:first-child>.btn:not(:first-child),.input-group>.input-group-prepend:first-child>.input-group-text:not(:first-child){border-top-left-radius:0;border-bottom-left-radius:0}.custom-control{position:relative;display:block;min-height:1.5rem;padding-left:1.5rem}.custom-control-inline{display:inline-flex;margin-right:1rem}.custom-control-input{position:absolute;z-index:-1;opacity:0}.custom-control-input:checked ~ .custom-control-label::before{color:#fff;border-color:#007bff;background-color:#007bff}.custom-control-input:focus ~ .custom-control-label::before{box-shadow:0 0 0 .2rem rgba(0,123,255,0.25)}.custom-control-input:focus:not(:checked) ~ .custom-control-label::before{border-color:#80bdff}.custom-control-input:not(:disabled):active ~ .custom-control-label::before{color:#fff;background-color:#b3d7ff;border-color:#b3d7ff}.custom-control-input:disabled ~ .custom-control-label{color:#6c757d}.custom-control-input:disabled ~ .custom-control-label::before{background-color:#e9ecef}.custom-control-label{position:relative;margin-bottom:0;vertical-align:top}.custom-control-label::before{position:absolute;top:.25rem;left:-1.5rem;display:block;width:1rem;height:1rem;pointer-events:none;content:"";background-color:#fff;border:#adb5bd solid 1px}.custom-control-label::after{position:absolute;top:.25rem;left:-1.5rem;display:block;width:1rem;height:1rem;content:"";background:no-repeat 50% / 50% 50%}.custom-checkbox .custom-control-label::before{border-radius:.25rem}.custom-checkbox .custom-control-input:checked ~ .custom-control-label::after{background-image:url("data:image/svg+xml,%3csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 8 8'%3e%3cpath fill='%23fff' d='M6.564.75l-3.59 3.612-1.538-1.55L0 4.26 2.974 7.25 8 2.193z'/%3e%3c/svg%3e")}.custom-checkbox .custom-control-input:indeterminate ~ .custom-control-label::before{border-color:#007bff;background-color:#007bff}.custom-checkbox .custom-control-input:indeterminate ~ .custom-control-label::after{background-image:url("data:image/svg+xml,%3csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 4 4'%3e%3cpath stroke='%23fff' d='M0 2h4'/%3e%3c/svg%3e")}.custom-checkbox .custom-control-input:disabled:checked ~ .custom-control-label::before{background-color:rgba(0,123,255,0.5)}.custom-checkbox .custom-control-input:disabled:indeterminate ~ .custom-control-label::before{background-color:rgba(0,123,255,0.5)}.custom-radio .custom-control-label::before{border-radius:50%}.custom-radio .custom-control-input:checked ~ .custom-control-label::after{background-image:url("data:image/svg+xml,%3csvg xmlns='http://www.w3.org/2000/svg' viewBox='-4 -4 8 8'%3e%3ccircle r='3' fill='%23fff'/%3e%3c/svg%3e")}.custom-radio .custom-control-input:disabled:checked ~ .custom-control-label::before{background-color:rgba(0,123,255,0.5)}.custom-switch{padding-left:2.25rem}.custom-switch .custom-control-label::before{left:-2.25rem;width:1.75rem;pointer-events:all;border-radius:.5rem}.custom-switch .custom-control-label::after{top:calc(.25rem + 2px);left:calc(-2.25rem + 2px);width:calc(1rem - 4px);height:calc(1rem - 4px);background-color:#adb5bd;border-radius:.5rem;transition:transform 0.15s ease-in-out,background-color 0.15s ease-in-out,border-color 0.15s ease-in-out,box-shadow 0.15s ease-in-out}@media (prefers-reduced-motion: reduce){.custom-switch .custom-control-label::after{transition:none}}.custom-switch .custom-control-input:checked ~ .custom-control-label::after{background-color:#fff;transform:translateX(.75rem)}.custom-switch .custom-control-input:disabled:checked ~ .custom-control-label::before{background-color:rgba(0,123,255,0.5)}.custom-select{display:inline-block;width:100%;height:calc(1.5em + .75rem + 2px);padding:.375rem 1.75rem .375rem .75rem;font-size:1rem;font-weight:400;line-height:1.5;color:#495057;vertical-align:middle;background:url("data:image/svg+xml,%3csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 4 5'%3e%3cpath fill='%23343a40' d='M2 0L0 2h4zm0 5L0 3h4z'/%3e%3c/svg%3e") no-repeat right .75rem center/8px 10px;background-color:#fff;border:1px solid #ced4da;border-radius:.25rem;-webkit-appearance:none;-moz-appearance:none;appearance:none}.custom-select:focus{border-color:#80bdff;outline:0;box-shadow:0 0 0 .2rem rgba(0,123,255,0.25)}.custom-select:focus::-ms-value{color:#495057;background-color:#fff}.custom-select[multiple],.custom-select[size]:not([size="1"]){height:auto;padding-right:.75rem;background-image:none}.custom-select:disabled{color:#6c757d;background-color:#e9ecef}.custom-select::-ms-expand{display:none}.custom-select-sm{height:calc(1.5em + .5rem + 2px);padding-top:.25rem;padding-bottom:.25rem;padding-left:.5rem;font-size:.875rem}.custom-select-lg{height:calc(1.5em + 1rem + 2px);padding-top:.5rem;padding-bottom:.5rem;padding-left:1rem;font-size:1.25rem}.custom-file{position:relative;display:inline-block;width:100%;height:calc(1.5em + .75rem + 2px);margin-bottom:0}.custom-file-input{position:relative;z-index:2;width:100%;height:calc(1.5em + .75rem + 2px);margin:0;opacity:0}.custom-file-input:focus ~ .custom-file-label{border-color:#80bdff;box-shadow:0 0 0 .2rem rgba(0,123,255,0.25)}.custom-file-input:disabled ~ .custom-file-label{background-color:#e9ecef}.custom-file-input:lang(en) ~ .custom-file-label::after{content:"Browse"}.custom-file-input ~ .custom-file-label[data-browse]::after{content:attr(data-browse)}.custom-file-label{position:absolute;top:0;right:0;left:0;z-index:1;height:calc(1.5em + .75rem + 2px);padding:.375rem .75rem;font-weight:400;line-height:1.5;color:#495057;background-color:#fff;border:1px solid #ced4da;border-radius:.25rem}.custom-file-label::after{position:absolute;top:0;right:0;bottom:0;z-index:3;display:block;height:calc(1.5em + .75rem);padding:.375rem .75rem;line-height:1.5;color:#495057;content:"Browse";background-color:#e9ecef;border-left:inherit;border-radius:0 .25rem .25rem 0}.custom-range{width:100%;height:calc(1rem + .4rem);padding:0;background-color:transparent;-webkit-appearance:none;-moz-appearance:none;appearance:none}.custom-range:focus{outline:none}.custom-range:focus::-webkit-slider-thumb{box-shadow:0 0 0 1px #fff,0 0 0 .2rem rgba(0,123,255,0.25)}.custom-range:focus::-moz-range-thumb{box-shadow:0 0 0 1px #fff,0 0 0 .2rem rgba(0,123,255,0.25)}.custom-range:focus::-ms-thumb{box-shadow:0 0 0 1px #fff,0 0 0 .2rem rgba(0,123,255,0.25)}.custom-range::-moz-focus-outer{border:0}.custom-range::-webkit-slider-thumb{width:1rem;height:1rem;margin-top:-.25rem;background-color:#007bff;border:0;border-radius:1rem;-webkit-transition:background-color 0.15s ease-in-out,border-color 0.15s ease-in-out,box-shadow 0.15s ease-in-out;transition:background-color 0.15s ease-in-out,border-color 0.15s ease-in-out,box-shadow 0.15s ease-in-out;-webkit-appearance:none;appearance:none}@media (prefers-reduced-motion: reduce){.custom-range::-webkit-slider-thumb{-webkit-transition:none;transition:none}}.custom-range::-webkit-slider-thumb:active{background-color:#b3d7ff}.custom-range::-webkit-slider-runnable-track{width:100%;height:.5rem;color:transparent;cursor:pointer;background-color:#dee2e6;border-color:transparent;border-radius:1rem}.custom-range::-moz-range-thumb{width:1rem;height:1rem;background-color:#007bff;border:0;border-radius:1rem;-moz-transition:background-color 0.15s ease-in-out,border-color 0.15s ease-in-out,box-shadow 0.15s ease-in-out;transition:background-color 0.15s ease-in-out,border-color 0.15s ease-in-out,box-shadow 0.15s ease-in-out;-moz-appearance:none;appearance:none}@media (prefers-reduced-motion: reduce){.custom-range::-moz-range-thumb{-moz-transition:none;transition:none}}.custom-range::-moz-range-thumb:active{background-color:#b3d7ff}.custom-range::-moz-range-track{width:100%;height:.5rem;color:transparent;cursor:pointer;background-color:#dee2e6;border-color:transparent;border-radius:1rem}.custom-range::-ms-thumb{width:1rem;height:1rem;margin-top:0;margin-right:.2rem;margin-left:.2rem;background-color:#007bff;border:0;border-radius:1rem;-ms-transition:background-color 0.15s ease-in-out,border-color 0.15s ease-in-out,box-shadow 0.15s ease-in-out;transition:background-color 0.15s ease-in-out,border-color 0.15s ease-in-out,box-shadow 0.15s ease-in-out;appearance:none}@media (prefers-reduced-motion: reduce){.custom-range::-ms-thumb{-ms-transition:none;transition:none}}.custom-range::-ms-thumb:active{background-color:#b3d7ff}.custom-range::-ms-track{width:100%;height:.5rem;color:transparent;cursor:pointer;background-color:transparent;border-color:transparent;border-width:.5rem}.custom-range::-ms-fill-lower{background-color:#dee2e6;border-radius:1rem}.custom-range::-ms-fill-upper{margin-right:15px;background-color:#dee2e6;border-radius:1rem}.custom-range:disabled::-webkit-slider-thumb{background-color:#adb5bd}.custom-range:disabled::-webkit-slider-runnable-track{cursor:default}.custom-range:disabled::-moz-range-thumb{background-color:#adb5bd}.custom-range:disabled::-moz-range-track{cursor:default}.custom-range:disabled::-ms-thumb{background-color:#adb5bd}.custom-control-label::before,.custom-file-label,.custom-select{transition:background-color 0.15s ease-in-out,border-color 0.15s ease-in-out,box-shadow 0.15s ease-in-out}@media (prefers-reduced-motion: reduce){.custom-control-label::before,.custom-file-label,.custom-select{transition:none}}.nav{display:flex;flex-wrap:wrap;padding-left:0;margin-bottom:0;list-style:none}.nav-link{display:block;padding:.5rem 1rem}.nav-link:hover,.nav-link:focus{text-decoration:none}.nav-link.disabled{color:#6c757d;pointer-events:none;cursor:default}.nav-tabs{border-bottom:1px solid #dee2e6}.nav-tabs .nav-item{margin-bottom:-1px}.nav-tabs .nav-link{border:1px solid transparent;border-top-left-radius:.25rem;border-top-right-radius:.25rem}.nav-tabs .nav-link:hover,.nav-tabs .nav-link:focus{border-color:#e9ecef #e9ecef #dee2e6}.nav-tabs .nav-link.disabled{color:#6c757d;background-color:transparent;border-color:transparent}.nav-tabs .nav-link.active,.nav-tabs .nav-item.show .nav-link{color:#495057;background-color:#fff;border-color:#dee2e6 #dee2e6 #fff}.nav-tabs .dropdown-menu{margin-top:-1px;border-top-left-radius:0;border-top-right-radius:0}.nav-pills .nav-link{border-radius:.25rem}.nav-pills .nav-link.active,.nav-pills .show>.nav-link{color:#fff;background-color:#007bff}.nav-fill .nav-item{flex:1 1 auto;text-align:center}.nav-justified .nav-item{flex-basis:0;flex-grow:1;text-align:center}.tab-content>.tab-pane{display:none}.tab-content>.active{display:block}.navbar{position:relative;display:flex;flex-wrap:wrap;align-items:center;justify-content:space-between;padding:.5rem 1rem}.navbar>.container,.navbar>.container-fluid{display:flex;flex-wrap:wrap;align-items:center;justify-content:space-between}.navbar-brand{display:inline-block;padding-top:.3125rem;padding-bottom:.3125rem;margin-right:1rem;font-size:1.25rem;line-height:inherit;white-space:nowrap}.navbar-brand:hover,.navbar-brand:focus{text-decoration:none}.navbar-nav{display:flex;flex-direction:column;padding-left:0;margin-bottom:0;list-style:none}.navbar-nav .nav-link{padding-right:0;padding-left:0}.navbar-nav .dropdown-menu{position:static;float:none}.navbar-text{display:inline-block;padding-top:.5rem;padding-bottom:.5rem}.navbar-collapse{flex-basis:100%;flex-grow:1;align-items:center}.navbar-toggler{padding:.25rem .75rem;font-size:1.25rem;line-height:1;background-color:transparent;border:1px solid transparent;border-radius:.25rem}.navbar-toggler:hover,.navbar-toggler:focus{text-decoration:none}.navbar-toggler-icon{display:inline-block;width:1.5em;height:1.5em;vertical-align:middle;content:"";background:no-repeat center center;background-size:100% 100%}@media (max-width: 575.98px){.navbar-expand-sm>.container,.navbar-expand-sm>.container-fluid{padding-right:0;padding-left:0}}@media (min-width: 576px){.navbar-expand-sm{flex-flow:row nowrap;justify-content:flex-start}.navbar-expand-sm .navbar-nav{flex-direction:row}.navbar-expand-sm .navbar-nav .dropdown-menu{position:absolute}.navbar-expand-sm .navbar-nav .nav-link{padding-right:.5rem;padding-left:.5rem}.navbar-expand-sm>.container,.navbar-expand-sm>.container-fluid{flex-wrap:nowrap}.navbar-expand-sm .navbar-collapse{display:flex !important;flex-basis:auto}.navbar-expand-sm .navbar-toggler{display:none}}@media (max-width: 767.98px){.navbar-expand-md>.container,.navbar-expand-md>.container-fluid{padding-right:0;padding-left:0}}@media (min-width: 768px){.navbar-expand-md{flex-flow:row nowrap;justify-content:flex-start}.navbar-expand-md .navbar-nav{flex-direction:row}.navbar-expand-md .navbar-nav .dropdown-menu{position:absolute}.navbar-expand-md .navbar-nav .nav-link{padding-right:.5rem;padding-left:.5rem}.navbar-expand-md>.container,.navbar-expand-md>.container-fluid{flex-wrap:nowrap}.navbar-expand-md .navbar-collapse{display:flex !important;flex-basis:auto}.navbar-expand-md .navbar-toggler{display:none}}@media (max-width: 991.98px){.navbar-expand-lg>.container,.navbar-expand-lg>.container-fluid{padding-right:0;padding-left:0}}@media (min-width: 992px){.navbar-expand-lg{flex-flow:row nowrap;justify-content:flex-start}.navbar-expand-lg .navbar-nav{flex-direction:row}.navbar-expand-lg .navbar-nav .dropdown-menu{position:absolute}.navbar-expand-lg .navbar-nav .nav-link{padding-right:.5rem;padding-left:.5rem}.navbar-expand-lg>.container,.navbar-expand-lg>.container-fluid{flex-wrap:nowrap}.navbar-expand-lg .navbar-collapse{display:flex !important;flex-basis:auto}.navbar-expand-lg .navbar-toggler{display:none}}@media (max-width: 1199.98px){.navbar-expand-xl>.container,.navbar-expand-xl>.container-fluid{padding-right:0;padding-left:0}}@media (min-width: 1200px){.navbar-expand-xl{flex-flow:row nowrap;justify-content:flex-start}.navbar-expand-xl .navbar-nav{flex-direction:row}.navbar-expand-xl .navbar-nav .dropdown-menu{position:absolute}.navbar-expand-xl .navbar-nav .nav-link{padding-right:.5rem;padding-left:.5rem}.navbar-expand-xl>.container,.navbar-expand-xl>.container-fluid{flex-wrap:nowrap}.navbar-expand-xl .navbar-collapse{display:flex !important;flex-basis:auto}.navbar-expand-xl .navbar-toggler{display:none}}.navbar-expand{flex-flow:row nowrap;justify-content:flex-start}.navbar-expand>.container,.navbar-expand>.container-fluid{padding-right:0;padding-left:0}.navbar-expand .navbar-nav{flex-direction:row}.navbar-expand .navbar-nav .dropdown-menu{position:absolute}.navbar-expand .navbar-nav .nav-link{padding-right:.5rem;padding-left:.5rem}.navbar-expand>.container,.navbar-expand>.container-fluid{flex-wrap:nowrap}.navbar-expand .navbar-collapse{display:flex !important;flex-basis:auto}.navbar-expand .navbar-toggler{display:none}.navbar-light .navbar-brand{color:rgba(0,0,0,0.9)}.navbar-light .navbar-brand:hover,.navbar-light .navbar-brand:focus{color:rgba(0,0,0,0.9)}.navbar-light .navbar-nav .nav-link{color:rgba(0,0,0,0.5)}.navbar-light .navbar-nav .nav-link:hover,.navbar-light .navbar-nav .nav-link:focus{color:rgba(0,0,0,0.7)}.navbar-light .navbar-nav .nav-link.disabled{color:rgba(0,0,0,0.3)}.navbar-light .navbar-nav .show>.nav-link,.navbar-light .navbar-nav .active>.nav-link,.navbar-light .navbar-nav .nav-link.show,.navbar-light .navbar-nav .nav-link.active{color:rgba(0,0,0,0.9)}.navbar-light .navbar-toggler{color:rgba(0,0,0,0.5);border-color:rgba(0,0,0,0.1)}.navbar-light .navbar-toggler-icon{background-image:url("data:image/svg+xml,%3csvg viewBox='0 0 30 30' xmlns='http://www.w3.org/2000/svg'%3e%3cpath stroke='rgba(0,0,0,0.5)' stroke-width='2' stroke-linecap='round' stroke-miterlimit='10' d='M4 7h22M4 15h22M4 23h22'/%3e%3c/svg%3e")}.navbar-light .navbar-text{color:rgba(0,0,0,0.5)}.navbar-light .navbar-text a{color:rgba(0,0,0,0.9)}.navbar-light .navbar-text a:hover,.navbar-light .navbar-text a:focus{color:rgba(0,0,0,0.9)}.navbar-dark .navbar-brand{color:#fff}.navbar-dark .navbar-brand:hover,.navbar-dark .navbar-brand:focus{color:#fff}.navbar-dark .navbar-nav .nav-link{color:rgba(255,255,255,0.5)}.navbar-dark .navbar-nav .nav-link:hover,.navbar-dark .navbar-nav .nav-link:focus{color:rgba(255,255,255,0.75)}.navbar-dark .navbar-nav .nav-link.disabled{color:rgba(255,255,255,0.25)}.navbar-dark .navbar-nav .show>.nav-link,.navbar-dark .navbar-nav .active>.nav-link,.navbar-dark .navbar-nav .nav-link.show,.navbar-dark .navbar-nav .nav-link.active{color:#fff}.navbar-dark .navbar-toggler{color:rgba(255,255,255,0.5);border-color:rgba(255,255,255,0.1)}.navbar-dark .navbar-toggler-icon{background-image:url("data:image/svg+xml,%3csvg viewBox='0 0 30 30' xmlns='http://www.w3.org/2000/svg'%3e%3cpath stroke='rgba(255,255,255,0.5)' stroke-width='2' stroke-linecap='round' stroke-miterlimit='10' d='M4 7h22M4 15h22M4 23h22'/%3e%3c/svg%3e")}.navbar-dark .navbar-text{color:rgba(255,255,255,0.5)}.navbar-dark .navbar-text a{color:#fff}.navbar-dark .navbar-text a:hover,.navbar-dark .navbar-text a:focus{color:#fff}.card{position:relative;display:flex;flex-direction:column;min-width:0;word-wrap:break-word;background-color:#fff;background-clip:border-box;border:1px solid rgba(0,0,0,0.125);border-radius:.25rem}.card>hr{margin-right:0;margin-left:0}.card>.list-group:first-child .list-group-item:first-child{border-top-left-radius:.25rem;border-top-right-radius:.25rem}.card>.list-group:last-child .list-group-item:last-child{border-bottom-right-radius:.25rem;border-bottom-left-radius:.25rem}.card-body{flex:1 1 auto;padding:1.25rem}.card-title{margin-bottom:.75rem}.card-subtitle{margin-top:-.375rem;margin-bottom:0}.card-text:last-child{margin-bottom:0}.card-link:hover{text-decoration:none}.card-link+.card-link{margin-left:1.25rem}.card-header{padding:.75rem 1.25rem;margin-bottom:0;background-color:rgba(0,0,0,0.03);border-bottom:1px solid rgba(0,0,0,0.125)}.card-header:first-child{border-radius:calc(.25rem - 1px) calc(.25rem - 1px) 0 0}.card-header+.list-group .list-group-item:first-child{border-top:0}.card-footer{padding:.75rem 1.25rem;background-color:rgba(0,0,0,0.03);border-top:1px solid rgba(0,0,0,0.125)}.card-footer:last-child{border-radius:0 0 calc(.25rem - 1px) calc(.25rem - 1px)}.card-header-tabs{margin-right:-.625rem;margin-bottom:-0.75rem;margin-left:-.625rem;border-bottom:0}.card-header-pills{margin-right:-.625rem;margin-left:-.625rem}.card-img-overlay{position:absolute;top:0;right:0;bottom:0;left:0;padding:1.25rem}.card-img{width:100%;border-radius:calc(.25rem - 1px)}.card-img-top{width:100%;border-top-left-radius:calc(.25rem - 1px);border-top-right-radius:calc(.25rem - 1px)}.card-img-bottom{width:100%;border-bottom-right-radius:calc(.25rem - 1px);border-bottom-left-radius:calc(.25rem - 1px)}.card-deck{display:flex;flex-direction:column}.card-deck .card{margin-bottom:15px}@media (min-width: 576px){.card-deck{flex-flow:row wrap;margin-right:-15px;margin-left:-15px}.card-deck .card{display:flex;flex:1 0 0%;flex-direction:column;margin-right:15px;margin-bottom:0;margin-left:15px}}.card-group{display:flex;flex-direction:column}.card-group>.card{margin-bottom:15px}@media (min-width: 576px){.card-group{flex-flow:row wrap}.card-group>.card{flex:1 0 0%;margin-bottom:0}.card-group>.card+.card{margin-left:0;border-left:0}.card-group>.card:not(:last-child){border-top-right-radius:0;border-bottom-right-radius:0}.card-group>.card:not(:last-child) .card-img-top,.card-group>.card:not(:last-child) .card-header{border-top-right-radius:0}.card-group>.card:not(:last-child) .card-img-bottom,.card-group>.card:not(:last-child) .card-footer{border-bottom-right-radius:0}.card-group>.card:not(:first-child){border-top-left-radius:0;border-bottom-left-radius:0}.card-group>.card:not(:first-child) .card-img-top,.card-group>.card:not(:first-child) .card-header{border-top-left-radius:0}.card-group>.card:not(:first-child) .card-img-bottom,.card-group>.card:not(:first-child) .card-footer{border-bottom-left-radius:0}}.card-columns .card{margin-bottom:.75rem}@media (min-width: 576px){.card-columns{-moz-column-count:3;column-count:3;-moz-column-gap:1.25rem;column-gap:1.25rem;orphans:1;widows:1}.card-columns .card{display:inline-block;width:100%}}.accordion>.card{overflow:hidden}.accordion>.card:not(:first-of-type) .card-header:first-child{border-radius:0}.accordion>.card:not(:first-of-type):not(:last-of-type){border-bottom:0;border-radius:0}.accordion>.card:first-of-type{border-bottom:0;border-bottom-right-radius:0;border-bottom-left-radius:0}.accordion>.card:last-of-type{border-top-left-radius:0;border-top-right-radius:0}.accordion>.card .card-header{margin-bottom:-1px}.breadcrumb{display:flex;flex-wrap:wrap;padding:.75rem 1rem;margin-bottom:1rem;list-style:none;background-color:#e9ecef;border-radius:.25rem}.breadcrumb-item+.breadcrumb-item{padding-left:.5rem}.breadcrumb-item+.breadcrumb-item::before{display:inline-block;padding-right:.5rem;color:#6c757d;content:"/"}.breadcrumb-item+.breadcrumb-item:hover::before{text-decoration:underline}.breadcrumb-item+.breadcrumb-item:hover::before{text-decoration:none}.breadcrumb-item.active{color:#6c757d}.pagination{display:flex;padding-left:0;list-style:none;border-radius:.25rem}.page-link{position:relative;display:block;padding:.5rem .75rem;margin-left:-1px;line-height:1.25;color:#007bff;background-color:#fff;border:1px solid #dee2e6}.page-link:hover{z-index:2;color:#0056b3;text-decoration:none;background-color:#e9ecef;border-color:#dee2e6}.page-link:focus{z-index:2;outline:0;box-shadow:0 0 0 .2rem rgba(0,123,255,0.25)}.page-item:first-child .page-link{margin-left:0;border-top-left-radius:.25rem;border-bottom-left-radius:.25rem}.page-item:last-child .page-link{border-top-right-radius:.25rem;border-bottom-right-radius:.25rem}.page-item.active .page-link{z-index:1;color:#fff;background-color:#007bff;border-color:#007bff}.page-item.disabled .page-link{color:#6c757d;pointer-events:none;cursor:auto;background-color:#fff;border-color:#dee2e6}.pagination-lg .page-link{padding:.75rem 1.5rem;font-size:1.25rem;line-height:1.5}.pagination-lg .page-item:first-child .page-link{border-top-left-radius:.3rem;border-bottom-left-radius:.3rem}.pagination-lg .page-item:last-child .page-link{border-top-right-radius:.3rem;border-bottom-right-radius:.3rem}.pagination-sm .page-link{padding:.25rem .5rem;font-size:.875rem;line-height:1.5}.pagination-sm .page-item:first-child .page-link{border-top-left-radius:.2rem;border-bottom-left-radius:.2rem}.pagination-sm .page-item:last-child .page-link{border-top-right-radius:.2rem;border-bottom-right-radius:.2rem}.badge{display:inline-block;padding:.25em .4em;font-size:75%;font-weight:700;line-height:1;text-align:center;white-space:nowrap;vertical-align:baseline;border-radius:.25rem;transition:color 0.15s ease-in-out,background-color 0.15s ease-in-out,border-color 0.15s ease-in-out,box-shadow 0.15s ease-in-out}@media (prefers-reduced-motion: reduce){.badge{transition:none}}a.badge:hover,a.badge:focus{text-decoration:none}.badge:empty{display:none}.btn .badge{position:relative;top:-1px}.badge-pill{padding-right:.6em;padding-left:.6em;border-radius:10rem}.badge-primary{color:#fff;background-color:#007bff}a.badge-primary:hover,a.badge-primary:focus{color:#fff;background-color:#0062cc}a.badge-primary:focus,a.badge-primary.focus{outline:0;box-shadow:0 0 0 .2rem rgba(0,123,255,0.5)}.badge-secondary{color:#fff;background-color:#6c757d}a.badge-secondary:hover,a.badge-secondary:focus{color:#fff;background-color:#545b62}a.badge-secondary:focus,a.badge-secondary.focus{outline:0;box-shadow:0 0 0 .2rem rgba(108,117,125,0.5)}.badge-success{color:#fff;background-color:#28a745}a.badge-success:hover,a.badge-success:focus{color:#fff;background-color:#1e7e34}a.badge-success:focus,a.badge-success.focus{outline:0;box-shadow:0 0 0 .2rem rgba(40,167,69,0.5)}.badge-info{color:#fff;background-color:#17a2b8}a.badge-info:hover,a.badge-info:focus{color:#fff;background-color:#117a8b}a.badge-info:focus,a.badge-info.focus{outline:0;box-shadow:0 0 0 .2rem rgba(23,162,184,0.5)}.badge-warning{color:#212529;background-color:#ffc107}a.badge-warning:hover,a.badge-warning:focus{color:#212529;background-color:#d39e00}a.badge-warning:focus,a.badge-warning.focus{outline:0;box-shadow:0 0 0 .2rem rgba(255,193,7,0.5)}.badge-danger{color:#fff;background-color:#dc3545}a.badge-danger:hover,a.badge-danger:focus{color:#fff;background-color:#bd2130}a.badge-danger:focus,a.badge-danger.focus{outline:0;box-shadow:0 0 0 .2rem rgba(220,53,69,0.5)}.badge-light{color:#212529;background-color:#f8f9fa}a.badge-light:hover,a.badge-light:focus{color:#212529;background-color:#dae0e5}a.badge-light:focus,a.badge-light.focus{outline:0;box-shadow:0 0 0 .2rem rgba(248,249,250,0.5)}.badge-dark{color:#fff;background-color:#343a40}a.badge-dark:hover,a.badge-dark:focus{color:#fff;background-color:#1d2124}a.badge-dark:focus,a.badge-dark.focus{outline:0;box-shadow:0 0 0 .2rem rgba(52,58,64,0.5)}.jumbotron{padding:2rem 1rem;margin-bottom:2rem;background-color:#e9ecef;border-radius:.3rem}@media (min-width: 576px){.jumbotron{padding:4rem 2rem}}.jumbotron-fluid{padding-right:0;padding-left:0;border-radius:0}.alert{position:relative;padding:.75rem 1.25rem;margin-bottom:1rem;border:1px solid transparent;border-radius:.25rem}.alert-heading{color:inherit}.alert-link{font-weight:700}.alert-dismissible{padding-right:4rem}.alert-dismissible .close{position:absolute;top:0;right:0;padding:.75rem 1.25rem;color:inherit}.alert-primary{color:#004085;background-color:#cce5ff;border-color:#b8daff}.alert-primary hr{border-top-color:#9fcdff}.alert-primary .alert-link{color:#002752}.alert-secondary{color:#383d41;background-color:#e2e3e5;border-color:#d6d8db}.alert-secondary hr{border-top-color:#c8cbcf}.alert-secondary .alert-link{color:#202326}.alert-success{color:#155724;background-color:#d4edda;border-color:#c3e6cb}.alert-success hr{border-top-color:#b1dfbb}.alert-success .alert-link{color:#0b2e13}.alert-info{color:#0c5460;background-color:#d1ecf1;border-color:#bee5eb}.alert-info hr{border-top-color:#abdde5}.alert-info .alert-link{color:#062c33}.alert-warning{color:#856404;background-color:#fff3cd;border-color:#ffeeba}.alert-warning hr{border-top-color:#ffe8a1}.alert-warning .alert-link{color:#533f03}.alert-danger{color:#721c24;background-color:#f8d7da;border-color:#f5c6cb}.alert-danger hr{border-top-color:#f1b0b7}.alert-danger .alert-link{color:#491217}.alert-light{color:#818182;background-color:#fefefe;border-color:#fdfdfe}.alert-light hr{border-top-color:#ececf6}.alert-light .alert-link{color:#686868}.alert-dark{color:#1b1e21;background-color:#d6d8d9;border-color:#c6c8ca}.alert-dark hr{border-top-color:#b9bbbe}.alert-dark .alert-link{color:#040505}@-webkit-keyframes progress-bar-stripes{from{background-position:1rem 0}to{background-position:0 0}}@keyframes progress-bar-stripes{from{background-position:1rem 0}to{background-position:0 0}}.progress{display:flex;height:1rem;overflow:hidden;font-size:.75rem;background-color:#e9ecef;border-radius:.25rem}.progress-bar{display:flex;flex-direction:column;justify-content:center;color:#fff;text-align:center;white-space:nowrap;background-color:#007bff;transition:width 0.6s ease}@media (prefers-reduced-motion: reduce){.progress-bar{transition:none}}.progress-bar-striped{background-image:linear-gradient(45deg, rgba(255,255,255,0.15) 25%, transparent 25%, transparent 50%, rgba(255,255,255,0.15) 50%, rgba(255,255,255,0.15) 75%, transparent 75%, transparent);background-size:1rem 1rem}.progress-bar-animated{-webkit-animation:progress-bar-stripes 1s linear infinite;animation:progress-bar-stripes 1s linear infinite}@media (prefers-reduced-motion: reduce){.progress-bar-animated{-webkit-animation:none;animation:none}}.media{display:flex;align-items:flex-start}.media-body{flex:1}.list-group{display:flex;flex-direction:column;padding-left:0;margin-bottom:0}.list-group-item-action{width:100%;color:#495057;text-align:inherit}.list-group-item-action:hover,.list-group-item-action:focus{z-index:1;color:#495057;text-decoration:none;background-color:#f8f9fa}.list-group-item-action:active{color:#212529;background-color:#e9ecef}.list-group-item{position:relative;display:block;padding:.75rem 1.25rem;margin-bottom:-1px;background-color:#fff;border:1px solid rgba(0,0,0,0.125)}.list-group-item:first-child{border-top-left-radius:.25rem;border-top-right-radius:.25rem}.list-group-item:last-child{margin-bottom:0;border-bottom-right-radius:.25rem;border-bottom-left-radius:.25rem}.list-group-item.disabled,.list-group-item:disabled{color:#6c757d;pointer-events:none;background-color:#fff}.list-group-item.active{z-index:2;color:#fff;background-color:#007bff;border-color:#007bff}.list-group-horizontal{flex-direction:row}.list-group-horizontal .list-group-item{margin-right:-1px;margin-bottom:0}.list-group-horizontal .list-group-item:first-child{border-top-left-radius:.25rem;border-bottom-left-radius:.25rem;border-top-right-radius:0}.list-group-horizontal .list-group-item:last-child{margin-right:0;border-top-right-radius:.25rem;border-bottom-right-radius:.25rem;border-bottom-left-radius:0}@media (min-width: 576px){.list-group-horizontal-sm{flex-direction:row}.list-group-horizontal-sm .list-group-item{margin-right:-1px;margin-bottom:0}.list-group-horizontal-sm .list-group-item:first-child{border-top-left-radius:.25rem;border-bottom-left-radius:.25rem;border-top-right-radius:0}.list-group-horizontal-sm .list-group-item:last-child{margin-right:0;border-top-right-radius:.25rem;border-bottom-right-radius:.25rem;border-bottom-left-radius:0}}@media (min-width: 768px){.list-group-horizontal-md{flex-direction:row}.list-group-horizontal-md .list-group-item{margin-right:-1px;margin-bottom:0}.list-group-horizontal-md .list-group-item:first-child{border-top-left-radius:.25rem;border-bottom-left-radius:.25rem;border-top-right-radius:0}.list-group-horizontal-md .list-group-item:last-child{margin-right:0;border-top-right-radius:.25rem;border-bottom-right-radius:.25rem;border-bottom-left-radius:0}}@media (min-width: 992px){.list-group-horizontal-lg{flex-direction:row}.list-group-horizontal-lg .list-group-item{margin-right:-1px;margin-bottom:0}.list-group-horizontal-lg .list-group-item:first-child{border-top-left-radius:.25rem;border-bottom-left-radius:.25rem;border-top-right-radius:0}.list-group-horizontal-lg .list-group-item:last-child{margin-right:0;border-top-right-radius:.25rem;border-bottom-right-radius:.25rem;border-bottom-left-radius:0}}@media (min-width: 1200px){.list-group-horizontal-xl{flex-direction:row}.list-group-horizontal-xl .list-group-item{margin-right:-1px;margin-bottom:0}.list-group-horizontal-xl .list-group-item:first-child{border-top-left-radius:.25rem;border-bottom-left-radius:.25rem;border-top-right-radius:0}.list-group-horizontal-xl .list-group-item:last-child{margin-right:0;border-top-right-radius:.25rem;border-bottom-right-radius:.25rem;border-bottom-left-radius:0}}.list-group-flush .list-group-item{border-right:0;border-left:0;border-radius:0}.list-group-flush .list-group-item:last-child{margin-bottom:-1px}.list-group-flush:first-child .list-group-item:first-child{border-top:0}.list-group-flush:last-child .list-group-item:last-child{margin-bottom:0;border-bottom:0}.list-group-item-primary{color:#004085;background-color:#b8daff}.list-group-item-primary.list-group-item-action:hover,.list-group-item-primary.list-group-item-action:focus{color:#004085;background-color:#9fcdff}.list-group-item-primary.list-group-item-action.active{color:#fff;background-color:#004085;border-color:#004085}.list-group-item-secondary{color:#383d41;background-color:#d6d8db}.list-group-item-secondary.list-group-item-action:hover,.list-group-item-secondary.list-group-item-action:focus{color:#383d41;background-color:#c8cbcf}.list-group-item-secondary.list-group-item-action.active{color:#fff;background-color:#383d41;border-color:#383d41}.list-group-item-success{color:#155724;background-color:#c3e6cb}.list-group-item-success.list-group-item-action:hover,.list-group-item-success.list-group-item-action:focus{color:#155724;background-color:#b1dfbb}.list-group-item-success.list-group-item-action.active{color:#fff;background-color:#155724;border-color:#155724}.list-group-item-info{color:#0c5460;background-color:#bee5eb}.list-group-item-info.list-group-item-action:hover,.list-group-item-info.list-group-item-action:focus{color:#0c5460;background-color:#abdde5}.list-group-item-info.list-group-item-action.active{color:#fff;background-color:#0c5460;border-color:#0c5460}.list-group-item-warning{color:#856404;background-color:#ffeeba}.list-group-item-warning.list-group-item-action:hover,.list-group-item-warning.list-group-item-action:focus{color:#856404;background-color:#ffe8a1}.list-group-item-warning.list-group-item-action.active{color:#fff;background-color:#856404;border-color:#856404}.list-group-item-danger{color:#721c24;background-color:#f5c6cb}.list-group-item-danger.list-group-item-action:hover,.list-group-item-danger.list-group-item-action:focus{color:#721c24;background-color:#f1b0b7}.list-group-item-danger.list-group-item-action.active{color:#fff;background-color:#721c24;border-color:#721c24}.list-group-item-light{color:#818182;background-color:#fdfdfe}.list-group-item-light.list-group-item-action:hover,.list-group-item-light.list-group-item-action:focus{color:#818182;background-color:#ececf6}.list-group-item-light.list-group-item-action.active{color:#fff;background-color:#818182;border-color:#818182}.list-group-item-dark{color:#1b1e21;background-color:#c6c8ca}.list-group-item-dark.list-group-item-action:hover,.list-group-item-dark.list-group-item-action:focus{color:#1b1e21;background-color:#b9bbbe}.list-group-item-dark.list-group-item-action.active{color:#fff;background-color:#1b1e21;border-color:#1b1e21}.close{float:right;font-size:1.5rem;font-weight:700;line-height:1;color:#000;text-shadow:0 1px 0 #fff;opacity:.5}.close:hover{color:#000;text-decoration:none}.close:not(:disabled):not(.disabled):hover,.close:not(:disabled):not(.disabled):focus{opacity:.75}button.close{padding:0;background-color:transparent;border:0;-webkit-appearance:none;-moz-appearance:none;appearance:none}a.close.disabled{pointer-events:none}.toast{max-width:350px;overflow:hidden;font-size:.875rem;background-color:rgba(255,255,255,0.85);background-clip:padding-box;border:1px solid rgba(0,0,0,0.1);box-shadow:0 0.25rem 0.75rem rgba(0,0,0,0.1);-webkit-backdrop-filter:blur(10px);backdrop-filter:blur(10px);opacity:0;border-radius:.25rem}.toast:not(:last-child){margin-bottom:.75rem}.toast.showing{opacity:1}.toast.show{display:block;opacity:1}.toast.hide{display:none}.toast-header{display:flex;align-items:center;padding:.25rem .75rem;color:#6c757d;background-color:rgba(255,255,255,0.85);background-clip:padding-box;border-bottom:1px solid rgba(0,0,0,0.05)}.toast-body{padding:.75rem}.modal-open{overflow:hidden}.modal-open .modal{overflow-x:hidden;overflow-y:auto}.modal{position:fixed;top:0;left:0;z-index:1050;display:none;width:100%;height:100%;overflow:hidden;outline:0}.modal-dialog{position:relative;width:auto;margin:.5rem;pointer-events:none}.modal.fade .modal-dialog{transition:transform 0.3s ease-out;transform:translate(0, -50px)}@media (prefers-reduced-motion: reduce){.modal.fade .modal-dialog{transition:none}}.modal.show .modal-dialog{transform:none}.modal-dialog-scrollable{display:flex;max-height:calc(100% - 1rem)}.modal-dialog-scrollable .modal-content{max-height:calc(100vh - 1rem);overflow:hidden}.modal-dialog-scrollable .modal-header,.modal-dialog-scrollable .modal-footer{flex-shrink:0}.modal-dialog-scrollable .modal-body{overflow-y:auto}.modal-dialog-centered{display:flex;align-items:center;min-height:calc(100% - 1rem)}.modal-dialog-centered::before{display:block;height:calc(100vh - 1rem);content:""}.modal-dialog-centered.modal-dialog-scrollable{flex-direction:column;justify-content:center;height:100%}.modal-dialog-centered.modal-dialog-scrollable .modal-content{max-height:none}.modal-dialog-centered.modal-dialog-scrollable::before{content:none}.modal-content{position:relative;display:flex;flex-direction:column;width:100%;pointer-events:auto;background-color:#fff;background-clip:padding-box;border:1px solid rgba(0,0,0,0.2);border-radius:.3rem;outline:0}.modal-backdrop{position:fixed;top:0;left:0;z-index:1040;width:100vw;height:100vh;background-color:#000}.modal-backdrop.fade{opacity:0}.modal-backdrop.show{opacity:.5}.modal-header{display:flex;align-items:flex-start;justify-content:space-between;padding:1rem 1rem;border-bottom:1px solid #dee2e6;border-top-left-radius:.3rem;border-top-right-radius:.3rem}.modal-header .close{padding:1rem 1rem;margin:-1rem -1rem -1rem auto}.modal-title{margin-bottom:0;line-height:1.5}.modal-body{position:relative;flex:1 1 auto;padding:1rem}.modal-footer{display:flex;align-items:center;justify-content:flex-end;padding:1rem;border-top:1px solid #dee2e6;border-bottom-right-radius:.3rem;border-bottom-left-radius:.3rem}.modal-footer>:not(:first-child){margin-left:.25rem}.modal-footer>:not(:last-child){margin-right:.25rem}.modal-scrollbar-measure{position:absolute;top:-9999px;width:50px;height:50px;overflow:scroll}@media (min-width: 576px){.modal-dialog{max-width:500px;margin:1.75rem auto}.modal-dialog-scrollable{max-height:calc(100% - 3.5rem)}.modal-dialog-scrollable .modal-content{max-height:calc(100vh - 3.5rem)}.modal-dialog-centered{min-height:calc(100% - 3.5rem)}.modal-dialog-centered::before{height:calc(100vh - 3.5rem)}.modal-sm{max-width:300px}}@media (min-width: 992px){.modal-lg,.modal-xl{max-width:800px}}@media (min-width: 1200px){.modal-xl{max-width:1140px}}.tooltip{position:absolute;z-index:1070;display:block;margin:0;font-family:-apple-system,BlinkMacSystemFont,"Segoe UI",Roboto,"Helvetica Neue",Arial,"Noto Sans",sans-serif,"Apple Color Emoji","Segoe UI Emoji","Segoe UI Symbol","Noto Color Emoji";font-style:normal;font-weight:400;line-height:1.5;text-align:left;text-align:start;text-decoration:none;text-shadow:none;text-transform:none;letter-spacing:normal;word-break:normal;word-spacing:normal;white-space:normal;line-break:auto;font-size:.875rem;word-wrap:break-word;opacity:0}.tooltip.show{opacity:.9}.tooltip .arrow{position:absolute;display:block;width:.8rem;height:.4rem}.tooltip .arrow::before{position:absolute;content:"";border-color:transparent;border-style:solid}.bs-tooltip-top,.bs-tooltip-auto[x-placement^="top"]{padding:.4rem 0}.bs-tooltip-top .arrow,.bs-tooltip-auto[x-placement^="top"] .arrow{bottom:0}.bs-tooltip-top .arrow::before,.bs-tooltip-auto[x-placement^="top"] .arrow::before{top:0;border-width:.4rem .4rem 0;border-top-color:#000}.bs-tooltip-right,.bs-tooltip-auto[x-placement^="right"]{padding:0 .4rem}.bs-tooltip-right .arrow,.bs-tooltip-auto[x-placement^="right"] .arrow{left:0;width:.4rem;height:.8rem}.bs-tooltip-right .arrow::before,.bs-tooltip-auto[x-placement^="right"] .arrow::before{right:0;border-width:.4rem .4rem .4rem 0;border-right-color:#000}.bs-tooltip-bottom,.bs-tooltip-auto[x-placement^="bottom"]{padding:.4rem 0}.bs-tooltip-bottom .arrow,.bs-tooltip-auto[x-placement^="bottom"] .arrow{top:0}.bs-tooltip-bottom .arrow::before,.bs-tooltip-auto[x-placement^="bottom"] .arrow::before{bottom:0;border-width:0 .4rem .4rem;border-bottom-color:#000}.bs-tooltip-left,.bs-tooltip-auto[x-placement^="left"]{padding:0 .4rem}.bs-tooltip-left .arrow,.bs-tooltip-auto[x-placement^="left"] .arrow{right:0;width:.4rem;height:.8rem}.bs-tooltip-left .arrow::before,.bs-tooltip-auto[x-placement^="left"] .arrow::before{left:0;border-width:.4rem 0 .4rem .4rem;border-left-color:#000}.tooltip-inner{max-width:200px;padding:.25rem .5rem;color:#fff;text-align:center;background-color:#000;border-radius:.25rem}.popover{position:absolute;top:0;left:0;z-index:1060;display:block;max-width:276px;font-family:-apple-system,BlinkMacSystemFont,"Segoe UI",Roboto,"Helvetica Neue",Arial,"Noto Sans",sans-serif,"Apple Color Emoji","Segoe UI Emoji","Segoe UI Symbol","Noto Color Emoji";font-style:normal;font-weight:400;line-height:1.5;text-align:left;text-align:start;text-decoration:none;text-shadow:none;text-transform:none;letter-spacing:normal;word-break:normal;word-spacing:normal;white-space:normal;line-break:auto;font-size:.875rem;word-wrap:break-word;background-color:#fff;background-clip:padding-box;border:1px solid rgba(0,0,0,0.2);border-radius:.3rem}.popover .arrow{position:absolute;display:block;width:1rem;height:.5rem;margin:0 .3rem}.popover .arrow::before,.popover .arrow::after{position:absolute;display:block;content:"";border-color:transparent;border-style:solid}.bs-popover-top,.bs-popover-auto[x-placement^="top"]{margin-bottom:.5rem}.bs-popover-top>.arrow,.bs-popover-auto[x-placement^="top"]>.arrow{bottom:calc((.5rem + 1px) * -1)}.bs-popover-top>.arrow::before,.bs-popover-auto[x-placement^="top"]>.arrow::before{bottom:0;border-width:.5rem .5rem 0;border-top-color:rgba(0,0,0,0.25)}.bs-popover-top>.arrow::after,.bs-popover-auto[x-placement^="top"]>.arrow::after{bottom:1px;border-width:.5rem .5rem 0;border-top-color:#fff}.bs-popover-right,.bs-popover-auto[x-placement^="right"]{margin-left:.5rem}.bs-popover-right>.arrow,.bs-popover-auto[x-placement^="right"]>.arrow{left:calc((.5rem + 1px) * -1);width:.5rem;height:1rem;margin:.3rem 0}.bs-popover-right>.arrow::before,.bs-popover-auto[x-placement^="right"]>.arrow::before{left:0;border-width:.5rem .5rem .5rem 0;border-right-color:rgba(0,0,0,0.25)}.bs-popover-right>.arrow::after,.bs-popover-auto[x-placement^="right"]>.arrow::after{left:1px;border-width:.5rem .5rem .5rem 0;border-right-color:#fff}.bs-popover-bottom,.bs-popover-auto[x-placement^="bottom"]{margin-top:.5rem}.bs-popover-bottom>.arrow,.bs-popover-auto[x-placement^="bottom"]>.arrow{top:calc((.5rem + 1px) * -1)}.bs-popover-bottom>.arrow::before,.bs-popover-auto[x-placement^="bottom"]>.arrow::before{top:0;border-width:0 .5rem .5rem .5rem;border-bottom-color:rgba(0,0,0,0.25)}.bs-popover-bottom>.arrow::after,.bs-popover-auto[x-placement^="bottom"]>.arrow::after{top:1px;border-width:0 .5rem .5rem .5rem;border-bottom-color:#fff}.bs-popover-bottom .popover-header::before,.bs-popover-auto[x-placement^="bottom"] .popover-header::before{position:absolute;top:0;left:50%;display:block;width:1rem;margin-left:-.5rem;content:"";border-bottom:1px solid #f7f7f7}.bs-popover-left,.bs-popover-auto[x-placement^="left"]{margin-right:.5rem}.bs-popover-left>.arrow,.bs-popover-auto[x-placement^="left"]>.arrow{right:calc((.5rem + 1px) * -1);width:.5rem;height:1rem;margin:.3rem 0}.bs-popover-left>.arrow::before,.bs-popover-auto[x-placement^="left"]>.arrow::before{right:0;border-width:.5rem 0 .5rem .5rem;border-left-color:rgba(0,0,0,0.25)}.bs-popover-left>.arrow::after,.bs-popover-auto[x-placement^="left"]>.arrow::after{right:1px;border-width:.5rem 0 .5rem .5rem;border-left-color:#fff}.popover-header{padding:.5rem .75rem;margin-bottom:0;font-size:1rem;background-color:#f7f7f7;border-bottom:1px solid #ebebeb;border-top-left-radius:calc(.3rem - 1px);border-top-right-radius:calc(.3rem - 1px)}.popover-header:empty{display:none}.popover-body{padding:.5rem .75rem;color:#212529}.carousel{position:relative}.carousel.pointer-event{touch-action:pan-y}.carousel-inner{position:relative;width:100%;overflow:hidden}.carousel-inner::after{display:block;clear:both;content:""}.carousel-item{position:relative;display:none;float:left;width:100%;margin-right:-100%;-webkit-backface-visibility:hidden;backface-visibility:hidden;transition:transform .6s ease-in-out}@media (prefers-reduced-motion: reduce){.carousel-item{transition:none}}.carousel-item.active,.carousel-item-next,.carousel-item-prev{display:block}.carousel-item-next:not(.carousel-item-left),.active.carousel-item-right{transform:translateX(100%)}.carousel-item-prev:not(.carousel-item-right),.active.carousel-item-left{transform:translateX(-100%)}.carousel-fade .carousel-item{opacity:0;transition-property:opacity;transform:none}.carousel-fade .carousel-item.active,.carousel-fade .carousel-item-next.carousel-item-left,.carousel-fade .carousel-item-prev.carousel-item-right{z-index:1;opacity:1}.carousel-fade .active.carousel-item-left,.carousel-fade .active.carousel-item-right{z-index:0;opacity:0;transition:0s .6s opacity}@media (prefers-reduced-motion: reduce){.carousel-fade .active.carousel-item-left,.carousel-fade .active.carousel-item-right{transition:none}}.carousel-control-prev,.carousel-control-next{position:absolute;top:0;bottom:0;z-index:1;display:flex;align-items:center;justify-content:center;width:15%;color:#fff;text-align:center;opacity:.5;transition:opacity 0.15s ease}@media (prefers-reduced-motion: reduce){.carousel-control-prev,.carousel-control-next{transition:none}}.carousel-control-prev:hover,.carousel-control-prev:focus,.carousel-control-next:hover,.carousel-control-next:focus{color:#fff;text-decoration:none;outline:0;opacity:.9}.carousel-control-prev{left:0}.carousel-control-next{right:0}.carousel-control-prev-icon,.carousel-control-next-icon{display:inline-block;width:20px;height:20px;background:no-repeat 50% / 100% 100%}.carousel-control-prev-icon{background-image:url("data:image/svg+xml,%3csvg xmlns='http://www.w3.org/2000/svg' fill='%23fff' viewBox='0 0 8 8'%3e%3cpath d='M5.25 0l-4 4 4 4 1.5-1.5-2.5-2.5 2.5-2.5-1.5-1.5z'/%3e%3c/svg%3e")}.carousel-control-next-icon{background-image:url("data:image/svg+xml,%3csvg xmlns='http://www.w3.org/2000/svg' fill='%23fff' viewBox='0 0 8 8'%3e%3cpath d='M2.75 0l-1.5 1.5 2.5 2.5-2.5 2.5 1.5 1.5 4-4-4-4z'/%3e%3c/svg%3e")}.carousel-indicators{position:absolute;right:0;bottom:0;left:0;z-index:15;display:flex;justify-content:center;padding-left:0;margin-right:15%;margin-left:15%;list-style:none}.carousel-indicators li{box-sizing:content-box;flex:0 1 auto;width:30px;height:3px;margin-right:3px;margin-left:3px;text-indent:-999px;cursor:pointer;background-color:#fff;background-clip:padding-box;border-top:10px solid transparent;border-bottom:10px solid transparent;opacity:.5;transition:opacity 0.6s ease}@media (prefers-reduced-motion: reduce){.carousel-indicators li{transition:none}}.carousel-indicators .active{opacity:1}.carousel-caption{position:absolute;right:15%;bottom:20px;left:15%;z-index:10;padding-top:20px;padding-bottom:20px;color:#fff;text-align:center}@-webkit-keyframes spinner-border{to{transform:rotate(360deg)}}@keyframes spinner-border{to{transform:rotate(360deg)}}.spinner-border{display:inline-block;width:2rem;height:2rem;vertical-align:text-bottom;border:.25em solid currentColor;border-right-color:transparent;border-radius:50%;-webkit-animation:spinner-border .75s linear infinite;animation:spinner-border .75s linear infinite}.spinner-border-sm{width:1rem;height:1rem;border-width:.2em}@-webkit-keyframes spinner-grow{0%{transform:scale(0)}50%{opacity:1}}@keyframes spinner-grow{0%{transform:scale(0)}50%{opacity:1}}.spinner-grow{display:inline-block;width:2rem;height:2rem;vertical-align:text-bottom;background-color:currentColor;border-radius:50%;opacity:0;-webkit-animation:spinner-grow .75s linear infinite;animation:spinner-grow .75s linear infinite}.spinner-grow-sm{width:1rem;height:1rem}.align-baseline{vertical-align:baseline !important}.align-top{vertical-align:top !important}.align-middle{vertical-align:middle !important}.align-bottom{vertical-align:bottom !important}.align-text-bottom{vertical-align:text-bottom !important}.align-text-top{vertical-align:text-top !important}.bg-primary{background-color:#007bff !important}a.bg-primary:hover,a.bg-primary:focus,button.bg-primary:hover,button.bg-primary:focus{background-color:#0062cc !important}.bg-secondary{background-color:#6c757d !important}a.bg-secondary:hover,a.bg-secondary:focus,button.bg-secondary:hover,button.bg-secondary:focus{background-color:#545b62 !important}.bg-success{background-color:#28a745 !important}a.bg-success:hover,a.bg-success:focus,button.bg-success:hover,button.bg-success:focus{background-color:#1e7e34 !important}.bg-info{background-color:#17a2b8 !important}a.bg-info:hover,a.bg-info:focus,button.bg-info:hover,button.bg-info:focus{background-color:#117a8b !important}.bg-warning{background-color:#ffc107 !important}a.bg-warning:hover,a.bg-warning:focus,button.bg-warning:hover,button.bg-warning:focus{background-color:#d39e00 !important}.bg-danger{background-color:#dc3545 !important}a.bg-danger:hover,a.bg-danger:focus,button.bg-danger:hover,button.bg-danger:focus{background-color:#bd2130 !important}.bg-light{background-color:#f8f9fa !important}a.bg-light:hover,a.bg-light:focus,button.bg-light:hover,button.bg-light:focus{background-color:#dae0e5 !important}.bg-dark{background-color:#343a40 !important}a.bg-dark:hover,a.bg-dark:focus,button.bg-dark:hover,button.bg-dark:focus{background-color:#1d2124 !important}.bg-white{background-color:#fff !important}.bg-transparent{background-color:transparent !important}.border{border:1px solid #dee2e6 !important}.border-top{border-top:1px solid #dee2e6 !important}.border-right{border-right:1px solid #dee2e6 !important}.border-bottom{border-bottom:1px solid #dee2e6 !important}.border-left{border-left:1px solid #dee2e6 !important}.border-0{border:0 !important}.border-top-0{border-top:0 !important}.border-right-0{border-right:0 !important}.border-bottom-0{border-bottom:0 !important}.border-left-0{border-left:0 !important}.border-primary{border-color:#007bff !important}.border-secondary{border-color:#6c757d !important}.border-success{border-color:#28a745 !important}.border-info{border-color:#17a2b8 !important}.border-warning{border-color:#ffc107 !important}.border-danger{border-color:#dc3545 !important}.border-light{border-color:#f8f9fa !important}.border-dark{border-color:#343a40 !important}.border-white{border-color:#fff !important}.rounded-sm{border-radius:.2rem !important}.rounded{border-radius:.25rem !important}.rounded-top{border-top-left-radius:.25rem !important;border-top-right-radius:.25rem !important}.rounded-right{border-top-right-radius:.25rem !important;border-bottom-right-radius:.25rem !important}.rounded-bottom{border-bottom-right-radius:.25rem !important;border-bottom-left-radius:.25rem !important}.rounded-left{border-top-left-radius:.25rem !important;border-bottom-left-radius:.25rem !important}.rounded-lg{border-radius:.3rem !important}.rounded-circle{border-radius:50% !important}.rounded-pill{border-radius:50rem !important}.rounded-0{border-radius:0 !important}.clearfix::after{display:block;clear:both;content:""}.d-none{display:none !important}.d-inline{display:inline !important}.d-inline-block{display:inline-block !important}.d-block{display:block !important}.d-table{display:table !important}.d-table-row{display:table-row !important}.d-table-cell{display:table-cell !important}.d-flex{display:flex !important}.d-inline-flex{display:inline-flex !important}@media (min-width: 576px){.d-sm-none{display:none !important}.d-sm-inline{display:inline !important}.d-sm-inline-block{display:inline-block !important}.d-sm-block{display:block !important}.d-sm-table{display:table !important}.d-sm-table-row{display:table-row !important}.d-sm-table-cell{display:table-cell !important}.d-sm-flex{display:flex !important}.d-sm-inline-flex{display:inline-flex !important}}@media (min-width: 768px){.d-md-none{display:none !important}.d-md-inline{display:inline !important}.d-md-inline-block{display:inline-block !important}.d-md-block{display:block !important}.d-md-table{display:table !important}.d-md-table-row{display:table-row !important}.d-md-table-cell{display:table-cell !important}.d-md-flex{display:flex !important}.d-md-inline-flex{display:inline-flex !important}}@media (min-width: 992px){.d-lg-none{display:none !important}.d-lg-inline{display:inline !important}.d-lg-inline-block{display:inline-block !important}.d-lg-block{display:block !important}.d-lg-table{display:table !important}.d-lg-table-row{display:table-row !important}.d-lg-table-cell{display:table-cell !important}.d-lg-flex{display:flex !important}.d-lg-inline-flex{display:inline-flex !important}}@media (min-width: 1200px){.d-xl-none{display:none !important}.d-xl-inline{display:inline !important}.d-xl-inline-block{display:inline-block !important}.d-xl-block{display:block !important}.d-xl-table{display:table !important}.d-xl-table-row{display:table-row !important}.d-xl-table-cell{display:table-cell !important}.d-xl-flex{display:flex !important}.d-xl-inline-flex{display:inline-flex !important}}@media print{.d-print-none{display:none !important}.d-print-inline{display:inline !important}.d-print-inline-block{display:inline-block !important}.d-print-block{display:block !important}.d-print-table{display:table !important}.d-print-table-row{display:table-row !important}.d-print-table-cell{display:table-cell !important}.d-print-flex{display:flex !important}.d-print-inline-flex{display:inline-flex !important}}.embed-responsive{position:relative;display:block;width:100%;padding:0;overflow:hidden}.embed-responsive::before{display:block;content:""}.embed-responsive .embed-responsive-item,.embed-responsive iframe,.embed-responsive embed,.embed-responsive object,.embed-responsive video{position:absolute;top:0;bottom:0;left:0;width:100%;height:100%;border:0}.embed-responsive-21by9::before{padding-top:42.8571428571%}.embed-responsive-16by9::before{padding-top:56.25%}.embed-responsive-4by3::before{padding-top:75%}.embed-responsive-1by1::before{padding-top:100%}.flex-row{flex-direction:row !important}.flex-column{flex-direction:column !important}.flex-row-reverse{flex-direction:row-reverse !important}.flex-column-reverse{flex-direction:column-reverse !important}.flex-wrap{flex-wrap:wrap !important}.flex-nowrap{flex-wrap:nowrap !important}.flex-wrap-reverse{flex-wrap:wrap-reverse !important}.flex-fill{flex:1 1 auto !important}.flex-grow-0{flex-grow:0 !important}.flex-grow-1{flex-grow:1 !important}.flex-shrink-0{flex-shrink:0 !important}.flex-shrink-1{flex-shrink:1 !important}.justify-content-start{justify-content:flex-start !important}.justify-content-end{justify-content:flex-end !important}.justify-content-center{justify-content:center !important}.justify-content-between{justify-content:space-between !important}.justify-content-around{justify-content:space-around !important}.align-items-start{align-items:flex-start !important}.align-items-end{align-items:flex-end !important}.align-items-center{align-items:center !important}.align-items-baseline{align-items:baseline !important}.align-items-stretch{align-items:stretch !important}.align-content-start{align-content:flex-start !important}.align-content-end{align-content:flex-end !important}.align-content-center{align-content:center !important}.align-content-between{align-content:space-between !important}.align-content-around{align-content:space-around !important}.align-content-stretch{align-content:stretch !important}.align-self-auto{align-self:auto !important}.align-self-start{align-self:flex-start !important}.align-self-end{align-self:flex-end !important}.align-self-center{align-self:center !important}.align-self-baseline{align-self:baseline !important}.align-self-stretch{align-self:stretch !important}@media (min-width: 576px){.flex-sm-row{flex-direction:row !important}.flex-sm-column{flex-direction:column !important}.flex-sm-row-reverse{flex-direction:row-reverse !important}.flex-sm-column-reverse{flex-direction:column-reverse !important}.flex-sm-wrap{flex-wrap:wrap !important}.flex-sm-nowrap{flex-wrap:nowrap !important}.flex-sm-wrap-reverse{flex-wrap:wrap-reverse !important}.flex-sm-fill{flex:1 1 auto !important}.flex-sm-grow-0{flex-grow:0 !important}.flex-sm-grow-1{flex-grow:1 !important}.flex-sm-shrink-0{flex-shrink:0 !important}.flex-sm-shrink-1{flex-shrink:1 !important}.justify-content-sm-start{justify-content:flex-start !important}.justify-content-sm-end{justify-content:flex-end !important}.justify-content-sm-center{justify-content:center !important}.justify-content-sm-between{justify-content:space-between !important}.justify-content-sm-around{justify-content:space-around !important}.align-items-sm-start{align-items:flex-start !important}.align-items-sm-end{align-items:flex-end !important}.align-items-sm-center{align-items:center !important}.align-items-sm-baseline{align-items:baseline !important}.align-items-sm-stretch{align-items:stretch !important}.align-content-sm-start{align-content:flex-start !important}.align-content-sm-end{align-content:flex-end !important}.align-content-sm-center{align-content:center !important}.align-content-sm-between{align-content:space-between !important}.align-content-sm-around{align-content:space-around !important}.align-content-sm-stretch{align-content:stretch !important}.align-self-sm-auto{align-self:auto !important}.align-self-sm-start{align-self:flex-start !important}.align-self-sm-end{align-self:flex-end !important}.align-self-sm-center{align-self:center !important}.align-self-sm-baseline{align-self:baseline !important}.align-self-sm-stretch{align-self:stretch !important}}@media (min-width: 768px){.flex-md-row{flex-direction:row !important}.flex-md-column{flex-direction:column !important}.flex-md-row-reverse{flex-direction:row-reverse !important}.flex-md-column-reverse{flex-direction:column-reverse !important}.flex-md-wrap{flex-wrap:wrap !important}.flex-md-nowrap{flex-wrap:nowrap !important}.flex-md-wrap-reverse{flex-wrap:wrap-reverse !important}.flex-md-fill{flex:1 1 auto !important}.flex-md-grow-0{flex-grow:0 !important}.flex-md-grow-1{flex-grow:1 !important}.flex-md-shrink-0{flex-shrink:0 !important}.flex-md-shrink-1{flex-shrink:1 !important}.justify-content-md-start{justify-content:flex-start !important}.justify-content-md-end{justify-content:flex-end !important}.justify-content-md-center{justify-content:center !important}.justify-content-md-between{justify-content:space-between !important}.justify-content-md-around{justify-content:space-around !important}.align-items-md-start{align-items:flex-start !important}.align-items-md-end{align-items:flex-end !important}.align-items-md-center{align-items:center !important}.align-items-md-baseline{align-items:baseline !important}.align-items-md-stretch{align-items:stretch !important}.align-content-md-start{align-content:flex-start !important}.align-content-md-end{align-content:flex-end !important}.align-content-md-center{align-content:center !important}.align-content-md-between{align-content:space-between !important}.align-content-md-around{align-content:space-around !important}.align-content-md-stretch{align-content:stretch !important}.align-self-md-auto{align-self:auto !important}.align-self-md-start{align-self:flex-start !important}.align-self-md-end{align-self:flex-end !important}.align-self-md-center{align-self:center !important}.align-self-md-baseline{align-self:baseline !important}.align-self-md-stretch{align-self:stretch !important}}@media (min-width: 992px){.flex-lg-row{flex-direction:row !important}.flex-lg-column{flex-direction:column !important}.flex-lg-row-reverse{flex-direction:row-reverse !important}.flex-lg-column-reverse{flex-direction:column-reverse !important}.flex-lg-wrap{flex-wrap:wrap !important}.flex-lg-nowrap{flex-wrap:nowrap !important}.flex-lg-wrap-reverse{flex-wrap:wrap-reverse !important}.flex-lg-fill{flex:1 1 auto !important}.flex-lg-grow-0{flex-grow:0 !important}.flex-lg-grow-1{flex-grow:1 !important}.flex-lg-shrink-0{flex-shrink:0 !important}.flex-lg-shrink-1{flex-shrink:1 !important}.justify-content-lg-start{justify-content:flex-start !important}.justify-content-lg-end{justify-content:flex-end !important}.justify-content-lg-center{justify-content:center !important}.justify-content-lg-between{justify-content:space-between !important}.justify-content-lg-around{justify-content:space-around !important}.align-items-lg-start{align-items:flex-start !important}.align-items-lg-end{align-items:flex-end !important}.align-items-lg-center{align-items:center !important}.align-items-lg-baseline{align-items:baseline !important}.align-items-lg-stretch{align-items:stretch !important}.align-content-lg-start{align-content:flex-start !important}.align-content-lg-end{align-content:flex-end !important}.align-content-lg-center{align-content:center !important}.align-content-lg-between{align-content:space-between !important}.align-content-lg-around{align-content:space-around !important}.align-content-lg-stretch{align-content:stretch !important}.align-self-lg-auto{align-self:auto !important}.align-self-lg-start{align-self:flex-start !important}.align-self-lg-end{align-self:flex-end !important}.align-self-lg-center{align-self:center !important}.align-self-lg-baseline{align-self:baseline !important}.align-self-lg-stretch{align-self:stretch !important}}@media (min-width: 1200px){.flex-xl-row{flex-direction:row !important}.flex-xl-column{flex-direction:column !important}.flex-xl-row-reverse{flex-direction:row-reverse !important}.flex-xl-column-reverse{flex-direction:column-reverse !important}.flex-xl-wrap{flex-wrap:wrap !important}.flex-xl-nowrap{flex-wrap:nowrap !important}.flex-xl-wrap-reverse{flex-wrap:wrap-reverse !important}.flex-xl-fill{flex:1 1 auto !important}.flex-xl-grow-0{flex-grow:0 !important}.flex-xl-grow-1{flex-grow:1 !important}.flex-xl-shrink-0{flex-shrink:0 !important}.flex-xl-shrink-1{flex-shrink:1 !important}.justify-content-xl-start{justify-content:flex-start !important}.justify-content-xl-end{justify-content:flex-end !important}.justify-content-xl-center{justify-content:center !important}.justify-content-xl-between{justify-content:space-between !important}.justify-content-xl-around{justify-content:space-around !important}.align-items-xl-start{align-items:flex-start !important}.align-items-xl-end{align-items:flex-end !important}.align-items-xl-center{align-items:center !important}.align-items-xl-baseline{align-items:baseline !important}.align-items-xl-stretch{align-items:stretch !important}.align-content-xl-start{align-content:flex-start !important}.align-content-xl-end{align-content:flex-end !important}.align-content-xl-center{align-content:center !important}.align-content-xl-between{align-content:space-between !important}.align-content-xl-around{align-content:space-around !important}.align-content-xl-stretch{align-content:stretch !important}.align-self-xl-auto{align-self:auto !important}.align-self-xl-start{align-self:flex-start !important}.align-self-xl-end{align-self:flex-end !important}.align-self-xl-center{align-self:center !important}.align-self-xl-baseline{align-self:baseline !important}.align-self-xl-stretch{align-self:stretch !important}}.float-left{float:left !important}.float-right{float:right !important}.float-none{float:none !important}@media (min-width: 576px){.float-sm-left{float:left !important}.float-sm-right{float:right !important}.float-sm-none{float:none !important}}@media (min-width: 768px){.float-md-left{float:left !important}.float-md-right{float:right !important}.float-md-none{float:none !important}}@media (min-width: 992px){.float-lg-left{float:left !important}.float-lg-right{float:right !important}.float-lg-none{float:none !important}}@media (min-width: 1200px){.float-xl-left{float:left !important}.float-xl-right{float:right !important}.float-xl-none{float:none !important}}.overflow-auto{overflow:auto !important}.overflow-hidden{overflow:hidden !important}.position-static{position:static !important}.position-relative{position:relative !important}.position-absolute{position:absolute !important}.position-fixed{position:fixed !important}.position-sticky{position:-webkit-sticky !important;position:sticky !important}.fixed-top{position:fixed;top:0;right:0;left:0;z-index:1030}.fixed-bottom{position:fixed;right:0;bottom:0;left:0;z-index:1030}@supports ((position: -webkit-sticky) or (position: sticky)){.sticky-top{position:-webkit-sticky;position:sticky;top:0;z-index:1020}}.sr-only{position:absolute;width:1px;height:1px;padding:0;overflow:hidden;clip:rect(0, 0, 0, 0);white-space:nowrap;border:0}.sr-only-focusable:active,.sr-only-focusable:focus{position:static;width:auto;height:auto;overflow:visible;clip:auto;white-space:normal}.shadow-sm{box-shadow:0 0.125rem 0.25rem rgba(0,0,0,0.075) !important}.shadow{box-shadow:0 0.5rem 1rem rgba(0,0,0,0.15) !important}.shadow-lg{box-shadow:0 1rem 3rem rgba(0,0,0,0.175) !important}.shadow-none{box-shadow:none !important}.w-25{width:25% !important}.w-50{width:50% !important}.w-75{width:75% !important}.w-100{width:100% !important}.w-auto{width:auto !important}.h-25{height:25% !important}.h-50{height:50% !important}.h-75{height:75% !important}.h-100{height:100% !important}.h-auto{height:auto !important}.mw-100{max-width:100% !important}.mh-100{max-height:100% !important}.min-vw-100{min-width:100vw !important}.min-vh-100{min-height:100vh !important}.vw-100{width:100vw !important}.vh-100{height:100vh !important}.stretched-link::after{position:absolute;top:0;right:0;bottom:0;left:0;z-index:1;pointer-events:auto;content:"";background-color:transparent}.m-0{margin:0 !important}.mt-0,.my-0{margin-top:0 !important}.mr-0,.mx-0{margin-right:0 !important}.mb-0,.my-0{margin-bottom:0 !important}.ml-0,.mx-0{margin-left:0 !important}.m-1{margin:.25rem !important}.mt-1,.my-1{margin-top:.25rem !important}.mr-1,.mx-1{margin-right:.25rem !important}.mb-1,.my-1{margin-bottom:.25rem !important}.ml-1,.mx-1{margin-left:.25rem !important}.m-2{margin:.5rem !important}.mt-2,.my-2{margin-top:.5rem !important}.mr-2,.mx-2{margin-right:.5rem !important}.mb-2,.my-2{margin-bottom:.5rem !important}.ml-2,.mx-2{margin-left:.5rem !important}.m-3{margin:1rem !important}.mt-3,.my-3{margin-top:1rem !important}.mr-3,.mx-3{margin-right:1rem !important}.mb-3,.my-3{margin-bottom:1rem !important}.ml-3,.mx-3{margin-left:1rem !important}.m-4{margin:1.5rem !important}.mt-4,.my-4{margin-top:1.5rem !important}.mr-4,.mx-4{margin-right:1.5rem !important}.mb-4,.my-4{margin-bottom:1.5rem !important}.ml-4,.mx-4{margin-left:1.5rem !important}.m-5{margin:3rem !important}.mt-5,.my-5{margin-top:3rem !important}.mr-5,.mx-5{margin-right:3rem !important}.mb-5,.my-5{margin-bottom:3rem !important}.ml-5,.mx-5{margin-left:3rem !important}.p-0{padding:0 !important}.pt-0,.py-0{padding-top:0 !important}.pr-0,.px-0{padding-right:0 !important}.pb-0,.py-0{padding-bottom:0 !important}.pl-0,.px-0{padding-left:0 !important}.p-1{padding:.25rem !important}.pt-1,.py-1{padding-top:.25rem !important}.pr-1,.px-1{padding-right:.25rem !important}.pb-1,.py-1{padding-bottom:.25rem !important}.pl-1,.px-1{padding-left:.25rem !important}.p-2{padding:.5rem !important}.pt-2,.py-2{padding-top:.5rem !important}.pr-2,.px-2{padding-right:.5rem !important}.pb-2,.py-2{padding-bottom:.5rem !important}.pl-2,.px-2{padding-left:.5rem !important}.p-3{padding:1rem !important}.pt-3,.py-3{padding-top:1rem !important}.pr-3,.px-3{padding-right:1rem !important}.pb-3,.py-3{padding-bottom:1rem !important}.pl-3,.px-3{padding-left:1rem !important}.p-4{padding:1.5rem !important}.pt-4,.py-4{padding-top:1.5rem !important}.pr-4,.px-4{padding-right:1.5rem !important}.pb-4,.py-4{padding-bottom:1.5rem !important}.pl-4,.px-4{padding-left:1.5rem !important}.p-5{padding:3rem !important}.pt-5,.py-5{padding-top:3rem !important}.pr-5,.px-5{padding-right:3rem !important}.pb-5,.py-5{padding-bottom:3rem !important}.pl-5,.px-5{padding-left:3rem !important}.m-n1{margin:-.25rem !important}.mt-n1,.my-n1{margin-top:-.25rem !important}.mr-n1,.mx-n1{margin-right:-.25rem !important}.mb-n1,.my-n1{margin-bottom:-.25rem !important}.ml-n1,.mx-n1{margin-left:-.25rem !important}.m-n2{margin:-.5rem !important}.mt-n2,.my-n2{margin-top:-.5rem !important}.mr-n2,.mx-n2{margin-right:-.5rem !important}.mb-n2,.my-n2{margin-bottom:-.5rem !important}.ml-n2,.mx-n2{margin-left:-.5rem !important}.m-n3{margin:-1rem !important}.mt-n3,.my-n3{margin-top:-1rem !important}.mr-n3,.mx-n3{margin-right:-1rem !important}.mb-n3,.my-n3{margin-bottom:-1rem !important}.ml-n3,.mx-n3{margin-left:-1rem !important}.m-n4{margin:-1.5rem !important}.mt-n4,.my-n4{margin-top:-1.5rem !important}.mr-n4,.mx-n4{margin-right:-1.5rem !important}.mb-n4,.my-n4{margin-bottom:-1.5rem !important}.ml-n4,.mx-n4{margin-left:-1.5rem !important}.m-n5{margin:-3rem !important}.mt-n5,.my-n5{margin-top:-3rem !important}.mr-n5,.mx-n5{margin-right:-3rem !important}.mb-n5,.my-n5{margin-bottom:-3rem !important}.ml-n5,.mx-n5{margin-left:-3rem !important}.m-auto{margin:auto !important}.mt-auto,.my-auto{margin-top:auto !important}.mr-auto,.mx-auto{margin-right:auto !important}.mb-auto,.my-auto{margin-bottom:auto !important}.ml-auto,.mx-auto{margin-left:auto !important}@media (min-width: 576px){.m-sm-0{margin:0 !important}.mt-sm-0,.my-sm-0{margin-top:0 !important}.mr-sm-0,.mx-sm-0{margin-right:0 !important}.mb-sm-0,.my-sm-0{margin-bottom:0 !important}.ml-sm-0,.mx-sm-0{margin-left:0 !important}.m-sm-1{margin:.25rem !important}.mt-sm-1,.my-sm-1{margin-top:.25rem !important}.mr-sm-1,.mx-sm-1{margin-right:.25rem !important}.mb-sm-1,.my-sm-1{margin-bottom:.25rem !important}.ml-sm-1,.mx-sm-1{margin-left:.25rem !important}.m-sm-2{margin:.5rem !important}.mt-sm-2,.my-sm-2{margin-top:.5rem !important}.mr-sm-2,.mx-sm-2{margin-right:.5rem !important}.mb-sm-2,.my-sm-2{margin-bottom:.5rem !important}.ml-sm-2,.mx-sm-2{margin-left:.5rem !important}.m-sm-3{margin:1rem !important}.mt-sm-3,.my-sm-3{margin-top:1rem !important}.mr-sm-3,.mx-sm-3{margin-right:1rem !important}.mb-sm-3,.my-sm-3{margin-bottom:1rem !important}.ml-sm-3,.mx-sm-3{margin-left:1rem !important}.m-sm-4{margin:1.5rem !important}.mt-sm-4,.my-sm-4{margin-top:1.5rem !important}.mr-sm-4,.mx-sm-4{margin-right:1.5rem !important}.mb-sm-4,.my-sm-4{margin-bottom:1.5rem !important}.ml-sm-4,.mx-sm-4{margin-left:1.5rem !important}.m-sm-5{margin:3rem !important}.mt-sm-5,.my-sm-5{margin-top:3rem !important}.mr-sm-5,.mx-sm-5{margin-right:3rem !important}.mb-sm-5,.my-sm-5{margin-bottom:3rem !important}.ml-sm-5,.mx-sm-5{margin-left:3rem !important}.p-sm-0{padding:0 !important}.pt-sm-0,.py-sm-0{padding-top:0 !important}.pr-sm-0,.px-sm-0{padding-right:0 !important}.pb-sm-0,.py-sm-0{padding-bottom:0 !important}.pl-sm-0,.px-sm-0{padding-left:0 !important}.p-sm-1{padding:.25rem !important}.pt-sm-1,.py-sm-1{padding-top:.25rem !important}.pr-sm-1,.px-sm-1{padding-right:.25rem !important}.pb-sm-1,.py-sm-1{padding-bottom:.25rem !important}.pl-sm-1,.px-sm-1{padding-left:.25rem !important}.p-sm-2{padding:.5rem !important}.pt-sm-2,.py-sm-2{padding-top:.5rem !important}.pr-sm-2,.px-sm-2{padding-right:.5rem !important}.pb-sm-2,.py-sm-2{padding-bottom:.5rem !important}.pl-sm-2,.px-sm-2{padding-left:.5rem !important}.p-sm-3{padding:1rem !important}.pt-sm-3,.py-sm-3{padding-top:1rem !important}.pr-sm-3,.px-sm-3{padding-right:1rem !important}.pb-sm-3,.py-sm-3{padding-bottom:1rem !important}.pl-sm-3,.px-sm-3{padding-left:1rem !important}.p-sm-4{padding:1.5rem !important}.pt-sm-4,.py-sm-4{padding-top:1.5rem !important}.pr-sm-4,.px-sm-4{padding-right:1.5rem !important}.pb-sm-4,.py-sm-4{padding-bottom:1.5rem !important}.pl-sm-4,.px-sm-4{padding-left:1.5rem !important}.p-sm-5{padding:3rem !important}.pt-sm-5,.py-sm-5{padding-top:3rem !important}.pr-sm-5,.px-sm-5{padding-right:3rem !important}.pb-sm-5,.py-sm-5{padding-bottom:3rem !important}.pl-sm-5,.px-sm-5{padding-left:3rem !important}.m-sm-n1{margin:-.25rem !important}.mt-sm-n1,.my-sm-n1{margin-top:-.25rem !important}.mr-sm-n1,.mx-sm-n1{margin-right:-.25rem !important}.mb-sm-n1,.my-sm-n1{margin-bottom:-.25rem !important}.ml-sm-n1,.mx-sm-n1{margin-left:-.25rem !important}.m-sm-n2{margin:-.5rem !important}.mt-sm-n2,.my-sm-n2{margin-top:-.5rem !important}.mr-sm-n2,.mx-sm-n2{margin-right:-.5rem !important}.mb-sm-n2,.my-sm-n2{margin-bottom:-.5rem !important}.ml-sm-n2,.mx-sm-n2{margin-left:-.5rem !important}.m-sm-n3{margin:-1rem !important}.mt-sm-n3,.my-sm-n3{margin-top:-1rem !important}.mr-sm-n3,.mx-sm-n3{margin-right:-1rem !important}.mb-sm-n3,.my-sm-n3{margin-bottom:-1rem !important}.ml-sm-n3,.mx-sm-n3{margin-left:-1rem !important}.m-sm-n4{margin:-1.5rem !important}.mt-sm-n4,.my-sm-n4{margin-top:-1.5rem !important}.mr-sm-n4,.mx-sm-n4{margin-right:-1.5rem !important}.mb-sm-n4,.my-sm-n4{margin-bottom:-1.5rem !important}.ml-sm-n4,.mx-sm-n4{margin-left:-1.5rem !important}.m-sm-n5{margin:-3rem !important}.mt-sm-n5,.my-sm-n5{margin-top:-3rem !important}.mr-sm-n5,.mx-sm-n5{margin-right:-3rem !important}.mb-sm-n5,.my-sm-n5{margin-bottom:-3rem !important}.ml-sm-n5,.mx-sm-n5{margin-left:-3rem !important}.m-sm-auto{margin:auto !important}.mt-sm-auto,.my-sm-auto{margin-top:auto !important}.mr-sm-auto,.mx-sm-auto{margin-right:auto !important}.mb-sm-auto,.my-sm-auto{margin-bottom:auto !important}.ml-sm-auto,.mx-sm-auto{margin-left:auto !important}}@media (min-width: 768px){.m-md-0{margin:0 !important}.mt-md-0,.my-md-0{margin-top:0 !important}.mr-md-0,.mx-md-0{margin-right:0 !important}.mb-md-0,.my-md-0{margin-bottom:0 !important}.ml-md-0,.mx-md-0{margin-left:0 !important}.m-md-1{margin:.25rem !important}.mt-md-1,.my-md-1{margin-top:.25rem !important}.mr-md-1,.mx-md-1{margin-right:.25rem !important}.mb-md-1,.my-md-1{margin-bottom:.25rem !important}.ml-md-1,.mx-md-1{margin-left:.25rem !important}.m-md-2{margin:.5rem !important}.mt-md-2,.my-md-2{margin-top:.5rem !important}.mr-md-2,.mx-md-2{margin-right:.5rem !important}.mb-md-2,.my-md-2{margin-bottom:.5rem !important}.ml-md-2,.mx-md-2{margin-left:.5rem !important}.m-md-3{margin:1rem !important}.mt-md-3,.my-md-3{margin-top:1rem !important}.mr-md-3,.mx-md-3{margin-right:1rem !important}.mb-md-3,.my-md-3{margin-bottom:1rem !important}.ml-md-3,.mx-md-3{margin-left:1rem !important}.m-md-4{margin:1.5rem !important}.mt-md-4,.my-md-4{margin-top:1.5rem !important}.mr-md-4,.mx-md-4{margin-right:1.5rem !important}.mb-md-4,.my-md-4{margin-bottom:1.5rem !important}.ml-md-4,.mx-md-4{margin-left:1.5rem !important}.m-md-5{margin:3rem !important}.mt-md-5,.my-md-5{margin-top:3rem !important}.mr-md-5,.mx-md-5{margin-right:3rem !important}.mb-md-5,.my-md-5{margin-bottom:3rem !important}.ml-md-5,.mx-md-5{margin-left:3rem !important}.p-md-0{padding:0 !important}.pt-md-0,.py-md-0{padding-top:0 !important}.pr-md-0,.px-md-0{padding-right:0 !important}.pb-md-0,.py-md-0{padding-bottom:0 !important}.pl-md-0,.px-md-0{padding-left:0 !important}.p-md-1{padding:.25rem !important}.pt-md-1,.py-md-1{padding-top:.25rem !important}.pr-md-1,.px-md-1{padding-right:.25rem !important}.pb-md-1,.py-md-1{padding-bottom:.25rem !important}.pl-md-1,.px-md-1{padding-left:.25rem !important}.p-md-2{padding:.5rem !important}.pt-md-2,.py-md-2{padding-top:.5rem !important}.pr-md-2,.px-md-2{padding-right:.5rem !important}.pb-md-2,.py-md-2{padding-bottom:.5rem !important}.pl-md-2,.px-md-2{padding-left:.5rem !important}.p-md-3{padding:1rem !important}.pt-md-3,.py-md-3{padding-top:1rem !important}.pr-md-3,.px-md-3{padding-right:1rem !important}.pb-md-3,.py-md-3{padding-bottom:1rem !important}.pl-md-3,.px-md-3{padding-left:1rem !important}.p-md-4{padding:1.5rem !important}.pt-md-4,.py-md-4{padding-top:1.5rem !important}.pr-md-4,.px-md-4{padding-right:1.5rem !important}.pb-md-4,.py-md-4{padding-bottom:1.5rem !important}.pl-md-4,.px-md-4{padding-left:1.5rem !important}.p-md-5{padding:3rem !important}.pt-md-5,.py-md-5{padding-top:3rem !important}.pr-md-5,.px-md-5{padding-right:3rem !important}.pb-md-5,.py-md-5{padding-bottom:3rem !important}.pl-md-5,.px-md-5{padding-left:3rem !important}.m-md-n1{margin:-.25rem !important}.mt-md-n1,.my-md-n1{margin-top:-.25rem !important}.mr-md-n1,.mx-md-n1{margin-right:-.25rem !important}.mb-md-n1,.my-md-n1{margin-bottom:-.25rem !important}.ml-md-n1,.mx-md-n1{margin-left:-.25rem !important}.m-md-n2{margin:-.5rem !important}.mt-md-n2,.my-md-n2{margin-top:-.5rem !important}.mr-md-n2,.mx-md-n2{margin-right:-.5rem !important}.mb-md-n2,.my-md-n2{margin-bottom:-.5rem !important}.ml-md-n2,.mx-md-n2{margin-left:-.5rem !important}.m-md-n3{margin:-1rem !important}.mt-md-n3,.my-md-n3{margin-top:-1rem !important}.mr-md-n3,.mx-md-n3{margin-right:-1rem !important}.mb-md-n3,.my-md-n3{margin-bottom:-1rem !important}.ml-md-n3,.mx-md-n3{margin-left:-1rem !important}.m-md-n4{margin:-1.5rem !important}.mt-md-n4,.my-md-n4{margin-top:-1.5rem !important}.mr-md-n4,.mx-md-n4{margin-right:-1.5rem !important}.mb-md-n4,.my-md-n4{margin-bottom:-1.5rem !important}.ml-md-n4,.mx-md-n4{margin-left:-1.5rem !important}.m-md-n5{margin:-3rem !important}.mt-md-n5,.my-md-n5{margin-top:-3rem !important}.mr-md-n5,.mx-md-n5{margin-right:-3rem !important}.mb-md-n5,.my-md-n5{margin-bottom:-3rem !important}.ml-md-n5,.mx-md-n5{margin-left:-3rem !important}.m-md-auto{margin:auto !important}.mt-md-auto,.my-md-auto{margin-top:auto !important}.mr-md-auto,.mx-md-auto{margin-right:auto !important}.mb-md-auto,.my-md-auto{margin-bottom:auto !important}.ml-md-auto,.mx-md-auto{margin-left:auto !important}}@media (min-width: 992px){.m-lg-0{margin:0 !important}.mt-lg-0,.my-lg-0{margin-top:0 !important}.mr-lg-0,.mx-lg-0{margin-right:0 !important}.mb-lg-0,.my-lg-0{margin-bottom:0 !important}.ml-lg-0,.mx-lg-0{margin-left:0 !important}.m-lg-1{margin:.25rem !important}.mt-lg-1,.my-lg-1{margin-top:.25rem !important}.mr-lg-1,.mx-lg-1{margin-right:.25rem !important}.mb-lg-1,.my-lg-1{margin-bottom:.25rem !important}.ml-lg-1,.mx-lg-1{margin-left:.25rem !important}.m-lg-2{margin:.5rem !important}.mt-lg-2,.my-lg-2{margin-top:.5rem !important}.mr-lg-2,.mx-lg-2{margin-right:.5rem !important}.mb-lg-2,.my-lg-2{margin-bottom:.5rem !important}.ml-lg-2,.mx-lg-2{margin-left:.5rem !important}.m-lg-3{margin:1rem !important}.mt-lg-3,.my-lg-3{margin-top:1rem !important}.mr-lg-3,.mx-lg-3{margin-right:1rem !important}.mb-lg-3,.my-lg-3{margin-bottom:1rem !important}.ml-lg-3,.mx-lg-3{margin-left:1rem !important}.m-lg-4{margin:1.5rem !important}.mt-lg-4,.my-lg-4{margin-top:1.5rem !important}.mr-lg-4,.mx-lg-4{margin-right:1.5rem !important}.mb-lg-4,.my-lg-4{margin-bottom:1.5rem !important}.ml-lg-4,.mx-lg-4{margin-left:1.5rem !important}.m-lg-5{margin:3rem !important}.mt-lg-5,.my-lg-5{margin-top:3rem !important}.mr-lg-5,.mx-lg-5{margin-right:3rem !important}.mb-lg-5,.my-lg-5{margin-bottom:3rem !important}.ml-lg-5,.mx-lg-5{margin-left:3rem !important}.p-lg-0{padding:0 !important}.pt-lg-0,.py-lg-0{padding-top:0 !important}.pr-lg-0,.px-lg-0{padding-right:0 !important}.pb-lg-0,.py-lg-0{padding-bottom:0 !important}.pl-lg-0,.px-lg-0{padding-left:0 !important}.p-lg-1{padding:.25rem !important}.pt-lg-1,.py-lg-1{padding-top:.25rem !important}.pr-lg-1,.px-lg-1{padding-right:.25rem !important}.pb-lg-1,.py-lg-1{padding-bottom:.25rem !important}.pl-lg-1,.px-lg-1{padding-left:.25rem !important}.p-lg-2{padding:.5rem !important}.pt-lg-2,.py-lg-2{padding-top:.5rem !important}.pr-lg-2,.px-lg-2{padding-right:.5rem !important}.pb-lg-2,.py-lg-2{padding-bottom:.5rem !important}.pl-lg-2,.px-lg-2{padding-left:.5rem !important}.p-lg-3{padding:1rem !important}.pt-lg-3,.py-lg-3{padding-top:1rem !important}.pr-lg-3,.px-lg-3{padding-right:1rem !important}.pb-lg-3,.py-lg-3{padding-bottom:1rem !important}.pl-lg-3,.px-lg-3{padding-left:1rem !important}.p-lg-4{padding:1.5rem !important}.pt-lg-4,.py-lg-4{padding-top:1.5rem !important}.pr-lg-4,.px-lg-4{padding-right:1.5rem !important}.pb-lg-4,.py-lg-4{padding-bottom:1.5rem !important}.pl-lg-4,.px-lg-4{padding-left:1.5rem !important}.p-lg-5{padding:3rem !important}.pt-lg-5,.py-lg-5{padding-top:3rem !important}.pr-lg-5,.px-lg-5{padding-right:3rem !important}.pb-lg-5,.py-lg-5{padding-bottom:3rem !important}.pl-lg-5,.px-lg-5{padding-left:3rem !important}.m-lg-n1{margin:-.25rem !important}.mt-lg-n1,.my-lg-n1{margin-top:-.25rem !important}.mr-lg-n1,.mx-lg-n1{margin-right:-.25rem !important}.mb-lg-n1,.my-lg-n1{margin-bottom:-.25rem !important}.ml-lg-n1,.mx-lg-n1{margin-left:-.25rem !important}.m-lg-n2{margin:-.5rem !important}.mt-lg-n2,.my-lg-n2{margin-top:-.5rem !important}.mr-lg-n2,.mx-lg-n2{margin-right:-.5rem !important}.mb-lg-n2,.my-lg-n2{margin-bottom:-.5rem !important}.ml-lg-n2,.mx-lg-n2{margin-left:-.5rem !important}.m-lg-n3{margin:-1rem !important}.mt-lg-n3,.my-lg-n3{margin-top:-1rem !important}.mr-lg-n3,.mx-lg-n3{margin-right:-1rem !important}.mb-lg-n3,.my-lg-n3{margin-bottom:-1rem !important}.ml-lg-n3,.mx-lg-n3{margin-left:-1rem !important}.m-lg-n4{margin:-1.5rem !important}.mt-lg-n4,.my-lg-n4{margin-top:-1.5rem !important}.mr-lg-n4,.mx-lg-n4{margin-right:-1.5rem !important}.mb-lg-n4,.my-lg-n4{margin-bottom:-1.5rem !important}.ml-lg-n4,.mx-lg-n4{margin-left:-1.5rem !important}.m-lg-n5{margin:-3rem !important}.mt-lg-n5,.my-lg-n5{margin-top:-3rem !important}.mr-lg-n5,.mx-lg-n5{margin-right:-3rem !important}.mb-lg-n5,.my-lg-n5{margin-bottom:-3rem !important}.ml-lg-n5,.mx-lg-n5{margin-left:-3rem !important}.m-lg-auto{margin:auto !important}.mt-lg-auto,.my-lg-auto{margin-top:auto !important}.mr-lg-auto,.mx-lg-auto{margin-right:auto !important}.mb-lg-auto,.my-lg-auto{margin-bottom:auto !important}.ml-lg-auto,.mx-lg-auto{margin-left:auto !important}}@media (min-width: 1200px){.m-xl-0{margin:0 !important}.mt-xl-0,.my-xl-0{margin-top:0 !important}.mr-xl-0,.mx-xl-0{margin-right:0 !important}.mb-xl-0,.my-xl-0{margin-bottom:0 !important}.ml-xl-0,.mx-xl-0{margin-left:0 !important}.m-xl-1{margin:.25rem !important}.mt-xl-1,.my-xl-1{margin-top:.25rem !important}.mr-xl-1,.mx-xl-1{margin-right:.25rem !important}.mb-xl-1,.my-xl-1{margin-bottom:.25rem !important}.ml-xl-1,.mx-xl-1{margin-left:.25rem !important}.m-xl-2{margin:.5rem !important}.mt-xl-2,.my-xl-2{margin-top:.5rem !important}.mr-xl-2,.mx-xl-2{margin-right:.5rem !important}.mb-xl-2,.my-xl-2{margin-bottom:.5rem !important}.ml-xl-2,.mx-xl-2{margin-left:.5rem !important}.m-xl-3{margin:1rem !important}.mt-xl-3,.my-xl-3{margin-top:1rem !important}.mr-xl-3,.mx-xl-3{margin-right:1rem !important}.mb-xl-3,.my-xl-3{margin-bottom:1rem !important}.ml-xl-3,.mx-xl-3{margin-left:1rem !important}.m-xl-4{margin:1.5rem !important}.mt-xl-4,.my-xl-4{margin-top:1.5rem !important}.mr-xl-4,.mx-xl-4{margin-right:1.5rem !important}.mb-xl-4,.my-xl-4{margin-bottom:1.5rem !important}.ml-xl-4,.mx-xl-4{margin-left:1.5rem !important}.m-xl-5{margin:3rem !important}.mt-xl-5,.my-xl-5{margin-top:3rem !important}.mr-xl-5,.mx-xl-5{margin-right:3rem !important}.mb-xl-5,.my-xl-5{margin-bottom:3rem !important}.ml-xl-5,.mx-xl-5{margin-left:3rem !important}.p-xl-0{padding:0 !important}.pt-xl-0,.py-xl-0{padding-top:0 !important}.pr-xl-0,.px-xl-0{padding-right:0 !important}.pb-xl-0,.py-xl-0{padding-bottom:0 !important}.pl-xl-0,.px-xl-0{padding-left:0 !important}.p-xl-1{padding:.25rem !important}.pt-xl-1,.py-xl-1{padding-top:.25rem !important}.pr-xl-1,.px-xl-1{padding-right:.25rem !important}.pb-xl-1,.py-xl-1{padding-bottom:.25rem !important}.pl-xl-1,.px-xl-1{padding-left:.25rem !important}.p-xl-2{padding:.5rem !important}.pt-xl-2,.py-xl-2{padding-top:.5rem !important}.pr-xl-2,.px-xl-2{padding-right:.5rem !important}.pb-xl-2,.py-xl-2{padding-bottom:.5rem !important}.pl-xl-2,.px-xl-2{padding-left:.5rem !important}.p-xl-3{padding:1rem !important}.pt-xl-3,.py-xl-3{padding-top:1rem !important}.pr-xl-3,.px-xl-3{padding-right:1rem !important}.pb-xl-3,.py-xl-3{padding-bottom:1rem !important}.pl-xl-3,.px-xl-3{padding-left:1rem !important}.p-xl-4{padding:1.5rem !important}.pt-xl-4,.py-xl-4{padding-top:1.5rem !important}.pr-xl-4,.px-xl-4{padding-right:1.5rem !important}.pb-xl-4,.py-xl-4{padding-bottom:1.5rem !important}.pl-xl-4,.px-xl-4{padding-left:1.5rem !important}.p-xl-5{padding:3rem !important}.pt-xl-5,.py-xl-5{padding-top:3rem !important}.pr-xl-5,.px-xl-5{padding-right:3rem !important}.pb-xl-5,.py-xl-5{padding-bottom:3rem !important}.pl-xl-5,.px-xl-5{padding-left:3rem !important}.m-xl-n1{margin:-.25rem !important}.mt-xl-n1,.my-xl-n1{margin-top:-.25rem !important}.mr-xl-n1,.mx-xl-n1{margin-right:-.25rem !important}.mb-xl-n1,.my-xl-n1{margin-bottom:-.25rem !important}.ml-xl-n1,.mx-xl-n1{margin-left:-.25rem !important}.m-xl-n2{margin:-.5rem !important}.mt-xl-n2,.my-xl-n2{margin-top:-.5rem !important}.mr-xl-n2,.mx-xl-n2{margin-right:-.5rem !important}.mb-xl-n2,.my-xl-n2{margin-bottom:-.5rem !important}.ml-xl-n2,.mx-xl-n2{margin-left:-.5rem !important}.m-xl-n3{margin:-1rem !important}.mt-xl-n3,.my-xl-n3{margin-top:-1rem !important}.mr-xl-n3,.mx-xl-n3{margin-right:-1rem !important}.mb-xl-n3,.my-xl-n3{margin-bottom:-1rem !important}.ml-xl-n3,.mx-xl-n3{margin-left:-1rem !important}.m-xl-n4{margin:-1.5rem !important}.mt-xl-n4,.my-xl-n4{margin-top:-1.5rem !important}.mr-xl-n4,.mx-xl-n4{margin-right:-1.5rem !important}.mb-xl-n4,.my-xl-n4{margin-bottom:-1.5rem !important}.ml-xl-n4,.mx-xl-n4{margin-left:-1.5rem !important}.m-xl-n5{margin:-3rem !important}.mt-xl-n5,.my-xl-n5{margin-top:-3rem !important}.mr-xl-n5,.mx-xl-n5{margin-right:-3rem !important}.mb-xl-n5,.my-xl-n5{margin-bottom:-3rem !important}.ml-xl-n5,.mx-xl-n5{margin-left:-3rem !important}.m-xl-auto{margin:auto !important}.mt-xl-auto,.my-xl-auto{margin-top:auto !important}.mr-xl-auto,.mx-xl-auto{margin-right:auto !important}.mb-xl-auto,.my-xl-auto{margin-bottom:auto !important}.ml-xl-auto,.mx-xl-auto{margin-left:auto !important}}.text-monospace{font-family:SFMono-Regular,Menlo,Monaco,Consolas,"Liberation Mono","Courier New",monospace !important}.text-justify{text-align:justify !important}.text-wrap{white-space:normal !important}.text-nowrap{white-space:nowrap !important}.text-truncate{overflow:hidden;text-overflow:ellipsis;white-space:nowrap}.text-left{text-align:left !important}.text-right{text-align:right !important}.text-center{text-align:center !important}@media (min-width: 576px){.text-sm-left{text-align:left !important}.text-sm-right{text-align:right !important}.text-sm-center{text-align:center !important}}@media (min-width: 768px){.text-md-left{text-align:left !important}.text-md-right{text-align:right !important}.text-md-center{text-align:center !important}}@media (min-width: 992px){.text-lg-left{text-align:left !important}.text-lg-right{text-align:right !important}.text-lg-center{text-align:center !important}}@media (min-width: 1200px){.text-xl-left{text-align:left !important}.text-xl-right{text-align:right !important}.text-xl-center{text-align:center !important}}.text-lowercase{text-transform:lowercase !important}.text-uppercase{text-transform:uppercase !important}.text-capitalize{text-transform:capitalize !important}.font-weight-light{font-weight:300 !important}.font-weight-lighter{font-weight:lighter !important}.font-weight-normal{font-weight:400 !important}.font-weight-bold{font-weight:700 !important}.font-weight-bolder{font-weight:bolder !important}.font-italic{font-style:italic !important}.text-white{color:#fff !important}.text-primary{color:#007bff !important}a.text-primary:hover,a.text-primary:focus{color:#0056b3 !important}.text-secondary{color:#6c757d !important}a.text-secondary:hover,a.text-secondary:focus{color:#494f54 !important}.text-success{color:#28a745 !important}a.text-success:hover,a.text-success:focus{color:#19692c !important}.text-info{color:#17a2b8 !important}a.text-info:hover,a.text-info:focus{color:#0f6674 !important}.text-warning{color:#ffc107 !important}a.text-warning:hover,a.text-warning:focus{color:#ba8b00 !important}.text-danger{color:#dc3545 !important}a.text-danger:hover,a.text-danger:focus{color:#a71d2a !important}.text-light{color:#f8f9fa !important}a.text-light:hover,a.text-light:focus{color:#cbd3da !important}.text-dark{color:#343a40 !important}a.text-dark:hover,a.text-dark:focus{color:#121416 !important}.text-body{color:#212529 !important}.text-muted{color:#6c757d !important}.text-black-50{color:rgba(0,0,0,0.5) !important}.text-white-50{color:rgba(255,255,255,0.5) !important}.text-hide{font:0/0 a;color:transparent;text-shadow:none;background-color:transparent;border:0}.text-decoration-none{text-decoration:none !important}.text-break{word-break:break-word !important;overflow-wrap:break-word !important}.text-reset{color:inherit !important}.visible{visibility:visible !important}.invisible{visibility:hidden !important}@media print{*,*::before,*::after{text-shadow:none !important;box-shadow:none !important}a:not(.btn){text-decoration:underline}abbr[title]::after{content:" (" attr(title) ")"}pre{white-space:pre-wrap !important}pre,blockquote{border:1px solid #adb5bd;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}p,h2,h3{orphans:3;widows:3}h2,h3{page-break-after:avoid}@page{size:a3}body{min-width:992px !important}.container{min-width:992px !important}.navbar{display:none}.badge{border:1px solid #000}.table{border-collapse:collapse !important}.table td,.table th{background-color:#fff !important}.table-bordered th,.table-bordered td{border:1px solid #dee2e6 !important}.table-dark{color:inherit}.table-dark th,.table-dark td,.table-dark thead th,.table-dark tbody+tbody{border-color:#dee2e6}.table .thead-dark th{color:inherit;border-color:#dee2e6}}.highlight table td{padding:5px}.highlight table pre{margin:0}.highlight .cm{color:#999988;font-style:italic}.highlight .cp{color:#999999;font-weight:bold}.highlight .c1{color:#999988;font-style:italic}.highlight .cs{color:#999999;font-weight:bold;font-style:italic}.highlight .c,.highlight .cd{color:#8c8c8c;font-style:italic}.highlight .err{color:#a61717;background-color:#e3d2d2}.highlight .gd{color:#000000;background-color:#ffdddd}.highlight .ge{color:#000000;font-style:italic}.highlight .gr{color:#aa0000}.highlight .gh{color:#999999}.highlight .gi{color:#000000;background-color:#ddffdd}.highlight .go{color:#888888}.highlight .gp{color:#555555}.highlight .gs{font-weight:bold}.highlight .gu{color:#aaaaaa}.highlight .gt{color:#aa0000}.highlight .kc{color:#000000;font-weight:bold}.highlight .kd{color:#000000;font-weight:bold}.highlight .kn{color:#000000;font-weight:bold}.highlight .kp{color:#000000;font-weight:bold}.highlight .kr{color:#000000;font-weight:bold}.highlight .kt{color:#445588;font-weight:bold}.highlight .k,.highlight .kv{color:#000000;font-weight:bold}.highlight .mf{color:#009999}.highlight .mh{color:#009999}.highlight .il{color:#009999}.highlight .mi{color:#009999}.highlight .mo{color:#009999}.highlight .m,.highlight .mb,.highlight .mx{color:#009999}.highlight .sb{color:#d14}.highlight .sc{color:#d14}.highlight .sd{color:#d14}.highlight .s2{color:#d14}.highlight .se{color:#d14}.highlight .sh{color:#d14}.highlight .si{color:#d14}.highlight .sx{color:#d14}.highlight .sr{color:#009926}.highlight .s1{color:#d14}.highlight .ss{color:#990073}.highlight .s{color:#d14}.highlight .na{color:#008080}.highlight .bp{color:#999999}.highlight .nb{color:#0086B3}.highlight .nc{color:#445588;font-weight:bold}.highlight .no{color:#008080}.highlight .nd{color:#3c5d5d;font-weight:bold}.highlight .ni{color:#800080}.highlight .ne{color:#990000;font-weight:bold}.highlight .nf{color:#990000;font-weight:bold}.highlight .nl{color:#990000;font-weight:bold}.highlight .nn{color:#555555}.highlight .nt{color:#000080}.highlight .vc{color:#008080}.highlight .vg{color:#008080}.highlight .vi{color:#008080}.highlight .nv{color:#008080}.highlight .ow{color:#000000;font-weight:bold}.highlight .o{color:#000000;font-weight:bold}.highlight .w{color:#bbbbbb}.highlight{background-color:#f8f8f8}.container{padding-left:30px;padding-right:30px;max-width:1240px}.container-fluid{padding-left:0;padding-right:0}@font-face{font-family:FreightSans;font-weight:700;font-style:normal;src:url("/assets/fonts/FreightSans/freight-sans-bold.woff2") format("woff2"),url("/assets/fonts/FreightSans/freight-sans-bold.woff") format("woff")}@font-face{font-family:FreightSans;font-weight:700;font-style:italic;src:url("/assets/fonts/FreightSans/freight-sans-bold-italic.woff2") format("woff2"),url("/assets/fonts/FreightSans/freight-sans-bold-italic.woff") format("woff")}@font-face{font-family:FreightSans;font-weight:500;font-style:normal;src:url("/assets/fonts/FreightSans/freight-sans-medium.woff2") format("woff2"),url("/assets/fonts/FreightSans/freight-sans-medium.woff") format("woff")}@font-face{font-family:FreightSans;font-weight:500;font-style:italic;src:url("/assets/fonts/FreightSans/freight-sans-medium-italic.woff2") format("woff2"),url("/assets/fonts/FreightSans/freight-sans-medium-italic.woff") format("woff")}@font-face{font-family:FreightSans;font-weight:100;font-style:normal;src:url("/assets/fonts/FreightSans/freight-sans-light.woff2") format("woff2"),url("/assets/fonts/FreightSans/freight-sans-light.woff") format("woff")}@font-face{font-family:FreightSans;font-weight:100;font-style:italic;src:url("/assets/fonts/FreightSans/freight-sans-light-italic.woff2") format("woff2"),url("/assets/fonts/FreightSans/freight-sans-light-italic.woff") format("woff")}@font-face{font-family:FreightSans;font-weight:400;font-style:italic;src:url("/assets/fonts/FreightSans/freight-sans-book-italic.woff2") format("woff2"),url("/assets/fonts/FreightSans/freight-sans-book-italic.woff") format("woff")}@font-face{font-family:FreightSans;font-weight:400;font-style:normal;src:url("/assets/fonts/FreightSans/freight-sans-book.woff2") format("woff2"),url("/assets/fonts/FreightSans/freight-sans-book.woff") format("woff")}@font-face{font-family:IBMPlexMono;font-weight:600;font-style:normal;unicode-range:u+0020-007f;src:local("IBMPlexMono-SemiBold"),url("/assets/fonts/IBMPlexMono/IBMPlexMono-SemiBold.woff2") format("woff2"),url("/assets/fonts/IBMPlexMono/IBMPlexMono-SemiBold.woff") format("woff")}@font-face{font-family:IBMPlexMono;font-weight:500;font-style:normal;unicode-range:u+0020-007f;src:local("IBMPlexMono-Medium"),url("/assets/fonts/IBMPlexMono/IBMPlexMono-Medium.woff2") format("woff2"),url("/assets/fonts/IBMPlexMono/IBMPlexMono-Medium.woff") format("woff")}@font-face{font-family:IBMPlexMono;font-weight:400;font-style:normal;unicode-range:u+0020-007f;src:local("IBMPlexMono-Regular"),url("/assets/fonts/IBMPlexMono/IBMPlexMono-Regular.woff2") format("woff2"),url("/assets/fonts/IBMPlexMono/IBMPlexMono-Regular.woff") format("woff")}@font-face{font-family:IBMPlexMono;font-weight:300;font-style:normal;unicode-range:u+0020-007f;src:local("IBMPlexMono-Light"),url("/assets/fonts/IBMPlexMono/IBMPlexMono-Light.woff2") format("woff2"),url("/assets/fonts/IBMPlexMono/IBMPlexMono-Light.woff") format("woff")}*{font-family:FreightSans, Helvetica Neue, Helvetica, Arial, sans-serif;font-weight:400}h1,h2,h3,h4,h5,h6{font-family:FreightSans}p{margin-bottom:1.25rem}a,em,i,b,strong,u,span{font-size:inherit}a:link,a:visited,a:hover{text-decoration:none;color:#ee4c2c}p a:link,p a:visited,p a:hover{color:#ee4c2c;text-decoration:none}@media screen and (min-width: 768px){p a:hover{text-decoration:underline}p a.social-icon:hover{text-decoration:none}}.btn,a.btn{border-radius:0;border:none;background-color:#f3f4f7;color:#6c6c6d;font-weight:400;position:relative;letter-spacing:0.25px}.btn.btn-lg,.btn-group-lg>.btn,a.btn.btn-lg,.btn-group-lg>a.btn{font-size:1.125rem;padding-top:.5rem}.btn.btn-white,a.btn.btn-white{background-color:#fff}.btn.btn-orange,a.btn.btn-orange{background-color:#ee4c2c}.btn.btn-demo,a.btn.btn-demo{color:#fff}@media screen and (min-width: 768px){.btn:after,a.btn:after{content:"";display:block;width:0;height:1px;position:absolute;bottom:0;left:0;background-color:#ee4c2c;transition:width .250s ease-in-out}.btn:hover:after,a.btn:hover:after{width:100%}.btn:hover,a.btn:hover{color:#262626}}.navbar{padding-left:0;padding-right:0}html{position:relative;min-height:100%;font-size:12px}@media screen and (min-width: 768px){html{font-size:16px}}@media screen and (min-width: 768px){body{margin:0 0 620px}}body.no-scroll{height:100%;overflow:hidden}a.with-right-arrow,.btn.with-right-arrow{padding-right:2rem;position:relative;background-image:url("/assets/images/chevron-right-orange.svg");background-size:6px 13px;background-position:top 10px right 11px;background-repeat:no-repeat}@media screen and (min-width: 768px){a.with-right-arrow,.btn.with-right-arrow{background-size:8px 14px;background-position:top 15px right 12px;padding-right:2rem}}a.with-left-arrow,.btn.with-left-arrow{padding-left:2rem;position:relative;background-image:url("/assets/images/chevron-left-grey.svg");background-size:6px 13px;background-position:top 10px left 11px;background-repeat:no-repeat}@media screen and (min-width: 768px){a.with-left-arrow,.btn.with-left-arrow{background-size:8px 14px;background-position:top 16px left 12px;padding-left:2rem}}.main-background{position:absolute;top:0;left:0;width:100%;height:350px;background-size:100% 100%;background-repeat:no-repeat;background-image:url("/assets/images/pytorch_bg_purple.jpg")}@media screen and (min-width: 768px){.main-background{height:640px}}.main-background.home-page-background{z-index:-1;height:350px}@media screen and (min-width: 768px){.main-background.home-page-background{height:570px}}.main-background.hub-background{height:380px}@media screen and (min-width: 768px){.main-background.hub-background{height:495px}}@media screen and (min-width: 768px){.main-background.ecosystem-background{height:472px}}@media screen and (min-width: 768px){.main-background.events-background{height:472px}}@media screen and (min-width: 768px){.main-background.ecosystem-join-background{height:435px}}.main-background.resources-background{height:380px}@media screen and (min-width: 768px){.main-background.resources-background{height:472px}}.main-background.get-started-background{height:275px}@media screen and (min-width: 768px){.main-background.get-started-background{height:380px}}.main-background.comm-stories-background{height:275px}@media screen and (min-width: 768px){.main-background.comm-stories-background{height:380px}}.main-background.features-background{height:335px}@media screen and (min-width: 768px){.main-background.features-background{height:300px}}.bg-light-grey{background-color:#f3f4f7}.text-dark-grey{color:#6c6c6d}.sidebar-links .top-section{color:#000}.sidebar-links ul{list-style-type:none;padding-left:0}.sidebar-links ul li{color:#6c6c6d;margin-left:20px}.sidebar-links ul li a{color:inherit}.sidebar-links .with-sub-sections.top-section:before{content:"+ ";font-family:"Courier New", Courier, monospace;width:50px}.sidebar-links .with-sub-sections.top-section.open:before{content:"- ";font-family:"Courier New", Courier, monospace;width:50px}.bg-very-light-grey{background-color:#f3f4f7}.email-subscribe-form input.email{color:#ee4c2c;border:none;border-bottom:1px solid #939393;width:100%;background-color:transparent;outline:none;font-size:1.125rem;letter-spacing:0.25px;line-height:2.25rem}.email-subscribe-form ::-webkit-input-placeholder{color:#ee4c2c}.email-subscribe-form ::-moz-placeholder{color:#ee4c2c}.email-subscribe-form :-ms-input-placeholder{color:#ee4c2c}.email-subscribe-form :-moz-placeholder{color:#ee4c2c}.email-subscribe-form input[type="submit"]{position:absolute;right:0;top:10px;height:15px;width:15px;background-image:url("/assets/images/arrow-right-with-tail.svg");background-color:transparent;background-repeat:no-repeat;background-size:15px 15px;background-position:center center;-webkit-appearance:none;-moz-appearance:none;appearance:none;border:0}.email-subscribe-form-fields-wrapper{position:relative}.bg-slate{background-color:#262626}.tweets-wrapper{width:100%}.tweets-wrapper p{font-size:1rem;line-height:1.5rem;letter-spacing:0.22px}.tweets-wrapper ol{padding-left:0}.tweets-wrapper a{color:#ee4c2c}.tweets-wrapper img,.tweets-wrapper .timeline-Tweet-actions,.tweets-wrapper .timeline-Tweet-media,.tweets-wrapper .MediaCard{display:none !important}.tweet{margin-bottom:2.2rem;word-wrap:break-word}.tweet a{color:#ee4c2c;display:inline}.tweet a span{color:inherit}.tweet p,.tweet span{font-size:1rem;line-height:1.5rem;letter-spacing:0.22px;color:#A0A0A1}@media screen and (min-width: 1240px){.tweet p{padding-right:40px}}.tweet span.retweeted,.tweet span.in-reply-to{font-size:.8125rem}.tweet p.tweet-header{margin-bottom:.3125rem;line-height:.75rem}.tweet .tweet-bird:before{content:"";position:relative;left:0;background-image:url("/assets/images/logo-twitter-grey.svg");background-size:20px 16px;display:inline-block;width:20px;height:16px}@media screen and (min-width: 768px){.tweet .tweet-bird:before{margin-bottom:.625rem}}.anchorjs-link{color:#6c6c6d !important}@media screen and (min-width: 768px){.anchorjs-link:hover{color:inherit;text-decoration:none !important}}.article-page-module{background-color:#f3f4f7;padding-top:1.875rem;padding-bottom:1.875rem}@media screen and (min-width: 768px){.article-page-module{padding-top:3.75rem;padding-bottom:3.75rem}}@media screen and (min-width: 1240px){.article-page-module .col-md-3{padding-left:20px;padding-right:20px}}.article-page-module .module-link-col .btn{padding-left:0}@media screen and (min-width: 768px){.article-page-module .module-link-col{text-align:right}.article-page-module .module-link-col .btn{padding-left:inherit}}.article-page-module .module-content-wrapper{margin-top:1.25rem;margin-bottom:1.25rem}@media screen and (min-width: 768px){.article-page-module .module-content-wrapper{margin-top:0;margin-bottom:0}}.article-page-module img{margin-bottom:1.875rem;width:100%}.article-page-module h3{font-size:1.5rem;letter-spacing:1.33px;line-height:2rem;text-transform:uppercase;margin-bottom:1.25rem}@media screen and (min-width: 768px){.article-page-module h3{margin-bottom:3.75rem}}.article-page-module h5,.article-page-module p{font-size:1rem;line-height:1.5rem}.article-page-module h5{color:#262626}.article-page-module p{color:#CCCDD1;letter-spacing:0.25px}.article-page-module .module-header{position:relative}.article-page-module .module-button{padding-left:0}@media screen and (min-width: 768px){.article-page-module .module-button{position:absolute;right:15px;top:0;padding-top:0;padding-bottom:.125rem;background-position:center right;padding-right:16px}}article.pytorch-article .note-card{border-radius:0;border:none;background-color:#ee4c2c;color:white;padding:30px;margin-bottom:50px}article.pytorch-article .note-card h4{font-size:1.5rem;letter-spacing:1.33px;line-height:2rem;text-transform:uppercase;color:white;margin-top:0;margin-bottom:1.125rem}article.pytorch-article .note-card p{font-size:1.125rem;line-height:1.5em;margin-bottom:0;color:white}article.pytorch-article .note-card p a{color:white;font-weight:700}.ecosystem-card,.resource-card,.hub-card{border-radius:0;border:none;height:110px;margin-bottom:1.25rem;margin-bottom:1.875rem;overflow:scroll}@media screen and (min-width: 1240px){.ecosystem-card,.resource-card,.hub-card{height:150px;overflow:inherit}}@media (min-width: 768px) and (max-width: 1239px){.ecosystem-card,.resource-card,.hub-card{height:170px;overflow:inherit}}.ecosystem-card p.card-summary,.resource-card p.card-summary,.hub-card p.card-summary{font-size:1.125rem;line-height:1.5rem;margin-bottom:0;color:#6c6c6d}.ecosystem-card h4,.resource-card h4,.hub-card h4{color:#262626;margin-bottom:1.125rem;overflow:hidden;white-space:nowrap;text-overflow:ellipsis}.ecosystem-card a,.resource-card a,.hub-card a{height:100%}@media screen and (min-width: 768px){.ecosystem-card a,.resource-card a,.hub-card a{min-height:190px}}@media (min-width: 768px) and (max-width: 1239px){.ecosystem-card a,.resource-card a,.hub-card a{min-height:234px}}@media screen and (min-width: 768px){.ecosystem-card:after,.resource-card:after,.hub-card:after{content:"";display:block;width:0;height:1px;position:absolute;bottom:0;left:0;background-color:#ee4c2c;transition:width .250s ease-in-out}.ecosystem-card:hover:after,.resource-card:hover:after,.hub-card:hover:after{width:100%}.ecosystem-card:hover,.resource-card:hover,.hub-card:hover{color:#262626}}.ecosystem-card:hover p.card-summary,.resource-card:hover p.card-summary,.hub-card:hover p.card-summary{color:#262626}.ecosystem-card .card-body{background-position:top 1.25rem right 1.25rem;background-repeat:no-repeat;padding:1.5625rem 1.875rem}.ecosystem-card .card-body.reasoning{background-image:url("/assets/images/logo-elf.svg");background-size:29px 25px}.ecosystem-card .card-body.tool{background-image:url("/assets/images/logo-wav2letter.svg");background-size:29px 25px}.ecosystem-card .card-body.language{background-image:url("/assets/images/logo-parlai.svg");background-size:29px 25px}.ecosystem-card .card-body.vision{background-image:url("/assets/images/logo-detectron.svg");background-size:29px 25px}.resource-card{border:1px solid #d6d7d8;background-color:transparent;margin-bottom:1.25rem}@media screen and (min-width: 768px){.resource-card{margin-bottom:0}}@media (min-width: 768px) and (max-width: 1239px){.resource-card{height:225px}}.resource-card .pytorch-image{position:relative;height:1.25rem;width:1.25rem;top:3.125rem}.resource-card a{letter-spacing:0.25px;color:#262626}.resource-card .card-body{display:block;padding:0 15px 0 0;position:relative;top:20px;margin-left:60px}@media (min-width: 768px) and (max-width: 1239px){.resource-card .card-body{top:18px}}@media screen and (min-width: 1240px){.resource-card .card-body{top:30px;margin-left:80px;padding-right:30px}}.resource-card.slack:before,.resource-card.github:before,.resource-card.pytorch-resource:before{content:"";background-size:32px 32px;background-repeat:no-repeat;display:block;position:absolute;height:32px;width:32px;top:15px;left:15px}@media screen and (min-width: 1240px){.resource-card.slack:before,.resource-card.github:before,.resource-card.pytorch-resource:before{left:30px;top:30px}}.resource-card.slack:before{background-image:url("/assets/images/logo-slack.svg")}.resource-card.github:before{background-image:url("/assets/images/logo-github.svg")}.resource-card.pytorch-resource:before{background-image:url("/assets/images/logo-icon.svg")}.resource-card .pytorch-discuss .discuss{color:#ee4c2c;font-weight:400}@media screen and (min-width: 768px){.resource-card:after{content:"";display:block;width:0;height:1px;position:absolute;bottom:0;left:0;background-color:#ee4c2c;transition:width .250s ease-in-out}.resource-card:hover:after{width:100%}.resource-card:hover{color:#262626}}.article-page-module.similar-projects .ecosystem-card p.card-summary{font-size:1rem;height:36px}@media screen and (min-width: 768px){.article-page-module.similar-projects .ecosystem-card p.card-summary{height:50px}}#twitter-widget iframe{display:none !important}body.general .main-content-wrapper{margin-top:80px}@media screen and (min-width: 768px){body.general .main-content-wrapper{margin-top:100px}}.domain-card{background-color:#f3f4f7;padding:40px 20px;margin:20px 0}.domain-card h4{color:#000}.domain-card p{color:#6c6c6d;margin-bottom:0}.domain-card:hover h4{color:#ee4c2c}code,kbd,pre,samp,code b{font-family:IBMPlexMono,SFMono-Regular,Menlo,Monaco,Consolas,"Liberation Mono","Courier New",monospace}code span,kbd span,pre span,samp span,code b span{font-family:IBMPlexMono,SFMono-Regular,Menlo,Monaco,Consolas,"Liberation Mono","Courier New",monospace}pre{padding:1.125rem;background-color:#f3f4f7}pre code{font-size:.875rem}pre.highlight{background-color:#f3f4f7;line-height:1.3125rem}code.highlighter-rouge{color:#6c6c6d;background-color:#f3f4f7;padding:2px 6px}a:link code.highlighter-rouge,a:visited code.highlighter-rouge,a:hover code.highlighter-rouge{color:#4974D1}a:link.has-code,a:visited.has-code,a:hover.has-code{color:#4974D1}p code,h1 code,h2 code,h3 code,h4 code,h5 code,h6 code{font-size:78.5%}.header-holder{height:68px;align-items:center;display:flex;left:0;margin-left:auto;margin-right:auto;position:fixed;right:0;top:0;width:100%;z-index:9999}@media screen and (min-width: 1200px){.header-holder{height:70px}}@media screen and (min-width: 1200px){.header-holder{top:32px}}.header-holder.blog-header,.header-holder.blog-detail-header,.header-holder.resources-header,.header-holder.get-started-header,.header-holder.features-header,.header-holder.comm-stories-header,.header-holder.ecosystem-header,.header-holder.announcement-header,.header-holder.hub-header,.header-holder.mobile-header{background-color:#fff;border-bottom:1px solid #e2e2e2}.hello-bar{display:none}@media screen and (min-width: 1200px){.hello-bar{background-color:#CC2F90;color:#fff;display:flex;letter-spacing:.34px;justify-content:center;padding:4px 0;position:fixed;top:0;text-align:center;z-index:9999;margin-left:auto;margin-right:auto;width:100%}.hello-bar a{color:#fff;text-decoration:underline}}.header-container{position:relative;display:flex;align-items:center}.header-container:before,.header-container:after{content:"";display:table}.header-container:after{clear:both}.header-container{*zoom:1}@media screen and (min-width: 1200px){.header-container{display:block}}.header-logo{height:23px;width:93px;background-image:url("/assets/images/logo.svg");background-repeat:no-repeat;background-size:93px 23px;display:block;float:left}@media screen and (min-width: 1200px){.header-logo{background-size:108px 27px;position:absolute;height:27px;width:108px;top:4px;float:none}}.main-menu-open-button{background-image:url("/assets/images/icon-menu-dots.svg");background-position:center center;background-size:25px 7px;background-repeat:no-repeat;width:25px;height:7px;position:absolute;right:0;top:4px}@media screen and (min-width: 1200px){.main-menu-open-button{display:none}}.header-holder .main-menu{display:none}@media screen and (min-width: 1200px){.header-holder .main-menu{display:flex;align-items:center;justify-content:flex-end}}.header-holder .main-menu ul{display:flex;align-items:center;margin:0}.header-holder .main-menu ul li{display:inline-block;margin-right:34px;position:relative}.header-holder .main-menu ul li.active:after{content:"•";bottom:-24px;color:#ee4c2c;font-size:1.375rem;left:0;position:absolute;right:0;text-align:center}.header-holder .main-menu ul li.active a{color:#ee4c2c}.header-holder .main-menu ul li.active .with-down-arrow{background-image:url("/assets/images/chevron-down-orange.svg")}.header-holder .main-menu ul li.resources-active:after{left:-27px}.header-holder .main-menu ul li:last-of-type{margin-right:0}.header-holder .main-menu ul li a{color:#fff;font-size:1.2rem;letter-spacing:0;line-height:2.125rem;text-align:center;text-decoration:none;padding-bottom:10px}@media screen and (min-width: 1200px){.header-holder .main-menu ul li a:hover{color:#ffffff;border-bottom:2px solid #ffffff}}.header-holder .main-menu ul li a.with-down-arrow{cursor:default;padding-right:2rem;position:relative;background-image:url("/assets/images/chevron-down-white.svg");background-size:14px 18px;background-position:top 7px right 10px;background-repeat:no-repeat;padding-bottom:20px}.header-holder .main-menu ul li a.with-down-arrow:hover{border-bottom:none}.header-holder .main-menu ul li a.with-down-arrow .dropdown-menu{border-radius:0;padding:0}.header-holder .main-menu ul li a.with-down-arrow .dropdown-menu .dropdown-item{color:#6c6c6d;border-bottom:1px solid #e2e2e2}.header-holder .main-menu ul li a.with-down-arrow .dropdown-menu .dropdown-item:last-of-type{border-bottom-color:transparent}.header-holder .main-menu ul li a.with-down-arrow .dropdown-menu .dropdown-item:hover{background-color:#ee4c2c}.header-holder .main-menu ul li a.with-down-arrow .dropdown-menu .dropdown-item p{font-size:1rem;color:#757575}.header-holder .main-menu ul li a.with-down-arrow .dropdown-menu a.dropdown-item:hover{color:#fff}.header-holder .main-menu ul li a.with-down-arrow .dropdown-menu a.dropdown-item:hover p{color:#fff}.mobile-main-menu{display:none}.mobile-main-menu.open{background-color:#262626;display:block;height:100%;left:0;margin-left:auto;margin-right:auto;min-height:100%;position:fixed;right:0;top:0;width:100%;z-index:99999}.mobile-main-menu .container-fluid{background-color:inherit;align-items:center;display:flex;height:68px;position:relative;z-index:1}.mobile-main-menu .container-fluid:before,.mobile-main-menu .container-fluid:after{content:"";display:table}.mobile-main-menu .container-fluid:after{clear:both}.mobile-main-menu .container-fluid{*zoom:1}.mobile-main-menu.open ul{list-style-type:none;padding:0}.mobile-main-menu.open ul li a,.mobile-main-menu.open .resources-mobile-menu-title{font-size:2rem;color:#fff;letter-spacing:0;line-height:4rem}.mobile-main-menu.open ul li.active a{color:#ee4c2c}.main-menu-close-button{background-image:url("/assets/images/icon-close.svg");background-position:center center;background-repeat:no-repeat;background-size:24px 24px;height:24px;position:absolute;right:0;width:24px;top:-4px}.mobile-main-menu-header-container{position:relative}.mobile-main-menu-links-container{display:flex;padding-left:2.8125rem;height:100%;min-height:100%;margin-top:20px;overflow-y:scroll}@media only screen and (max-width: 320px){.mobile-main-menu-links-container .main-menu{padding-top:5rem}}@media only screen and (max-width: 320px){.mobile-main-menu-links-container .navSearchWrapper{width:75%}}#topnav-gh-icon{background-image:url(/assets/social/github-white.svg);color:white;width:33px;height:33px;background-size:23px 23px;background-repeat:no-repeat;background-position:5px 4px;border-radius:25px}#topnav-gh-icon:hover{background-color:#88888833}.blog-header .header-logo,.blog-detail-header .header-logo,.resources-header .header-logo,.get-started-header .header-logo,.features-header .header-logo,.ecosystem-header .header-logo,.announcement-header .header-logo,.comm-stories-header .header-logo,.hub-header .header-logo,.mobile-header .header-logo{background-image:url("/assets/images/logo-dark.svg")}.blog-header .main-menu ul li a,.blog-detail-header .main-menu ul li a,.resources-header .main-menu ul li a,.get-started-header .main-menu ul li a,.features-header .main-menu ul li a,.ecosystem-header .main-menu ul li a,.announcement-header .main-menu ul li a,.comm-stories-header .main-menu ul li a,.hub-header .main-menu ul li a,.mobile-header .main-menu ul li a{color:#262626}@media screen and (min-width: 1200px){.blog-header .main-menu ul li a:hover,.blog-detail-header .main-menu ul li a:hover,.resources-header .main-menu ul li a:hover,.get-started-header .main-menu ul li a:hover,.features-header .main-menu ul li a:hover,.ecosystem-header .main-menu ul li a:hover,.announcement-header .main-menu ul li a:hover,.comm-stories-header .main-menu ul li a:hover,.hub-header .main-menu ul li a:hover,.mobile-header .main-menu ul li a:hover{color:#262626;border-bottom:2px solid #262626}}.blog-header .main-menu ul li a.with-down-arrow,.blog-detail-header .main-menu ul li a.with-down-arrow,.resources-header .main-menu ul li a.with-down-arrow,.get-started-header .main-menu ul li a.with-down-arrow,.features-header .main-menu ul li a.with-down-arrow,.ecosystem-header .main-menu ul li a.with-down-arrow,.announcement-header .main-menu ul li a.with-down-arrow,.comm-stories-header .main-menu ul li a.with-down-arrow,.hub-header .main-menu ul li a.with-down-arrow,.mobile-header .main-menu ul li a.with-down-arrow{background-image:url("/assets/images/chevron-down-black.svg")}.blog-header .main-menu-open-button,.blog-detail-header .main-menu-open-button,.resources-header .main-menu-open-button,.get-started-header .main-menu-open-button,.features-header .main-menu-open-button,.ecosystem-header .main-menu-open-button,.announcement-header .main-menu-open-button,.comm-stories-header .main-menu-open-button,.hub-header .main-menu-open-button,.mobile-header .main-menu-open-button{background-image:url("/assets/images/icon-menu-dots-dark.svg")}.blog-header #topnav-gh-icon,.blog-detail-header #topnav-gh-icon,.resources-header #topnav-gh-icon,.get-started-header #topnav-gh-icon,.features-header #topnav-gh-icon,.ecosystem-header #topnav-gh-icon,.announcement-header #topnav-gh-icon,.comm-stories-header #topnav-gh-icon,.hub-header #topnav-gh-icon,.mobile-header #topnav-gh-icon{background-image:url(/assets/social/github-black.svg)}.ecosystem-dropdown-menu,.resources-dropdown-menu{left:-25px;width:300px;display:none;position:absolute;z-index:1000;display:none;top:45px;float:left;min-width:10rem;padding:0.5rem 0;font-size:1rem;color:#212529;text-align:left;list-style:none;background-color:#fff;background-clip:padding-box;border:1px solid rgba(0,0,0,0.15);border-radius:0.25rem}.ecosystem-dropdown:hover .ecosystem-dropdown-menu,.ecosystem-dropdown:hover .resources-dropdown-menu,.resources-dropdown:hover .ecosystem-dropdown-menu,.resources-dropdown:hover .resources-dropdown-menu,.resources-active:hover .ecosystem-dropdown-menu,.resources-active:hover .resources-dropdown-menu{display:block}.main-menu ul li .ecosystem-dropdown-menu,.main-menu ul li .resources-dropdown-menu{border-radius:0;padding:0}.main-menu ul li .ecosystem-dropdown-menu .dropdown-item,.main-menu ul li .resources-dropdown-menu .dropdown-item{color:#6c6c6d;border-bottom:1px solid #e2e2e2}.header-holder .main-menu ul li a.nav-dropdown-item{display:block;font-size:1rem;line-height:1.3125rem;width:100%;padding:0.25rem 1.5rem;clear:both;font-weight:400;color:#757575;text-align:left;background-color:transparent;border-bottom:1px solid #e2e2e2}.header-holder .main-menu ul li a.nav-dropdown-item p{margin-bottom:.5rem}.header-holder .main-menu ul li a.nav-dropdown-item:last-of-type{border-bottom-color:transparent}.header-holder .main-menu ul li a.nav-dropdown-item:hover{background-color:#ee4c2c;color:white}.header-holder .main-menu ul li a.nav-dropdown-item .dropdown-title{font-size:1.125rem;color:#212529;letter-spacing:0;line-height:34px}.header-holder .main-menu ul li a.nav-dropdown-item .docs-title{display:block;padding-top:0.5rem}.header-holder .main-menu ul li a.nav-dropdown-item:hover .dropdown-title{background-color:#ee4c2c;color:white}.mobile-main-menu-links-container ul.resources-mobile-menu-items li{padding-left:15px}.mobile-main-menu-links-container ul.resources-mobile-menu-items li a{font-size:1.5rem;line-height:3rem}.jumbotron{background-color:transparent;position:absolute;left:0;right:0;margin-right:auto;margin-left:auto;padding:0;margin-bottom:0;display:flex;align-items:center;top:68px}@media screen and (min-width: 768px){.jumbotron{height:550px;top:90px}}.jumbotron .jumbotron-content{display:flex;align-items:center}.jumbotron .lead{font-weight:400;letter-spacing:0.25px;font-size:20px;line-height:1.2}@media screen and (min-width: 768px){.jumbotron .lead{font-size:29px}}.jumbotron h1{font-size:2rem;text-transform:uppercase;font-weight:lighter;letter-spacing:1.08px;margin-bottom:.625rem;line-height:1.05;margin-top:4rem}@media screen and (min-width: 768px){.jumbotron h1{font-size:3.875rem;margin-top:0}}.jumbotron h1 img{margin-bottom:1rem}.jumbotron p{font-size:1.125rem;margin-bottom:1.25rem}@media screen and (min-width: 1200px){.jumbotron p{width:50%}}.jumbotron.on-dark-background h1,.jumbotron.on-dark-background p{color:#fff}.jumbotron .btn{padding-top:.5625rem}@media screen and (min-width: 768px){.jumbotron .btn{margin-top:.625rem}}.homepage .main-content-wrapper{margin-top:315px}@media screen and (min-width: 768px){.homepage .main-content-wrapper{margin-top:472px}}.homepage h2{margin-bottom:1.5625rem;text-transform:uppercase;letter-spacing:1.78px;line-height:2.5rem}@media screen and (min-width: 768px){.homepage h2{margin-bottom:2.0625rem}}.homepage h3{font-size:1.5rem;letter-spacing:1.33px;line-height:2rem;text-transform:uppercase;margin-bottom:1.25rem}.homepage h5{margin-bottom:.5rem}@media screen and (min-width: 768px){.homepage h5{margin-bottom:.9375rem}}.homepage .jumbotron{height:195px}@media screen and (min-width: 768px){.homepage .jumbotron{height:395px}}.homepage .jumbotron .btn{margin-top:.375rem}.homepage .ecosystem-row .card{background-color:#f3f4f7}.homepage .homepage-header{background-color:rgba(0,0,0,0.165)}.homepage-feature-module{padding-top:2.5rem;padding-bottom:2.5rem}@media screen and (min-width: 768px){.homepage-feature-module{padding-top:3.875rem;padding-bottom:4.5rem}.homepage-feature-module .module-button{position:absolute;right:15px;top:0}}.homepage-feature-module p{color:#6c6c6d;font-size:1.125em}.homepage-feature-module .title{color:#000;font-weight:300;font-size:1.5rem}@media (min-width: 768px) and (max-width: 1239px){.homepage-feature-module .title{font-size:1.25rem}}.homepage-feature-module .pytorch-title{font-size:1.5rem;letter-spacing:0.33px;line-height:2.25rem}.homepage-feature-module .subtext{font-size:1.125rem;color:#8c8c8c;letter-spacing:0;line-height:1.5rem}@media (min-width: 768px) and (max-width: 1239px){.homepage-feature-module .subtext{font-size:.9375rem}}.key-features-module{padding-bottom:0}@media screen and (min-width: 768px){.key-features-module{padding-bottom:1.55rem}}.key-features-module .key-features-boxes{margin-top:2rem}@media screen and (min-width: 768px){.key-features-module .key-features-boxes{margin-top:0}}.key-features-module .key-feature-box{margin-bottom:2rem}.key-features-module .key-feature-box p{margin-bottom:0;letter-spacing:0.25px}@media screen and (min-width: 768px){.key-features-module .key-feature-box{margin-bottom:2.5rem}}.community-heading{margin-top:2rem}.community-module{background-color:#fff}.community-module .ecosystem-card{height:auto}@media (min-width: 768px) and (max-width: 1239px){.community-module .ecosystem-card{padding:.625rem}}.community-module h2{margin-bottom:0}.community-module h5{text-transform:uppercase;color:#c6000a;margin-bottom:1.25rem}.community-module .h2-subheadline{margin-top:1.25rem;margin-bottom:2.6rem}@media screen and (min-width: 768px){.community-module .h2-subheadline{margin-top:0}}@media (min-width: 768px) and (max-width: 1239px){.community-module .card-body{padding:.625rem}}.community-module .module-button{background-color:#f3f4f7}.community-module p{margin-bottom:2.5rem;letter-spacing:0.25px}.community-module .module-subtext{margin-right:15.625rem}.community-module .email-subscribe-form input.email{border-bottom:1px solid #d6d7d8;font-size:1.25rem;line-height:0;padding-bottom:.75rem}.community-module .email-subscribe-form input[type="submit"]{top:6px}@media screen and (min-width: 768px){.community-module .email-subscribe-form input[type="submit"]{top:10px}}.pytorch-users-module,.homepage-bottom-wrapper{background-color:#f3f4f7}@media screen and (min-width: 768px){.pytorch-users-module{padding-bottom:1.9rem}}.community-avatar{height:60px;width:60px}.community-logo-bottom{height:200px;background-color:#f3f4f7}.university-testimonials h2{margin-bottom:2.2rem}.university-testimonials-content{margin-top:2.5rem;margin-bottom:2rem}@media screen and (min-width: 768px){.university-testimonials-content{margin-top:0}}.university-testimonials-content .col-md-4{margin-bottom:2.5rem}.university-testimonials-content .case-study-title{font-size:1.5rem;margin-bottom:1.25rem}.university-testimonials-content p{color:#6c6c6d;font-size:1.125rem;letter-spacing:0.25px}.university-testimonials-content .btn{background-color:#fff}.follow-us-on-twitter h2{margin-bottom:1.25rem}@media screen and (min-width: 768px){.follow-us-on-twitter h2{margin-bottom:2.5rem}}.homepage-feature-module .tweets-wrapper p{font-size:1rem}.quick-starts p{font-size:1.125rem;line-height:1.75rem}.quick-start-guides{font-size:1.5rem;letter-spacing:0.25px;line-height:2.25rem;color:#a5a5a5}.quick-start-guides .step-counter{margin-bottom:.1875rem}.quick-start-guides ul{list-style-type:none;padding-left:0}.quick-start-guides ul li{margin-bottom:0;font-size:1.125rem}@media screen and (min-width: 768px){.quick-start-guides ul li{margin-bottom:.75rem}.quick-start-guides ul li:last-of-type{margin-bottom:0}}.quick-start-guides ul li.selected{color:#ee4c2c}.quick-start-guides ul li.selected:before{content:"\2022";position:absolute;left:0}@media screen and (min-width: 768px){.quick-start-guides ul li.selected:before{left:-5px}}.quick-start-guides .select-instructions{color:#262626;border-bottom:2px solid #a5a5a5;margin-bottom:1rem;font-size:1.125rem;display:inline-block}@media screen and (min-width: 768px){.quick-start-guides .select-instructions{margin-bottom:0}}.homepage .news-banner-container{background:#000;color:#fff;text-align:center;padding:20px;width:90%}.homepage .news-banner-container .right-arrow,.homepage .news-banner-container .left-arrow{height:15px;bottom:-3px;position:relative}@media screen and (min-width: 768px){.homepage .news-banner-container .right-arrow,.homepage .news-banner-container .left-arrow{bottom:-8px}}.homepage .news-banner-container .right-arrow:hover,.homepage .news-banner-container .left-arrow:hover{cursor:pointer}.homepage .news-banner-container .right-arrow{float:right}.homepage .news-banner-container .left-arrow{float:left}.homepage #news-items .pagination{display:none !important}.banner-info{display:inline-block;overflow:hidden;text-overflow:ellipsis;white-space:nowrap;margin:auto;width:80%;font-size:1.125rem}@media screen and (min-width: 768px){.banner-info{padding-top:3px}}.banner-info:hover{cursor:pointer;color:#ee4c2c}.news-banner-text a{color:white}.news-banner-text a:hover{color:#ee4c2c}.no-banner{padding-bottom:2rem}.homepage-box-module div.col-md{background:#F3F4F7;margin:10px;padding:30px}@media screen and (min-width: 768px){.homepage-box-module div.col-md{margin:20px}}.site-footer{padding:3.75rem 0;width:100%;background:#000;background-size:100%;margin-left:0;margin-right:0}@media screen and (min-width: 768px){.site-footer{position:absolute;left:0;bottom:0;height:620px}}.site-footer p{color:#fff}.site-footer ul{list-style-type:none;padding-left:0;margin-bottom:0}.site-footer ul li{font-size:1.125rem;line-height:2rem;color:#A0A0A1;padding-bottom:.375rem}.site-footer ul li.list-title{padding-bottom:.75rem;color:#fff}.site-footer ul li.list-title p{margin-bottom:0}.site-footer a:link,.site-footer a:visited{color:inherit}@media screen and (min-width: 768px){.site-footer a:hover{color:#ee4c2c}}.site-footer .privacy-policy{background:#000000;border-top:1px solid #fff;display:flex;flex-direction:column;margin-top:40px}.site-footer .privacy-policy ul{border-bottom:1px solid white}.site-footer .privacy-policy ul .privacy-policy-links{padding-bottom:1rem;padding-top:1rem;padding-right:1rem;display:inline-flex;color:white}.site-footer .privacy-policy .copyright{padding-top:1rem}.site-footer .privacy-policy .copyright p{color:#dfdfdf;font-size:14px}.site-footer .privacy-policy .copyright a{color:#dfdfdf;font-weight:600}.site-footer .privacy-policy .copyright a:hover{color:#dfdfdf;font-weight:600}.docs-tutorials-resources{background-color:#262626;color:#fff;padding-top:2.5rem;padding-bottom:2.5rem}@media screen and (min-width: 768px){.docs-tutorials-resources{padding-top:4.125rem;padding-bottom:4.09rem}}.docs-tutorials-resources h2{font-size:1.5rem;letter-spacing:-0.25px;text-transform:none;margin-bottom:0.25rem}@media screen and (min-width: 768px){.docs-tutorials-resources h2{margin-bottom:1.25rem}}.docs-tutorials-resources .col-md-4{margin-bottom:2rem}@media screen and (min-width: 768px){.docs-tutorials-resources .col-md-4{margin-bottom:0}}.docs-tutorials-resources .with-right-arrow{margin-left:12px;background-position:top 3px right 11px}@media screen and (min-width: 768px){.docs-tutorials-resources .with-right-arrow{background-position:top 6px right 11px}}.docs-tutorials-resources .with-right-arrow:hover{background-image:url("/assets/images/chevron-right-white.svg")}.docs-tutorials-resources p{font-size:1rem;line-height:1.5rem;letter-spacing:0.22px;color:#A0A0A1;margin-bottom:.5rem}@media screen and (min-width: 768px){.docs-tutorials-resources p{margin-bottom:1.25rem}}.docs-tutorials-resources a{font-size:1.125rem;color:#ee4c2c}.docs-tutorials-resources a:hover{color:#fff}.footer-container{position:relative}.footer-logo-wrapper{display:none}@media screen and (min-width: 768px){.footer-logo-wrapper{display:flex;grid-column:span 6}}.footer-logo-wrapper .footer-logo img{width:40px}.footer-links-wrapper{display:flex;flex-wrap:wrap;padding-bottom:1rem;border-bottom:1px solid white}@media screen and (min-width: 768px){.footer-links-wrapper{flex-wrap:initial;justify-content:flex-end}}.footer-links-col{margin-bottom:3.75rem;width:50%}@media screen and (min-width: 768px){.footer-links-col{margin-bottom:0;width:14%;margin-right:23px}.footer-links-col.follow-us-col{width:18%;margin-right:0}}@media (min-width: 768px) and (max-width: 1239px){.footer-links-col{width:18%;margin-right:30px}}.footer-social-icons{margin:8.5625rem 0 2.5rem 0}.footer-social-icons a{height:32px;width:32px;display:inline-block;background-color:#CCCDD1;border-radius:50%;margin-right:5px}.footer-social-icons a.facebook{background-image:url("/assets/images/logo-facebook-dark.svg");background-position:center center;background-size:9px 18px;background-repeat:no-repeat}.footer-social-icons a.twitter{background-image:url("/assets/images/logo-twitter-dark.svg");background-position:center center;background-size:17px 17px;background-repeat:no-repeat}.footer-social-icons a.youtube{background-image:url("/assets/images/logo-youtube-dark.svg");background-position:center center;background-repeat:no-repeat}.site-footer .mc-field-group{margin-top:-2px}.site-footer .email-subscribe-form input[type="submit"]{top:9px}@media screen and (min-width: 768px){.site-footer .email-subscribe-form input[type="submit"]{top:13px}}.social-links{grid-column:span 12;display:grid;grid-column-gap:3%;grid-row-gap:30px;grid-template-columns:repeat(6, minmax(0, 1fr))}@media (min-width: 600px){.social-links{grid-column:span 8}}@media screen and (min-width: 768px){.social-links{grid-column:span 6;align-self:end}}@media (max-width: 999px){.social-links{margin-left:10px;margin-right:10px}}.social-links li{text-align:center}.social-links svg{height:25px;max-width:30px;fill:#fff;color:#fff}.social-links svg:hover{fill:#ee4c2c;color:#ee4c2c}.lf-grid{grid-column-gap:3%;grid-row-gap:30px;display:grid;grid-template-columns:repeat(12, 1fr)}.hs-recaptcha{display:none}.newsletter{line-height:140%;margin-bottom:80px}.newsletter__title{line-height:140%;font-size:24px}@media (min-width: 1000px){.newsletter__title{font-size:40px}}.newsletter .legal-consent-container{display:none}.newsletter p.newsletter__privacy{max-width:860px;margin-top:30px;line-height:21px;font-size:14px;color:#dfdfdf}.newsletter p.newsletter__privacy a{color:#dfdfdf;font-weight:600}.newsletter p.newsletter__privacy a:hover{color:#dfdfdf;font-weight:600}.newsletter .hbspt-form{min-height:300px}@media (min-width: 500px){.newsletter .hbspt-form{min-height:100px}}@media (min-width: 1000px){.newsletter .hbspt-form{min-height:20px}}.newsletter .hbspt-form .hs-error-msg{display:block;margin-right:8px;color:#ee4c2c;font-size:14px;line-height:1.1em;width:95%;padding-top:15px}.newsletter .hbspt-form .hs-form{display:grid;grid-template-columns:1fr;grid-gap:30px}@media (min-width: 500px){.newsletter .hbspt-form .hs-form{grid-template-columns:minmax(0, 1fr) minmax(0, 1fr)}}@media (min-width: 700px){.newsletter .hbspt-form .hs-form{grid-template-columns:repeat(3, minmax(0, 1fr))}}@media (min-width: 950px){.newsletter .hbspt-form .hs-form{grid-template-columns:1fr 1fr 1fr 1fr 1fr;grid-row-gap:1.5rem;grid-column-gap:1.5rem}}.newsletter .hbspt-form .hs-form input[type='text'],.newsletter .hbspt-form .hs-form input[type='email']{height:50px;width:100%;background:transparent;border:none;border-bottom:2px solid #fff;border-radius:0;transition:all 0.25s ease;color:#fff;font-size:16px;line-height:105%}@media (min-width: 500px){.newsletter .hbspt-form .hs-form input[type='text'],.newsletter .hbspt-form .hs-form input[type='email']{height:42px}}@media (min-width: 500px){.newsletter .hbspt-form .hs-form input[type='text'],.newsletter .hbspt-form .hs-form input[type='email']{font-size:20px}}.newsletter .hbspt-form .hs-form input[type='text']::-moz-placeholder, .newsletter .hbspt-form .hs-form input[type='email']::-moz-placeholder{color:#fff;font-size:16px;line-height:105%}.newsletter .hbspt-form .hs-form input[type='text']:-ms-input-placeholder, .newsletter .hbspt-form .hs-form input[type='email']:-ms-input-placeholder{color:#fff;font-size:16px;line-height:105%}.newsletter .hbspt-form .hs-form input[type='text']::-ms-input-placeholder, .newsletter .hbspt-form .hs-form input[type='email']::-ms-input-placeholder{color:#fff;font-size:16px;line-height:105%}.newsletter .hbspt-form .hs-form input[type='text']::placeholder,.newsletter .hbspt-form .hs-form input[type='email']::placeholder{color:#fff;font-size:16px;line-height:105%}@media (min-width: 500px){.newsletter .hbspt-form .hs-form input[type='text']::-moz-placeholder, .newsletter .hbspt-form .hs-form input[type='email']::-moz-placeholder{font-size:20px}.newsletter .hbspt-form .hs-form input[type='text']:-ms-input-placeholder, .newsletter .hbspt-form .hs-form input[type='email']:-ms-input-placeholder{font-size:20px}.newsletter .hbspt-form .hs-form input[type='text']::-ms-input-placeholder, .newsletter .hbspt-form .hs-form input[type='email']::-ms-input-placeholder{font-size:20px}.newsletter .hbspt-form .hs-form input[type='text']::placeholder,.newsletter .hbspt-form .hs-form input[type='email']::placeholder{font-size:20px}}.newsletter .hbspt-form .hs-form input[type='text']:focus,.newsletter .hbspt-form .hs-form input[type='email']:focus{outline:0;border-bottom:2px solid #ee4c2c;transition:color 0.25s ease}.newsletter .hbspt-form .hs-form input[type='text']:focus::-moz-placeholder, .newsletter .hbspt-form .hs-form input[type='email']:focus::-moz-placeholder{-moz-transition:color 0.25s ease;transition:color 0.25s ease;color:transparent}.newsletter .hbspt-form .hs-form input[type='text']:focus:-ms-input-placeholder, .newsletter .hbspt-form .hs-form input[type='email']:focus:-ms-input-placeholder{-ms-transition:color 0.25s ease;transition:color 0.25s ease;color:transparent}.newsletter .hbspt-form .hs-form input[type='text']:focus::-ms-input-placeholder, .newsletter .hbspt-form .hs-form input[type='email']:focus::-ms-input-placeholder{-ms-transition:color 0.25s ease;transition:color 0.25s ease;color:transparent}.newsletter .hbspt-form .hs-form input[type='text']:focus::placeholder,.newsletter .hbspt-form .hs-form input[type='email']:focus::placeholder{transition:color 0.25s ease;color:transparent}.newsletter .hbspt-form .hs-form input:-webkit-autofill,.newsletter .hbspt-form .hs-form input:-webkit-autofill:hover,.newsletter .hbspt-form .hs-form input:-webkit-autofill:focus,.newsletter .hbspt-form .hs-form textarea:-webkit-autofill,.newsletter .hbspt-form .hs-form textarea:-webkit-autofill:hover,.newsletter .hbspt-form .hs-form textarea:-webkit-autofill:focus,.newsletter .hbspt-form .hs-form select:-webkit-autofill,.newsletter .hbspt-form .hs-form select:-webkit-autofill:hover,.newsletter .hbspt-form .hs-form select:-webkit-autofill:focus{-webkit-text-fill-color:#fff}.newsletter .hbspt-form .hs-form select{-webkit-appearance:none;-moz-appearance:none;appearance:none;background:transparent;border:0px solid transparent;border-bottom:2px solid #fff;border-radius:0;box-shadow:0 1px 0 1px transparent;display:block;height:50px;margin:0;max-width:100%;padding:0.25em 0 calc(0.25em + 1px) 5px;transition:all 0.25s ease;width:100%;color:#fff;font-size:16px;line-height:105%}@media (min-width: 500px){.newsletter .hbspt-form .hs-form select{height:42px}}@media (min-width: 500px){.newsletter .hbspt-form .hs-form select{font-size:20px}}.newsletter .hbspt-form .hs-form select::-ms-expand{display:none}.newsletter .hbspt-form .hs-form select:focus{outline:0;border-bottom:2px solid #ee4c2c}.newsletter .hbspt-form .hs-form select:focus::-moz-placeholder{-moz-transition:color 0.4s ease;transition:color 0.4s ease;color:transparent}.newsletter .hbspt-form .hs-form select:focus:-ms-input-placeholder{-ms-transition:color 0.4s ease;transition:color 0.4s ease;color:transparent}.newsletter .hbspt-form .hs-form select:focus::-ms-input-placeholder{-ms-transition:color 0.4s ease;transition:color 0.4s ease;color:transparent}.newsletter .hbspt-form .hs-form select:focus::placeholder{transition:color 0.4s ease;color:transparent}.newsletter .hbspt-form .hs-form select option{font-weight:normal;color:black}.newsletter .hbspt-form .hs-form .hs-button{border-radius:5px;margin-top:20px;border:none;background-color:#ee4c2c;color:#fff;font-weight:400;padding:11px 40px;font-size:16px;font-weight:700;text-decoration:none}.newsletter .hbspt-form .hs-form .hs-input.invalid{border-bottom:2px dashed red !important}.newsletter .hbspt-form .hs-form .hs_error_rollup{display:none}.newsletter .submitted-message{display:flex;align-content:center;align-items:center;justify-content:center;border:2px solid #fff;min-height:280px;font-size:18px;padding:20px 20px 0;line-height:1.1em}@media (min-width: 500px){.newsletter .submitted-message{min-height:80px}}@media (min-width: 1000px){.newsletter .submitted-message{min-height:unset}}.newsletter .submitted-message p{max-width:none}.main-content-wrapper{margin-top:300px}@media screen and (min-width: 768px){.main-content-wrapper{margin-top:540px;min-height:400px}}.main-content{padding-top:1.5rem;padding-bottom:1.5rem}@media screen and (min-width: 768px){.main-content{padding-top:2.625rem}}.main-content-menu{margin-bottom:1.25rem}@media screen and (min-width: 768px){.main-content-menu{margin-bottom:5rem}}.main-content-menu .navbar-nav .nav-link{color:#262626;padding-left:1.875rem;padding-right:1.875rem}@media screen and (min-width: 768px){.main-content-menu .navbar-nav .nav-link:first-of-type{padding-left:0}}article.pytorch-article{max-width:920px;margin:0 auto;padding-bottom:90px}article.pytorch-article h2,article.pytorch-article h3,article.pytorch-article h4,article.pytorch-article h5,article.pytorch-article h6{margin-top:1.875rem;margin-bottom:1.5rem;color:#262626}article.pytorch-article h2{font-size:1.5rem;letter-spacing:1.33px;line-height:2rem;margin-top:3.125rem;text-transform:uppercase}article.pytorch-article h3{font-size:1.5rem;letter-spacing:-0.25px;line-height:1.875rem;text-transform:none}article.pytorch-article h4,article.pytorch-article h5,article.pytorch-article h6{font-size:1.125rem;letter-spacing:-0.19px;line-height:1.875rem}article.pytorch-article p{margin-bottom:1.125rem}article.pytorch-article p,article.pytorch-article ul li,article.pytorch-article ol li,article.pytorch-article dl dt,article.pytorch-article dl dd,article.pytorch-article blockquote{font-size:1.125rem;line-height:1.875rem;color:#6c6c6d}article.pytorch-article table{margin-bottom:2.5rem;width:100%}article.pytorch-article table thead{border-bottom:1px solid #cacaca}article.pytorch-article table th,article.pytorch-article table tr,article.pytorch-article table td{color:#6c6c6d;font-size:1rem;letter-spacing:-0.17px}article.pytorch-article table th{padding:.625rem;color:#262626}article.pytorch-article table td{padding:.3125rem}article.pytorch-article ul,article.pytorch-article ol{margin:1.5rem 0 3.125rem 0}@media screen and (min-width: 768px){article.pytorch-article ul,article.pytorch-article ol{padding-left:6.25rem}}article.pytorch-article ul li,article.pytorch-article ol li{margin-bottom:.625rem}article.pytorch-article dl{margin-bottom:2.5rem}article.pytorch-article dl dt{margin-bottom:.75rem;font-weight:400}article.pytorch-article pre{margin-bottom:2.5rem}article.pytorch-article hr{margin-top:4.6875rem;margin-bottom:4.6875rem}article.pytorch-article blockquote{font-size:.75rem;font-style:italic;padding:15px 15px 5px 15px;width:100%;background-color:rgba(211,211,211,0.3);border-left:2px solid #000000}article.pytorch-article h3.no_toc{margin:0px}article.pytorch-article nav{float:right;display:block;overflow-y:auto;background-color:white;margin-left:20px;border-left:1px #717171}article.pytorch-article nav li{font-size:12px;line-height:20px;padding-top:0px;list-style:none}article.pytorch-article nav a{color:#717171;font-weight:bold}article.pytorch-article ul#markdown-toc{padding-left:1em;margin:0px}article.pytorch-article ul#markdown-toc ul{margin:0px;padding-left:1em}article.pytorch-article ul#markdown-toc li{margin:0px}.get-started article{margin-bottom:5rem}.get-started .quick-start-guides ul{margin-bottom:0;padding-left:0}.get-started .main-content-wrapper{margin-top:275px}@media screen and (min-width: 768px){.get-started .main-content-wrapper{margin-top:350px}}.get-started .jumbotron{height:190px}@media screen and (min-width: 768px){.get-started .jumbotron{height:260px}}.get-started .main-content .navbar{background-color:#f3f4f7;padding-left:0;padding-bottom:0;padding-top:0}@media (min-width: 992px){.get-started .main-content .navbar li:first-of-type{padding-left:3.4375rem}.get-started .main-content .navbar .nav-item{padding:1rem;cursor:pointer}.get-started .main-content .navbar .nav-link{position:relative;top:10%;transform:translateY(-50%)}}.get-started .main-content .navbar .nav-select{background-color:#fff}.get-started .main-content .navbar .nav-select .nav-link{color:#ee4c2c;font-weight:500}.get-started .main-content .navbar .nav-link{font-size:1.125rem;color:#8c8c8c}.get-started .main-content .navbar .nav-link:hover{color:#ee4c2c}.get-started .main-content .navbar .get-started-nav-link{padding-left:1.25rem;padding-right:1.25rem}@media screen and (min-width: 768px){.get-started .main-content .navbar .get-started-nav-link{padding-left:1.875rem;padding-right:1.875rem}}.get-started .main-content .navbar .nav-item{padding-top:.9375rem;padding-bottom:.9375rem}@media screen and (min-width: 768px){.get-started .main-content .navbar .nav-item{padding-bottom:0;padding-top:2rem}}@media (min-width: 768px) and (max-width: 1239px){.get-started .main-content .navbar .nav-item{padding-bottom:0;padding-top:2rem}}@media (max-width: 990px){.get-started .main-content .navbar .nav-item{padding-bottom:.625rem;padding-top:1rem}}.get-started .main-content .navbar .navbar-toggler{margin-left:2.5rem}.get-started .main-content{padding-top:0}@media screen and (min-width: 768px){.get-started .main-content{padding-top:1.9rem}}.get-started .quick-start-module{padding-bottom:0;padding-top:0;background-color:#fff}.get-started .quick-start-module .option,.get-started .quick-start-module #command{border:2px solid #fff;background:#f3f4f7}.get-started .quick-start-module .title-block{border:2px solid #fff}.get-started .quick-start-module .selected{background-color:#ee4c2c}.get-started .quick-start-module h1{font-size:2rem;letter-spacing:1.78px;line-height:2.5rem;text-transform:uppercase;margin-bottom:1.5rem}.get-started .nav-menu-wrapper{background-color:#f3f4f7}.get-started .nav-menu-wrapper .container{padding-left:0;padding-right:0}@media screen and (min-width: 768px){.get-started .nav-menu-wrapper .container{padding-left:30px;padding-right:30px}}.get-started .navbar-nav{flex-direction:row}#installation .os{display:none}#installation .selected{display:block}#cloud .platform{display:none}#cloud .selected{display:block}.screencast{display:none}.screencast iframe{width:100% !important}.get-started .quick-starts .row.ptbuild,.get-started .quick-starts .row.os,.get-started .quick-starts .row.package,.get-started .quick-starts .row.language,.get-started .quick-starts .row.cuda{margin-bottom:1.25rem}@media screen and (min-width: 768px){.get-started .quick-starts .row.ptbuild,.get-started .quick-starts .row.os,.get-started .quick-starts .row.package,.get-started .quick-starts .row.language,.get-started .quick-starts .row.cuda{margin-bottom:0}}@media (min-width: 768px) and (max-width: 1239px){.get-started .quick-starts{flex:0 0 100%;max-width:100%}}@media screen and (min-width: 768px){.get-started .quick-starts{margin-bottom:2.5rem}.get-started .quick-starts .row{margin-bottom:0}}@media screen and (min-width: 1240px){.get-started .quick-starts{margin-bottom:0}}.get-started .get-started-locally-sidebar{padding-top:2.5rem;padding-bottom:2.5rem;top:15%;z-index:385}@media screen and (min-width: 768px){.get-started .get-started-locally-sidebar{padding-top:0;max-height:100vh;overflow:auto}}.get-started .get-started-locally-sidebar ul{padding-left:0}.get-started .get-started-locally-sidebar li{list-style-type:none;line-height:36px}.get-started .get-started-locally-sidebar li a{color:#8c8c8c}.get-started .get-started-locally-sidebar li a.active,.get-started .get-started-locally-sidebar li a:hover{color:#ee4c2c}.get-started .get-started-locally-sidebar li .subitem{padding-left:1.25rem}.get-started .get-started-locally-sidebar li.subitem{padding-left:1.25rem}.cloud-nav{display:none}.get-started .get-started-cloud-sidebar{padding-top:3.125rem;padding-bottom:2.5rem;top:15%}.get-started .get-started-cloud-sidebar ul{padding-left:0}.get-started .get-started-cloud-sidebar li{list-style-type:none;line-height:36px}.get-started .get-started-cloud-sidebar li a{color:#8c8c8c}.get-started .get-started-cloud-sidebar li a.active,.get-started .get-started-cloud-sidebar li a:hover{color:#ee4c2c}.get-started .get-started-cloud-sidebar li .subitem{padding-left:1.25rem}.get-started .get-started-cloud-sidebar li.subitem{padding-left:1.25rem}.pytorch-2 .article-wrapper article.pytorch-article table tr td:first-of-type{padding-left:10px}.pytorch-2 .article-wrapper article.pytorch-article table,.pytorch-2 .article-wrapper article.pytorch-article td{border:1px solid #A0A0A1;padding:10px}.pytorch-2 .article-wrapper article.pytorch-article b,.pytorch-2 .article-wrapper article.pytorch-article em,.pytorch-2 .article-wrapper article.pytorch-article h3,.pytorch-2 .article-wrapper article.pytorch-article h2,.pytorch-2 .article-wrapper article.pytorch-article p,.pytorch-2 .article-wrapper article.pytorch-article a,.pytorch-2 .article-wrapper article.pytorch-article strong,.pytorch-2 .article-wrapper article.pytorch-article td,.pytorch-2 .article-wrapper article.pytorch-article tr{font-family:Verdana}.pytorch-2 .article-wrapper article.pytorch-article ul,.pytorch-2 .article-wrapper article.pytorch-article ol{margin:1.5rem 0 1.5rem 0}.pytorch-2 .article-wrapper article.pytorch-article ul li,.pytorch-2 .article-wrapper article.pytorch-article ol li{font-family:Verdana}.pytorch-2 .article-wrapper article.pytorch-article code{font-family:IBMPlexMono,SFMono-Regular,Menlo,Monaco,Consolas,"Liberation Mono","Courier New",monospace;padding:2px;color:inherit;background-color:#f1f1f1}.pytorch-2 .article-wrapper article.pytorch-article p,.pytorch-2 .article-wrapper article.pytorch-article a{font-family:Verdana;word-break:break-word}.pytorch-2 .article-wrapper article.pytorch-article p strong,.pytorch-2 .article-wrapper article.pytorch-article a strong{font-family:Verdana}@media screen and (max-width: 418px){.pytorch-2 .article-wrapper article.pytorch-article .QnATable{max-width:95vw}}.ecosystem .jumbotron{height:170px}@media screen and (min-width: 768px){.ecosystem .jumbotron{height:300px}}.ecosystem .jumbotron h1{padding-top:8.4375rem;color:#fff}.ecosystem .jumbotron p.lead{margin-bottom:1.5625rem;padding-top:1.25rem;color:#fff}.ecosystem .jumbotron .ecosystem-join{margin-bottom:3rem}.ecosystem .jumbotron svg{margin-bottom:1.25rem}@media screen and (min-width: 768px){.ecosystem .main-content{padding-top:3.25rem}}.ecosystem .main-content-wrapper{background-color:#f3f4f7;margin-top:340px}@media screen and (min-width: 768px){.ecosystem .main-content-wrapper{margin-top:435px}}.ecosystem.ecosystem-detail .main-content-wrapper{background-color:#fff}.ecosystem-cards-wrapper{margin-bottom:1.125rem;padding-top:1.25rem}@media (min-width: 768px){.ecosystem-cards-wrapper .col-md-6{flex:0 0 100%;max-width:100%}}@media screen and (min-width: 1240px){.ecosystem-cards-wrapper .col-md-6{flex:0 0 50%;max-width:50%}}.ecosystem .main-content-menu .navbar-nav .nav-link{font-size:1.125rem;color:#CCCDD1;padding-right:0;margin-right:1.875rem}.ecosystem .main-content-menu .navbar-nav .nav-link.selected{color:#ee4c2c;border-bottom:1px solid #ee4c2c}@media screen and (min-width: 768px){.ecosystem .main-content-menu .nav-item:last-of-type{position:absolute;right:0}.ecosystem .main-content-menu .nav-item:last-of-type a{margin-right:0}}.ecosystem.ecosystem-detail .main-content{padding-bottom:0}.ecosystem article.pytorch-article{counter-reset:article-list}.ecosystem article.pytorch-article>ol{padding-left:0;list-style-type:none}@media screen and (min-width: 1240px){.ecosystem article.pytorch-article>ol>li{position:relative}.ecosystem article.pytorch-article>ol>li:before{counter-increment:article-list;content:counter(article-list,decimal-leading-zero);color:#B932CC;line-height:2.5rem;letter-spacing:-0.34px;font-size:2rem;font-weight:300;position:absolute;left:-60px;top:-16px;padding:.625rem 0;background-color:#fff;z-index:10}.ecosystem article.pytorch-article>ol>li:after{content:"";width:2px;position:absolute;left:-42px;top:0;height:100%;background-color:#f3f3f3;z-index:9}}.ecosystem article.pytorch-article>ol>li>h4{color:#262626}.ecosystem article.pytorch-article>ol>li ul li{list-style-type:disc}.ecosystem .quick-starts{background:#ecedf1}.ecosystem .quick-starts .title-block,.ecosystem .quick-starts #command,.ecosystem .quick-starts .option,.ecosystem .quick-starts .cloud-option{border-color:#ecedf1}.ecosystem .join-link{color:inherit;text-decoration:underline}.ecosystem .join-notice{text-align:center;padding-top:1.25rem;padding-bottom:2.5rem}.ecosystem .join-notice p{color:#6c6c6d;margin-bottom:0;line-height:1.875rem}.ecosystem .join-jumbotron{width:90%}@media screen and (min-width: 768px){.ecosystem .join-jumbotron{height:262px}}.ecosystem .join-jumbotron .container{max-width:920px}.ecosystem .join-jumbotron h1{padding-top:.3125rem;color:#fff}.ecosystem .join-jumbotron h1 span{font-weight:300}.ecosystem .join-wrapper{background-color:#f3f4f7}@media screen and (min-width: 768px){.ecosystem .join-wrapper .main-content{padding-top:1.5rem}}.ecosystem .join-wrapper .container{max-width:920px}.ecosystem .join-wrapper #success-response{color:#6c6c6d}.ecosystem .join-intro{color:#6c6c6d;line-height:28px}.ecosystem .requirements span{color:#000;font-weight:bold}.ecosystem .requirements .join-number{color:#812CE5;display:flex;align-items:center}@media screen and (min-width: 768px){.ecosystem .requirements .join-number{padding-left:.625rem}}.ecosystem .requirements p{margin-bottom:0;margin-top:-.4375rem}@media screen and (min-width: 768px){.ecosystem .requirements p{padding-left:1.5rem}}@media screen and (min-width: 768px){.ecosystem .requirements .col-md-11{border-left:2px solid #f3f4f7}}.ecosystem .row.requirements{padding-bottom:2.5rem}.ecosystem .experimental .ecosystem-card-title-container{display:inline-flex}.ecosystem .experimental .ecosystem-card-title-container .experimental-badge{text-transform:uppercase;margin-left:15px;background-color:#e4e4e4;color:#262626;opacity:0.75;font-size:.625rem;letter-spacing:1px;line-height:1.375rem;height:1.25rem;width:6rem;text-align:center;margin-top:.25rem}.ecosystem .ecosystem-card-title-container .card-title{padding-left:0;font-size:1.5rem;color:#262626}.ecosystem .star-list{list-style:none;padding-left:0}.ecosystem .star-list li{display:inline}.ecosystem .star-list li.github-stars-count-whole-number{display:none}.ecosystem .icon-count-container{display:inline-block;vertical-align:text-bottom;margin-left:.5rem}.ecosystem .github-logo{height:15px;width:13px;margin-left:10px}.ecosystem .github-stars-count{color:#797676;position:relative;top:.25rem;font-size:14px;margin-left:0.125rem}@media screen and (min-width: 768px){.ecosystem .github-stars-count{top:.1875rem;font-size:initial}}.ecosystem-divider{position:relative;margin-bottom:4rem;margin-top:1.5rem;top:3rem}.ecosystem #dropdownSort,.ecosystem #dropdownSortLeft{margin-left:0}.ecosystem #dropdownSortLeft{font-size:19px;top:inherit;right:inherit}.ecosystem-filter-menu ul{list-style-type:none;padding-left:1.25rem}.ecosystem-filter-menu ul li{padding-right:1.25rem;word-break:break-all}.ecosystem-filter-menu ul li a{color:#797676}.ecosystem-filter-menu ul li a:hover{color:#ee4c2c}.ecosystem .ecosystem-filter{cursor:pointer}.ecosystem .ecosystem-filter ul{list-style-type:none}.ecosystem #dropdownFilter,#dropdownSort,#dropdownSortLeft{color:#797676;cursor:pointer;z-index:1;position:absolute}.ecosystem .pagination .page{border:1px solid #dee2e6;padding:0.5rem 0.75rem}.ecosystem .pagination .active .page{background-color:#dee2e6}.ecosystem-form .hbspt-form{padding-bottom:3rem}.ecosystem-form .hbspt-form .hs-form-field{width:100%}.ecosystem-form .hbspt-form .hs-form-field .input input{width:100%;border:none;border-bottom:2px solid #812CE5;height:2.75rem;outline:none;padding-left:.9375rem;margin-bottom:1.875rem}.ecosystem-form .hbspt-form .hs-richtext h3{text-transform:uppercase;padding-top:1.5625rem;padding-bottom:1.875rem}.ecosystem-form .hbspt-form label{color:#6c6c6d}.ecosystem-form .hbspt-form textarea{width:100%;border:none;border-bottom:2px solid #812CE5;outline:none;padding-left:.9375rem;margin-bottom:1.875rem;height:5.625rem;padding-top:.625rem}.ecosystem-form .hbspt-form ::-moz-placeholder{color:#6c6c6d;opacity:0.5}.ecosystem-form .hbspt-form :-ms-input-placeholder{color:#6c6c6d;opacity:0.5}.ecosystem-form .hbspt-form ::-ms-input-placeholder{color:#6c6c6d;opacity:0.5}.ecosystem-form .hbspt-form ::placeholder{color:#6c6c6d;opacity:0.5}.ecosystem-form .hbspt-form .actions{display:flex;width:100%;justify-content:center}.ecosystem-form .hbspt-form .hs-button{padding-left:.75rem;margin-top:2.5rem;background-color:#ee4c2c;color:#fff;cursor:pointer;border:none;width:30%;height:2.8125rem;text-align:left;background-repeat:no-repeat;background-image:url(/assets/images/arrow-right-with-tail-white.svg);background-size:30px 12px;background-position:right}@media screen and (min-width: 768px){.ecosystem-form .hbspt-form .hs-button{padding-left:1.125rem;background-origin:content-box;background-size:30px 15px}}.features .main-content{padding-bottom:0}.features .navbar-nav .nav-link{color:#000}.features .nav-logo{background-image:url("/assets/images/logo-dark.svg")}@media screen and (min-width: 768px){.features .main-background{height:575px}}.features .main-content-wrapper{margin-top:350px}@media screen and (min-width: 768px){.features .main-content-wrapper{margin-top:540px}}.features-row{padding-bottom:3.75rem;align-items:center}.features-row:first-of-type{margin-top:1.25rem}.features-row:last-of-type{padding-bottom:4.5rem}@media screen and (min-width: 768px){.features-row{padding-bottom:6rem}.features-row:first-of-type{margin-top:4.05rem}}.features-row h3{font-size:2rem;letter-spacing:1.78px;line-height:2.25rem;font-weight:400;text-transform:uppercase;margin-bottom:1.25rem;font-weight:300}@media (min-width: 768px) and (max-width: 1239px){.features-row h3{width:80%}}@media screen and (min-width: 1240px){.features-row h3{width:590px}}.features-row p{font-size:1.125rem;letter-spacing:0.25px;line-height:1.75rem;color:#6c6c6d;padding-right:1.875rem}@media (min-width: 768px) and (max-width: 1239px){.features-row p{width:80%}}@media screen and (min-width: 1240px){.features-row p{width:590px}}.features-row .feature-content-holder{width:100%}@media screen and (min-width: 1240px){.features-row .feature-content-holder{width:495px}}.features-row .feature-content-holder pre.highlight{margin-bottom:0}.features-row:nth-child(odd) .col-md-6:nth-child(1n){order:2}.features-row:nth-child(odd) .col-md-6:nth-child(2n){order:1}@media screen and (min-width: 768px){.features-row:nth-child(odd) .col-md-6:nth-child(1n){order:1}.features-row:nth-child(odd) .col-md-6:nth-child(2n){order:2}}.features-row:nth-child(1n) h3{color:#B73BC9}.features-row:nth-child(1n) .feature-content-holder{border-bottom:2px solid #B73BC9}.features-row:nth-child(2n) h3{color:#D92F4C}.features-row:nth-child(2n) .feature-content-holder{border-bottom:2px solid #D92F4C}.features-row:nth-child(3n) h3{color:#8038E0}.features-row:nth-child(3n) .feature-content-holder{border-bottom:2px solid #8038E0}@media screen and (min-width: 1240px){.features-row .col-md-6{padding-left:0;padding-right:0}}@media screen and (min-width: 768px){.features-row .col-md-6:nth-of-type(2) .feature-content{width:100%}.features-row .col-md-6:nth-of-type(2) .feature-content h3,.features-row .col-md-6:nth-of-type(2) .feature-content p,.features-row .col-md-6:nth-of-type(2) .feature-content .feature-content-holder{float:right}}.features .jumbotron{height:200px}@media screen and (min-width: 768px){.features .jumbotron{height:195px}}@media (max-width: 320px){.features .jumbotron{height:250px}}.features .jumbotron h1{padding-top:1.875rem}@media screen and (min-width: 768px){.features .jumbotron{height:468px}.features .jumbotron h1{padding-top:0}}.features .jumbotron h1,.features .jumbotron p{color:#fff}@media screen and (min-width: 768px){.features .jumbotron .btn{margin-top:.375rem}}.resources .jumbotron{align-items:flex-end;color:#fff;height:220px}@media screen and (min-width: 768px){.resources .jumbotron{height:300px}}.resources .jumbotron h1{padding-top:8.4375rem}.resources .jumbotron p.lead{margin-bottom:1.5625rem;padding-top:1.25rem}.resources .main-content-wrapper{margin-top:385px;margin-bottom:0.75rem}@media screen and (min-width: 768px){.resources .main-content-wrapper{margin-top:475px}}@media screen and (min-width: 768px){.resources .resource-card{margin-bottom:2.25rem}}.quick-starts{background:#f3f4f7}.quick-starts .col-md-2-4{position:relative;width:100%;min-height:1px;padding-right:15px;padding-left:15px}@media (min-width: 768px){.quick-starts .col-md-2-4{flex:0 0 20%;max-width:20%}}.quick-starts .start-locally-col{margin-bottom:1.25rem}.quick-starts .start-locally-col .row.ptbuild,.quick-starts .start-locally-col .row.os,.quick-starts .start-locally-col .row.package,.quick-starts .start-locally-col .row.language,.quick-starts .start-locally-col .row.cuda{margin-bottom:1.25rem}@media screen and (min-width: 768px){.quick-starts .start-locally-col .row.ptbuild,.quick-starts .start-locally-col .row.os,.quick-starts .start-locally-col .row.package,.quick-starts .start-locally-col .row.language,.quick-starts .start-locally-col .row.cuda{margin-bottom:0}}@media (min-width: 768px) and (max-width: 1239px){.quick-starts .start-locally-col{flex:0 0 100%;max-width:100%}}@media screen and (min-width: 768px){.quick-starts .start-locally-col{margin-bottom:2.5rem}.quick-starts .start-locally-col .row{margin-bottom:0}}@media screen and (min-width: 1240px){.quick-starts .start-locally-col{margin-bottom:0}}.quick-starts .start-locally-col pre{font-size:80% !important;background-color:#ffffff !important}.quick-starts .start-locally-col .prev-versions-btn{margin-top:30px}@media (min-width: 768px) and (max-width: 1239px){.quick-starts .cloud-options-col{flex:0 0 100%;max-width:100%;margin-left:0;margin-top:1.25rem}}.quick-starts p{font-size:1.125rem;line-height:1.75rem}.quick-starts .card-body{flex:1 1 auto}.quick-starts .cloud-option-image{margin-left:.9375rem;margin-right:1.5625rem;margin-bottom:.3125rem}.quick-starts .cloud-option-row{margin-left:0;cursor:pointer}.quick-starts .option{border:2px solid #f3f4f7;font-size:1rem;color:#6c6c6d;letter-spacing:-0.22px;line-height:1.25rem;background:#fff;cursor:pointer}.quick-starts .option:hover{background-color:#ee4c2c;color:#fff}.quick-starts .selected{background-color:#ee4c2c;color:#fff}.quick-starts .block{margin-bottom:.0625rem;height:2.5rem;display:flex;align-items:center}.quick-starts .title-block{margin:.0625rem;height:2.5rem;border:2px solid #f3f4f7;font-size:1rem;color:#6c6c6d;line-height:1.25rem;display:flex;align-items:center}.quick-starts .title-block:before{display:block;content:".";color:transparent;border-left:2px solid #CCCDD1;height:100%;position:absolute;left:0}.quick-starts #command{color:#4a4a4a;background-color:#fff;padding:.9375rem;border:2px solid #f3f4f7;word-wrap:break-word;display:table-cell;vertical-align:middle}.quick-starts #command a{font-size:125%}@media screen and (min-width: 768px){.quick-starts #command a:hover{color:#ee4c2c}}.quick-starts #command pre{word-break:break-all;white-space:normal}.quick-starts .command-container{display:table;width:100%}@media screen and (min-width: 768px){.quick-starts .command-container{min-height:5.25rem}}.quick-starts .command-container pre{margin-bottom:0px;padding:0px;font-size:75%;background-color:#f3f4f7}.quick-starts .command-block{height:5.25rem;word-wrap:break-word;color:#6c6c6d}.quick-starts .command-block:before{border-left:2px solid #000}.quick-starts .quick-start-link{color:#6c6c6d}.quick-starts .mobile-heading{display:flex;align-items:center;font-weight:400}@media screen and (min-width: 768px){.quick-starts .mobile-heading{display:none}}.quick-starts .command-mobile-heading{display:flex;align-items:center;font-weight:400;color:#000}@media screen and (min-width: 768px){.quick-starts .command-mobile-heading{display:none}}.quick-starts .headings{display:none}@media screen and (min-width: 768px){.quick-starts .headings{display:block}}.quick-starts .cloud-options-col{margin-top:1.25rem}@media screen and (min-width: 768px){.quick-starts .cloud-options-col{margin-top:0}}@media (max-width: 978px){.quick-starts .os-text{margin-top:0}}.quick-start-guides{font-size:1.125rem;letter-spacing:0.25px;line-height:2.25rem;color:#CCCDD1}.quick-start-guides .select-instructions{color:#262626;border-bottom:2px solid #CCCDD1;margin-bottom:1rem;display:inline-block}@media screen and (min-width: 768px){.quick-start-guides .select-instructions{margin-bottom:0}}.quick-start-module{padding-top:2.5rem;padding-bottom:2.5rem}.quick-start-module .option-module{float:right}@media screen and (min-width: 768px){.quick-start-module{padding-top:4rem;padding-bottom:4.125rem}}.quick-start-module p{color:#6c6c6d;font-size:1.125em;letter-spacing:0.25px;padding-bottom:.9375rem;margin-bottom:1.4rem}.quick-start-module h3{font-size:1.5rem;letter-spacing:1.33px;line-height:2rem;text-transform:uppercase;margin-bottom:2.1rem}.quick-starts .cloud-option-body{display:flex;align-items:center;height:64px;padding:0 0 0 5rem;position:relative;background-image:url("/assets/images/chevron-right-orange.svg");background-size:6px 13px;background-position:center right 15px;background-repeat:no-repeat}@media screen and (min-width: 768px){.quick-starts .cloud-option-body:after{content:"";display:block;width:0;height:1px;position:absolute;bottom:0;left:0;background-color:#ee4c2c;transition:width .250s ease-in-out}.quick-starts .cloud-option-body:hover:after{width:100%}.quick-starts .cloud-option-body:hover{color:#262626}}@media screen and (min-width: 768px){.quick-starts .cloud-option-body{padding-right:2rem}}@media (min-width: 768px) and (max-width: 1239px){.quick-starts .cloud-option-body{padding-right:1.25rem}}@media screen and (min-width: 768px){.quick-starts .cloud-option-body{background-size:8px 14px}}.quick-starts .cloud-option-body:before{opacity:0.5;position:absolute;left:1.875rem;top:21px}.quick-starts .cloud-option-body.aws:before{content:url("/assets/images/aws-logo.svg")}.quick-starts .cloud-option-body.microsoft-azure:before{content:url("/assets/images/microsoft-azure-logo.svg")}.quick-starts .cloud-option-body.lightning-studios:before{content:url("/assets/images/lightning-studios-logo.svg")}.quick-starts .cloud-option-body.google-cloud:before{content:url("/assets/images/google-cloud-logo.svg")}.quick-starts .cloud-option-body.colab:before{content:url("/assets/images/colab-logo.svg")}@media screen and (min-width: 768px){.quick-starts .cloud-option-body:hover:before{opacity:1}}.quick-starts .cloud-option{background-color:#fff;margin-bottom:.125rem;border:2px solid #f3f4f7;font-size:1.125rem;letter-spacing:-0.25px;line-height:1.875rem;color:#262626}.quick-starts .cloud-option #microsoft-azure p{color:#262626;margin:0;padding:0;font-size:inherit;line-height:1.3rem}.quick-starts .cloud-option #microsoft-azure span{margin-bottom:0;padding-bottom:0;color:#ee4c2c;padding:0px 35px 0px 8px;font-style:italic;line-height:1.3rem}@media (min-width: 768px) and (max-width: 1239px){.quick-starts .cloud-option{font-size:1rem}}.quick-starts .cloud-option ul{display:none;width:100%;margin:0 0 1.25rem 0;padding:0}.quick-starts .cloud-option ul li{margin-top:0;position:relative;padding-left:5rem}@media (min-width: 768px) and (max-width: 1239px){.quick-starts .cloud-option ul li{font-size:1rem}}.quick-starts .cloud-option ul li a{color:#6c6c6d;letter-spacing:-0.25px;line-height:30px}@media screen and (min-width: 768px){.quick-starts .cloud-option ul li a:hover{color:#ee4c2c}}@media screen and (min-width: 768px){.quick-starts .cloud-option ul li:hover:before{content:"\2022";color:#ee4c2c;position:absolute;left:36px}}.quick-starts .cloud-option ul li:first-of-type{margin-top:1.25rem}.quick-starts .cloud-option.open .cloud-option-body{background-image:url("/assets/images/chevron-down-orange.svg");background-size:14px 14px;border-bottom:1px solid #ee4c2c;color:#262626}@media screen and (min-width: 768px){.quick-starts .cloud-option.open .cloud-option-body{border-bottom:none}}.quick-starts .cloud-option.open .cloud-option-body:after{width:100%}.quick-starts .cloud-option.open .cloud-option-body:before{opacity:1}.quick-starts .cloud-option.open ul{display:block}.blog .navbar-nav .nav-link{color:#000}.blog .main-content{padding-bottom:1.5rem}@media screen and (min-width: 768px){.blog .main-content{padding-top:1.70rem;padding-bottom:3.5rem}}.blog .main-background{height:290px}@media screen and (min-width: 768px){.blog .main-background{height:485px}}.blog .blog-detail-background{height:300px}@media screen and (min-width: 768px){.blog .blog-detail-background{height:312px}}.blog .main-content-menu .navbar-nav .nav-link{text-transform:capitalize}.blog .main-content-menu .navbar-nav .nav-link.selected{color:#ee4c2c !important;text-decoration:underline;-webkit-text-decoration-color:#ee4c2c;text-decoration-color:#ee4c2c;opacity:0.75 !important}@media screen and (min-width: 768px){.blog .main-content-menu .nav-item:last-of-type{position:absolute;right:0}.blog .main-content-menu .nav-item:last-of-type a{margin-right:0}}.blog .zoom-in{cursor:zoom-in}.blog .zoomed{cursor:zoom-out}.blog .zoomed img{margin:auto !important;position:absolute;top:0;left:0;right:0;bottom:0;max-width:98%}.blog .nav-logo{background-image:url("/assets/images/logo-dark.svg")}.blog .main-content-wrapper{margin-top:275px}.blog .main-content-wrapper .row.blog-index{margin-top:30px}.blog .main-content-wrapper .row.blog-index p{color:#6c6c6d}.blog .main-content-wrapper .row.blog-vertical{display:block;max-width:100%;margin:auto}.blog .main-content-wrapper .row.blog-vertical .col-md-4{display:initial}.blog .main-content-wrapper .row.blog-vertical .btn{float:left}.blog .main-content-wrapper .vertical-blog-container{border-bottom:1px solid #E2E2E2;padding-bottom:3rem}.blog .main-content-wrapper .vertical-blog-container:last-of-type{margin-bottom:2rem}@media screen and (min-width: 768px){.blog .main-content-wrapper{margin-top:470px}.blog .main-content-wrapper .row.blog-index [class*="col-"]:not(:first-child):not(:last-child):not(:nth-child(3n)){padding-right:2.1875rem;padding-left:2.1875rem}.blog .main-content-wrapper .row.blog-index [class*="col-"]:nth-child(3n){padding-left:2.1875rem}.blog .main-content-wrapper .row.blog-index [class*="col-"]:nth-child(3n+1){padding-right:2.1875rem}.blog .main-content-wrapper .col-md-4{margin-bottom:1.4375rem}}.blog .main-content-wrapper h4 a{font-family:FreightSans;font-size:1.5rem;color:#000;letter-spacing:0;line-height:2rem;font-weight:400}.blog .main-content-wrapper .author{color:#ee4c2c;font-size:1.25rem;letter-spacing:0.25px;line-height:1.875rem;margin-bottom:1.875rem}.blog .main-content-wrapper .author-icon{position:relative;top:1.625rem;height:1.0625rem;width:1.1875rem}.blog .blog-detail-content{padding-bottom:2.8rem}@media screen and (min-width: 768px){.blog .blog-detail-wrapper{margin-top:324px}}.blog .jumbotron{top:6.5625rem}@media screen and (min-width: 768px){.blog .jumbotron{height:25.3125rem}}@media screen and (min-width: 768px){.blog .jumbotron .container{padding-bottom:2.8125rem}}.blog .jumbotron .blog-index-title{overflow:hidden;margin-top:1.5rem;white-space:nowrap;text-overflow:ellipsis;color:white}@media screen and (min-width: 768px){.blog .jumbotron .blog-index-title{overflow:unset;white-space:unset;text-overflow:unset}}.blog .jumbotron h1{letter-spacing:-1.65px;font-size:3.25rem;line-height:3.5rem;text-transform:none;color:#fff}.blog .jumbotron h1 a{color:#fff;word-wrap:break-word}.blog .jumbotron h2{color:#fff}.blog .jumbotron .blog-title{display:inline-flex}.blog .jumbotron .blog-title:hover{color:#fff}.blog .jumbotron .blog-detail-container{padding-top:4rem}@media screen and (min-width: 768px){.blog .jumbotron .blog-detail-container{padding-top:10.875rem}}.blog .jumbotron p{font-size:1.25rem;letter-spacing:0;line-height:1.875rem;color:#fff}.blog .jumbotron .btn{margin-top:.75rem;padding-top:.5625rem}.blog .jumbotron .blog-page-container p.blog-date{padding-top:.625rem}.blog .jumbotron .blog-page-container .btn{margin-bottom:.625rem}.blog .blog-detail-jumbotron{top:45px}@media screen and (min-width: 768px){.blog .blog-detail-jumbotron{height:107px;top:75px}}.blog p.blog-date{font-size:1.125rem;letter-spacing:0;line-height:1.5rem;margin-bottom:.625rem;color:#6c6c6d}.blog p.featured-post{font-size:1.125rem;letter-spacing:0;line-height:1.5rem;margin-bottom:.625rem;color:#fff}.blog p.featured-blog-preview{margin-bottom:.75rem}.blog #blogPostFilter .nav-link{opacity:0.53;font-size:1.25rem;color:#000;letter-spacing:0;line-height:2.125rem}.blog .page-link{font-size:1.25rem;letter-spacing:0;line-height:2.125rem;color:#ee4c2c;width:7.5rem;text-align:center}.blog .blog-modal{max-width:75%;top:5rem}.blog .blog-modal:hover{cursor:zoom-out}@media (max-width: 575px){.blog .blog-modal{max-width:100%;top:10rem}}.blog .blog-image{cursor:zoom-in}@media (max-width: 1067px){.blog .jumbotron h1{margin-right:0;margin-top:1.5rem}.blog .jumbotron h1 a{font-size:2.8125rem;line-height:2.5rem}.blog .main-content-wrapper .col-md-4{margin-bottom:4.6875rem}.blog .similar-posts{margin-bottom:3.125rem}}@media (max-width: 1050px){.blog .main-content-wrapper .author-icon{left:-1.875rem}}.blog table tr th{font-weight:600}.blog .pytorch-article .enterprise-azure-logo-container{padding-left:0}.blog .pytorch-article .enterprise-azure-logo-container img{margin-bottom:0}.blog .pytorch-article img{margin-bottom:1.125rem}twitterwidget{margin:0 auto;margin-top:1.125rem !important;margin-bottom:1.125rem !important}.pytorch-article .outlined-code-block{border:1px solid black;padding:1rem;margin-bottom:1rem}.pytorch-article .outlined-code-block pre{margin:0;padding:0;background-color:white}.pytorch-article .reference-list li{overflow-wrap:anywhere}.similar-posts-module{background:#f3f4f7}.similar-posts-module p.blog-date{font-size:1.125rem;color:#CCCDD1;letter-spacing:0;line-height:1.5rem}.similar-posts-module h4 a{font-family:FreightSans;font-size:1.5rem;color:#000;letter-spacing:0;line-height:2rem;font-weight:400}.similar-posts-module .module-content{margin-bottom:2.1875rem}.similar-posts-module .module-content .navbar-nav{margin-top:3.75rem}.similar-posts-module .module-content .module-heading{text-transform:uppercase;color:#000;font-size:1.5rem;letter-spacing:.083125rem;line-height:2rem;font-weight:400}@media screen and (min-width: 768px){.similar-posts-module .module-content .nav-item:last-of-type{position:absolute;right:0}.similar-posts-module .module-content .nav-item:last-of-type a{margin-right:0}}.similar-posts-module .see-more-posts{color:#000;font-size:1.125rem;letter-spacing:-0.25px;line-height:1.875rem;top:.125rem}input[type='search']{-moz-appearance:none;-webkit-appearance:none}.navSearchWrapper{align-items:center;align-self:center;display:flex;justify-content:center;position:relative;right:10px;top:15px;margin-left:0;padding-bottom:20px}@media screen and (min-width: 768px){.navSearchWrapper{position:absolute;margin-left:30px;display:block;padding-left:3px;padding-bottom:0}}.tabletSearchWrapper{top:0px}@media (min-width: 768px) and (max-width: 1239px){.tabletSearchWrapper{padding-bottom:20px;position:relative;margin-left:0}}.navSearchWrapper .aa-dropdown-menu{background:#f9f9f9;border:3px solid rgba(57,57,57,0.25);color:#393939;font-size:.875rem;left:auto !important;line-height:1.2em;right:0 !important}.navSearchWrapper .aa-dropdown-menu .algolia-docsearch-suggestion--category-header{background:#000;color:white;font-size:.875rem;font-weight:400}.navSearchWrapper .aa-dropdown-menu .algolia-docsearch-suggestion--category-header .algolia-docsearch-suggestion--highlight{background-color:#000;color:#fff}.navSearchWrapper .aa-dropdown-menu .algolia-docsearch-suggestion--title .algolia-docsearch-suggestion--highlight,.navSearchWrapper .aa-dropdown-menu .algolia-docsearch-suggestion--subcategory-column .algolia-docsearch-suggestion--highlight{color:#000}.navSearchWrapper .aa-dropdown-menu .algolia-docsearch-suggestion__secondary,.navSearchWrapper .aa-dropdown-menu .algolia-docsearch-suggestion--subcategory-column{border-color:rgba(57,57,57,0.3)}@media screen and (min-width: 768px){.navSearchWrapper .algolia-autocomplete .algolia-docsearch-suggestion--subcategory-column{word-wrap:normal}}input#search-input{background-color:inherit;border:none;border-radius:20px;color:#000;font-size:1.125rem;font-weight:300;line-height:20px;outline:none;padding-left:25px;position:relative;transition:0.5s width ease;display:none;width:220px;background-image:url("/assets/images/search-icon.svg");background-size:12px 15px;background-repeat:no-repeat;background-position:8px 5px}input#search-input:hover{background-image:url("/assets/images/search-icon-orange.svg")}input#mobile-search-input{font-size:2rem;background-color:transparent;color:#fff;border:none;outline:none;padding-left:25px;position:relative;border-top-left-radius:20px;border-bottom-left-radius:20px;width:300px;display:block}input#search-input:focus,input#search-input:active{color:#000}.navigationSlider .slidingNav .navSearchWrapper .algolia-docsearch-footer a{height:auto}@media only screen and (max-width: 735px){.navSearchWrapper{width:100%}}input::-moz-placeholder{color:#e5e5e5}input:-ms-input-placeholder{color:#e5e5e5}input::-ms-input-placeholder{color:#e5e5e5}input::placeholder{color:#e5e5e5}.hljs{padding:1.25rem 1.5rem}@media only screen and (max-width: 1024px){.reactNavSearchWrapper input#search-input{background-color:rgba(242,196,178,0.25);border:none;border-radius:20px;box-sizing:border-box;color:#393939;font-size:.875rem;line-height:20px;outline:none;padding-left:25px;position:relative;transition:background-color 0.2s cubic-bezier(0.68, -0.55, 0.265, 1.55),width 0.2s cubic-bezier(0.68, -0.55, 0.265, 1.55),color 0.2s ease;width:100%}.reactNavSearchWrapper input#search-input:focus,.reactNavSearchWrapper input#search-input:active{background-color:#000;color:#fff}.reactNavSearchWrapper .algolia-docsearch-suggestion--subcategory-inline{display:none}.reactNavSearchWrapper>span{width:100%}.reactNavSearchWrapper .aa-dropdown-menu{font-size:.75rem;line-height:2em;padding:0;border-width:1px;min-width:500px}.reactNavSearchWrapper .algolia-docsearch-suggestion__secondary{border-top:none}.aa-suggestions{min-height:140px;max-height:60vh;-webkit-overflow-scrolling:touch;overflow-y:scroll}}@media only screen and (min-width: 1024px){.navSearchWrapper{padding-left:10px;position:relative;right:auto;top:auto}}@media only screen and (min-width: 1024px) and (min-width: 768px){.navSearchWrapper{padding-left:3px;right:10px;margin-left:0}}@media only screen and (min-width: 1024px){.navSearchWrapper .algolia-autocomplete{display:block}.tabletSearchWrapper{right:10px}}@media only screen and (max-width: 735px){.reactNavSearchWrapper .aa-dropdown-menu{min-width:400px}}@media only screen and (max-width: 475px){.reactNavSearchWrapper .aa-dropdown-menu{min-width:300px}}.search-border{display:none;flex-direction:row;border:none;background-color:transparent;border-radius:20px;width:100%;float:right}@media screen and (min-width: 768px){.search-border{display:flex}}.mobile-search-border{flex-direction:row;border:none;background-color:rgba(255,255,255,0.1);border-radius:20px;width:100%;float:right;display:flex}@media (min-width: 768px) and (max-width: 1239px){.mobile-search-border{border-radius:25px}}#close-search{color:#ee4c2c;padding-right:10px;font-size:.99em;display:none;cursor:pointer}.active-header{margin-top:-1px}.active-search-icon{background-image:url("/assets/images/search-icon-orange.svg") !important;display:inline-block !important}.active-background{background-color:#f3f4f7;width:50%;padding:4px}.homepage-header input#search-input{background-image:url("/assets/images/search-icon-white.svg");color:#fff}.homepage-header input#search-input:focus,.homepage-header input#search-input:active{color:#fff}.homepage-header .active-background{background-color:#88888833}.homepage-header #close-search{color:#fff;opacity:0.5}.homepage-header #close-search:hover{color:#ee4c2c}.homepage-header #search-icon{background-image:url(/assets/images/search-icon-white.svg)}.homepage-header #search-icon:hover{background-color:#88888833}#search-icon{background-image:url(/assets/images/search-icon.svg);color:transparent;width:33px;height:33px;background-size:21px 21px;background-repeat:no-repeat;background-position:6px 5px;border-radius:25px;cursor:pointer}#search-icon:hover{background-color:#f3f4f7}#mobile-search-icon{background-image:url(/assets/images/search-icon-white.svg);width:30px;height:38px;background-size:16px 28px;background-repeat:no-repeat;background-position:0px 5px;cursor:pointer;border-top-right-radius:20px;border-bottom-right-radius:20px}@media (min-width: 768px) and (max-width: 1239px){#mobile-search-icon{height:50px;width:35px;background-size:20px 42px}}.navSearchWrapper .algolia-autocomplete .ds-dropdown-menu{min-width:330px;height:500px;overflow-y:scroll}@media screen and (min-width: 768px){.navSearchWrapper .algolia-autocomplete .ds-dropdown-menu{height:auto;min-width:700px;overflow-y:hidden}}@media (min-width: 768px) and (max-width: 1239px){.navSearchWrapper .algolia-autocomplete .ds-dropdown-menu{height:700px;overflow-y:scroll}}@media (min-width: 769px) and (max-width: 1024px){.navSearchWrapper .algolia-autocomplete .ds-dropdown-menu{min-width:950px}}.cookie-banner-wrapper{display:none}.cookie-banner-wrapper.is-visible{display:block;position:fixed;bottom:0;background-color:#f3f4f7;min-height:100px;width:100%;z-index:401;border-top:3px solid #ededee}.cookie-banner-wrapper .gdpr-notice{color:#6c6c6d;margin-top:1.5625rem;text-align:left;max-width:1440px}@media screen and (min-width: 768px){.cookie-banner-wrapper .gdpr-notice{width:77%}}@media (min-width: 768px) and (max-width: 1239px){.cookie-banner-wrapper .gdpr-notice{width:inherit}}.cookie-banner-wrapper .gdpr-notice .cookie-policy-link{color:#343434}.cookie-banner-wrapper .close-button{-webkit-appearance:none;-moz-appearance:none;appearance:none;background:transparent;border:1px solid #f3f4f7;height:1.3125rem;position:absolute;bottom:42px;right:0;top:0;cursor:pointer;outline:none}@media screen and (min-width: 768px){.cookie-banner-wrapper .close-button{right:20%;top:inherit}}@media (min-width: 768px) and (max-width: 1239px){.cookie-banner-wrapper .close-button{right:0;top:0}}.hub .jumbotron{height:300px}@media screen and (min-width: 768px){.hub .jumbotron{height:420px}}.hub .jumbotron h1{color:#fff}.hub .jumbotron h1 #hub-header,.hub .jumbotron h1 #hub-sub-header{font-weight:lighter}.hub .jumbotron p.lead,.hub .jumbotron p.hub-release-message{margin-bottom:1.5625rem;padding-top:1.5625rem;color:#fff}@media screen and (min-width: 768px){.hub .jumbotron p.lead,.hub .jumbotron p.hub-release-message{width:77%}}.hub .jumbotron p.hub-release-message{padding-top:0;font-style:italic}.hub .jumbotron svg{margin-bottom:1.25rem}.hub .jumbotron p.detail-lead{padding-top:3.125rem;color:#797676;width:100%;margin-bottom:0px}.hub .jumbotron p.lead-summary{color:#6c6c6d}.hub.hub-index .jumbotron{height:280px}@media screen and (min-width: 768px){.hub.hub-index .jumbotron{height:325px}}.hub .detail-github-link{background:#ee4c2c;color:#fff}.hub .detail-colab-link{background:#ffc107;color:#000}.hub .detail-web-demo-link{background:#4a9fb5;color:#fff}.hub .detail-colab-link,.hub .detail-github-link,.hub .detail-web-demo-link{margin-top:1rem}.hub .detail-button-container{margin-top:2.8125rem}@media (min-width: 768px) and (max-width: 1239px){.hub .detail-button-container{margin-top:1.25rem}}@media (max-width: 320px){.hub .detail-button-container{margin-top:1.25rem}}@media (max-width: 360px){.hub .detail-button-container{margin-top:1.25rem}}.hub a .detail-colab-link,.hub a .detail-github-link{padding-right:3.125rem}.hub .detail-arrow{color:#ee4c2c;font-size:2.5rem}@media screen and (min-width: 768px){.hub .detail-arrow{font-size:4.5rem}}.hub .with-right-white-arrow{padding-right:2rem;position:relative;background-image:url("/assets/images/chevron-right-white.svg");background-size:6px 13px;background-position:top 10px right 11px;background-repeat:no-repeat}@media screen and (min-width: 768px){.hub .with-right-white-arrow{background-size:8px 14px;background-position:top 15px right 12px;padding-right:2rem}}.hub .main-content{padding-top:8.75rem}@media screen and (min-width: 768px){.hub .main-content{padding-top:8.4375rem}}@media (max-width: 320px){.hub .main-content{padding-top:10rem}}.hub.hub-detail .main-content{padding-top:12.5rem}@media screen and (min-width: 768px){.hub.hub-detail .main-content{padding-top:9.375rem}}.hub.hub-detail .jumbotron{height:350px}@media screen and (min-width: 768px){.hub.hub-detail .jumbotron{height:400px}}.hub .main-content-wrapper{background-color:#f3f4f7;margin-top:300px}@media screen and (min-width: 768px){.hub .main-content-wrapper{margin-top:395px}}.hub-feedback-button{border:2px solid #e2e2e2;color:#A0A0A1;padding-left:0;padding-right:5rem;font-size:1rem;width:13rem}.hub-feedback-button:after{bottom:-1px}.hub-flag{background-image:url("/assets/images/feedback-flag.svg");background-size:15px 20px;background-position:center right 10px;background-repeat:no-repeat}#hub-icons{height:2rem}@media (max-width: 480px){#hub-icons{position:initial;padding-left:0;padding-top:1rem}}.hub.hub-detail .main-content-wrapper{margin-top:305px}@media screen and (min-width: 768px){.hub.hub-detail .main-content-wrapper{margin-top:390px}}@media (min-width: 768px) and (max-width: 1239px){.hub.hub-detail .main-content-wrapper{margin-top:490px}}@media (max-width: 320px){.hub.hub-detail .main-content-wrapper{margin-top:330px}}.hub .hub-cards-wrapper,.hub-cards-wrapper-right{margin-bottom:1.125rem;padding-top:1.25rem}.hub .hub-cards-wrapper .card-body .card-summary,.hub-cards-wrapper-right .card-body .card-summary{width:75%}.hub .hub-cards-wrapper .card-body .hub-image,.hub-cards-wrapper-right .card-body .hub-image{position:absolute;top:0px;right:0px;height:100%;width:25%}.hub .hub-cards-wrapper .card-body .hub-image img,.hub-cards-wrapper-right .card-body .hub-image img{height:100%;width:100%}.hub .hub-cards-wrapper .card-body .hub-image:before,.hub-cards-wrapper-right .card-body .hub-image:before{content:'';position:absolute;top:0;left:0;bottom:0;right:0;z-index:1;background:#000000;opacity:.075}.hub .github-stars-count{color:#797676;position:relative;top:.25rem;font-size:14px}@media screen and (min-width: 768px){.hub .github-stars-count{top:.1875rem;font-size:initial}}.hub .github-stars-count-whole-number{display:none}.hub .github-logo{height:15px;width:13px}.hub .icon-count-container{display:inline-block;vertical-align:text-bottom;margin-left:.5rem}.hub .detail-count{font-size:1.25rem}.hub .main-stars-container{display:flex}.hub .detail-stars-container{display:inline-flex}.hub .detail-stars-container .github-stars-image{margin-left:0}.hub .card-body .hub-card-title-container{width:75%;display:inline-flex;max-width:18.75rem}.hub .card-body .hub-card-title-container .experimental-badge{text-transform:uppercase;margin-left:.9375rem;background-color:#e4e4e4;color:#262626;opacity:0.75;font-size:.625rem;letter-spacing:1px;line-height:1.375rem;height:1.25rem;width:6rem;text-align:center;margin-top:.25rem}.hub .card-body .hub-card-title-container .card-title{padding-left:0;font-size:1.5rem;color:#262626}.hub .card-body .hub-card-title-container .star-list{list-style:none;padding-left:0}.hub .card-body .hub-card-title-container .star-list li{display:inline}.hub .card-body .hub-card-title-container .star-list li.github-stars-count-whole-number{display:none}.hub .hub-filter-menu ul{list-style-type:none;padding-left:1.25rem}.hub .hub-filter-menu ul li{padding-right:1.25rem;word-break:break-all}.hub .hub-filter-menu ul li a{color:#797676}.hub .hub-filter-menu ul li a:hover{color:#ee4c2c}.hub .hub-filter{cursor:pointer}.hub-index #dropdownSortLeft{color:#797676;cursor:pointer;z-index:1;position:absolute;top:inherit;left:23%;max-width:4rem}@media (min-width: 480px) and (max-width: 590px){.hub-index #dropdownSortLeft{left:40%}}.hub #dropdownFilter,#dropdownSort,#dropdownSortLeft{color:#797676;cursor:pointer;z-index:1;position:absolute;top:11rem;right:1rem;left:inherit}@media (min-width: 480px) and (max-width: 590px){.hub #dropdownFilter,#dropdownSort,#dropdownSortLeft{top:7rem}}@media (min-width: 590px){.hub #dropdownFilter,#dropdownSort,#dropdownSortLeft{top:5rem}}@media screen and (min-width: 768px){.hub #dropdownFilter,#dropdownSort,#dropdownSortLeft{top:5rem}}.hub .sort-menu{left:inherit;right:1rem;top:12.5rem;max-width:12rem}@media (min-width: 480px) and (max-width: 590px){.hub .sort-menu{top:8.5rem}}@media (min-width: 590px) and (max-width: 900px){.hub .sort-menu{top:6.5rem}}@media (min-width: 900px) and (max-width: 1239px){.hub .sort-menu{top:6.5rem}}@media screen and (min-width: 1240px){.hub .sort-menu{right:0;top:6.5rem}}.hub-index .sort-menu{left:23%;top:inherit;max-width:12rem}.hub .research-hub-title,.research-hub-sub-title{text-transform:uppercase;letter-spacing:1.78px;line-height:2rem}.research-hub-sub-title{padding-bottom:1.25rem}.hub .research-hub-title{color:#ee4c2c}.hub .all-models-button,.full-docs-button{font-size:1.125rem;position:relative;cursor:pointer;outline:none;padding:.625rem 1.875rem .625rem 1.25rem;background-color:#fff;margin-bottom:0.125rem;border:2px solid #f3f4f7;letter-spacing:-0.25px;line-height:1.75rem;color:#6c6c6d;background-image:url("/assets/images/chevron-right-orange.svg");background-size:6px 13px;background-position:center right 10px;background-repeat:no-repeat}.hub .all-models-button a,.full-docs-button a{color:#6c6c6d}@media screen and (min-width: 768px){.hub .all-models-button:after,.full-docs-button:after{content:"";display:block;width:0;height:1px;position:absolute;bottom:0;left:0;background-color:#ee4c2c;transition:width .250s ease-in-out}.hub .all-models-button:hover:after,.full-docs-button:hover:after{width:100%}.hub .all-models-button:hover,.full-docs-button:hover{color:#262626}}.hub .hub-column{padding-bottom:4.6875rem}.hub.hub-index .hub-column{padding-bottom:0}.hub .how-it-works{padding-top:3.125rem;padding-bottom:2.8125rem}.hub .how-it-works .how-it-works-text{color:#6c6c6d;font-size:1.25rem;letter-spacing:0;line-height:1.875rem}.hub .how-it-works .how-it-works-title-col{padding-bottom:3.4375rem}.hub .how-it-works .full-docs-button{margin-top:1.875rem}.hub .hub-code-text{font-size:80%;color:#262626;background-color:#e2e2e2;padding:2px}.hub .hub-code-block{display:block;border-left:3px solid #ee4c2c;padding:1.25rem 1.5625rem 1.25rem 1.5625rem;margin-bottom:3.75rem}.hub pre.highlight{background-color:#e2e2e2;border-left:2px solid #ee4c2c}.hub code.highlighter-rouge{background-color:#e2e2e2}.hub article{padding-top:1.25rem}@media screen and (min-width: 768px){.hub article{padding-top:0}}.hub article p{color:#262626}@media screen and (min-width: 768px){.hub .hub-detail-background{height:515px}}.hub .dropdown-menu{border-radius:0;padding-bottom:0}.hub .card:hover .hub-image:before{bottom:100%}.hub.hub.hub-detail .github-stars-image img{height:9px}@media screen and (min-width: 768px){.hub.hub.hub-detail .github-stars-image img{height:10px}}.hub #development-models-hide,#research-models-hide{display:none}@media (min-width: 768px){.hub .col-md-6.hub-column{flex:0 0 100%;max-width:100%}}@media screen and (min-width: 1240px){.hub .col-md-6.hub-column{flex:0 0 50%;max-width:50%}}@media (min-width: 768px){.hub .col-md-12.hub-column .col-md-6{flex:0 0 100%;max-width:100%}}@media screen and (min-width: 1240px){.hub .col-md-12.hub-column .col-md-6{flex:0 0 100%;max-width:50%}}.hub .featured-image{padding-bottom:1.25rem}.hub .coming-soon{font-weight:300;font-style:italic}@media screen and (min-width: 768px){.hub.hub-index .jumbotron{height:325px}}.hub.hub-index .jumbotron h1{padding-top:0}@media screen and (min-width: 768px){.hub.hub-index .jumbotron h1{padding-top:3.4375rem}}.hub.hub-index .jumbotron p.lead{padding-top:3.4375rem}.hub.hub-index .main-content-wrapper{margin-top:210px}@media screen and (min-width: 768px){.hub.hub-index .main-content-wrapper{margin-top:280px}}.hub .page-link{font-size:1.25rem;letter-spacing:0;line-height:2.125rem;color:#ee4c2c;width:7.5rem;text-align:center}.hub .filter-btn{color:#797676;border:1px solid #797676;display:inline-block;text-align:center;white-space:nowrap;vertical-align:middle;padding:0.375rem 0.75rem;font-size:1rem;line-height:1.5;margin-bottom:5px}.hub .filter-btn:hover{border:1px solid #ee4c2c;color:#ee4c2c}.hub .selected{border:1px solid #ee4c2c;background-color:#ee4c2c;color:#fff}.hub .selected:hover{color:#fff}.hub .all-tag-selected{background-color:#797676;color:#fff}.hub .all-tag-selected:hover{border-color:#797676;color:#fff}.hub .pagination .page{border:1px solid #dee2e6;padding:0.5rem 0.75rem}.hub .pagination .active .page{background-color:#dee2e6}.hub .hub-tags-container{width:60%}.hub .hub-tags-container.active{width:0}@media screen and (min-width: 768px){.hub .hub-search-wrapper{top:8px}}.hub .hub-search-wrapper .algolia-autocomplete .ds-dropdown-menu{min-width:100%;max-width:100% !important}.hub .hub-search-wrapper .algolia-autocomplete{width:100%}.hub .hub-search-wrapper.active{width:100%}.hub .hub-search-wrapper span{font-size:1.125rem;text-align:center}@media (max-width: 480px){.hub #hub-search-icon{margin-top:1rem}}#hub-search-icon{background-image:url("/assets/images/search-icon.svg");color:transparent;opacity:0.4;width:25px;height:25px;margin-left:3rem;background-size:15px 20px;background-repeat:no-repeat;right:10px;position:absolute;z-index:1;cursor:pointer}#hub-search-icon:hover{background-image:url("/assets/images/search-icon-orange.svg");opacity:1}#hub-search-input{background-color:#CCCDD1;border:none;color:#000;font-size:1.125rem;font-weight:300;line-height:20px;outline:none;position:relative;display:none;width:100%;border-radius:5px;padding:.875rem 0 .875rem .3125rem}#hub-close-search{display:none;margin-left:20px;opacity:0.4;right:10px;position:absolute;z-index:1;cursor:pointer;font-size:1.125rem}@media screen and (min-width: 768px){#hub-close-search{top:1.125rem}}#hub-close-search:hover{color:#ee4c2c;opacity:1}.hub .hub-divider{margin-bottom:2.2rem;margin-top:1.5rem}.hub .active-hub-divider{border-color:#ee4c2c}.hub .hub-search-border{display:flex;align-items:center;flex-direction:row;border:none;background-color:transparent;border-radius:20px;width:100%}.hub .hub-cards-wrapper{z-index:1000}.hub .nav-container{display:flex;width:100%;position:absolute}.compact-cards{width:100%}.compact-cards a{color:#6C6C6D}.compact-cards a:hover{color:#ee4c2c}.compact-hub-card-wrapper{padding:0}.compact-card-container{display:flex;align-items:center}.compact-card-body{padding-top:8px}.compact-card-body:hover{border-bottom:1px solid #ee4c2c;color:#ee4c2c}.compact-card-body:hover .compact-item-title{color:#ee4c2c}.compact-card-body .compact-hub-card-title-container{width:75%;display:flex}.compact-model-card{height:auto;border-bottom:1px solid #E2E2E2}.compact-item-title{padding-left:0;color:#000}.compact-card-summary{white-space:nowrap;overflow:hidden;text-overflow:ellipsis;top:5px}.compact-hub-divider{padding:0;width:100%}.hub-select-container{position:absolute;right:0;height:2rem}.compact-hub-index-cards{padding-bottom:2rem}.full-hub-icon:hover{cursor:pointer;height:3rem}.compact-hub-icon{margin-left:0.5rem;margin-right:3.125rem}.compact-hub-icon:hover{cursor:pointer}.mobile article{margin-bottom:5rem}.mobile .main-background{height:275px}@media screen and (min-width: 768px){.mobile .main-background{height:380px}}.mobile .main-content-wrapper{margin-top:275px}@media screen and (min-width: 768px){.mobile .main-content-wrapper{margin-top:350px}}.mobile .jumbotron{height:190px}@media screen and (min-width: 768px){.mobile .jumbotron{height:260px}}.mobile .main-content .navbar{background-color:#f3f4f7;padding-left:0;padding-bottom:0;padding-top:0}@media (min-width: 992px){.mobile .main-content .navbar li:first-of-type{padding-left:3.4375rem}.mobile .main-content .navbar .nav-item{padding:2rem;cursor:pointer}.mobile .main-content .navbar .nav-link{position:relative;top:10%;transform:translateY(-50%)}}.mobile .main-content .navbar .nav-select{background-color:#fff}.mobile .main-content .navbar .nav-select .nav-link{color:#ee4c2c;font-weight:500}.mobile .main-content .navbar .nav-link{font-size:1.125rem;color:#8c8c8c}@media screen and (min-width: 768px){.mobile .main-content .navbar .nav-link{margin-left:1.875rem}}.mobile .main-content .navbar .nav-link:hover{color:#ee4c2c}.mobile .main-content .navbar .nav-item{padding-top:.9375rem;padding-bottom:.9375rem}@media screen and (min-width: 768px){.mobile .main-content .navbar .nav-item{padding-bottom:0;padding-top:2rem}}@media (min-width: 768px) and (max-width: 1239px){.mobile .main-content .navbar .nav-item{padding-bottom:0;padding-top:2rem}}@media (max-width: 990px){.mobile .main-content .navbar .nav-item{padding-bottom:.625rem;padding-top:1rem}}.mobile .main-content .navbar .navbar-toggler{margin-left:2.5rem}.mobile .main-content{padding-top:0}@media screen and (min-width: 768px){.mobile .main-content{padding-top:1.9rem}}.mobile .nav-menu-wrapper{background-color:#f3f4f7}.mobile .navbar-nav{flex-direction:row}.mobile .mobile-page-sidebar{padding-top:2.5rem;padding-bottom:2.5rem;top:15%}@media screen and (min-width: 768px){.mobile .mobile-page-sidebar{padding-top:0}}.mobile .mobile-page-sidebar ul{padding-left:0}.mobile .mobile-page-sidebar li{list-style-type:none;line-height:23px;margin-bottom:15px}.mobile .mobile-page-sidebar li a{color:#8c8c8c}.mobile .mobile-page-sidebar li a.active,.mobile .mobile-page-sidebar li a:hover{color:#ee4c2c}@media screen and (min-width: 1240px){.deep-learning .header-container{margin-bottom:1rem}}.deep-learning .jumbotron{height:180px}@media screen and (min-width: 768px){.deep-learning .jumbotron{height:250px}}.deep-learning .jumbotron .thank-you-page-container{margin-top:0}@media (min-width: 768px) and (max-width: 1239px){.deep-learning .jumbotron .thank-you-page-container{margin-top:250px}}@media screen and (min-width: 768px){.deep-learning .jumbotron .deep-learning-jumbotron-text{margin-top:55px}.deep-learning .jumbotron .deep-learning-jumbotron-text h1{padding-top:30px}}@media (min-width: 768px) and (max-width: 1239px){.deep-learning .jumbotron .deep-learning-jumbotron-text{max-width:95%;flex-basis:100%}}.deep-learning .jumbotron .deep-learning-thank-you-text{width:80%}.deep-learning .jumbotron .deep-learning-thank-you-text .download-book-link{display:inline-block}.deep-learning .jumbotron .deep-learning-landing-text{width:100%}@media screen and (min-width: 768px){.deep-learning .jumbotron .deep-learning-landing-text{width:85%}}.deep-learning .jumbotron .deep-learning-book-container{display:none}@media screen and (min-width: 768px){.deep-learning .jumbotron .deep-learning-book-container{display:block}}@media (min-width: 768px) and (max-width: 1239px){.deep-learning .jumbotron .deep-learning-book-container{display:none}}.deep-learning .jumbotron .thank-you-book-container{display:none}@media (min-width: 768px) and (max-width: 1239px){.deep-learning .jumbotron .thank-you-book-container{display:block}}@media screen and (min-width: 768px){.deep-learning .jumbotron .thank-you-book-container{display:block}}@media screen and (min-width: 768px){.deep-learning .deep-learning-col{max-width:80%}}@media screen and (min-width: 768px){.deep-learning .deep-learning-background{height:440px}}@media screen and (min-width: 768px){.deep-learning .header-holder{height:90px}}.deep-learning .main-content-wrapper{margin-top:250px}@media screen and (min-width: 768px){.deep-learning .main-content-wrapper{margin-top:480px}}@media screen and (min-width: 768px){.deep-learning .deep-learning-content{padding-top:0}}.deep-learning .main-background{height:250px}@media screen and (min-width: 768px){.deep-learning .main-background{height:440px}}.deep-learning .thank-you-wrapper{margin-top:400px}@media screen and (min-width: 768px){.deep-learning .thank-you-wrapper{margin-top:275px}}.deep-learning .thank-you-background{height:438px}@media screen and (min-width: 768px){.deep-learning .thank-you-background{height:680px}}.deep-learning-container{display:flex;align-items:center}.deep-learning-logo{background-image:url("/assets/images/pytorch-logo.png")}.deep-learning-row{display:flex;align-items:center}.deep-learning-row .lead{margin-top:1rem;margin-bottom:2rem}@media (min-width: 768px) and (max-width: 1239px){.deep-learning-row h1{font-size:3rem}}@media screen and (min-width: 768px){.deep-learning-row h1{margin-top:2rem}}.deep-learning-book{max-width:100%;height:400px}.deep-learning-form{margin-left:-1rem}@media screen and (min-width: 768px){.deep-learning-form{margin-left:0;margin-top:1rem}}#deep-learning-button{margin-top:2rem}.deep-learning-form .email-subscribe-form .deep-learning-input{padding-left:.5rem;background-color:#f3f4f7}.deep-learning-form #mce-error-response{color:#ee4c2c}.video-item{margin-bottom:5rem}.video-item a h5{color:#000;margin-top:1rem}.video-item a:hover h5{color:#ee4c2c}.video-item .image-container{overflow:hidden}.video-item .image-container img{margin:-10% 0;width:100%}.ecosystem .contributor-jumbotron{width:90%}@media screen and (min-width: 768px){.ecosystem .contributor-jumbotron{height:262px}}.ecosystem .contributor-jumbotron .container{max-width:920px}.ecosystem .contributor-jumbotron h1{padding-top:0}.ecosystem .contributor-jumbotron h1 span{font-weight:300;color:#812CE5}.ecosystem .contributor-jumbotron .contributor-jumbo-text h1{color:white}.ecosystem .contributor-jumbotron .contributor-jumbo-text h2{color:white;padding-top:0}.hidden{display:none}.contributor-container-fluid{height:4rem;width:100%}@media screen and (max-width: 767px){.contributor-container-fluid{margin-top:2rem}}@media screen and (min-width: 1200px){.contributor-container-fluid{margin-left:0}}.ecosystem .contributor.main-content{padding-top:0}.ecosystem .contributor.main-content .navbar{padding-left:0;padding-bottom:0;padding-top:0}.ecosystem .contributor.main-content .navbar .nav-item{cursor:pointer}.ecosystem .contributor.main-content .navbar .nav-item:last-of-type{position:relative}@media (min-width: 992px){.ecosystem .contributor.main-content .navbar .nav-item{padding:2rem;cursor:pointer}.ecosystem .contributor.main-content .navbar .nav-link{position:relative;top:10%;transform:translateY(-50%)}}.ecosystem .contributor.main-content .navbar .nav-select{background-color:#fff}.ecosystem .contributor.main-content .navbar .nav-select .nav-link{color:#ee4c2c;font-weight:500}.ecosystem .contributor.main-content .navbar .nav-link{font-size:1.125rem;color:#8c8c8c}@media screen and (min-width: 768px){.ecosystem .contributor.main-content .navbar .nav-link{margin-left:1.875rem}}.ecosystem .contributor.main-content .navbar .nav-link:hover{color:#ee4c2c}.ecosystem .contributor.main-content .navbar .contributor-nav-link{padding-left:1.25rem;padding-right:1.25rem}@media screen and (min-width: 768px){.ecosystem .contributor.main-content .navbar .contributor-nav-link{padding-left:1.875rem;padding-right:1.875rem}}.ecosystem .contributor.main-content .navbar .contributor-nav{flex-direction:row}.ecosystem .contributor.main-content .navbar .nav-item{padding-top:.9375rem;padding-bottom:.9375rem}@media screen and (min-width: 768px){.ecosystem .contributor.main-content .navbar .nav-item{padding-bottom:0;padding-top:2rem}}@media (min-width: 768px) and (max-width: 1239px){.ecosystem .contributor.main-content .navbar .nav-item{padding-bottom:0;padding-top:2rem}}@media (max-width: 990px){.ecosystem .contributor.main-content .navbar .nav-item{padding-bottom:.625rem;padding-top:1rem}}.ecosystem .contributor.main-content .navbar .navbar-toggler{margin-left:2.5rem}.past-issue-container{display:flex}@media (max-width: 767px){.past-issue-container{display:block}}.past-issue-container .get-started-cloud-sidebar .sticky-top{position:-webkit-sticky;position:sticky;top:15%}@media (max-width: 767px){.past-issue-container .get-started-cloud-sidebar .sticky-top{position:relative;top:0;margin-left:0}}.past-issue-container .get-started-cloud-sidebar .pytorch-article li{list-style:initial}.past-issue-container .get-started-cloud-sidebar li{list-style-type:none;line-height:36px;color:#8c8c8c}.past-issue-container .get-started-cloud-sidebar span{white-space:nowrap}#past-issues{max-width:920px;margin:auto;margin-top:0;margin-bottom:0}.contributor-container{max-width:920px;left:0;right:0;margin-left:auto;margin-right:auto;padding-left:30px;padding-right:30px;width:90%}.past-issue-container.container{padding-left:5px;padding-top:45px}.nav-background{width:100%;background-color:#f3f4f7}#get-started-contributor-sidebar-list{padding-left:0}#get-started-contributor-sidebar-list .active{color:#ee4c2c}#get-started-contributor-sidebar-list li a{color:#8c8c8c}.two-column-row{max-width:920px;margin:0 auto 0 auto;padding:0 30px 43px 30px;width:90%}@media screen and (min-width: 768px){.two-column-row{display:flex}}.two-column-row h2{text-transform:uppercase;font-weight:100;margin-bottom:30px}.two-column-row p{margin-bottom:40px}.two-column-row .content-left{flex:60%;padding-top:76px}@media screen and (min-width: 768px){.two-column-row .content-left{margin-right:62px}}.two-column-row .content-left h2{color:#ee4c2c}.two-column-row .content-left .contributor-consent-check{max-width:400px}.two-column-row .content-left .email-consent{color:#797676;font-size:14px}.two-column-row .content-left .please-accept-terms{display:none;color:#ee4c2c;font-size:14px}.two-column-row .content-right{flex:40%;padding-top:76px}.two-column-row .content-right h2{color:#812CE5}.two-column-row .contributor-form{margin:-8px 0 47px 0}.two-column-row .contributor-form .form-success,.two-column-row .contributor-form .form-fail{color:#ee4c2c;display:none;flex:none;margin:8px 0 12px 0}.two-column-row .contributor-form form{width:100%}.two-column-row .contributor-form form .contributor-form-ui{display:flex;max-width:390px;flex-wrap:wrap}.two-column-row .contributor-form form .contributor-form-ui input[type="text"]{border:1px solid #e6e6e6;border-radius:4px;flex:1 70%;padding:5px 8px 5px 8px;margin-right:10px}.two-column-row .contributor-form form .contributor-form-ui input[type="text"]::-moz-placeholder{color:silver}.two-column-row .contributor-form form .contributor-form-ui input[type="text"]:-ms-input-placeholder{color:silver}.two-column-row .contributor-form form .contributor-form-ui input[type="text"]::-ms-input-placeholder{color:silver}.two-column-row .contributor-form form .contributor-form-ui input[type="text"]::placeholder{color:silver}.two-column-row .contributor-form form .contributor-form-ui input[type="text"]:focus{border:1px solid #ee4c2c}.two-column-row .contributor-form form .contributor-form-ui input[type="submit"]{background:#e6e6e6;border:none;border-radius:4px;color:#6d6d6d}.two-column-row .contributor-form form .contributor-form-ui input[type="submit"]:hover{background:silver;color:#3a3a3a}.two-column-row .contributor-form input[type="checkbox"]{margin:1px 6px 0 0}.two-column-row .contributor-form .contributor-consent-check{color:#797676;margin-top:1rem}.two-column-row .contributors-button{background-image:url("/assets/images/chevron-right-orange.svg");background-color:#fff;background-size:6px 13px;background-position:center right 10px;background-repeat:no-repeat;border:2px solid #f3f4f7;color:#6c6c6d;cursor:pointer;font-size:1.125rem;outline:none;letter-spacing:-0.25px;line-height:1.75rem;margin-bottom:0.125rem;padding:.625rem 1.875rem .625rem 1.25rem}.two-column-row .contributors-button a{color:#6c6c6d}@media screen and (min-width: 768px){.two-column-row .contributors-button:after{content:"";display:block;width:0;height:1px;position:absolute;bottom:0;left:0;background-color:#ee4c2c;transition:width .250s ease-in-out}.two-column-row .contributors-button:hover:after{width:100%}.two-column-row .contributors-button:hover{color:#262626}}.mobile .enterprise-jumbotron{height:210px}@media screen and (min-width: 768px){.mobile .enterprise-jumbotron{height:280px}}.enterprise{padding-bottom:0}.enterprise p,.enterprise li{color:#6c6c6d;font-size:18px}.enterprise h2{padding-bottom:1.5rem}.enterprise .container{padding:48px 30px 48px 30px}.enterprise .enterprise-gray-container{background-color:#f3f4f7}.enterprise .pyt-enterprise-logo{background-image:url("/assets/images/PTE_lockup_PRIMARY.svg");background-repeat:no-repeat;height:60px}.enterprise .container{max-width:940px}.enterprise .enterprise-landing-azure-logo-container{float:left;padding:0}.ecosystem .events-wrapper{background-color:white}@media screen and (min-width: 768px){.ecosystem .events-wrapper{margin-top:472px}}.ecosystem .events{padding-top:0}.ecosystem .events .event-info-container{display:flex;flex-flow:column}.ecosystem .events .sticky-top{top:15%}.ecosystem .events .event-label{margin-bottom:2rem}.ecosystem .live-event-container{display:flex}@media (max-width: 767px){.ecosystem .live-event-container{flex-flow:wrap}}.ecosystem .events-section{max-width:920px;margin:0 auto 0 auto;padding:0 30px 43px 30px;width:90%}.ecosystem .events-section .event-item{padding-bottom:3rem;border-bottom:1px solid #D6D7D8}.ecosystem .events-section .event-item h2{padding-bottom:1rem}.ecosystem .community-event{margin:0;padding:3px 10px;border:1px solid #8c8c8c;border-radius:3px;text-transform:uppercase;font-size:14px;font-weight:700;color:#8c8c8c}.ecosystem .event-side-nav-container{padding-left:3rem}.ecosystem .event-side-nav-container ul{list-style:none}.ecosystem .live-events-section p{font-size:18px;margin-top:2rem}@media (min-width: 768px) and (max-width: 1239px){.ecosystem .live-events-section{width:100%;padding-left:5px;padding-right:5px}}@media (max-width: 767px){.ecosystem .live-events-section{width:100%;padding-left:5px;padding-right:5px}}.ecosystem .events.main-content{padding-top:0}.events-container-fluid{height:5rem;width:100%;padding-bottom:7rem}@media screen and (max-width: 767px){.events-container-fluid{margin-top:2rem}}@media screen and (min-width: 1200px){.events-container-fluid{margin-left:0}}.events-container{max-width:920px;left:0;right:0;margin-left:auto;margin-right:auto;padding-left:0px;padding-right:0px;width:90%}.ecosystem .events.main-content .navbar{padding-left:0;padding-bottom:0;padding-top:0}.ecosystem .events.main-content .navbar .nav-item{cursor:pointer}.ecosystem .events.main-content .navbar .nav-item:last-of-type{position:relative}@media (min-width: 992px){.ecosystem .events.main-content .navbar .nav-item{padding:.5rem;cursor:pointer}.ecosystem .events.main-content .navbar .nav-link{position:relative;top:10%;transform:translateY(-50%)}}.ecosystem .events.main-content .navbar .nav-select{background-color:#fff}.ecosystem .events.main-content .navbar .nav-select .nav-link{color:#ee4c2c;font-weight:500}.ecosystem .events.main-content .navbar .nav-link{font-size:1.125rem;color:#8c8c8c}@media screen and (min-width: 768px){.ecosystem .events.main-content .navbar .nav-link{margin-left:1.875rem}}.ecosystem .events.main-content .navbar .nav-link:hover{color:#ee4c2c}.ecosystem .events.main-content .navbar .events-nav-link{padding-left:.9375rem;padding-right:.3125rem}@media screen and (min-width: 768px){.ecosystem .events.main-content .navbar .events-nav-link{padding-left:1.25rem;padding-right:1.25rem}}.ecosystem .events.main-content .navbar .events-nav{flex-direction:row}.ecosystem .events.main-content .navbar .nav-item{padding-top:.9375rem;padding-bottom:.9375rem}@media screen and (min-width: 768px){.ecosystem .events.main-content .navbar .nav-item{padding-bottom:0;padding-top:2rem}}@media (min-width: 768px) and (max-width: 1239px){.ecosystem .events.main-content .navbar .nav-item{padding-bottom:0;padding-top:2rem}}@media (max-width: 990px){.ecosystem .events.main-content .navbar .nav-item{padding-bottom:.625rem;padding-top:1rem}}.ecosystem .events.main-content .navbar .navbar-toggler{margin-left:2.5rem}.events-video-wrapper{width:100%;border:1px solid #797676;background-color:#f3f4f7;height:21rem;margin-top:2.5rem}.events-video-wrapper .video-container{display:flex;top:12%}.events-video-wrapper .video-tabs{display:flex}.events-video-wrapper .events-video-nav{flex-direction:row;padding-right:0;margin-bottom:1rem}.events-video-wrapper .events-video-nav .nav-item{border-right:1px solid #797676;border-bottom:1px solid #797676}.events-video-wrapper .events-video-nav .nav-select{background-color:#fff;border-bottom:none}.events-video-wrapper .events-video-nav .nav-select .nav-link{color:#ee4c2c}.events-video-wrapper .events-nav-link{text-align:center}.events-video-wrapper .video{position:relative;height:0;padding-bottom:30%;place-self:center}.events-video-wrapper .video-info{margin-left:3rem;max-width:45%}.events-video-wrapper iframe{height:100%;width:100%;position:absolute}.video-links-container{border:1px solid #797676}.video-links-container .video-links{display:flex}.video-links-container .video-links .video-link-item{padding-left:1rem;list-style:none}.episode-header-text{font-size:26px;margin-bottom:2rem}.episode-card-row{display:block}@media screen and (min-width: 908px){.episode-card-row{display:flex;flex-wrap:wrap;margin-bottom:2rem}}.episode-card-row .episode-card.resource-card{height:14rem;margin-right:1rem;margin-bottom:1rem;background-color:#f3f4f7;border:none;max-width:31%;flex:auto}.episode-card-row .episode-card.resource-card ul{list-style:none}.episode-card-row .episode-card.resource-card a{color:inherit}.episode-card-row .episode-card.resource-card .episode-body{display:block;position:relative;top:30px;margin-left:20px}.episode-card-row .episode-card.resource-card .episode-title{margin-left:3.2rem;margin-bottom:.5rem;font-size:1.5rem}@media screen and (min-width: 768px){.episode-card-row .episode-card.resource-card .episode-title{margin-left:2.5rem}}.episode-card-row .episode-card.resource-card .guest-name{font-weight:500;font-size:1.25rem;overflow:hidden;white-space:nowrap;text-overflow:ellipsis}.episode-card-row .episode-card.resource-card .episode-info{display:flex;justify-content:space-between}.episode-card-row .episode-card.resource-card .episode-info span{padding-left:5px;padding-right:5px}.episode-card-row .episode-card.resource-card .info-divide{display:block;border-bottom:1px solid #D6D7D8;margin-top:.5rem;margin-bottom:.5rem}.episode-card-row .episode-card.resource-card .episode-poster{color:#ee4c2c}.episode-card-row .episode-card.resource-card .episode-date-time{display:flex;padding-left:0}.episode-card-row .episode-card.resource-card .episode-date-time span{padding-left:5px;padding-right:5px}@media screen and (max-width: 907px){.episode-card-row .episode-card.resource-card{max-width:100%;margin-bottom:1.25rem}}.episode-card-row .episode-card.resource-card.pytorch-resource:before{content:"";background-size:32px 32px;background-repeat:no-repeat;display:block;position:absolute;height:32px;width:32px;top:30px;left:15px}@media screen and (min-width: 768px){.episode-card-row .episode-card.resource-card.pytorch-resource:before{left:30px;top:30px}}.podcast-container{padding-left:0}@media screen and (min-width: 768px){.podcast-container{display:flex}.podcast-container .podcast-card:not(:first-of-type){margin-left:1rem}}.podcast-container .podcast-card{display:flex;align-items:center;justify-content:center;margin-top:2rem;border:1px solid #D6D7D8;height:8.75rem}@media screen and (min-width: 768px){.podcast-container .podcast-card:after{content:"";display:block;width:0;height:1px;position:absolute;bottom:0;left:0;background-color:#ee4c2c;transition:width .250s ease-in-out}.podcast-container .podcast-card:hover:after{width:100%}.podcast-container .podcast-card:hover{color:#262626}}.podcast-container .podcast-title{font-size:24px;font-weight:400}.comm-stories .community-stories-wrapper{background-color:white}.comm-stories .community-stories{padding-top:0}.comm-stories .community-stories .production-info-container,.comm-stories .community-stories .research-info-container{display:flex;flex-flow:column}.comm-stories .community-stories .sticky-top{top:15%}.comm-stories .production-container,.comm-stories .research-container{display:flex;padding-left:0}@media (max-width: 767px){.comm-stories .production-container,.comm-stories .research-container{flex-flow:wrap}}.comm-stories .production-section,.comm-stories .research-section{max-width:920px;margin:0 auto 0 auto;padding:0 30px 43px 30px;width:90%}.comm-stories .production-section .production-item,.comm-stories .production-section .research-item,.comm-stories .research-section .production-item,.comm-stories .research-section .research-item{padding-bottom:2rem;padding-top:2rem;border-bottom:1px solid #d6d7d8}.comm-stories .production-section .production-item h2,.comm-stories .production-section .research-item h2,.comm-stories .research-section .production-item h2,.comm-stories .research-section .research-item h2{padding-bottom:1rem}.comm-stories .production-side-nav-container #research-sidebar-list,.comm-stories .production-side-nav-container #production-sidebar-list,.comm-stories .research-side-nav-container #research-sidebar-list,.comm-stories .research-side-nav-container #production-sidebar-list{padding-left:0}.comm-stories .production-side-nav-container #research-sidebar-list .active,.comm-stories .production-side-nav-container #production-sidebar-list .active,.comm-stories .research-side-nav-container #research-sidebar-list .active,.comm-stories .research-side-nav-container #production-sidebar-list .active{color:#ee4c2c}.comm-stories .production-side-nav-container #research-sidebar-list ul,.comm-stories .production-side-nav-container #production-sidebar-list ul,.comm-stories .research-side-nav-container #research-sidebar-list ul,.comm-stories .research-side-nav-container #production-sidebar-list ul{padding-left:3rem;list-style:none}.comm-stories .production-side-nav-container #research-sidebar-list ul li,.comm-stories .production-side-nav-container #production-sidebar-list ul li,.comm-stories .research-side-nav-container #research-sidebar-list ul li,.comm-stories .research-side-nav-container #production-sidebar-list ul li{line-height:36px}.comm-stories .production-side-nav-container #research-sidebar-list ul li a,.comm-stories .production-side-nav-container #production-sidebar-list ul li a,.comm-stories .research-side-nav-container #research-sidebar-list ul li a,.comm-stories .research-side-nav-container #production-sidebar-list ul li a{color:#8c8c8c}.comm-stories .production-section p,.comm-stories .research-section p{font-size:18px;margin-top:2rem}@media (min-width: 768px) and (max-width: 1239px){.comm-stories .production-section,.comm-stories .research-section{width:100%;padding-left:5px;padding-right:5px}}@media (max-width: 767px){.comm-stories .production-section,.comm-stories .research-section{width:100%;padding-left:5px;padding-right:5px}}.comm-stories .main-content-wrapper{margin-top:275px}@media screen and (min-width: 768px){.comm-stories .main-content-wrapper{margin-top:380px}}.comm-stories .jumbotron{color:#fff;height:190px}@media screen and (min-width: 768px){.comm-stories .jumbotron{height:260px}}.ecosystem .community-stories.main-content{padding-top:0}.community-stories-container-fluid{height:5rem;width:100%;padding-bottom:7rem}@media screen and (max-width: 767px){.community-stories-container-fluid{margin-top:2rem}}@media screen and (min-width: 1200px){.community-stories-container-fluid{margin-left:0}}.comm-stories .community-stories.main-content .navbar{padding-left:0;padding-bottom:0;padding-top:0}.comm-stories .community-stories.main-content .navbar .nav-item{cursor:pointer}.comm-stories .community-stories.main-content .navbar .nav-item:last-of-type{position:relative}@media (min-width: 992px){.comm-stories .community-stories.main-content .navbar .nav-item{padding:2rem;cursor:pointer}.comm-stories .community-stories.main-content .navbar .nav-link{position:relative;top:10%;transform:translateY(-50%)}}.comm-stories .community-stories.main-content .navbar .nav-select{background-color:#fff}.comm-stories .community-stories.main-content .navbar .nav-select .nav-link{color:#ee4c2c;font-weight:500}.comm-stories .community-stories.main-content .navbar .nav-link{font-size:1.125rem;color:#8c8c8c}@media screen and (min-width: 768px){.comm-stories .community-stories.main-content .navbar .nav-link{margin-left:1.875rem}}.comm-stories .community-stories.main-content .navbar .nav-link:hover{color:#ee4c2c}.comm-stories .community-stories.main-content .navbar .community-stories-nav-link{padding-left:1.25rem;padding-right:1.25rem}@media screen and (min-width: 768px){.comm-stories .community-stories.main-content .navbar .community-stories-nav-link{padding-left:1.875rem;padding-right:1.875rem}}.comm-stories .community-stories.main-content .navbar .community-stories-nav{flex-direction:row}.comm-stories .community-stories.main-content .navbar .nav-item{padding-top:.9375rem;padding-bottom:.9375rem}@media screen and (min-width: 768px){.comm-stories .community-stories.main-content .navbar .nav-item{padding-bottom:0;padding-top:2rem}}@media (min-width: 768px) and (max-width: 1239px){.comm-stories .community-stories.main-content .navbar .nav-item{padding-bottom:0;padding-top:2rem}}@media (max-width: 990px){.comm-stories .community-stories.main-content .navbar .nav-item{padding-bottom:.625rem;padding-top:1rem}}.comm-stories .community-stories.main-content .navbar .navbar-toggler{margin-left:2.5rem}.announcement .hero-content{top:148px;height:250px;position:relative;margin-bottom:120px;justify-content:center}@media screen and (min-width: 768px){.announcement .hero-content{top:178px;height:350px}}.announcement .hero-content h1{font-size:3.75rem;text-transform:uppercase;font-weight:lighter;letter-spacing:1.08px;margin-bottom:.625rem;line-height:1.05;color:#fff}@media screen and (min-width: 768px){.announcement .hero-content h1{font-size:4.5rem}}.announcement .hero-content h1.small{font-size:40px}@media screen and (min-width: 768px){.announcement .hero-content h1.small{font-size:58px}}.announcement .hero-content .lead{margin-bottom:1.5625rem;padding-top:1.875rem;color:#fff;width:100%}.announcement .row{justify-content:center}.announcement .main-content{margin-bottom:5rem;padding-bottom:0}.announcement .main-background{height:370px}@media screen and (min-width: 768px){.announcement .main-background{height:450px}}.announcement .card-container{display:grid;grid-template-columns:repeat(2, 1fr);gap:20px;padding-top:3rem}.announcement .card-container .card{border:none;display:block}.announcement .card-container .card a{color:#000}.announcement .card-container .card .card-body{display:flex;flex-direction:column;height:100%;justify-content:space-between;padding:0}.announcement .card-container .card .card-body img{width:100%;height:207px;-o-object-fit:contain;object-fit:contain;padding:20px}@media screen and (min-width: 1000px){.announcement .card-container .card .card-body img{padding:30px}}@media screen and (min-width: 1000px){.announcement .card-container{grid-template-columns:repeat(3, 1fr);gap:36px}}.announcement .contact-us-section{background-color:#f3f4f7;padding:50px 0}.announcement .contact-us-section .row{justify-content:center}.announcement .contact-us-section .row .lead{padding-top:1.5rem}.announcement .contact-us-section .row .hbspt-form{padding:30px 0}.announcement .contact-us-section .row .hbspt-form .hs-button{background-image:url("/assets/images/chevron-right-orange.svg");background-size:6px 13px;background-position:top 16px right 11px;background-repeat:no-repeat;border-radius:0;border:none;background-color:#fff;color:#6c6c6d;font-weight:400;position:relative;letter-spacing:0.25px;padding:.75rem 2rem .75rem .75rem;margin:10px 0}@media screen and (min-width: 768px){.announcement .contact-us-section .row .hbspt-form .hs-button:after{content:"";display:block;width:0;height:1px;position:absolute;bottom:0;left:0;background-color:#ee4c2c;transition:width .250s ease-in-out}.announcement .contact-us-section .row .hbspt-form .hs-button:hover:after{width:100%}.announcement .contact-us-section .row .hbspt-form .hs-button:hover{color:#262626}}@media screen and (min-width: 768px){.announcement .contact-us-section .row .hbspt-form .hs-button{background-position:top 19px right 11px}}.announcement .contact-us-section .row .hbspt-form fieldset.form-columns-2,.announcement .contact-us-section .row .hbspt-form fieldset.form-columns-1{max-width:100%}.announcement .contact-us-section .row .hbspt-form fieldset.form-columns-2 .hs-form-field,.announcement .contact-us-section .row .hbspt-form fieldset.form-columns-1 .hs-form-field{max-width:100%;padding:10px 0;width:100%}.announcement .contact-us-section .row .hbspt-form fieldset.form-columns-2 .hs-form-field input,.announcement .contact-us-section .row .hbspt-form fieldset.form-columns-1 .hs-form-field input{border:none;width:100%}.announcement .contact-us-section .row .hbspt-form fieldset.form-columns-2 .hs-form-field textarea,.announcement .contact-us-section .row .hbspt-form fieldset.form-columns-1 .hs-form-field textarea{border:none;width:100%}.announcement .contact-us-section .row .hbspt-form li.hs-form-radio input[type=radio]{width:auto !important}.announcement .contact-us-section .row .hbspt-form li.hs-form-radio span{margin-left:5px}.announcement .contact-us-section .row .hbspt-form ul{list-style-type:none}.announcement .light-background-section{background-color:#fff}.announcement .light-background-section .content{padding:40px 0}.announcement .light-background-section ul li{font-size:1.25rem;font-weight:300}.announcement .darker-background-section{background-color:#f3f4f7}.announcement .darker-background-section .content{padding:40px 0}.announcement .grey-background-section{background-color:#f3f4f7;padding:60px 0}.announcement .grey-background-section img{height:100px}.announcement .grey-background-section p{font-size:14px;line-height:170%}.announcement .color-background-section{background-image:url("/assets/images/pytorch_bg_purple.jpg");background-size:100% 100%;background-repeat:no-repeat;padding:60px 0}.announcement .color-background-section h2{color:white}.announcement .body-side-text .lead{margin-bottom:1.5625rem;padding-top:1.5rem}.announcement img{width:100%}.announcement h2.upper{font-size:25px;line-height:130%;text-align:center;letter-spacing:1.75px;text-transform:uppercase;margin-bottom:30px}.announcement h3.upper{font-size:19px;text-transform:uppercase;letter-spacing:1.75px;line-height:130%;margin:25px 0}.announcement table.benefits{background-color:white;font-size:14px;text-align:center}.announcement table.benefits td.benefit{border-left:none;min-width:300px;text-align:left}@media screen and (min-width: 768px){.announcement table.benefits td.benefit{min-width:520px}}.announcement table.benefits tbody td{border-left:1px solid #812CE5;vertical-align:middle}.announcement table.benefits tbody td.benefit{font-weight:600}.announcement table.benefits thead,.announcement table.benefits tfoot{background-color:#812CE5;color:white;font-size:16px;font-weight:700}@media screen and (min-width: 768px){.announcement table.benefits thead,.announcement table.benefits tfoot{font-size:20px}}.announcement table.benefits thead td,.announcement table.benefits tfoot td{border-left:1px solid #000;vertical-align:middle;border-top:none}.announcement table.benefits thead a,.announcement table.benefits tfoot a{text-decoration:underline;color:white}.announcement table.benefits thead td.price,.announcement table.benefits tfoot td.price{font-size:14px;line-height:1.2}@media screen and (min-width: 768px){.announcement table.benefits thead td.price,.announcement table.benefits tfoot td.price{font-size:16px}}.announcement table.benefits img{width:15px}.announcement .modal-header{border-bottom:none;padding-bottom:0}.announcement .consolidated-employees tbody td{font-weight:600}.announcement .consolidated-employees td.no-border{border-left:none}.announcement .member-boxes{gap:20px;margin:0}.announcement .member-boxes div.col-sm{background-color:white}.board-member{margin:35px 0}.board-member img{margin-bottom:15px}.board-member a svg{margin-top:5px;height:25px;max-width:30px;fill:#000;color:#000}.board-member a:hover svg{fill:#ee4c2c;color:#ee4c2c}.announcement .cloud-credits-table{font-size:1.1rem;margin-top:40px}.announcement .cloud-credits-table ul{padding-left:20px}.announcement .cloud-credits-table ul li{margin-top:10px;font-size:1.1rem}.announcement .cloud-credits-table .col-md{border-radius:5px;margin-bottom:40px}.announcement .cloud-credits-table .card{border-radius:6px}.announcement .cloud-credits-table .thead{border-top-left-radius:5px;border-top-right-radius:5px;color:#fff;padding:14px 20px;text-align:center}.announcement .cloud-credits-table .col-md:first-child .thead{background:conic-gradient(from 53deg at 37% 100%, #828282 0, rgba(130,130,130,0.95) 100%)}.announcement .cloud-credits-table .col-md:nth-child(2) .thead{background:conic-gradient(from 53deg at 37% 100%, #ab9344 0, rgba(171,147,68,0.95) 100%)}.announcement .cloud-credits-table .col-md:nth-child(3) .thead{background:conic-gradient(from 53deg at 37% 100%, #293850 0, rgba(41,56,80,0.95) 100%)}.announcement .cloud-credits-table .tbody{border-bottom:1px solid #d0d0d0;border-left:1px solid #d0d0d0;border-right:1px solid #d0d0d0;height:100%;padding:26px 20px}.announcement .cloud-credits-table .tfoot{background-color:#000;border-bottom-left-radius:5px;border-bottom-right-radius:5px;color:#fff;padding:20px;text-align:center}.announcement .steps-columns{background-color:transparent}.announcement .steps-columns .col-md{margin-bottom:20px;padding:20px}.announcement .steps-columns h3{margin-bottom:20px}.announcement .steps-columns .step{font-size:1.5rem;margin-bottom:5px;margin-top:20px}.announcement .steps-columns ul{padding-left:20px}.announcement .steps-columns ul li{margin-top:10px} diff --git a/assets/menu-tab-selection.js b/assets/menu-tab-selection.js new file mode 100644 index 000000000000..04d3a0f8b684 --- /dev/null +++ b/assets/menu-tab-selection.js @@ -0,0 +1,7 @@ +var menuTabScript = $("script[src*=menu-tab-selection]"); +var pageId = menuTabScript.attr("page-id"); + +$(".main-content-menu .nav-item").removeClass("nav-select"); +$(".main-content-menu .nav-link[data-id='" + pageId + "']") + .parent(".nav-item") + .addClass("nav-select"); diff --git a/assets/mobile-menu.js b/assets/mobile-menu.js new file mode 100644 index 000000000000..fab8b2e7af40 --- /dev/null +++ b/assets/mobile-menu.js @@ -0,0 +1,30 @@ +var mobileMenu = { + bind: function() { + $("[data-behavior='open-mobile-menu']").on('click', function(e) { + e.preventDefault(); + $(".mobile-main-menu").addClass("open"); + $("body").addClass('no-scroll'); + + mobileMenu.listenForResize(); + }); + + $("[data-behavior='close-mobile-menu']").on('click', function(e) { + e.preventDefault(); + mobileMenu.close(); + }); + }, + + listenForResize: function() { + $(window).on('resize.ForMobileMenu', function() { + if ($(this).width() > 768) { + mobileMenu.close(); + } + }); + }, + + close: function() { + $(".mobile-main-menu").removeClass("open"); + $("body").removeClass('no-scroll'); + $(window).off('resize.ForMobileMenu'); + } +}; diff --git a/assets/mobile-page-sidebar.js b/assets/mobile-page-sidebar.js new file mode 100644 index 000000000000..d90f0495db3b --- /dev/null +++ b/assets/mobile-page-sidebar.js @@ -0,0 +1,26 @@ +$(".pytorch-article h2").each(function() { + $("#mobile-page-sidebar-list").append( + "
  • " + this.textContent + "
  • " + ); +}); + +$(".mobile-page-sidebar li").on("click", function() { + removeActiveClass(); + addActiveClass(this); +}); + +function removeActiveClass() { + $(".mobile-page-sidebar li a").each(function() { + $(this).removeClass("active"); + }); +} + +function addActiveClass(element) { + $(element) + .find("a") + .addClass("active"); +} + +if ($("#mobile-page-sidebar-list").text() == "") { + $("#shortcuts-menu").hide(); +} diff --git a/assets/pte/PyTorch-Enterprise-Participation-Form.pdf b/assets/pte/PyTorch-Enterprise-Participation-Form.pdf new file mode 100644 index 000000000000..527104588550 Binary files /dev/null and b/assets/pte/PyTorch-Enterprise-Participation-Form.pdf differ diff --git a/assets/pte/PyTorch-Enterprise-Support-Program-Brand-Guidelines-May2021.pdf b/assets/pte/PyTorch-Enterprise-Support-Program-Brand-Guidelines-May2021.pdf new file mode 100644 index 000000000000..3a1f29d3b5ca Binary files /dev/null and b/assets/pte/PyTorch-Enterprise-Support-Program-Brand-Guidelines-May2021.pdf differ diff --git a/assets/pte/PyTorch-Enterprise-Support-Program-Certification-Guide.pdf b/assets/pte/PyTorch-Enterprise-Support-Program-Certification-Guide.pdf new file mode 100644 index 000000000000..4c1f1f10f2d0 Binary files /dev/null and b/assets/pte/PyTorch-Enterprise-Support-Program-Certification-Guide.pdf differ diff --git a/assets/pte/PyTorch-Enterprise-Support-Program-Terms.pdf b/assets/pte/PyTorch-Enterprise-Support-Program-Terms.pdf new file mode 100644 index 000000000000..28f1a52f3981 Binary files /dev/null and b/assets/pte/PyTorch-Enterprise-Support-Program-Terms.pdf differ diff --git a/assets/pytorch-foundation-charter.pdf b/assets/pytorch-foundation-charter.pdf new file mode 100644 index 000000000000..7dac6f5ac972 Binary files /dev/null and b/assets/pytorch-foundation-charter.pdf differ diff --git a/assets/pytorch-foundation-principles.pdf b/assets/pytorch-foundation-principles.pdf new file mode 100644 index 000000000000..eef7538c328a Binary files /dev/null and b/assets/pytorch-foundation-principles.pdf differ diff --git a/assets/pytorch-frame-expert-exchange.pdf b/assets/pytorch-frame-expert-exchange.pdf new file mode 100644 index 000000000000..6930f03c3ccb Binary files /dev/null and b/assets/pytorch-frame-expert-exchange.pdf differ diff --git a/assets/pytorch-profiler.gif b/assets/pytorch-profiler.gif new file mode 100644 index 000000000000..8b61b6e3b61e Binary files /dev/null and b/assets/pytorch-profiler.gif differ diff --git a/assets/pytorch2-2.pdf b/assets/pytorch2-2.pdf new file mode 100644 index 000000000000..8669ecd430ce Binary files /dev/null and b/assets/pytorch2-2.pdf differ diff --git a/assets/quick-start-module.js b/assets/quick-start-module.js new file mode 100644 index 000000000000..ab2929175469 --- /dev/null +++ b/assets/quick-start-module.js @@ -0,0 +1,285 @@ +// Keys are Substrings as diplayed by navigator.platform +var supportedOperatingSystems = new Map([ + ['linux', 'linux'], + ['mac', 'macos'], + ['win', 'windows'], +]); + +var archInfoMap = new Map([ + ['cuda', {title: "CUDA", platforms: new Set(['linux', 'windows'])}], + ['rocm', {title: "ROCm", platforms: new Set(['linux'])}], + ['accnone', {title: "CPU", platforms: new Set(['linux', 'macos', 'windows'])}] +]); + +let version_map={"nightly": {"accnone": ["cpu", ""], "cuda.x": ["cuda", "11.8"], "cuda.y": ["cuda", "12.6"], "cuda.z": ["cuda", "12.8"], "rocm5.x": ["rocm", "6.4"]}, "release": {"accnone": ["cpu", ""], "cuda.x": ["cuda", "11.8"], "cuda.y": ["cuda", "12.6"], "cuda.z": ["cuda", "12.8"], "rocm5.x": ["rocm", "6.3"]}} +let stable_version="Stable (2.7.1)"; + +var default_selected_os = getAnchorSelectedOS() || getDefaultSelectedOS(); +var opts = { + cuda: getPreferredCuda(default_selected_os), + os: default_selected_os, + pm: 'pip', + language: 'python', + ptbuild: 'stable', +}; + +var supportedCloudPlatforms = [ + 'aws', + 'google-cloud', + 'microsoft-azure', + 'lightning-studios', +]; + +var os = $(".os > .option"); +var package = $(".package > .option"); +var language = $(".language > .option"); +var cuda = $(".cuda > .option"); +var ptbuild = $(".ptbuild > .option"); + +os.on("click", function() { + selectedOption(os, this, "os"); +}); +package.on("click", function() { + selectedOption(package, this, "pm"); +}); +language.on("click", function() { + selectedOption(language, this, "language"); +}); +cuda.on("click", function() { + selectedOption(cuda, this, "cuda"); +}); +ptbuild.on("click", function() { + selectedOption(ptbuild, this, "ptbuild") +}); + +// Pre-select user's operating system +$(function() { + var userOsOption = document.getElementById(opts.os); + var userCudaOption = document.getElementById(opts.cuda); + if (userOsOption) { + $(userOsOption).trigger("click"); + } + if (userCudaOption) { + $(userCudaOption).trigger("click"); + } +}); + + +// determine os (mac, linux, windows) based on user's platform +function getDefaultSelectedOS() { + var platform = navigator.platform.toLowerCase(); + for (var [navPlatformSubstring, os] of supportedOperatingSystems.entries()) { + if (platform.indexOf(navPlatformSubstring) !== -1) { + return os; + } + } + // Just return something if user platform is not in our supported map + return supportedOperatingSystems.values().next().value; +} + +// determine os based on location hash +function getAnchorSelectedOS() { + var anchor = location.hash; + var ANCHOR_REGEX = /^#[^ ]+$/; + // Look for anchor in the href + if (!ANCHOR_REGEX.test(anchor)) { + return false; + } + // Look for anchor with OS in the first portion + var testOS = anchor.slice(1).split("-")[0]; + for (var [navPlatformSubstring, os] of supportedOperatingSystems.entries()) { + if (testOS.indexOf(navPlatformSubstring) !== -1) { + return os; + } + } + return false; +} + +// determine CUDA version based on OS +function getPreferredCuda(os) { + // Only CPU builds are currently available for MacOS + if (os == 'macos') { + return 'accnone'; + } + return 'cuda.x'; +} + +// Disable compute platform not supported on OS +function disableUnsupportedPlatforms(os) { + + if(opts.ptbuild == "preview") + archMap = version_map.nightly + else + archMap = version_map.release + + for (const [arch_key, info] of archInfoMap) { + var elems = document.querySelectorAll('[id^="'+arch_key+'"]'); + if (elems == null) { + console.log("Failed to find element for architecture " + arch_key); + return; + } + for (var i=0; i < elems.length;i++) { + var supported = info.platforms.has(os); + elems[i].style.textDecoration = supported ? "" : "line-through"; + + // Officially supported arch but not available + if(!archMap[elems[i].id]) { + elems[i].style.textDecoration = "line-through"; + } + } + } +} + +// Change compute versions depending on build type +function changeVersion(ptbuild) { + + if(ptbuild == "preview") + archMap = version_map.nightly + else + archMap = version_map.release + + for (const [arch_key, info] of archInfoMap) { + var elems = document.querySelectorAll('[id^="'+arch_key+'"]'); + for (var i=0; i < elems.length;i++) { + if(archMap[elems[i].id]) { + elems[i].style.textDecoration = ""; + elems[i].children[0].textContent = info.title + " " + archMap[elems[i].id][1] + } else { + elems[i].style.textDecoration = "line-through"; + } + } + } + var stable_element = document.getElementById("stable"); + stable_element.children[0].textContent = stable_version; +} + + + +// Change accnone name depending on OS type +function changeAccNoneName(osname) { + var accnone_element = document.getElementById("accnone"); + if (accnone_element == null) { + console.log("Failed to find accnone element"); + return; + } + if (osname == "macos") { + accnone_element.children[0].textContent = "Default"; + } else { + accnone_element.children[0].textContent = "CPU"; + } +} + +function selectedOption(option, selection, category) { + $(option).removeClass("selected"); + $(selection).addClass("selected"); + opts[category] = selection.id; + if (category === "pm") { + var elements = document.getElementsByClassName("language")[0].children; + if (selection.id !== "libtorch" && elements["cplusplus"].classList.contains("selected")) { + $(elements["cplusplus"]).removeClass("selected"); + $(elements["python"]).addClass("selected"); + opts["language"] = "python"; + } else if (selection.id == "libtorch") { + for (var i = 0; i < elements.length; i++) { + if (elements[i].id === "cplusplus") { + $(elements[i]).addClass("selected"); + opts["language"] = "cplusplus"; + } else { + $(elements[i]).removeClass("selected"); + } + } + } + } else if (category === "language") { + var elements = document.getElementsByClassName("package")[0].children; + if (selection.id !== "cplusplus" && elements["libtorch"].classList.contains("selected")) { + $(elements["libtorch"]).removeClass("selected"); + $(elements["pip"]).addClass("selected"); + opts["pm"] = "pip"; + } else if (selection.id == "cplusplus") { + for (var i = 0; i < elements.length; i++) { + if (elements[i].id === "libtorch") { + $(elements[i]).addClass("selected"); + opts["pm"] = "libtorch"; + } else { + $(elements[i]).removeClass("selected"); + } + } + } + } else if (category == "ptbuild") { + changeVersion(opts.ptbuild); + //make sure unsupported platforms are disabled + disableUnsupportedPlatforms(opts.os); + } + commandMessage(buildMatcher()); + if (category === "os") { + disableUnsupportedPlatforms(opts.os); + display(opts.os, 'installation', 'os'); + } + changeAccNoneName(opts.os); +} + +function display(selection, id, category) { + var container = document.getElementById(id); + // Check if there's a container to display the selection + if (container === null) { + return; + } + var elements = container.getElementsByClassName(category); + for (var i = 0; i < elements.length; i++) { + if (elements[i].classList.contains(selection)) { + $(elements[i]).addClass("selected"); + } else { + $(elements[i]).removeClass("selected"); + } + } +} + +function buildMatcher() { + return ( + opts.ptbuild.toLowerCase() + + "," + + opts.pm.toLowerCase() + + "," + + opts.os.toLowerCase() + + "," + + opts.cuda.toLowerCase() + + "," + + opts.language.toLowerCase() + ); +} + +// Cloud Partners sub-menu toggle listeners +$("[data-toggle='cloud-dropdown']").on("click", function(e) { + if ($(this).hasClass("open")) { + $(this).removeClass("open"); + // If you deselect a current drop-down item, don't display it's info any longer + display(null, 'cloud', 'platform'); + } else { + $("[data-toggle='cloud-dropdown'].open").removeClass("open"); + $(this).addClass("open"); + var cls = $(this).find(".cloud-option-body")[0].className; + for (var i = 0; i < supportedCloudPlatforms.length; i++) { + if (cls.includes(supportedCloudPlatforms[i])) { + display(supportedCloudPlatforms[i], 'cloud', 'platform'); + } + } + } +}); + +function commandMessage(key) { + var object = {"preview,pip,linux,accnone,python": "pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu", "preview,pip,linux,cuda.x,python": "pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu118", "preview,pip,linux,cuda.y,python": "pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu126", "preview,pip,linux,cuda.z,python": "pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128", "preview,pip,linux,rocm5.x,python": "pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.4", "preview,libtorch,linux,accnone,cplusplus": "Download here (cxx11 ABI):
    https://download.pytorch.org/libtorch/nightly/cpu/libtorch-cxx11-abi-shared-with-deps-latest.zip", "preview,libtorch,linux,cuda.x,cplusplus": "Download here (cxx11 ABI):
    https://download.pytorch.org/libtorch/nightly/cu118/libtorch-cxx11-abi-shared-with-deps-latest.zip", "preview,libtorch,linux,cuda.y,cplusplus": "Download here (cxx11 ABI):
    https://download.pytorch.org/libtorch/nightly/cu126/libtorch-cxx11-abi-shared-with-deps-latest.zip", "preview,libtorch,linux,cuda.z,cplusplus": "Download here (cxx11 ABI):
    https://download.pytorch.org/libtorch/nightly/cu128/libtorch-cxx11-abi-shared-with-deps-latest.zip", "preview,libtorch,linux,rocm5.x,cplusplus": "Download here (cxx11 ABI):
    https://download.pytorch.org/libtorch/nightly/rocm6.4/libtorch-cxx11-abi-shared-with-deps-latest.zip", "preview,pip,macos,cuda.x,python": "# CUDA is not available on MacOS, please use default package
    pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu", "preview,pip,macos,cuda.y,python": "# CUDA is not available on MacOS, please use default package
    pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu", "preview,pip,macos,cuda.z,python": "# CUDA is not available on MacOS, please use default package
    pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu", "preview,pip,macos,rocm5.x,python": "# ROCm is not available on MacOS, please use default package
    pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu", "preview,pip,macos,accnone,python": "pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu", "preview,libtorch,macos,accnone,cplusplus": "Download arm64 libtorch here (ROCm and CUDA are not supported):
    https://download.pytorch.org/libtorch/nightly/cpu/libtorch-macos-arm64-latest.zip", "preview,libtorch,macos,cuda.x,cplusplus": "Download arm64 libtorch here (ROCm and CUDA are not supported):
    https://download.pytorch.org/libtorch/nightly/cpu/libtorch-macos-arm64-latest.zip", "preview,libtorch,macos,cuda.y,cplusplus": "Download arm64 libtorch here (ROCm and CUDA are not supported):
    https://download.pytorch.org/libtorch/nightly/cpu/libtorch-macos-arm64-latest.zip", "preview,libtorch,macos,cuda.z,cplusplus": "Download arm64 libtorch here (ROCm and CUDA are not supported):
    https://download.pytorch.org/libtorch/nightly/cpu/libtorch-macos-arm64-latest.zip", "preview,libtorch,macos,rocm5.x,cplusplus": "Download arm64 libtorch here (ROCm and CUDA are not supported):
    https://download.pytorch.org/libtorch/nightly/cpu/libtorch-macos-arm64-latest.zip", "preview,pip,windows,accnone,python": "pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu", "preview,pip,windows,cuda.x,python": "pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu118", "preview,pip,windows,cuda.y,python": "pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu126", "preview,pip,windows,cuda.z,python": "pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128", "preview,pip,windows,rocm5.x,python": "NOTE: ROCm is not available on Windows", "preview,libtorch,windows,accnone,cplusplus": "Download here (Release version):
    https://download.pytorch.org/libtorch/nightly/cpu/libtorch-win-shared-with-deps-latest.zip
    Download here (Debug version):
    https://download.pytorch.org/libtorch/nightly/cpu/libtorch-win-shared-with-deps-debug-latest.zip", "preview,libtorch,windows,cuda.x,cplusplus": "Download here (Release version):
    https://download.pytorch.org/libtorch/nightly/cu118/libtorch-win-shared-with-deps-latest.zip
    Download here (Debug version):
    https://download.pytorch.org/libtorch/nightly/cu118/libtorch-win-shared-with-deps-debug-latest.zip", "preview,libtorch,windows,cuda.y,cplusplus": "Download here (Release version):
    https://download.pytorch.org/libtorch/nightly/cu126/libtorch-win-shared-with-deps-latest.zip
    Download here (Debug version):
    https://download.pytorch.org/libtorch/nightly/cu126/libtorch-win-shared-with-deps-debug-latest.zip", "preview,libtorch,windows,cuda.z,cplusplus": "Download here (Release version):
    https://download.pytorch.org/libtorch/nightly/cu128/libtorch-win-shared-with-deps-latest.zip
    Download here (Debug version):
    https://download.pytorch.org/libtorch/nightly/cu128/libtorch-win-shared-with-deps-debug-latest.zip", "preview,libtorch,windows,rocm5.x,cplusplus": "NOTE: ROCm is not available on Windows", "stable,pip,linux,accnone,python": "pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu", "stable,pip,linux,cuda.x,python": "pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118", "stable,pip,linux,cuda.y,python": "pip3 install torch torchvision torchaudio", "stable,pip,linux,cuda.z,python": "pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128", "stable,pip,linux,rocm5.x,python": "pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.3", "stable,libtorch,linux,accnone,cplusplus": "Download here (cxx11 ABI):
    https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-2.7.1%2Bcpu.zip", "stable,libtorch,linux,cuda.x,cplusplus": "Download here (cxx11 ABI):
    https://download.pytorch.org/libtorch/cu118/libtorch-cxx11-abi-shared-with-deps-2.7.1%2Bcu118.zip", "stable,libtorch,linux,cuda.y,cplusplus": "Download here (cxx11 ABI):
    https://download.pytorch.org/libtorch/cu126/libtorch-cxx11-abi-shared-with-deps-2.7.1%2Bcu126.zip", "stable,libtorch,linux,cuda.z,cplusplus": "Download here (cxx11 ABI):
    https://download.pytorch.org/libtorch/cu128/libtorch-cxx11-abi-shared-with-deps-2.7.1%2Bcu128.zip", "stable,libtorch,linux,rocm5.x,cplusplus": "Download here (cxx11 ABI):
    https://download.pytorch.org/libtorch/rocm6.3/libtorch-cxx11-abi-shared-with-deps-2.7.1%2Brocm6.3.zip", "stable,pip,macos,cuda.x,python": "# CUDA is not available on MacOS, please use default package
    pip3 install torch torchvision torchaudio", "stable,pip,macos,cuda.y,python": "# CUDA is not available on MacOS, please use default package
    pip3 install torch torchvision torchaudio", "stable,pip,macos,cuda.z,python": "# CUDA is not available on MacOS, please use default package
    pip3 install torch torchvision torchaudio", "stable,pip,macos,rocm5.x,python": "# ROCm is not available on MacOS, please use default package
    pip3 install torch torchvision torchaudio", "stable,pip,macos,accnone,python": "pip3 install torch torchvision torchaudio", "stable,libtorch,macos,accnone,cplusplus": "Download arm64 libtorch here (ROCm and CUDA are not supported):
    https://download.pytorch.org/libtorch/cpu/libtorch-macos-arm64-2.7.1.zip", "stable,libtorch,macos,cuda.x,cplusplus": "Download arm64 libtorch here (ROCm and CUDA are not supported):
    https://download.pytorch.org/libtorch/cpu/libtorch-macos-arm64-2.7.1.zip", "stable,libtorch,macos,cuda.y,cplusplus": "Download arm64 libtorch here (ROCm and CUDA are not supported):
    https://download.pytorch.org/libtorch/cpu/libtorch-macos-arm64-2.7.1.zip", "stable,libtorch,macos,cuda.z,cplusplus": "Download arm64 libtorch here (ROCm and CUDA are not supported):
    https://download.pytorch.org/libtorch/cpu/libtorch-macos-arm64-2.7.1.zip", "stable,libtorch,macos,rocm5.x,cplusplus": "Download arm64 libtorch here (ROCm and CUDA are not supported):
    https://download.pytorch.org/libtorch/cpu/libtorch-macos-arm64-2.7.1.zip", "stable,pip,windows,accnone,python": "pip3 install torch torchvision torchaudio", "stable,pip,windows,cuda.x,python": "pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118", "stable,pip,windows,cuda.y,python": "pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126", "stable,pip,windows,cuda.z,python": "pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128", "stable,pip,windows,rocm5.x,python": "NOTE: ROCm is not available on Windows", "stable,libtorch,windows,accnone,cplusplus": "Download here (Release version):
    https://download.pytorch.org/libtorch/cpu/libtorch-win-shared-with-deps-2.7.1%2Bcpu.zip
    Download here (Debug version):
    https://download.pytorch.org/libtorch/cpu/libtorch-win-shared-with-deps-debug-2.7.1%2Bcpu.zip", "stable,libtorch,windows,cuda.x,cplusplus": "Download here (Release version):
    https://download.pytorch.org/libtorch/cu118/libtorch-win-shared-with-deps-2.7.1%2Bcu118.zip
    Download here (Debug version):
    https://download.pytorch.org/libtorch/cu118/libtorch-win-shared-with-deps-debug-2.7.1%2Bcu118.zip", "stable,libtorch,windows,cuda.y,cplusplus": "Download here (Release version):
    https://download.pytorch.org/libtorch/cu126/libtorch-win-shared-with-deps-2.7.1%2Bcu126.zip
    Download here (Debug version):
    https://download.pytorch.org/libtorch/cu126/libtorch-win-shared-with-deps-debug-2.7.1%2Bcu126.zip", "stable,libtorch,windows,cuda.z,cplusplus": "Download here (Release version):
    https://download.pytorch.org/libtorch/cu128/libtorch-win-shared-with-deps-2.7.1%2Bcu128.zip
    Download here (Debug version):
    https://download.pytorch.org/libtorch/cu128/libtorch-win-shared-with-deps-debug-2.7.1%2Bcu128.zip", "stable,libtorch,windows,rocm5.x,cplusplus": "NOTE: ROCm is not available on Windows"}; + + if (!object.hasOwnProperty(key)) { + $("#command").html( + "
     # Follow instructions at this URL: https://github.com/pytorch/pytorch#from-source 
    " + ); + } else if (key.indexOf("lts") == 0 && key.indexOf('rocm') < 0) { + $("#command").html("
    " + object[key] + "
    "); + } else { + $("#command").html("
    " + object[key] + "
    "); + } +} + +// Set cuda version right away +changeVersion("stable") + diff --git a/assets/scroll-to-anchor.js b/assets/scroll-to-anchor.js new file mode 100644 index 000000000000..79fee28bb60b --- /dev/null +++ b/assets/scroll-to-anchor.js @@ -0,0 +1,86 @@ +// Modified from https://stackoverflow.com/a/13067009 +// Going for a JS solution to scrolling to an anchor so we can benefit from +// less hacky css and smooth scrolling. + +var scrollToAnchor = { + bind: function() { + var document = window.document; + var history = window.history; + var location = window.location + var HISTORY_SUPPORT = !!(history && history.pushState); + + var anchorScrolls = { + ANCHOR_REGEX: /^#[^ ]+$/, + offsetHeightPx: function() { + return $(".header-holder").height() + 20; + }, + + /** + * Establish events, and fix initial scroll position if a hash is provided. + */ + init: function() { + this.scrollToCurrent(); + $(window).on('hashchange', $.proxy(this, 'scrollToCurrent')); + $('body').on('click', 'a', $.proxy(this, 'delegateAnchors')); + }, + + /** + * Return the offset amount to deduct from the normal scroll position. + * Modify as appropriate to allow for dynamic calculations + */ + getFixedOffset: function() { + return this.offsetHeightPx(); + }, + + /** + * If the provided href is an anchor which resolves to an element on the + * page, scroll to it. + * @param {String} href + * @return {Boolean} - Was the href an anchor. + */ + scrollIfAnchor: function(href, pushToHistory) { + var match, anchorOffset; + + if(!this.ANCHOR_REGEX.test(href)) { + return false; + } + + match = document.getElementById(href.slice(1)); + + if(match) { + anchorOffset = $(match).offset().top - this.getFixedOffset(); + $('html, body').scrollTop(anchorOffset); + + // Add the state to history as-per normal anchor links + if(HISTORY_SUPPORT && pushToHistory) { + history.pushState({}, document.title, location.pathname + href); + } + } + + return !!match; + }, + + /** + * Attempt to scroll to the current location's hash. + */ + scrollToCurrent: function(e) { + if(this.scrollIfAnchor(window.location.hash) && e) { + e.preventDefault(); + } + }, + + /** + * If the click event's target was an anchor, fix the scroll position. + */ + delegateAnchors: function(e) { + var elem = e.target; + + if(this.scrollIfAnchor(elem.getAttribute('href'), true)) { + e.preventDefault(); + } + } + }; + + $(document).ready($.proxy(anchorScrolls, 'init')); + } +}; diff --git a/assets/search-bar.js b/assets/search-bar.js new file mode 100644 index 000000000000..a9128101edda --- /dev/null +++ b/assets/search-bar.js @@ -0,0 +1,42 @@ +docsearch({ + apiKey: "e3b73ac141dff0b0fd27bdae9055bc73", + indexName: "pytorch", + inputSelector: "#search-input", + debug: false // Set debug to true if you want to inspect the dropdown +}); + +docsearch({ + apiKey: 'e3b73ac141dff0b0fd27bdae9055bc73', + indexName: 'pytorch', + inputSelector: '#mobile-search-input', + algoliaOptions: { + hitsPerPage: 5 + }, + debug: false // Set debug to true if you want to inspect the dropdown +}); + +$("#search-icon").on("click", function() { + $(this).hide(); + $("#close-search").show(); + $(".search-border") + .addClass("active-background") + .animate({ width: "100%" }, "slow"); + $("#search-input") + .addClass("active-search-icon") + .focus(); + $(".main-menu-item").hide(); + $(".header-logo").addClass("active-header"); +}); + +$("#close-search").on("click", function() { + $(this).hide(); + $("#search-icon").show(); + $(".search-border") + .attr("style", "") + .removeClass("active-background"); + $("#search-input") + .removeClass("active-search-icon") + .val(""); + $(".main-menu-item").fadeIn("slow"); + $(".header-logo").removeClass("active-header"); +}); diff --git a/assets/show-screencast.js b/assets/show-screencast.js new file mode 100644 index 000000000000..295d88edba67 --- /dev/null +++ b/assets/show-screencast.js @@ -0,0 +1,15 @@ +$('a.show-screencast').one('click', func); + +function func(e) { + e.preventDefault(); + $(this).next('div.screencast').show(); + // Hide the show button + $(this).hide(); +} + +$('div.screencast a:contains(Hide)').click(function (e) { + e.preventDefault(); + // Make the show button visible again + $(this).parent().hide() + .prev().one('click', func).show(); +}); \ No newline at end of file diff --git a/assets/social/github-black.svg b/assets/social/github-black.svg new file mode 100644 index 000000000000..1df775c4d103 --- /dev/null +++ b/assets/social/github-black.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/assets/social/github-white.svg b/assets/social/github-white.svg new file mode 100644 index 000000000000..d077e97c8d86 --- /dev/null +++ b/assets/social/github-white.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/assets/tos-oss-privacy-policy/fb-oss-privacy-policy.pdf b/assets/tos-oss-privacy-policy/fb-oss-privacy-policy.pdf new file mode 100644 index 000000000000..237bb79cf4eb Binary files /dev/null and b/assets/tos-oss-privacy-policy/fb-oss-privacy-policy.pdf differ diff --git a/assets/tos-oss-privacy-policy/fb-tos-privacy-policy.pdf b/assets/tos-oss-privacy-policy/fb-tos-privacy-policy.pdf new file mode 100644 index 000000000000..abacd4fbc48c Binary files /dev/null and b/assets/tos-oss-privacy-policy/fb-tos-privacy-policy.pdf differ diff --git a/assets/track-events.js b/assets/track-events.js new file mode 100644 index 000000000000..82f05b7d35f8 --- /dev/null +++ b/assets/track-events.js @@ -0,0 +1,107 @@ +var trackEvents = { + recordClick: function(eventCategory, eventLabel) { + if (typeof gtag == "function") { + var gaEventObject = { + eventCategory: eventCategory, + eventAction: "click", + eventLabel: eventLabel + }; + + gtag('event', 'click', gaEventObject); + } + + if (typeof fbq === "function" && eventLabel !== "Download") { + fbq("trackCustom", eventCategory, { + target: eventLabel + }); + } + else { + // Only call the lead event code when a user signs up + // to download the deep learning book + fbq('track', 'Lead'); + } + }, + + bind: function() { + // Clicks on the main menu + $(".main-menu ul li a").on("click", function() { + trackEvents.recordClick("Global Nav", $(this).text()); + return true; + }); + + // Clicks on GitHub link in main or mobile menu + $("#github-main-menu-link, #github-mobile-menu-link").on( + "click", + function() { + trackEvents.recordClick("Link", $(this).text()); + return true; + } + ); + + // Clicks on Resource cards + $(".resource-card a").on("click", function() { + trackEvents.recordClick("Resource Card", $(this).find("h4").text()); + return true; + }); + + // Clicks on Ecosystem Project cards + $(".ecosystem-card a").on("click", function() { + trackEvents.recordClick("Ecosystem Project Card", $(this).find(".card-title").text()); + return true; + }); + + // Clicks on 'Get Started' call to action buttons + $("[data-cta='get-started']").on("click", function() { + trackEvents.recordClick("Get Started CTA", $(this).text()); + return true; + }); + + // Clicks on Cloud Platforms in Quick Start Module + $(".cloud-option").on("click", function() { + var platformName = $.trim($(this).find(".cloud-option-body").text()); + trackEvents.recordClick("Quick Start Module - Cloud Platforms", platformName); + }); + + // Clicks on Cloud Platform Services in Quick Start Module + $(".cloud-option ul li a").on("click", function() { + var platformName = $.trim( + $(this). + closest("[data-toggle='cloud-dropdown']"). + find(".cloud-option-body"). + text() + ); + + var serviceName = $.trim($(this).text()); + + trackEvents.recordClick( + "Quick Start Module - Cloud Platforms", + platformName + " - " + serviceName + ); + return true; + }); + + // Clicks on options in Quick Start - Locally + $(".quick-start-module .row .option").on("click", function() { + var selectedOption = $.trim($(this).text()); + var rowIndex = $(this).closest(".row").index(); + var selectedCategory = $(".quick-start-module .headings .title-block"). + eq(rowIndex). + find(".option-text"). + text(); + + trackEvents.recordClick( + "Quick Start Module - Local Install", + selectedCategory + ": " + selectedOption + ) + }) + + // Clicks on Deep Learning Download button + $("#deep-learning-button").on( + "click", + function() { + trackEvents.recordClick("Link", "Download"); + return true; + } + ); + } +}; diff --git a/assets/tubi_logo.png b/assets/tubi_logo.png new file mode 100644 index 000000000000..f4d6272887c6 Binary files /dev/null and b/assets/tubi_logo.png differ diff --git a/assets/vendor/anchor.min.js b/assets/vendor/anchor.min.js new file mode 100644 index 000000000000..29a64acae6aa --- /dev/null +++ b/assets/vendor/anchor.min.js @@ -0,0 +1,6 @@ +/** + * AnchorJS - v4.1.1 - 2018-07-01 + * https://github.com/bryanbraun/anchorjs + * Copyright (c) 2018 Bryan Braun; Licensed MIT + */ +!function(A,e){"use strict";"function"==typeof define&&define.amd?define([],e):"object"==typeof module&&module.exports?module.exports=e():(A.AnchorJS=e(),A.anchors=new A.AnchorJS)}(this,function(){"use strict";return function(A){function d(A){A.icon=A.hasOwnProperty("icon")?A.icon:"",A.visible=A.hasOwnProperty("visible")?A.visible:"hover",A.placement=A.hasOwnProperty("placement")?A.placement:"right",A.ariaLabel=A.hasOwnProperty("ariaLabel")?A.ariaLabel:"Anchor",A.class=A.hasOwnProperty("class")?A.class:"",A.truncate=A.hasOwnProperty("truncate")?Math.floor(A.truncate):64}function f(A){var e;if("string"==typeof A||A instanceof String)e=[].slice.call(document.querySelectorAll(A));else{if(!(Array.isArray(A)||A instanceof NodeList))throw new Error("The selector provided to AnchorJS was invalid.");e=[].slice.call(A)}return e}this.options=A||{},this.elements=[],d(this.options),this.isTouchDevice=function(){return!!("ontouchstart"in window||window.DocumentTouch&&document instanceof DocumentTouch)},this.add=function(A){var e,t,i,n,o,s,r,a,c,h,l,u=[];if(d(this.options),"touch"===(l=this.options.visible)&&(l=this.isTouchDevice()?"always":"hover"),A||(A="h2, h3, h4, h5, h6"),0===(e=f(A)).length)return this;for(function(){if(null===document.head.querySelector("style.anchorjs")){var A,e=document.createElement("style");e.className="anchorjs",e.appendChild(document.createTextNode("")),void 0===(A=document.head.querySelector('[rel="stylesheet"], style'))?document.head.appendChild(e):document.head.insertBefore(e,A),e.sheet.insertRule(" .anchorjs-link { opacity: 0; text-decoration: none; -webkit-font-smoothing: antialiased; -moz-osx-font-smoothing: grayscale; }",e.sheet.cssRules.length),e.sheet.insertRule(" *:hover > .anchorjs-link, .anchorjs-link:focus { opacity: 1; }",e.sheet.cssRules.length),e.sheet.insertRule(" [data-anchorjs-icon]::after { content: attr(data-anchorjs-icon); }",e.sheet.cssRules.length),e.sheet.insertRule(' @font-face { font-family: "anchorjs-icons"; src: url(data:n/a;base64,AAEAAAALAIAAAwAwT1MvMg8yG2cAAAE4AAAAYGNtYXDp3gC3AAABpAAAAExnYXNwAAAAEAAAA9wAAAAIZ2x5ZlQCcfwAAAH4AAABCGhlYWQHFvHyAAAAvAAAADZoaGVhBnACFwAAAPQAAAAkaG10eASAADEAAAGYAAAADGxvY2EACACEAAAB8AAAAAhtYXhwAAYAVwAAARgAAAAgbmFtZQGOH9cAAAMAAAAAunBvc3QAAwAAAAADvAAAACAAAQAAAAEAAHzE2p9fDzz1AAkEAAAAAADRecUWAAAAANQA6R8AAAAAAoACwAAAAAgAAgAAAAAAAAABAAADwP/AAAACgAAA/9MCrQABAAAAAAAAAAAAAAAAAAAAAwABAAAAAwBVAAIAAAAAAAIAAAAAAAAAAAAAAAAAAAAAAAMCQAGQAAUAAAKZAswAAACPApkCzAAAAesAMwEJAAAAAAAAAAAAAAAAAAAAARAAAAAAAAAAAAAAAAAAAAAAQAAg//0DwP/AAEADwABAAAAAAQAAAAAAAAAAAAAAIAAAAAAAAAIAAAACgAAxAAAAAwAAAAMAAAAcAAEAAwAAABwAAwABAAAAHAAEADAAAAAIAAgAAgAAACDpy//9//8AAAAg6cv//f///+EWNwADAAEAAAAAAAAAAAAAAAAACACEAAEAAAAAAAAAAAAAAAAxAAACAAQARAKAAsAAKwBUAAABIiYnJjQ3NzY2MzIWFxYUBwcGIicmNDc3NjQnJiYjIgYHBwYUFxYUBwYGIwciJicmNDc3NjIXFhQHBwYUFxYWMzI2Nzc2NCcmNDc2MhcWFAcHBgYjARQGDAUtLXoWOR8fORYtLTgKGwoKCjgaGg0gEhIgDXoaGgkJBQwHdR85Fi0tOAobCgoKOBoaDSASEiANehoaCQkKGwotLXoWOR8BMwUFLYEuehYXFxYugC44CQkKGwo4GkoaDQ0NDXoaShoKGwoFBe8XFi6ALjgJCQobCjgaShoNDQ0NehpKGgobCgoKLYEuehYXAAAADACWAAEAAAAAAAEACAAAAAEAAAAAAAIAAwAIAAEAAAAAAAMACAAAAAEAAAAAAAQACAAAAAEAAAAAAAUAAQALAAEAAAAAAAYACAAAAAMAAQQJAAEAEAAMAAMAAQQJAAIABgAcAAMAAQQJAAMAEAAMAAMAAQQJAAQAEAAMAAMAAQQJAAUAAgAiAAMAAQQJAAYAEAAMYW5jaG9yanM0MDBAAGEAbgBjAGgAbwByAGoAcwA0ADAAMABAAAAAAwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABAAH//wAP) format("truetype"); }',e.sheet.cssRules.length)}}(),t=document.querySelectorAll("[id]"),i=[].map.call(t,function(A){return A.id}),o=0;o\]\.\/\(\)\*\\\n\t\b\v]/g,"-").replace(/-{2,}/g,"-").substring(0,this.options.truncate).replace(/^-+|-+$/gm,"").toLowerCase()},this.hasAnchorJSLink=function(A){var e=A.firstChild&&-1<(" "+A.firstChild.className+" ").indexOf(" anchorjs-link "),t=A.lastChild&&-1<(" "+A.lastChild.className+" ").indexOf(" anchorjs-link ");return e||t||!1}}}); \ No newline at end of file diff --git a/assets/vendor/bootstrap.min.js b/assets/vendor/bootstrap.min.js new file mode 100644 index 000000000000..c4c0d1f95cd3 --- /dev/null +++ b/assets/vendor/bootstrap.min.js @@ -0,0 +1,7 @@ +/*! + * Bootstrap v4.3.1 (https://getbootstrap.com/) + * Copyright 2011-2019 The Bootstrap Authors (https://github.com/twbs/bootstrap/graphs/contributors) + * Licensed under MIT (https://github.com/twbs/bootstrap/blob/master/LICENSE) + */ +!function(t,e){"object"==typeof exports&&"undefined"!=typeof module?e(exports,require("jquery"),require("popper.js")):"function"==typeof define&&define.amd?define(["exports","jquery","popper.js"],e):e((t=t||self).bootstrap={},t.jQuery,t.Popper)}(this,function(t,g,u){"use strict";function i(t,e){for(var n=0;nthis._items.length-1||t<0))if(this._isSliding)g(this._element).one(Q.SLID,function(){return e.to(t)});else{if(n===t)return this.pause(),void this.cycle();var i=ndocument.documentElement.clientHeight;!this._isBodyOverflowing&&t&&(this._element.style.paddingLeft=this._scrollbarWidth+"px"),this._isBodyOverflowing&&!t&&(this._element.style.paddingRight=this._scrollbarWidth+"px")},t._resetAdjustments=function(){this._element.style.paddingLeft="",this._element.style.paddingRight=""},t._checkScrollbar=function(){var t=document.body.getBoundingClientRect();this._isBodyOverflowing=t.left+t.right
    ',trigger:"hover focus",title:"",delay:0,html:!1,selector:!1,placement:"top",offset:0,container:!1,fallbackPlacement:"flip",boundary:"scrollParent",sanitize:!0,sanitizeFn:null,whiteList:Ee},je="show",He="out",Re={HIDE:"hide"+De,HIDDEN:"hidden"+De,SHOW:"show"+De,SHOWN:"shown"+De,INSERTED:"inserted"+De,CLICK:"click"+De,FOCUSIN:"focusin"+De,FOCUSOUT:"focusout"+De,MOUSEENTER:"mouseenter"+De,MOUSELEAVE:"mouseleave"+De},xe="fade",Fe="show",Ue=".tooltip-inner",We=".arrow",qe="hover",Me="focus",Ke="click",Qe="manual",Be=function(){function i(t,e){if("undefined"==typeof u)throw new TypeError("Bootstrap's tooltips require Popper.js (https://popper.js.org/)");this._isEnabled=!0,this._timeout=0,this._hoverState="",this._activeTrigger={},this._popper=null,this.element=t,this.config=this._getConfig(e),this.tip=null,this._setListeners()}var t=i.prototype;return t.enable=function(){this._isEnabled=!0},t.disable=function(){this._isEnabled=!1},t.toggleEnabled=function(){this._isEnabled=!this._isEnabled},t.toggle=function(t){if(this._isEnabled)if(t){var e=this.constructor.DATA_KEY,n=g(t.currentTarget).data(e);n||(n=new this.constructor(t.currentTarget,this._getDelegateConfig()),g(t.currentTarget).data(e,n)),n._activeTrigger.click=!n._activeTrigger.click,n._isWithActiveTrigger()?n._enter(null,n):n._leave(null,n)}else{if(g(this.getTipElement()).hasClass(Fe))return void this._leave(null,this);this._enter(null,this)}},t.dispose=function(){clearTimeout(this._timeout),g.removeData(this.element,this.constructor.DATA_KEY),g(this.element).off(this.constructor.EVENT_KEY),g(this.element).closest(".modal").off("hide.bs.modal"),this.tip&&g(this.tip).remove(),this._isEnabled=null,this._timeout=null,this._hoverState=null,(this._activeTrigger=null)!==this._popper&&this._popper.destroy(),this._popper=null,this.element=null,this.config=null,this.tip=null},t.show=function(){var e=this;if("none"===g(this.element).css("display"))throw new Error("Please use show on visible elements");var t=g.Event(this.constructor.Event.SHOW);if(this.isWithContent()&&this._isEnabled){g(this.element).trigger(t);var n=_.findShadowRoot(this.element),i=g.contains(null!==n?n:this.element.ownerDocument.documentElement,this.element);if(t.isDefaultPrevented()||!i)return;var o=this.getTipElement(),r=_.getUID(this.constructor.NAME);o.setAttribute("id",r),this.element.setAttribute("aria-describedby",r),this.setContent(),this.config.animation&&g(o).addClass(xe);var s="function"==typeof this.config.placement?this.config.placement.call(this,o,this.element):this.config.placement,a=this._getAttachment(s);this.addAttachmentClass(a);var l=this._getContainer();g(o).data(this.constructor.DATA_KEY,this),g.contains(this.element.ownerDocument.documentElement,this.tip)||g(o).appendTo(l),g(this.element).trigger(this.constructor.Event.INSERTED),this._popper=new u(this.element,o,{placement:a,modifiers:{offset:this._getOffset(),flip:{behavior:this.config.fallbackPlacement},arrow:{element:We},preventOverflow:{boundariesElement:this.config.boundary}},onCreate:function(t){t.originalPlacement!==t.placement&&e._handlePopperPlacementChange(t)},onUpdate:function(t){return e._handlePopperPlacementChange(t)}}),g(o).addClass(Fe),"ontouchstart"in document.documentElement&&g(document.body).children().on("mouseover",null,g.noop);var c=function(){e.config.animation&&e._fixTransition();var t=e._hoverState;e._hoverState=null,g(e.element).trigger(e.constructor.Event.SHOWN),t===He&&e._leave(null,e)};if(g(this.tip).hasClass(xe)){var h=_.getTransitionDurationFromElement(this.tip);g(this.tip).one(_.TRANSITION_END,c).emulateTransitionEnd(h)}else c()}},t.hide=function(t){var e=this,n=this.getTipElement(),i=g.Event(this.constructor.Event.HIDE),o=function(){e._hoverState!==je&&n.parentNode&&n.parentNode.removeChild(n),e._cleanTipClass(),e.element.removeAttribute("aria-describedby"),g(e.element).trigger(e.constructor.Event.HIDDEN),null!==e._popper&&e._popper.destroy(),t&&t()};if(g(this.element).trigger(i),!i.isDefaultPrevented()){if(g(n).removeClass(Fe),"ontouchstart"in document.documentElement&&g(document.body).children().off("mouseover",null,g.noop),this._activeTrigger[Ke]=!1,this._activeTrigger[Me]=!1,this._activeTrigger[qe]=!1,g(this.tip).hasClass(xe)){var r=_.getTransitionDurationFromElement(n);g(n).one(_.TRANSITION_END,o).emulateTransitionEnd(r)}else o();this._hoverState=""}},t.update=function(){null!==this._popper&&this._popper.scheduleUpdate()},t.isWithContent=function(){return Boolean(this.getTitle())},t.addAttachmentClass=function(t){g(this.getTipElement()).addClass(Ae+"-"+t)},t.getTipElement=function(){return this.tip=this.tip||g(this.config.template)[0],this.tip},t.setContent=function(){var t=this.getTipElement();this.setElementContent(g(t.querySelectorAll(Ue)),this.getTitle()),g(t).removeClass(xe+" "+Fe)},t.setElementContent=function(t,e){"object"!=typeof e||!e.nodeType&&!e.jquery?this.config.html?(this.config.sanitize&&(e=Se(e,this.config.whiteList,this.config.sanitizeFn)),t.html(e)):t.text(e):this.config.html?g(e).parent().is(t)||t.empty().append(e):t.text(g(e).text())},t.getTitle=function(){var t=this.element.getAttribute("data-original-title");return t||(t="function"==typeof this.config.title?this.config.title.call(this.element):this.config.title),t},t._getOffset=function(){var e=this,t={};return"function"==typeof this.config.offset?t.fn=function(t){return t.offsets=l({},t.offsets,e.config.offset(t.offsets,e.element)||{}),t}:t.offset=this.config.offset,t},t._getContainer=function(){return!1===this.config.container?document.body:_.isElement(this.config.container)?g(this.config.container):g(document).find(this.config.container)},t._getAttachment=function(t){return Pe[t.toUpperCase()]},t._setListeners=function(){var i=this;this.config.trigger.split(" ").forEach(function(t){if("click"===t)g(i.element).on(i.constructor.Event.CLICK,i.config.selector,function(t){return i.toggle(t)});else if(t!==Qe){var e=t===qe?i.constructor.Event.MOUSEENTER:i.constructor.Event.FOCUSIN,n=t===qe?i.constructor.Event.MOUSELEAVE:i.constructor.Event.FOCUSOUT;g(i.element).on(e,i.config.selector,function(t){return i._enter(t)}).on(n,i.config.selector,function(t){return i._leave(t)})}}),g(this.element).closest(".modal").on("hide.bs.modal",function(){i.element&&i.hide()}),this.config.selector?this.config=l({},this.config,{trigger:"manual",selector:""}):this._fixTitle()},t._fixTitle=function(){var t=typeof this.element.getAttribute("data-original-title");(this.element.getAttribute("title")||"string"!==t)&&(this.element.setAttribute("data-original-title",this.element.getAttribute("title")||""),this.element.setAttribute("title",""))},t._enter=function(t,e){var n=this.constructor.DATA_KEY;(e=e||g(t.currentTarget).data(n))||(e=new this.constructor(t.currentTarget,this._getDelegateConfig()),g(t.currentTarget).data(n,e)),t&&(e._activeTrigger["focusin"===t.type?Me:qe]=!0),g(e.getTipElement()).hasClass(Fe)||e._hoverState===je?e._hoverState=je:(clearTimeout(e._timeout),e._hoverState=je,e.config.delay&&e.config.delay.show?e._timeout=setTimeout(function(){e._hoverState===je&&e.show()},e.config.delay.show):e.show())},t._leave=function(t,e){var n=this.constructor.DATA_KEY;(e=e||g(t.currentTarget).data(n))||(e=new this.constructor(t.currentTarget,this._getDelegateConfig()),g(t.currentTarget).data(n,e)),t&&(e._activeTrigger["focusout"===t.type?Me:qe]=!1),e._isWithActiveTrigger()||(clearTimeout(e._timeout),e._hoverState=He,e.config.delay&&e.config.delay.hide?e._timeout=setTimeout(function(){e._hoverState===He&&e.hide()},e.config.delay.hide):e.hide())},t._isWithActiveTrigger=function(){for(var t in this._activeTrigger)if(this._activeTrigger[t])return!0;return!1},t._getConfig=function(t){var e=g(this.element).data();return Object.keys(e).forEach(function(t){-1!==Oe.indexOf(t)&&delete e[t]}),"number"==typeof(t=l({},this.constructor.Default,e,"object"==typeof t&&t?t:{})).delay&&(t.delay={show:t.delay,hide:t.delay}),"number"==typeof t.title&&(t.title=t.title.toString()),"number"==typeof t.content&&(t.content=t.content.toString()),_.typeCheckConfig(be,t,this.constructor.DefaultType),t.sanitize&&(t.template=Se(t.template,t.whiteList,t.sanitizeFn)),t},t._getDelegateConfig=function(){var t={};if(this.config)for(var e in this.config)this.constructor.Default[e]!==this.config[e]&&(t[e]=this.config[e]);return t},t._cleanTipClass=function(){var t=g(this.getTipElement()),e=t.attr("class").match(Ne);null!==e&&e.length&&t.removeClass(e.join(""))},t._handlePopperPlacementChange=function(t){var e=t.instance;this.tip=e.popper,this._cleanTipClass(),this.addAttachmentClass(this._getAttachment(t.placement))},t._fixTransition=function(){var t=this.getTipElement(),e=this.config.animation;null===t.getAttribute("x-placement")&&(g(t).removeClass(xe),this.config.animation=!1,this.hide(),this.show(),this.config.animation=e)},i._jQueryInterface=function(n){return this.each(function(){var t=g(this).data(Ie),e="object"==typeof n&&n;if((t||!/dispose|hide/.test(n))&&(t||(t=new i(this,e),g(this).data(Ie,t)),"string"==typeof n)){if("undefined"==typeof t[n])throw new TypeError('No method named "'+n+'"');t[n]()}})},s(i,null,[{key:"VERSION",get:function(){return"4.3.1"}},{key:"Default",get:function(){return Le}},{key:"NAME",get:function(){return be}},{key:"DATA_KEY",get:function(){return Ie}},{key:"Event",get:function(){return Re}},{key:"EVENT_KEY",get:function(){return De}},{key:"DefaultType",get:function(){return ke}}]),i}();g.fn[be]=Be._jQueryInterface,g.fn[be].Constructor=Be,g.fn[be].noConflict=function(){return g.fn[be]=we,Be._jQueryInterface};var Ve="popover",Ye="bs.popover",ze="."+Ye,Xe=g.fn[Ve],$e="bs-popover",Ge=new RegExp("(^|\\s)"+$e+"\\S+","g"),Je=l({},Be.Default,{placement:"right",trigger:"click",content:"",template:''}),Ze=l({},Be.DefaultType,{content:"(string|element|function)"}),tn="fade",en="show",nn=".popover-header",on=".popover-body",rn={HIDE:"hide"+ze,HIDDEN:"hidden"+ze,SHOW:"show"+ze,SHOWN:"shown"+ze,INSERTED:"inserted"+ze,CLICK:"click"+ze,FOCUSIN:"focusin"+ze,FOCUSOUT:"focusout"+ze,MOUSEENTER:"mouseenter"+ze,MOUSELEAVE:"mouseleave"+ze},sn=function(t){var e,n;function i(){return t.apply(this,arguments)||this}n=t,(e=i).prototype=Object.create(n.prototype),(e.prototype.constructor=e).__proto__=n;var o=i.prototype;return o.isWithContent=function(){return this.getTitle()||this._getContent()},o.addAttachmentClass=function(t){g(this.getTipElement()).addClass($e+"-"+t)},o.getTipElement=function(){return this.tip=this.tip||g(this.config.template)[0],this.tip},o.setContent=function(){var t=g(this.getTipElement());this.setElementContent(t.find(nn),this.getTitle());var e=this._getContent();"function"==typeof e&&(e=e.call(this.element)),this.setElementContent(t.find(on),e),t.removeClass(tn+" "+en)},o._getContent=function(){return this.element.getAttribute("data-content")||this.config.content},o._cleanTipClass=function(){var t=g(this.getTipElement()),e=t.attr("class").match(Ge);null!==e&&0=this._offsets[o]&&("undefined"==typeof this._offsets[o+1]||t+~]|"+M+")"+M+"*"),U=new RegExp(M+"|>"),X=new RegExp(F),V=new RegExp("^"+I+"$"),G={ID:new RegExp("^#("+I+")"),CLASS:new RegExp("^\\.("+I+")"),TAG:new RegExp("^("+I+"|[*])"),ATTR:new RegExp("^"+W),PSEUDO:new RegExp("^"+F),CHILD:new RegExp("^:(only|first|last|nth|nth-last)-(child|of-type)(?:\\("+M+"*(even|odd|(([+-]|)(\\d*)n|)"+M+"*(?:([+-]|)"+M+"*(\\d+)|))"+M+"*\\)|)","i"),bool:new RegExp("^(?:"+R+")$","i"),needsContext:new RegExp("^"+M+"*[>+~]|:(even|odd|eq|gt|lt|nth|first|last)(?:\\("+M+"*((?:-\\d)?\\d*)"+M+"*\\)|)(?=[^-]|$)","i")},Y=/HTML$/i,Q=/^(?:input|select|textarea|button)$/i,J=/^h\d$/i,K=/^[^{]+\{\s*\[native \w/,Z=/^(?:#([\w-]+)|(\w+)|\.([\w-]+))$/,ee=/[+~]/,te=new RegExp("\\\\[\\da-fA-F]{1,6}"+M+"?|\\\\([^\\r\\n\\f])","g"),ne=function(e,t){var n="0x"+e.slice(1)-65536;return t||(n<0?String.fromCharCode(n+65536):String.fromCharCode(n>>10|55296,1023&n|56320))},re=/([\0-\x1f\x7f]|^-?\d)|^-$|[^\0-\x1f\x7f-\uFFFF\w-]/g,ie=function(e,t){return t?"\0"===e?"\ufffd":e.slice(0,-1)+"\\"+e.charCodeAt(e.length-1).toString(16)+" ":"\\"+e},oe=function(){T()},ae=be(function(e){return!0===e.disabled&&"fieldset"===e.nodeName.toLowerCase()},{dir:"parentNode",next:"legend"});try{H.apply(t=O.call(p.childNodes),p.childNodes),t[p.childNodes.length].nodeType}catch(e){H={apply:t.length?function(e,t){L.apply(e,O.call(t))}:function(e,t){var n=e.length,r=0;while(e[n++]=t[r++]);e.length=n-1}}}function se(t,e,n,r){var i,o,a,s,u,l,c,f=e&&e.ownerDocument,p=e?e.nodeType:9;if(n=n||[],"string"!=typeof t||!t||1!==p&&9!==p&&11!==p)return n;if(!r&&(T(e),e=e||C,E)){if(11!==p&&(u=Z.exec(t)))if(i=u[1]){if(9===p){if(!(a=e.getElementById(i)))return n;if(a.id===i)return n.push(a),n}else if(f&&(a=f.getElementById(i))&&y(e,a)&&a.id===i)return n.push(a),n}else{if(u[2])return H.apply(n,e.getElementsByTagName(t)),n;if((i=u[3])&&d.getElementsByClassName&&e.getElementsByClassName)return H.apply(n,e.getElementsByClassName(i)),n}if(d.qsa&&!N[t+" "]&&(!v||!v.test(t))&&(1!==p||"object"!==e.nodeName.toLowerCase())){if(c=t,f=e,1===p&&(U.test(t)||z.test(t))){(f=ee.test(t)&&ye(e.parentNode)||e)===e&&d.scope||((s=e.getAttribute("id"))?s=s.replace(re,ie):e.setAttribute("id",s=S)),o=(l=h(t)).length;while(o--)l[o]=(s?"#"+s:":scope")+" "+xe(l[o]);c=l.join(",")}try{return H.apply(n,f.querySelectorAll(c)),n}catch(e){N(t,!0)}finally{s===S&&e.removeAttribute("id")}}}return g(t.replace($,"$1"),e,n,r)}function ue(){var r=[];return function e(t,n){return r.push(t+" ")>b.cacheLength&&delete e[r.shift()],e[t+" "]=n}}function le(e){return e[S]=!0,e}function ce(e){var t=C.createElement("fieldset");try{return!!e(t)}catch(e){return!1}finally{t.parentNode&&t.parentNode.removeChild(t),t=null}}function fe(e,t){var n=e.split("|"),r=n.length;while(r--)b.attrHandle[n[r]]=t}function pe(e,t){var n=t&&e,r=n&&1===e.nodeType&&1===t.nodeType&&e.sourceIndex-t.sourceIndex;if(r)return r;if(n)while(n=n.nextSibling)if(n===t)return-1;return e?1:-1}function de(t){return function(e){return"input"===e.nodeName.toLowerCase()&&e.type===t}}function he(n){return function(e){var t=e.nodeName.toLowerCase();return("input"===t||"button"===t)&&e.type===n}}function ge(t){return function(e){return"form"in e?e.parentNode&&!1===e.disabled?"label"in e?"label"in e.parentNode?e.parentNode.disabled===t:e.disabled===t:e.isDisabled===t||e.isDisabled!==!t&&ae(e)===t:e.disabled===t:"label"in e&&e.disabled===t}}function ve(a){return le(function(o){return o=+o,le(function(e,t){var n,r=a([],e.length,o),i=r.length;while(i--)e[n=r[i]]&&(e[n]=!(t[n]=e[n]))})})}function ye(e){return e&&"undefined"!=typeof e.getElementsByTagName&&e}for(e in d=se.support={},i=se.isXML=function(e){var t=e.namespaceURI,n=(e.ownerDocument||e).documentElement;return!Y.test(t||n&&n.nodeName||"HTML")},T=se.setDocument=function(e){var t,n,r=e?e.ownerDocument||e:p;return r!=C&&9===r.nodeType&&r.documentElement&&(a=(C=r).documentElement,E=!i(C),p!=C&&(n=C.defaultView)&&n.top!==n&&(n.addEventListener?n.addEventListener("unload",oe,!1):n.attachEvent&&n.attachEvent("onunload",oe)),d.scope=ce(function(e){return a.appendChild(e).appendChild(C.createElement("div")),"undefined"!=typeof e.querySelectorAll&&!e.querySelectorAll(":scope fieldset div").length}),d.attributes=ce(function(e){return e.className="i",!e.getAttribute("className")}),d.getElementsByTagName=ce(function(e){return e.appendChild(C.createComment("")),!e.getElementsByTagName("*").length}),d.getElementsByClassName=K.test(C.getElementsByClassName),d.getById=ce(function(e){return a.appendChild(e).id=S,!C.getElementsByName||!C.getElementsByName(S).length}),d.getById?(b.filter.ID=function(e){var t=e.replace(te,ne);return function(e){return e.getAttribute("id")===t}},b.find.ID=function(e,t){if("undefined"!=typeof t.getElementById&&E){var n=t.getElementById(e);return n?[n]:[]}}):(b.filter.ID=function(e){var n=e.replace(te,ne);return function(e){var t="undefined"!=typeof e.getAttributeNode&&e.getAttributeNode("id");return t&&t.value===n}},b.find.ID=function(e,t){if("undefined"!=typeof t.getElementById&&E){var n,r,i,o=t.getElementById(e);if(o){if((n=o.getAttributeNode("id"))&&n.value===e)return[o];i=t.getElementsByName(e),r=0;while(o=i[r++])if((n=o.getAttributeNode("id"))&&n.value===e)return[o]}return[]}}),b.find.TAG=d.getElementsByTagName?function(e,t){return"undefined"!=typeof t.getElementsByTagName?t.getElementsByTagName(e):d.qsa?t.querySelectorAll(e):void 0}:function(e,t){var n,r=[],i=0,o=t.getElementsByTagName(e);if("*"===e){while(n=o[i++])1===n.nodeType&&r.push(n);return r}return o},b.find.CLASS=d.getElementsByClassName&&function(e,t){if("undefined"!=typeof t.getElementsByClassName&&E)return t.getElementsByClassName(e)},s=[],v=[],(d.qsa=K.test(C.querySelectorAll))&&(ce(function(e){var t;a.appendChild(e).innerHTML="",e.querySelectorAll("[msallowcapture^='']").length&&v.push("[*^$]="+M+"*(?:''|\"\")"),e.querySelectorAll("[selected]").length||v.push("\\["+M+"*(?:value|"+R+")"),e.querySelectorAll("[id~="+S+"-]").length||v.push("~="),(t=C.createElement("input")).setAttribute("name",""),e.appendChild(t),e.querySelectorAll("[name='']").length||v.push("\\["+M+"*name"+M+"*="+M+"*(?:''|\"\")"),e.querySelectorAll(":checked").length||v.push(":checked"),e.querySelectorAll("a#"+S+"+*").length||v.push(".#.+[+~]"),e.querySelectorAll("\\\f"),v.push("[\\r\\n\\f]")}),ce(function(e){e.innerHTML="";var t=C.createElement("input");t.setAttribute("type","hidden"),e.appendChild(t).setAttribute("name","D"),e.querySelectorAll("[name=d]").length&&v.push("name"+M+"*[*^$|!~]?="),2!==e.querySelectorAll(":enabled").length&&v.push(":enabled",":disabled"),a.appendChild(e).disabled=!0,2!==e.querySelectorAll(":disabled").length&&v.push(":enabled",":disabled"),e.querySelectorAll("*,:x"),v.push(",.*:")})),(d.matchesSelector=K.test(c=a.matches||a.webkitMatchesSelector||a.mozMatchesSelector||a.oMatchesSelector||a.msMatchesSelector))&&ce(function(e){d.disconnectedMatch=c.call(e,"*"),c.call(e,"[s!='']:x"),s.push("!=",F)}),v=v.length&&new RegExp(v.join("|")),s=s.length&&new RegExp(s.join("|")),t=K.test(a.compareDocumentPosition),y=t||K.test(a.contains)?function(e,t){var n=9===e.nodeType?e.documentElement:e,r=t&&t.parentNode;return e===r||!(!r||1!==r.nodeType||!(n.contains?n.contains(r):e.compareDocumentPosition&&16&e.compareDocumentPosition(r)))}:function(e,t){if(t)while(t=t.parentNode)if(t===e)return!0;return!1},D=t?function(e,t){if(e===t)return l=!0,0;var n=!e.compareDocumentPosition-!t.compareDocumentPosition;return n||(1&(n=(e.ownerDocument||e)==(t.ownerDocument||t)?e.compareDocumentPosition(t):1)||!d.sortDetached&&t.compareDocumentPosition(e)===n?e==C||e.ownerDocument==p&&y(p,e)?-1:t==C||t.ownerDocument==p&&y(p,t)?1:u?P(u,e)-P(u,t):0:4&n?-1:1)}:function(e,t){if(e===t)return l=!0,0;var n,r=0,i=e.parentNode,o=t.parentNode,a=[e],s=[t];if(!i||!o)return e==C?-1:t==C?1:i?-1:o?1:u?P(u,e)-P(u,t):0;if(i===o)return pe(e,t);n=e;while(n=n.parentNode)a.unshift(n);n=t;while(n=n.parentNode)s.unshift(n);while(a[r]===s[r])r++;return r?pe(a[r],s[r]):a[r]==p?-1:s[r]==p?1:0}),C},se.matches=function(e,t){return se(e,null,null,t)},se.matchesSelector=function(e,t){if(T(e),d.matchesSelector&&E&&!N[t+" "]&&(!s||!s.test(t))&&(!v||!v.test(t)))try{var n=c.call(e,t);if(n||d.disconnectedMatch||e.document&&11!==e.document.nodeType)return n}catch(e){N(t,!0)}return 0":{dir:"parentNode",first:!0}," ":{dir:"parentNode"},"+":{dir:"previousSibling",first:!0},"~":{dir:"previousSibling"}},preFilter:{ATTR:function(e){return e[1]=e[1].replace(te,ne),e[3]=(e[3]||e[4]||e[5]||"").replace(te,ne),"~="===e[2]&&(e[3]=" "+e[3]+" "),e.slice(0,4)},CHILD:function(e){return e[1]=e[1].toLowerCase(),"nth"===e[1].slice(0,3)?(e[3]||se.error(e[0]),e[4]=+(e[4]?e[5]+(e[6]||1):2*("even"===e[3]||"odd"===e[3])),e[5]=+(e[7]+e[8]||"odd"===e[3])):e[3]&&se.error(e[0]),e},PSEUDO:function(e){var t,n=!e[6]&&e[2];return G.CHILD.test(e[0])?null:(e[3]?e[2]=e[4]||e[5]||"":n&&X.test(n)&&(t=h(n,!0))&&(t=n.indexOf(")",n.length-t)-n.length)&&(e[0]=e[0].slice(0,t),e[2]=n.slice(0,t)),e.slice(0,3))}},filter:{TAG:function(e){var t=e.replace(te,ne).toLowerCase();return"*"===e?function(){return!0}:function(e){return e.nodeName&&e.nodeName.toLowerCase()===t}},CLASS:function(e){var t=m[e+" "];return t||(t=new RegExp("(^|"+M+")"+e+"("+M+"|$)"))&&m(e,function(e){return t.test("string"==typeof e.className&&e.className||"undefined"!=typeof e.getAttribute&&e.getAttribute("class")||"")})},ATTR:function(n,r,i){return function(e){var t=se.attr(e,n);return null==t?"!="===r:!r||(t+="","="===r?t===i:"!="===r?t!==i:"^="===r?i&&0===t.indexOf(i):"*="===r?i&&-1:\x20\t\r\n\f]*)[\x20\t\r\n\f]*\/?>(?:<\/\1>|)$/i;function D(e,n,r){return m(n)?S.grep(e,function(e,t){return!!n.call(e,t,e)!==r}):n.nodeType?S.grep(e,function(e){return e===n!==r}):"string"!=typeof n?S.grep(e,function(e){return-1)[^>]*|#([\w-]+))$/;(S.fn.init=function(e,t,n){var r,i;if(!e)return this;if(n=n||j,"string"==typeof e){if(!(r="<"===e[0]&&">"===e[e.length-1]&&3<=e.length?[null,e,null]:q.exec(e))||!r[1]&&t)return!t||t.jquery?(t||n).find(e):this.constructor(t).find(e);if(r[1]){if(t=t instanceof S?t[0]:t,S.merge(this,S.parseHTML(r[1],t&&t.nodeType?t.ownerDocument||t:E,!0)),N.test(r[1])&&S.isPlainObject(t))for(r in t)m(this[r])?this[r](t[r]):this.attr(r,t[r]);return this}return(i=E.getElementById(r[2]))&&(this[0]=i,this.length=1),this}return e.nodeType?(this[0]=e,this.length=1,this):m(e)?void 0!==n.ready?n.ready(e):e(S):S.makeArray(e,this)}).prototype=S.fn,j=S(E);var L=/^(?:parents|prev(?:Until|All))/,H={children:!0,contents:!0,next:!0,prev:!0};function O(e,t){while((e=e[t])&&1!==e.nodeType);return e}S.fn.extend({has:function(e){var t=S(e,this),n=t.length;return this.filter(function(){for(var e=0;e\x20\t\r\n\f]*)/i,he=/^$|^module$|\/(?:java|ecma)script/i;ce=E.createDocumentFragment().appendChild(E.createElement("div")),(fe=E.createElement("input")).setAttribute("type","radio"),fe.setAttribute("checked","checked"),fe.setAttribute("name","t"),ce.appendChild(fe),y.checkClone=ce.cloneNode(!0).cloneNode(!0).lastChild.checked,ce.innerHTML="",y.noCloneChecked=!!ce.cloneNode(!0).lastChild.defaultValue,ce.innerHTML="",y.option=!!ce.lastChild;var ge={thead:[1,"","
    "],col:[2,"","
    "],tr:[2,"","
    "],td:[3,"","
    "],_default:[0,"",""]};function ve(e,t){var n;return n="undefined"!=typeof e.getElementsByTagName?e.getElementsByTagName(t||"*"):"undefined"!=typeof e.querySelectorAll?e.querySelectorAll(t||"*"):[],void 0===t||t&&A(e,t)?S.merge([e],n):n}function ye(e,t){for(var n=0,r=e.length;n",""]);var me=/<|&#?\w+;/;function xe(e,t,n,r,i){for(var o,a,s,u,l,c,f=t.createDocumentFragment(),p=[],d=0,h=e.length;d\s*$/g;function qe(e,t){return A(e,"table")&&A(11!==t.nodeType?t:t.firstChild,"tr")&&S(e).children("tbody")[0]||e}function Le(e){return e.type=(null!==e.getAttribute("type"))+"/"+e.type,e}function He(e){return"true/"===(e.type||"").slice(0,5)?e.type=e.type.slice(5):e.removeAttribute("type"),e}function Oe(e,t){var n,r,i,o,a,s;if(1===t.nodeType){if(Y.hasData(e)&&(s=Y.get(e).events))for(i in Y.remove(t,"handle events"),s)for(n=0,r=s[i].length;n").attr(n.scriptAttrs||{}).prop({charset:n.scriptCharset,src:n.url}).on("load error",i=function(e){r.remove(),i=null,e&&t("error"===e.type?404:200,e.type)}),E.head.appendChild(r[0])},abort:function(){i&&i()}}});var Ut,Xt=[],Vt=/(=)\?(?=&|$)|\?\?/;S.ajaxSetup({jsonp:"callback",jsonpCallback:function(){var e=Xt.pop()||S.expando+"_"+Ct.guid++;return this[e]=!0,e}}),S.ajaxPrefilter("json jsonp",function(e,t,n){var r,i,o,a=!1!==e.jsonp&&(Vt.test(e.url)?"url":"string"==typeof e.data&&0===(e.contentType||"").indexOf("application/x-www-form-urlencoded")&&Vt.test(e.data)&&"data");if(a||"jsonp"===e.dataTypes[0])return r=e.jsonpCallback=m(e.jsonpCallback)?e.jsonpCallback():e.jsonpCallback,a?e[a]=e[a].replace(Vt,"$1"+r):!1!==e.jsonp&&(e.url+=(Et.test(e.url)?"&":"?")+e.jsonp+"="+r),e.converters["script json"]=function(){return o||S.error(r+" was not called"),o[0]},e.dataTypes[0]="json",i=C[r],C[r]=function(){o=arguments},n.always(function(){void 0===i?S(C).removeProp(r):C[r]=i,e[r]&&(e.jsonpCallback=t.jsonpCallback,Xt.push(r)),o&&m(i)&&i(o[0]),o=i=void 0}),"script"}),y.createHTMLDocument=((Ut=E.implementation.createHTMLDocument("").body).innerHTML="
    ",2===Ut.childNodes.length),S.parseHTML=function(e,t,n){return"string"!=typeof e?[]:("boolean"==typeof t&&(n=t,t=!1),t||(y.createHTMLDocument?((r=(t=E.implementation.createHTMLDocument("")).createElement("base")).href=E.location.href,t.head.appendChild(r)):t=E),o=!n&&[],(i=N.exec(e))?[t.createElement(i[1])]:(i=xe([e],t,o),o&&o.length&&S(o).remove(),S.merge([],i.childNodes)));var r,i,o},S.fn.load=function(e,t,n){var r,i,o,a=this,s=e.indexOf(" ");return-1").append(S.parseHTML(e)).find(r):e)}).always(n&&function(e,t){a.each(function(){n.apply(this,o||[e.responseText,t,e])})}),this},S.expr.pseudos.animated=function(t){return S.grep(S.timers,function(e){return t===e.elem}).length},S.offset={setOffset:function(e,t,n){var r,i,o,a,s,u,l=S.css(e,"position"),c=S(e),f={};"static"===l&&(e.style.position="relative"),s=c.offset(),o=S.css(e,"top"),u=S.css(e,"left"),("absolute"===l||"fixed"===l)&&-1<(o+u).indexOf("auto")?(a=(r=c.position()).top,i=r.left):(a=parseFloat(o)||0,i=parseFloat(u)||0),m(t)&&(t=t.call(e,n,S.extend({},s))),null!=t.top&&(f.top=t.top-s.top+a),null!=t.left&&(f.left=t.left-s.left+i),"using"in t?t.using.call(e,f):("number"==typeof f.top&&(f.top+="px"),"number"==typeof f.left&&(f.left+="px"),c.css(f))}},S.fn.extend({offset:function(t){if(arguments.length)return void 0===t?this:this.each(function(e){S.offset.setOffset(this,t,e)});var e,n,r=this[0];return r?r.getClientRects().length?(e=r.getBoundingClientRect(),n=r.ownerDocument.defaultView,{top:e.top+n.pageYOffset,left:e.left+n.pageXOffset}):{top:0,left:0}:void 0},position:function(){if(this[0]){var e,t,n,r=this[0],i={top:0,left:0};if("fixed"===S.css(r,"position"))t=r.getBoundingClientRect();else{t=this.offset(),n=r.ownerDocument,e=r.offsetParent||n.documentElement;while(e&&(e===n.body||e===n.documentElement)&&"static"===S.css(e,"position"))e=e.parentNode;e&&e!==r&&1===e.nodeType&&((i=S(e).offset()).top+=S.css(e,"borderTopWidth",!0),i.left+=S.css(e,"borderLeftWidth",!0))}return{top:t.top-i.top-S.css(r,"marginTop",!0),left:t.left-i.left-S.css(r,"marginLeft",!0)}}},offsetParent:function(){return this.map(function(){var e=this.offsetParent;while(e&&"static"===S.css(e,"position"))e=e.offsetParent;return e||re})}}),S.each({scrollLeft:"pageXOffset",scrollTop:"pageYOffset"},function(t,i){var o="pageYOffset"===i;S.fn[t]=function(e){return $(this,function(e,t,n){var r;if(x(e)?r=e:9===e.nodeType&&(r=e.defaultView),void 0===n)return r?r[i]:e[t];r?r.scrollTo(o?r.pageXOffset:n,o?n:r.pageYOffset):e[t]=n},t,e,arguments.length)}}),S.each(["top","left"],function(e,n){S.cssHooks[n]=$e(y.pixelPosition,function(e,t){if(t)return t=Be(e,n),Me.test(t)?S(e).position()[n]+"px":t})}),S.each({Height:"height",Width:"width"},function(a,s){S.each({padding:"inner"+a,content:s,"":"outer"+a},function(r,o){S.fn[o]=function(e,t){var n=arguments.length&&(r||"boolean"!=typeof e),i=r||(!0===e||!0===t?"margin":"border");return $(this,function(e,t,n){var r;return x(e)?0===o.indexOf("outer")?e["inner"+a]:e.document.documentElement["client"+a]:9===e.nodeType?(r=e.documentElement,Math.max(e.body["scroll"+a],r["scroll"+a],e.body["offset"+a],r["offset"+a],r["client"+a])):void 0===n?S.css(e,t,i):S.style(e,t,n,i)},s,n?e:void 0,n)}})}),S.each(["ajaxStart","ajaxStop","ajaxComplete","ajaxError","ajaxSuccess","ajaxSend"],function(e,t){S.fn[t]=function(e){return this.on(t,e)}}),S.fn.extend({bind:function(e,t,n){return this.on(e,null,t,n)},unbind:function(e,t){return this.off(e,null,t)},delegate:function(e,t,n,r){return this.on(t,e,n,r)},undelegate:function(e,t,n){return 1===arguments.length?this.off(e,"**"):this.off(t,e||"**",n)},hover:function(e,t){return this.mouseenter(e).mouseleave(t||e)}}),S.each("blur focus focusin focusout resize scroll click dblclick mousedown mouseup mousemove mouseover mouseout mouseenter mouseleave change select submit keydown keypress keyup contextmenu".split(" "),function(e,n){S.fn[n]=function(e,t){return 0=o.clientWidth&&n>=o.clientHeight}),l=0a[e]&&!t.escapeWithReference&&(n=J(f[o],a[e]-('right'===e?f.width:f.height))),ae({},o,n)}};return l.forEach(function(e){var t=-1===['left','top'].indexOf(e)?'secondary':'primary';f=le({},f,m[t](e))}),e.offsets.popper=f,e},priority:['left','right','top','bottom'],padding:5,boundariesElement:'scrollParent'},keepTogether:{order:400,enabled:!0,fn:function(e){var t=e.offsets,o=t.popper,n=t.reference,i=e.placement.split('-')[0],r=Z,p=-1!==['top','bottom'].indexOf(i),s=p?'right':'bottom',d=p?'left':'top',a=p?'width':'height';return o[s]r(n[s])&&(e.offsets.popper[d]=r(n[s])),e}},arrow:{order:500,enabled:!0,fn:function(e,o){var n;if(!q(e.instance.modifiers,'arrow','keepTogether'))return e;var i=o.element;if('string'==typeof i){if(i=e.instance.popper.querySelector(i),!i)return e;}else if(!e.instance.popper.contains(i))return console.warn('WARNING: `arrow.element` must be child of its popper element!'),e;var r=e.placement.split('-')[0],p=e.offsets,s=p.popper,d=p.reference,a=-1!==['left','right'].indexOf(r),l=a?'height':'width',f=a?'Top':'Left',m=f.toLowerCase(),h=a?'left':'top',c=a?'bottom':'right',u=S(i)[l];d[c]-us[c]&&(e.offsets.popper[m]+=d[m]+u-s[c]),e.offsets.popper=g(e.offsets.popper);var b=d[m]+d[l]/2-u/2,y=t(e.instance.popper),w=parseFloat(y['margin'+f],10),E=parseFloat(y['border'+f+'Width'],10),v=b-e.offsets.popper[m]-w-E;return v=$(J(s[l]-u,v),0),e.arrowElement=i,e.offsets.arrow=(n={},ae(n,m,Q(v)),ae(n,h,''),n),e},element:'[x-arrow]'},flip:{order:600,enabled:!0,fn:function(e,t){if(W(e.instance.modifiers,'inner'))return e;if(e.flipped&&e.placement===e.originalPlacement)return e;var o=v(e.instance.popper,e.instance.reference,t.padding,t.boundariesElement,e.positionFixed),n=e.placement.split('-')[0],i=T(n),r=e.placement.split('-')[1]||'',p=[];switch(t.behavior){case he.FLIP:p=[n,i];break;case he.CLOCKWISE:p=z(n);break;case he.COUNTERCLOCKWISE:p=z(n,!0);break;default:p=t.behavior;}return p.forEach(function(s,d){if(n!==s||p.length===d+1)return e;n=e.placement.split('-')[0],i=T(n);var a=e.offsets.popper,l=e.offsets.reference,f=Z,m='left'===n&&f(a.right)>f(l.left)||'right'===n&&f(a.left)f(l.top)||'bottom'===n&&f(a.top)f(o.right),g=f(a.top)f(o.bottom),b='left'===n&&h||'right'===n&&c||'top'===n&&g||'bottom'===n&&u,y=-1!==['top','bottom'].indexOf(n),w=!!t.flipVariations&&(y&&'start'===r&&h||y&&'end'===r&&c||!y&&'start'===r&&g||!y&&'end'===r&&u);(m||b||w)&&(e.flipped=!0,(m||b)&&(n=p[d+1]),w&&(r=G(r)),e.placement=n+(r?'-'+r:''),e.offsets.popper=le({},e.offsets.popper,C(e.instance.popper,e.offsets.reference,e.placement)),e=P(e.instance.modifiers,e,'flip'))}),e},behavior:'flip',padding:5,boundariesElement:'viewport'},inner:{order:700,enabled:!1,fn:function(e){var t=e.placement,o=t.split('-')[0],n=e.offsets,i=n.popper,r=n.reference,p=-1!==['left','right'].indexOf(o),s=-1===['top','left'].indexOf(o);return i[p?'left':'top']=r[o]-(s?i[p?'width':'height']:0),e.placement=T(t),e.offsets.popper=g(i),e}},hide:{order:800,enabled:!0,fn:function(e){if(!q(e.instance.modifiers,'hide','preventOverflow'))return e;var t=e.offsets.reference,o=D(e.instance.modifiers,function(e){return'preventOverflow'===e.name}).boundaries;if(t.bottomo.right||t.top>o.bottom||t.right + + + + + + + + + + + + Towards Autonomous Language Model Systems | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + +
    + +
    +
    +
    +

    PyTorch Webinars

    +
    +
    +
    + +
    +
    +
    +
    + Towards Autonomous Language Model Systems +

    Towards Autonomous Language Model Systems

    +

    + Date: May 21, 2025, 11AM PT / 2PM ET +
    + Speaker: Ofir Press +
    +
    + Language models (LMs) are increasingly used to assist users in day-to-day tasks such as programming (Github Copilot) or search (Google's AI Overviews). But can we build language model systems that are able to autonomously complete entire tasks end-to-end? +

    + +In this talk, Ofir Press will discuss efforts to build autonomous LM systems, focusing on the software engineering domain. Ofir will present SWE-bench, a novel method for measuring AI systems on their abilities to fix real issues in popular software libraries. Ofir will then discuss SWE-agent, a system for solving SWE-bench tasks. +

    + +SWE-bench and SWE-agent are used by many leading AI organizations in academia and industry, including OpenAI, Anthropic, Meta, and Google, and SWE-bench has been downloaded over 2 million times. These projects show that academics on tight budgets can have a substantial impact in steering the research community toward building autonomous systems that can complete challenging tasks. +

    + +Ofir is a postdoc at Princeton University, where they mainly work with Karthik Narasimhan's lab. Ofir previously completed their PhD at the University of Washington in Seattle, where Ofir was advised by Noah Smith. During their PhD, Ofir spent two years at Facebook AI Research Labs on Luke Zettlemoyer's team. +

    +

    Register now to attend this event

    +
    +

    +
    +
    +
    +
    + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/10/index.html b/blog/10/index.html new file mode 100644 index 000000000000..0e7df860fedc --- /dev/null +++ b/blog/10/index.html @@ -0,0 +1,991 @@ + + + + + + + + + + + + + Blog | 10 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + +
    + + + + + + +
    +
    + +

    Featured Post

    +

    + Speeding up ViTs using Block Sparsity +

    + + + + Read More + + + +
    +
    + +
    +
    +
    +
    + + + + + + + + +
    +
    +

    May 02, 2024

    +

    + A Hitchhiker’s Guide to Speculative Decoding +

    +

    Speculative decoding is an optimization technique for inference that makes educated guesses about future tokens while generating the current token, all within a single forward pass. It incorporates a verification mechanism to ensure the correctness of these speculated tokens, thereby guaranteeing that the overall output of speculative decoding is identical to that of vanilla decoding. Optimizing the cost of inference of large language models (LLMs) is arguably one of the most critical factor...

    + +
    + + Read More + +
    + + + + +
    +
    +

    May 02, 2024

    +

    + Announcing PyTorch Docathon June, 2024 +

    +

    We are thrilled to announce the upcoming PyTorch Docathon in June! The Docathon, akin to a hackathon, is an event dedicated to enhancing the quality of the PyTorch documentation with the invaluable assistance of our community. Documentation is a vital component of any technology. By refining it, we can simplify the process for new users to get started with PyTorch, guide them in effectively utilizing its features, and ultimately expedite the transition from research to production in machine l...

    + +
    + + Read More + +
    + + + + +
    +
    +

    May 01, 2024

    +

    + Accelerating Llama3 FP8 Inference with Triton Kernels +

    +

    1.0 Summary + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    April 30, 2024

    +

    + ExecuTorch Alpha: Taking LLMs and AI to the Edge with Our Community and Partners +

    +

    We are excited to announce the release of ExecuTorch alpha, focused on deploying large language models (LLMs) and large ML models to the edge, stabilizing the API surface, and improving our installation processes. It has been an exciting few months from our 0.1 (preview) release in collaboration with our partners at Arm, Apple, and Qualcomm Technologies, Inc. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    April 24, 2024

    +

    + PyTorch 2.3 Release Blog +

    +

    We are excited to announce the release of PyTorch® 2.3 (release note)! PyTorch 2.3 offers support for user-defined Triton kernels in torch.compile, allowing for users to migrate their own Triton kernels from eager without experiencing performance regressions or graph breaks. Tensor Parallelism improves the experience for training Large Language Models using native PyTorch functions, which has been validated on training runs for 100B parameter models. As well, semi-structured sparsity implemen...

    + +
    + + Read More + +
    + + + + +
    +
    +

    April 16, 2024

    +

    + torchtune: Easily fine-tune LLMs using PyTorch +

    +

    We’re pleased to announce the alpha release of torchtune, a PyTorch-native library for easily fine-tuning large language models. + +

    + +
    + + Read More + +
    + + + + + + +
    + +
    + +
    + +
    +
    +
    + +
    +
    +
    +
    + +

    Install PyTorch

    + +

    Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

    + +

    NOTE: Latest PyTorch requires Python 3.9 or later.

    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Your OS
    +
    +
    +
    Package
    +
    +
    +
    Language
    +
    +
    +
    Compute Platform
    +
    +
    +
    Run this Command:
    +
    +
    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Stable (1.13.0)
    +
    +
    +
    Preview (Nightly)
    +
    +
    +
    +
    +
    Your OS
    +
    +
    +
    Linux
    +
    +
    +
    Mac
    +
    +
    +
    Windows
    +
    +
    +
    +
    +
    Package
    +
    +
    +
    Pip
    +
    +
    +
    LibTorch
    +
    +
    +
    Source
    +
    +
    +
    +
    +
    Language
    +
    +
    +
    Python
    +
    +
    +
    C++ / Java
    +
    +
    +
    +
    +
    Compute Platform
    +
    +
    +
    CUDA 11.8
    +
    +
    +
    CUDA 12.1
    +
    +
    +
    CUDA 12.4
    +
    +
    +
    ROCm 5.2
    +
    +
    +
    CPU
    +
    +
    +
    +
    +
    Run this Command:
    +
    +
    +
    pip install torch torchvision
    +
    +
    +
    +
    +
    + + + + Previous versions of PyTorch + +
    + +
    +

    Quick Start With
    Cloud Partners

    + +

    Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

    + +
    + + +
    +
    +
    + Google Cloud Platform +
    + + + + + +
    +
    + +
    +
    +
    +

    Microsoft Azure

    +
    + + +
    +
    + +
    +
    +
    + Lightning Studios +
    + +
    +
    +
    + +
    +
    +
    +
    + + + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/11/index.html b/blog/11/index.html new file mode 100644 index 000000000000..2e1cac4b1293 --- /dev/null +++ b/blog/11/index.html @@ -0,0 +1,995 @@ + + + + + + + + + + + + + Blog | 11 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + +
    + + + + + + +
    +
    + +

    Featured Post

    +

    + Maximizing training throughput using PyTorch FSDP +

    + + + + Read More + + + +
    +
    + +
    +
    +
    +
    + + + + + + + + +
    +
    +

    February 06, 2024

    +

    + PyTorch 2 paper and tutorial @ ASPLOS 2024 +

    +

    The PyTorch team is excited to share that our paper on PyTorch 2 has been accepted for presentation at the ACM International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS), scheduled to take place from April 27 to May 1, 2024, in San Diego, CA, USA. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    February 01, 2024

    +

    + What's New in PyTorch Documentation +

    +

    Greetings to the PyTorch community! Here is a quick update on PyTorch docs. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    January 30, 2024

    +

    + PyTorch 2.2: FlashAttention-v2 integration, AOTInductor +

    +

    We are excited to announce the release of PyTorch® 2.2 (release note)! PyTorch 2.2 offers ~2x performance improvements to scaled_dot_product_attention via FlashAttention-v2 integration, as well as AOTInductor, a new ahead-of-time compilation and deployment tool built for non-python server-side deployments. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    January 30, 2024

    +

    + New Library Updates in PyTorch 2.2 +

    +

    Summary + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    January 23, 2024

    +

    + Accelerating Generative AI with PyTorch IV: Seamless M4T, fast +

    +

    This post is the fourth part of a multi-series blog focused on how to accelerate generative AI models with pure, native PyTorch. To skip to the code, check out our github (seamless_communication, fairseq2). We are excited to share a breadth of newly released PyTorch performance features alongside practical examples to see how far we can push PyTorch native performance. In part one, we showed how to accelerate Segment Anything over 8x using only pure, native PyTorch. In part two, we showed how...

    + +
    + + Read More + +
    + + + + + + + + + +
    +
    +

    January 16, 2024

    +

    + Accelerating Triton Dequantization Kernels for GPTQ +

    +

    TL;DR + +

    + +
    + + Read More + +
    + +
    + +
    + +
    + +
    +
    +
    + +
    +
    +
    +
    + +

    Install PyTorch

    + +

    Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

    + +

    NOTE: Latest PyTorch requires Python 3.9 or later.

    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Your OS
    +
    +
    +
    Package
    +
    +
    +
    Language
    +
    +
    +
    Compute Platform
    +
    +
    +
    Run this Command:
    +
    +
    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Stable (1.13.0)
    +
    +
    +
    Preview (Nightly)
    +
    +
    +
    +
    +
    Your OS
    +
    +
    +
    Linux
    +
    +
    +
    Mac
    +
    +
    +
    Windows
    +
    +
    +
    +
    +
    Package
    +
    +
    +
    Pip
    +
    +
    +
    LibTorch
    +
    +
    +
    Source
    +
    +
    +
    +
    +
    Language
    +
    +
    +
    Python
    +
    +
    +
    C++ / Java
    +
    +
    +
    +
    +
    Compute Platform
    +
    +
    +
    CUDA 11.8
    +
    +
    +
    CUDA 12.1
    +
    +
    +
    CUDA 12.4
    +
    +
    +
    ROCm 5.2
    +
    +
    +
    CPU
    +
    +
    +
    +
    +
    Run this Command:
    +
    +
    +
    pip install torch torchvision
    +
    +
    +
    +
    +
    + + + + Previous versions of PyTorch + +
    + +
    +

    Quick Start With
    Cloud Partners

    + +

    Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

    + +
    + + +
    +
    +
    + Google Cloud Platform +
    + + + + + +
    +
    + +
    +
    +
    +

    Microsoft Azure

    +
    + + +
    +
    + +
    +
    +
    + Lightning Studios +
    + +
    +
    +
    + +
    +
    +
    +
    + + + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/12/index.html b/blog/12/index.html new file mode 100644 index 000000000000..1f1cee3f0ee3 --- /dev/null +++ b/blog/12/index.html @@ -0,0 +1,993 @@ + + + + + + + + + + + + + Blog | 12 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + +
    + + + + + + +
    +
    + +

    Featured Post

    +

    + Finetune LLMs on your own consumer hardware using tools from PyTorch and Hugging Face ecosystem +

    + + + + Read More + + + +
    +
    + +
    +
    +
    +
    + + + + + + + + +
    +
    +

    January 09, 2024

    +

    + Accelerate AI models on GPU using Amazon SageMaker multi-model endpoints with TorchServe, saving up to 75% on inference costs +

    +

    Multi-model endpoints (MMEs) are a powerful feature of Amazon SageMaker designed to simplify the deployment and operation of machine learning (ML) models. With MMEs, you can host multiple models on a single serving container and host all the models behind a single endpoint. The SageMaker platform automatically manages the loading and unloading of models and scales resources based on traffic patterns, reducing the operational burden of managing a large quantity of models. This feature is parti...

    + +
    + + Read More + +
    + + + + +
    +
    +

    January 03, 2024

    +

    + Accelerating Generative AI Part III: Diffusion, Fast +

    +

    This post is the third part of a multi-series blog focused on how to accelerate generative AI models with pure, native PyTorch. We are excited to share a breadth of newly released PyTorch performance features alongside practical examples to see how far we can push PyTorch native performance. In part one, we showed how to accelerate Segment Anything over 8x using only pure, native PyTorch. In part two, we showed how to accelerate Llama-7B by almost 10x using only native PyTorch optimizations. ...

    + +
    + + Read More + +
    + + + + +
    +
    +

    December 19, 2023

    +

    + Understanding GPU Memory 2: Finding and Removing Reference Cycles +

    +

    This is part 2 of the Understanding GPU Memory blog series. Our first post Understanding GPU Memory 1: Visualizing All Allocations over Time shows how to use the memory snapshot tool. In this part, we will use the Memory Snapshot to visualize a GPU memory leak caused by reference cycles, and then locate and remove them in our code using the Reference Cycle Detector. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    December 18, 2023

    +

    + Training Production AI Models with PyTorch 2.0 +

    +

    1. Introduction + +

    + +
    + + Read More + +
    + + + + + + + + + +
    +
    +

    December 14, 2023

    +

    + Understanding GPU Memory 1: Visualizing All Allocations over Time +

    +

    During your time with PyTorch on GPUs, you may be familiar with this common error message: + +

    + +
    + + Read More + +
    + + + + + + +
    + +
    + +
    + +
    +
    +
    + +
    +
    +
    +
    + +

    Install PyTorch

    + +

    Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

    + +

    NOTE: Latest PyTorch requires Python 3.9 or later.

    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Your OS
    +
    +
    +
    Package
    +
    +
    +
    Language
    +
    +
    +
    Compute Platform
    +
    +
    +
    Run this Command:
    +
    +
    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Stable (1.13.0)
    +
    +
    +
    Preview (Nightly)
    +
    +
    +
    +
    +
    Your OS
    +
    +
    +
    Linux
    +
    +
    +
    Mac
    +
    +
    +
    Windows
    +
    +
    +
    +
    +
    Package
    +
    +
    +
    Pip
    +
    +
    +
    LibTorch
    +
    +
    +
    Source
    +
    +
    +
    +
    +
    Language
    +
    +
    +
    Python
    +
    +
    +
    C++ / Java
    +
    +
    +
    +
    +
    Compute Platform
    +
    +
    +
    CUDA 11.8
    +
    +
    +
    CUDA 12.1
    +
    +
    +
    CUDA 12.4
    +
    +
    +
    ROCm 5.2
    +
    +
    +
    CPU
    +
    +
    +
    +
    +
    Run this Command:
    +
    +
    +
    pip install torch torchvision
    +
    +
    +
    +
    +
    + + + + Previous versions of PyTorch + +
    + +
    +

    Quick Start With
    Cloud Partners

    + +

    Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

    + +
    + + +
    +
    +
    + Google Cloud Platform +
    + + + + + +
    +
    + +
    +
    +
    +

    Microsoft Azure

    +
    + + +
    +
    + +
    +
    +
    + Lightning Studios +
    + +
    +
    +
    + +
    +
    +
    +
    + + + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/13/index.html b/blog/13/index.html new file mode 100644 index 000000000000..781cf915730f --- /dev/null +++ b/blog/13/index.html @@ -0,0 +1,999 @@ + + + + + + + + + + + + + Blog | 13 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + +
    + + + + + + + + +
    +
    +
    +
    + + + + + + + + +
    +
    +

    November 30, 2023

    +

    + Accelerating Generative AI with PyTorch II: GPT, Fast +

    +

    This post is the second part of a multi-series blog focused on how to accelerate generative AI models with pure, native PyTorch. We are excited to share a breadth of newly released PyTorch performance features alongside practical examples to see how far we can push PyTorch native performance. In part one, we showed how to accelerate Segment Anything over 8x using only pure, native PyTorch. In this blog we’ll focus on LLM optimization. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    November 29, 2023

    +

    + PyTorch 2.1 Contains New Performance Features for AI Developers +

    +

    We are excited to see the release of PyTorch 2.1. In this blog, we discuss the five features for which Intel made significant contributions to PyTorch 2.1: + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    November 16, 2023

    +

    + 🎉 PyTorch Docathon H2 2023 Wrap-up 🎉 +

    +

    We are thrilled to announce the successful completion of the Fall 2023 PyTorch Docathon! The event was a resounding success, and we want to extend our heartfelt gratitude to all the participants who made it possible. Dedication, expertise, and tireless efforts of our open-source contributors have once again helped us to improve PyTorch documentation. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    November 16, 2023

    +

    + Accelerating Generative AI with PyTorch: Segment Anything, Fast +

    +

    This post is the first part of a multi-series blog focused on how to accelerate generative AI models with pure, native PyTorch. We are excited to share a breadth of newly released PyTorch performance features alongside practical examples of how these features can be combined to see how far we can push PyTorch native performance. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    November 07, 2023

    +

    + PyTorch compile to speed up inference on Llama 2 +

    +

    In this blog, we discuss how to improve the inference latencies of the Llama 2 family of models using PyTorch native optimizations such as native fast kernels, compile transformations from torch compile, and tensor parallel for distributed inference. Our approach results in 29ms/token latency for single user requests on the 70B LLaMa model (as measured on 8 A100 GPUs). We are excited to share our findings with the community and make our code available here. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    November 06, 2023

    +

    + High-Performance Llama 2 Training and Inference with PyTorch/XLA on Cloud TPUs +

    +

    In a landscape where AI innovation is accelerating at an unprecedented pace, Meta’s Llama family of open sourced large language models (LLMs) stands out as a notable breakthrough. Llama marked a significant step forward for LLMs, demonstrating the power of pre-trained architectures for a wide range of applications. Llama 2 further pushed the boundaries of scale and capabilities, inspiring advancements in language understanding, generation, and beyond. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    November 02, 2023

    +

    + Accelerating Inference on x86-64 Machines with oneDNN Graph +

    +

    Supported in PyTorch 2.0 as a beta feature, oneDNN Graph leverages aggressive fusion patterns to accelerate inference on x86-64 machines, especially Intel® Xeon® Scalable processors. + +

    + +
    + + Read More + +
    + +
    + +
    + +
    + +
    +
    +
    + +
    +
    +
    +
    + +

    Install PyTorch

    + +

    Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

    + +

    NOTE: Latest PyTorch requires Python 3.9 or later.

    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Your OS
    +
    +
    +
    Package
    +
    +
    +
    Language
    +
    +
    +
    Compute Platform
    +
    +
    +
    Run this Command:
    +
    +
    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Stable (1.13.0)
    +
    +
    +
    Preview (Nightly)
    +
    +
    +
    +
    +
    Your OS
    +
    +
    +
    Linux
    +
    +
    +
    Mac
    +
    +
    +
    Windows
    +
    +
    +
    +
    +
    Package
    +
    +
    +
    Pip
    +
    +
    +
    LibTorch
    +
    +
    +
    Source
    +
    +
    +
    +
    +
    Language
    +
    +
    +
    Python
    +
    +
    +
    C++ / Java
    +
    +
    +
    +
    +
    Compute Platform
    +
    +
    +
    CUDA 11.8
    +
    +
    +
    CUDA 12.1
    +
    +
    +
    CUDA 12.4
    +
    +
    +
    ROCm 5.2
    +
    +
    +
    CPU
    +
    +
    +
    +
    +
    Run this Command:
    +
    +
    +
    pip install torch torchvision
    +
    +
    +
    +
    +
    + + + + Previous versions of PyTorch + +
    + +
    +

    Quick Start With
    Cloud Partners

    + +

    Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

    + +
    + + +
    +
    +
    + Google Cloud Platform +
    + + + + + +
    +
    + +
    +
    +
    +

    Microsoft Azure

    +
    + + +
    +
    + +
    +
    +
    + Lightning Studios +
    + +
    +
    +
    + +
    +
    +
    +
    + + + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/14/index.html b/blog/14/index.html new file mode 100644 index 000000000000..91d330af4bf9 --- /dev/null +++ b/blog/14/index.html @@ -0,0 +1,1001 @@ + + + + + + + + + + + + + Blog | 14 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + +
    + + + + + + +
    +
    + +

    Featured Post

    +

    + AMD Extends Support for PyTorch Machine Learning Development on Select RDNA™ 3 GPUs with ROCm™ 5.7 +

    + + + + Read More + + + +
    +
    + +
    +
    +
    +
    + + + + + + + + +
    +
    +

    October 17, 2023

    +

    + PyTorch Edge: Enabling On-Device Inference Across Mobile and Edge Devices with ExecuTorch +

    +

    We are excited to announce ExecuTorch, our all-new solution for enabling on-device inference capabilities across mobile and edge devices with the backing of industry leaders like Arm, Apple, and Qualcomm Innovation Center. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    October 17, 2023

    +

    + Lightning AI Joins the PyTorch Foundation as a Premier Member +

    +

    The PyTorch Foundation, a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem, is announcing today that Lightning AI has joined as a premier member. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    October 17, 2023

    +

    + Huawei Joins the PyTorch Foundation as a Premier Member +

    +

    Today, the PyTorch Foundation, a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem, announced that Huawei has joined as a premier member. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    October 17, 2023

    +

    + Compiling NumPy code into C++ or CUDA via torch.compile +

    +

    Quansight engineers have implemented support for tracing through NumPy code via +torch.compile in PyTorch 2.1. This feature leverages PyTorch’s compiler to +generate efficient fused vectorized code without having to modify your original +NumPy code. Even more, it also allows for executing NumPy code on CUDA +just by running it through torch.compile under torch.device("cuda")! + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    October 13, 2023

    +

    + Flash-Decoding for long-context inference +

    +

    Motivation + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    October 11, 2023

    +

    + ML Model Server Resource Saving - Transition From High-Cost GPUs to Intel CPUs and oneAPI powered Software with performance +

    +

    Reviewers: Yunsang Ju(Naver GplaceAI Leader), Min Jean Cho(Intel), Jing Xu(Intel), Mark Saroufim(Meta) + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    October 10, 2023

    +

    + Real-time Audio-visual Speech Recognition +

    +

    Audio-Visual Speech Recognition (AV-ASR, or AVSR) is the task of transcribing text from audio and visual streams, which has recently attracted a lot of research attention due to its robustness to noise. The vast majority of work to date has focused on developing AV-ASR models for non-streaming recognition; studies on streaming AV-ASR are very limited. + +

    + +
    + + Read More + +
    + +
    + +
    + +
    + +
    +
    +
    + +
    +
    +
    +
    + +

    Install PyTorch

    + +

    Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

    + +

    NOTE: Latest PyTorch requires Python 3.9 or later.

    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Your OS
    +
    +
    +
    Package
    +
    +
    +
    Language
    +
    +
    +
    Compute Platform
    +
    +
    +
    Run this Command:
    +
    +
    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Stable (1.13.0)
    +
    +
    +
    Preview (Nightly)
    +
    +
    +
    +
    +
    Your OS
    +
    +
    +
    Linux
    +
    +
    +
    Mac
    +
    +
    +
    Windows
    +
    +
    +
    +
    +
    Package
    +
    +
    +
    Pip
    +
    +
    +
    LibTorch
    +
    +
    +
    Source
    +
    +
    +
    +
    +
    Language
    +
    +
    +
    Python
    +
    +
    +
    C++ / Java
    +
    +
    +
    +
    +
    Compute Platform
    +
    +
    +
    CUDA 11.8
    +
    +
    +
    CUDA 12.1
    +
    +
    +
    CUDA 12.4
    +
    +
    +
    ROCm 5.2
    +
    +
    +
    CPU
    +
    +
    +
    +
    +
    Run this Command:
    +
    +
    +
    pip install torch torchvision
    +
    +
    +
    +
    +
    + + + + Previous versions of PyTorch + +
    + +
    +

    Quick Start With
    Cloud Partners

    + +

    Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

    + +
    + + +
    +
    +
    + Google Cloud Platform +
    + + + + + +
    +
    + +
    +
    +
    +

    Microsoft Azure

    +
    + + +
    +
    + +
    +
    +
    + Lightning Studios +
    + +
    +
    +
    + +
    +
    +
    +
    + + + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/15/index.html b/blog/15/index.html new file mode 100644 index 000000000000..4ab53dfea986 --- /dev/null +++ b/blog/15/index.html @@ -0,0 +1,995 @@ + + + + + + + + + + + + + Blog | 15 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + +
    + + + + + + +
    +
    + +

    Featured Post

    +

    + PyTorch 2.1: automatic dynamic shape compilation, distributed checkpointing +

    + + + + Read More + + + +
    +
    + +
    +
    +
    +
    + + + + + + + + +
    +
    +

    October 04, 2023

    +

    + New Library Updates in PyTorch 2.1 +

    +

    Summary + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    October 04, 2023

    +

    + High performance Llama 2 deployments with AWS Inferentia2 using TorchServe +

    +

    Recently, Llama 2 was released and has attracted a lot of interest from the machine learning community. Amazon EC2 Inf2 instances, powered by AWS Inferentia2, now support training and inference of Llama 2 models. In this post, we show low-latency and cost-effective inference of Llama-2 models on Amazon EC2 Inf2 instances using the latest AWS Neuron SDK release.  We first introduce how to create, compile and deploy the Llama-2 model and explain the optimization techniques introduced by AWS Neu...

    + +
    + + Read More + +
    + + + + +
    +
    +

    October 03, 2023

    +

    + How to Build an Interactive Chat-Generation Model using DialoGPT and PyTorch +

    +

    The focus on interactive chat-generation (or conversational response-generation) models has greatly increased in the past several months. Conversational response-generation models such as ChatGPT and Google Bard have taken the AI world by storm. The purpose of interactive chat generation is to answer various questions posed by humans, and these AI based models use natural language processing (NLP) to generate conversations almost indistinguishable from those generated by humans. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    October 02, 2023

    +

    + Announcing PyTorch Docathon H2 2023 +

    +

    We are excited to announce that we will be holding a Docathon for PyTorch on November 1, 2023! This event is an opportunity for our community to come together and improve the quality of our documentation. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    September 25, 2023

    +

    + Inside the Matrix: Visualizing Matrix Multiplication, Attention and Beyond +

    +

    Use 3D to visualize matrix multiplication expressions, attention heads with real weights, and more. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    September 13, 2023

    +

    + Accelerated CPU Inference with PyTorch Inductor using torch.compile +

    +

    Story at a Glance + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    September 12, 2023

    +

    + One Year of PyTorch Foundation +

    +

    It’s been one year since we announced the formation of the PyTorch Foundation! 🎉 + +

    + +
    + + Read More + +
    + +
    + +
    + +
    + +
    +
    +
    + +
    +
    +
    +
    + +

    Install PyTorch

    + +

    Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

    + +

    NOTE: Latest PyTorch requires Python 3.9 or later.

    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Your OS
    +
    +
    +
    Package
    +
    +
    +
    Language
    +
    +
    +
    Compute Platform
    +
    +
    +
    Run this Command:
    +
    +
    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Stable (1.13.0)
    +
    +
    +
    Preview (Nightly)
    +
    +
    +
    +
    +
    Your OS
    +
    +
    +
    Linux
    +
    +
    +
    Mac
    +
    +
    +
    Windows
    +
    +
    +
    +
    +
    Package
    +
    +
    +
    Pip
    +
    +
    +
    LibTorch
    +
    +
    +
    Source
    +
    +
    +
    +
    +
    Language
    +
    +
    +
    Python
    +
    +
    +
    C++ / Java
    +
    +
    +
    +
    +
    Compute Platform
    +
    +
    +
    CUDA 11.8
    +
    +
    +
    CUDA 12.1
    +
    +
    +
    CUDA 12.4
    +
    +
    +
    ROCm 5.2
    +
    +
    +
    CPU
    +
    +
    +
    +
    +
    Run this Command:
    +
    +
    +
    pip install torch torchvision
    +
    +
    +
    +
    +
    + + + + Previous versions of PyTorch + +
    + +
    +

    Quick Start With
    Cloud Partners

    + +

    Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

    + +
    + + +
    +
    +
    + Google Cloud Platform +
    + + + + + +
    +
    + +
    +
    +
    +

    Microsoft Azure

    +
    + + +
    +
    + +
    +
    +
    + Lightning Studios +
    + +
    +
    +
    + +
    +
    +
    +
    + + + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/16/index.html b/blog/16/index.html new file mode 100644 index 000000000000..fadfe3d07151 --- /dev/null +++ b/blog/16/index.html @@ -0,0 +1,997 @@ + + + + + + + + + + + + + Blog | 16 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + +
    + + + + + + + + +
    +
    +
    +
    + + + + + + + + +
    +
    +

    September 05, 2023

    +

    + Automated trace collection and analysis +

    +

    In this blog, we share how we enabled the collection and analysis of PyTorch Profiler traces for training workloads without any user side code instrumentation. We leveraged Dynolog - an open source daemon for CPU and GPU telemetry to collect PyTorch Profiler traces, and analyzed the collected traces using Holistic Trace Analysis - an open source library for analyzing PyTorch Profiler traces. This toolchain has allowed engineers at Meta to accelerate their performance optimization workflows. T...

    + +
    + + Read More + +
    + + + + +
    +
    +

    August 31, 2023

    +

    + PyTorch/XLA SPMD: Scale Up Model Training and Serving with Automatic Parallelization +

    +

    Today, we are delighted to announce PyTorch/XLA SPMD: the integration of GSPMD into PyTorch with an easy to use API. PyTorch developers seeking superior performance and scale can train and serve the largest neural networks while maximizing utilization of AI accelerators, such as Google Cloud TPUs. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    August 24, 2023

    +

    + Large Scale Training of Hugging Face Transformers on TPUs With PyTorch/XLA FSDP +

    +

    AI is transforming many industries through advanced capabilities such as understanding and generating language, answering questions, and delivering accurate recommendations. These capabilities are fueled by ever-increasing size and complexity of AI models, which require vast amounts of computing power to train. + +

    + +
    + + Read More + +
    + + + + + + + + + +
    +
    +

    August 07, 2023

    +

    + INT8 Quantization for x86 CPU in PyTorch +

    +

    Overview + +

    + +
    + + Read More + +
    + + + + + + + + + +
    +
    +

    August 01, 2023

    +

    + AMD's Journey to Openness and Performance +

    +

    AMD has gained progress in building a robust software stack that supports an open ecosystem of models, libraries, frameworks, and tools. With proven platforms gaining momentum, there is significance of a leadership software stack and an optimized ecosystem for achieving application performance. PyTorch is a key part of AMD’s AI journey, and AMD’s Victor Peng, AMD President and Soumith Chintala, founder of PyTorch discussed the latest progress at the DC & AI Keynote on June 12. + +

    + +
    + + Read More + +
    + +
    + +
    + +
    + +
    +
    +
    + +
    +
    +
    +
    + +

    Install PyTorch

    + +

    Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

    + +

    NOTE: Latest PyTorch requires Python 3.9 or later.

    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Your OS
    +
    +
    +
    Package
    +
    +
    +
    Language
    +
    +
    +
    Compute Platform
    +
    +
    +
    Run this Command:
    +
    +
    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Stable (1.13.0)
    +
    +
    +
    Preview (Nightly)
    +
    +
    +
    +
    +
    Your OS
    +
    +
    +
    Linux
    +
    +
    +
    Mac
    +
    +
    +
    Windows
    +
    +
    +
    +
    +
    Package
    +
    +
    +
    Pip
    +
    +
    +
    LibTorch
    +
    +
    +
    Source
    +
    +
    +
    +
    +
    Language
    +
    +
    +
    Python
    +
    +
    +
    C++ / Java
    +
    +
    +
    +
    +
    Compute Platform
    +
    +
    +
    CUDA 11.8
    +
    +
    +
    CUDA 12.1
    +
    +
    +
    CUDA 12.4
    +
    +
    +
    ROCm 5.2
    +
    +
    +
    CPU
    +
    +
    +
    +
    +
    Run this Command:
    +
    +
    +
    pip install torch torchvision
    +
    +
    +
    +
    +
    + + + + Previous versions of PyTorch + +
    + +
    +

    Quick Start With
    Cloud Partners

    + +

    Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

    + +
    + + +
    +
    +
    + Google Cloud Platform +
    + + + + + +
    +
    + +
    +
    +
    +

    Microsoft Azure

    +
    + + +
    +
    + +
    +
    +
    + Lightning Studios +
    + +
    +
    +
    + +
    +
    +
    +
    + + + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/17/index.html b/blog/17/index.html new file mode 100644 index 000000000000..2f78d308fdd6 --- /dev/null +++ b/blog/17/index.html @@ -0,0 +1,999 @@ + + + + + + + + + + + + + Blog | 17 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + +
    + + + + + + + + +
    +
    +
    +
    + + + + + + + + +
    +
    +

    July 27, 2023

    +

    + IBM Joins the PyTorch Foundation as a Premier Member +

    +

    The PyTorch Foundation, part of The Linux Foundation, is pleased to announce that IBM has joined as a premier member. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    July 25, 2023

    +

    + Announcing CPP-based S3 IO DataPipes +

    +

    Training large deep learning models requires large datasets. Amazon Simple Storage Service (Amazon S3) is a scalable cloud object store service used for storing large training datasets. Machine learning (ML) practitioners need an efficient data pipe that can download data from Amazon S3, transform the data, and feed the data to GPUs for training models with high throughput and low latency. + +In this post, we introduce the new S3 IO DataPipes for PyTorch, S3FileLister and S3FileLoader. For memo...

    + +
    + + Read More + +
    + + + + +
    +
    +

    July 10, 2023

    +

    + How to Accelerate PyTorch Geometric on Intel® CPUs +

    +

    Overview + +

    + +
    + + Read More + +
    + + + + + + + + + +
    +
    +

    June 28, 2023

    +

    + The Path to Achieve Ultra-Low Inference Latency With LLaMA 65B on PyTorch/XLA +

    +

    Background & State of the Art + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    June 22, 2023

    +

    + Optimized PyTorch 2.0 Inference with AWS Graviton processors +

    +

    New generations of CPUs offer significant performance improvement in machine learning (ML) inference due to specialized built-in instructions. Combined with their flexibility, high speed of development, and low operating cost, these general-purpose processors offer an alternative ML inference solution to other existing hardware solutions. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    June 16, 2023

    +

    + 🎉 PyTorch Docathon H1 2023 Wrap-up 🎉 +

    +

    Thank you to all who participated in our first ever PyTorch Docathon, the results have been nothing short of amazing! We want to extend our sincerest gratitude to all the participants who made this event a resounding success. Your passion, talent, and hard work have left an indelible mark on the PyTorch documentation. + +

    + +
    + + Read More + +
    + +
    + +
    + +
    + +
    +
    +
    + +
    +
    +
    +
    + +

    Install PyTorch

    + +

    Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

    + +

    NOTE: Latest PyTorch requires Python 3.9 or later.

    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Your OS
    +
    +
    +
    Package
    +
    +
    +
    Language
    +
    +
    +
    Compute Platform
    +
    +
    +
    Run this Command:
    +
    +
    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Stable (1.13.0)
    +
    +
    +
    Preview (Nightly)
    +
    +
    +
    +
    +
    Your OS
    +
    +
    +
    Linux
    +
    +
    +
    Mac
    +
    +
    +
    Windows
    +
    +
    +
    +
    +
    Package
    +
    +
    +
    Pip
    +
    +
    +
    LibTorch
    +
    +
    +
    Source
    +
    +
    +
    +
    +
    Language
    +
    +
    +
    Python
    +
    +
    +
    C++ / Java
    +
    +
    +
    +
    +
    Compute Platform
    +
    +
    +
    CUDA 11.8
    +
    +
    +
    CUDA 12.1
    +
    +
    +
    CUDA 12.4
    +
    +
    +
    ROCm 5.2
    +
    +
    +
    CPU
    +
    +
    +
    +
    +
    Run this Command:
    +
    +
    +
    pip install torch torchvision
    +
    +
    +
    +
    +
    + + + + Previous versions of PyTorch + +
    + +
    +

    Quick Start With
    Cloud Partners

    + +

    Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

    + +
    + + +
    +
    +
    + Google Cloud Platform +
    + + + + + +
    +
    + +
    +
    +
    +

    Microsoft Azure

    +
    + + +
    +
    + +
    +
    +
    + Lightning Studios +
    + +
    +
    +
    + +
    +
    +
    +
    + + + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/18/index.html b/blog/18/index.html new file mode 100644 index 000000000000..f1dba17fb881 --- /dev/null +++ b/blog/18/index.html @@ -0,0 +1,991 @@ + + + + + + + + + + + + + Blog | 18 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + +
    + + + + + + +
    +
    + +

    Featured Post

    +

    + Join the PyTorch Foundation: Membership Now Open +

    + + + + Read More + + + +
    +
    + +
    +
    +
    +
    + + + + + + + + +
    +
    +

    May 22, 2023

    +

    + Out of the box acceleration and memory savings of 🤗 decoder models with PyTorch 2.0 +

    +

    As part of PyTorch 2.0 release, an accelerated implementation of the attention mechanism as part of the “Better Transformer” project (and known in PyTorch as Accelerated Transformers) has been added natively into PyTorch as torch.nn.functional.scaled_dot_product_attention. This implementation leverages fused kernels from FlashAttention and Memory-efficient attention, and supports both training and inference. + +

    + +
    + + Read More + +
    + + + + + + + + + +
    +
    +

    May 12, 2023

    +

    + Language Identification: Building an End-to-End AI Solution using PyTorch +

    +

    Language Identification is the process of identifying the primary language from multiple audio input samples. In natural language processing (NLP), language identification is an important problem and a challenging issue. There are many language-related tasks such as entering text on your phone, finding news articles you enjoy, or discovering answers to questions that you may have. All these tasks are powered by NLP models. To decide which model to invoke at a particular point in time, we must...

    + +
    + + Read More + +
    + + + + +
    +
    +

    May 03, 2023

    +

    + Announcing PyTorch Docathon 2023 +

    +

    + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    May 02, 2023

    +

    + Accelerated Image Segmentation using PyTorch +

    +

    Using Intel® Extension for PyTorch to Boost Image Processing Performance + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    April 27, 2023

    +

    + Introducing Hidet: A Deep Learning Compiler for Efficient Model Serving +

    +

    Hidet is a powerful deep learning compiler that simplifies the process of implementing high-performing deep learning operators on modern accelerators (e.g., NVIDIA GPUs). With the new feature of torch.compile(...) in PyTorch 2.0, integrating a novel compiler into PyTorch is easier than ever - Hidet now can be used as a torch.compile(...) backend to accelerate PyTorch models, making it an attractive option for PyTorch users who want to improve the inference performance of their models, especia...

    + +
    + + Read More + +
    + + + + +
    +
    +

    April 19, 2023

    +

    + Accelerating Large Language Models with Accelerated Transformers +

    +

    TL;DR. We show how to use Accelerated PyTorch 2.0 Transformers and the newly introduced torch.compile() method to accelerate Large Language Models on the example of nanoGPT, a compact open-source implementation of the GPT model from Andrej Karpathy. Using the new scaled dot product attention operator introduced with Accelerated PT2 Transformers, we select the flash_attention custom kernel and achieve faster training time per batch (measured with Nvidia A100 GPUs), going from a ~143ms/batch ba...

    + +
    + + Read More + +
    + +
    + +
    + +
    + +
    +
    +
    + +
    +
    +
    +
    + +

    Install PyTorch

    + +

    Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

    + +

    NOTE: Latest PyTorch requires Python 3.9 or later.

    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Your OS
    +
    +
    +
    Package
    +
    +
    +
    Language
    +
    +
    +
    Compute Platform
    +
    +
    +
    Run this Command:
    +
    +
    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Stable (1.13.0)
    +
    +
    +
    Preview (Nightly)
    +
    +
    +
    +
    +
    Your OS
    +
    +
    +
    Linux
    +
    +
    +
    Mac
    +
    +
    +
    Windows
    +
    +
    +
    +
    +
    Package
    +
    +
    +
    Pip
    +
    +
    +
    LibTorch
    +
    +
    +
    Source
    +
    +
    +
    +
    +
    Language
    +
    +
    +
    Python
    +
    +
    +
    C++ / Java
    +
    +
    +
    +
    +
    Compute Platform
    +
    +
    +
    CUDA 11.8
    +
    +
    +
    CUDA 12.1
    +
    +
    +
    CUDA 12.4
    +
    +
    +
    ROCm 5.2
    +
    +
    +
    CPU
    +
    +
    +
    +
    +
    Run this Command:
    +
    +
    +
    pip install torch torchvision
    +
    +
    +
    +
    +
    + + + + Previous versions of PyTorch + +
    + +
    +

    Quick Start With
    Cloud Partners

    + +

    Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

    + +
    + + +
    +
    +
    + Google Cloud Platform +
    + + + + + +
    +
    + +
    +
    +
    +

    Microsoft Azure

    +
    + + +
    +
    + +
    +
    +
    + Lightning Studios +
    + +
    +
    +
    + +
    +
    +
    +
    + + + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/19/index.html b/blog/19/index.html new file mode 100644 index 000000000000..8dd787fd998b --- /dev/null +++ b/blog/19/index.html @@ -0,0 +1,993 @@ + + + + + + + + + + + + + Blog | 19 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + +
    + + + + + + +
    +
    + +

    Featured Post

    +

    + Experience the power of PyTorch 2.0 on AMD Solutions +

    + + + + Read More + + + +
    +
    + +
    +
    +
    +
    + + + + + + + + +
    +
    +

    April 14, 2023

    +

    + Accelerated Generative Diffusion Models with PyTorch 2 +

    +

    TL;DR: PyTorch 2.0 nightly offers out-of-the-box performance improvement for Generative Diffusion models by using the new torch.compile() compiler and optimized implementations of Multihead Attention integrated with PyTorch 2. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    April 07, 2023

    +

    + Straggler Mitigation On PyTorch DDP By Hierarchical SGD +

    +

    PyTorch DDP has been widely adopted across the industry for distributed training, which by default runs synchronous SGD to synchronize gradients across model replicas at every step. The performance of this technique is critical for fast iteration during model exploration as well as resource and cost saving. The performance is critical for fast iteration and cost saving of model development and exploration. To resolve a ubiquitous performance bottleneck introduced by slow nodes in large-scale ...

    + +
    + + Read More + +
    + + + + +
    +
    +

    April 07, 2023

    +

    + Celebrate PyTorch 2.0 with New Performance Features for AI Developers +

    +

    Congratulations to the PyTorch Foundation for its release of PyTorch 2.0! In this blog, I discuss the four features for which Intel made significant contributions to PyTorch 2.0: + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    April 03, 2023

    +

    + PyTorch & OpenXLA: The Path Forward +

    +

    As we celebrate the release of OpenXLA, PyTorch 2.0, and PyTorch/XLA 2.0, it’s worth taking a step back and sharing where we see it all going in the short to medium term. With PyTorch adoption leading in the AI space and XLA supporting best-in-class compiler features, PyTorch/XLA is well positioned to provide a cutting edge development stack for both model training and inference. To achieve this, we see investments in three main areas: + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    March 28, 2023

    +

    + Accelerated PyTorch 2 Transformers +

    +

    The PyTorch 2.0 release includes a new high-performance implementation of the PyTorch Transformer API with the goal of making training and deployment of state-of-the-art Transformer models affordable. Following the successful release of “fastpath” inference execution (“Better Transformer”), this release introduces high-performance support for training and inference using a custom kernel architecture for scaled dot product attention (SPDA). + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    March 22, 2023

    +

    + PyTorch 2.0 & XLA—The Latest Cutting Edge Features +

    +

    Today, we are excited to share our latest work for PyTorch/XLA 2.0. The release of PyTorch 2.0 is yet another major milestone for this storied community and we are excited to continue to be part of it. When the PyTorch/XLA project started in 2018 between Google and Meta, the focus was on bringing cutting edge Cloud TPUs to help support the PyTorch community. Along the way, others in the community such as Amazon joined the project and very quickly the community expanded. We are excited about X...

    + +
    + + Read More + +
    + + + + +
    +
    +

    March 16, 2023

    +

    + Accelerated Diffusers with PyTorch 2.0 +

    +

    PyTorch 2.0 has just been released. Its flagship new feature is torch.compile(), a one-line code change that promises to automatically improve performance across codebases. We have previously checked on that promise in Hugging Face Transformers and TIMM models, and delved deep into its motivation, architecture and the road ahead. + +

    + +
    + + Read More + +
    + +
    + +
    + +
    + +
    +
    +
    + +
    +
    +
    +
    + +

    Install PyTorch

    + +

    Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

    + +

    NOTE: Latest PyTorch requires Python 3.9 or later.

    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Your OS
    +
    +
    +
    Package
    +
    +
    +
    Language
    +
    +
    +
    Compute Platform
    +
    +
    +
    Run this Command:
    +
    +
    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Stable (1.13.0)
    +
    +
    +
    Preview (Nightly)
    +
    +
    +
    +
    +
    Your OS
    +
    +
    +
    Linux
    +
    +
    +
    Mac
    +
    +
    +
    Windows
    +
    +
    +
    +
    +
    Package
    +
    +
    +
    Pip
    +
    +
    +
    LibTorch
    +
    +
    +
    Source
    +
    +
    +
    +
    +
    Language
    +
    +
    +
    Python
    +
    +
    +
    C++ / Java
    +
    +
    +
    +
    +
    Compute Platform
    +
    +
    +
    CUDA 11.8
    +
    +
    +
    CUDA 12.1
    +
    +
    +
    CUDA 12.4
    +
    +
    +
    ROCm 5.2
    +
    +
    +
    CPU
    +
    +
    +
    +
    +
    Run this Command:
    +
    +
    +
    pip install torch torchvision
    +
    +
    +
    +
    +
    + + + + Previous versions of PyTorch + +
    + +
    +

    Quick Start With
    Cloud Partners

    + +

    Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

    + +
    + + +
    +
    +
    + Google Cloud Platform +
    + + + + + +
    +
    + +
    +
    +
    +

    Microsoft Azure

    +
    + + +
    +
    + +
    +
    +
    + Lightning Studios +
    + +
    +
    +
    + +
    +
    +
    +
    + + + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/2/index.html b/blog/2/index.html new file mode 100644 index 000000000000..9c2749687733 --- /dev/null +++ b/blog/2/index.html @@ -0,0 +1,1000 @@ + + + + + + + + + + + + + Blog | 2 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + +
    + + + + + + +
    +
    + +

    Featured Post

    +

    + PyTorch 2.7 Release +

    + + + + Read More + + + +
    +
    + +
    +
    +
    +
    + + + + + + + + +
    +
    +

    April 08, 2025

    +

    + Accelerating Whisper on Arm with PyTorch and Hugging Face Transformers +

    +

    Automatic speech recognition (ASR) has revolutionized how we interact with technology, clearing the way for applications like real-time audio transcription, voice assistants, and accessibility tools. OpenAI Whisper is a powerful model for ASR, capable of multilingual speech recognition and translation. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    April 03, 2025

    +

    + PyTorch Day France 2025: Call For Proposals Open +

    +

    We’re pleased to announce PyTorch Day France 2025, a dedicated gathering of the PyTorch community held 7 May 2025 in Paris, France. Proudly hosted by the PyTorch Foundation and co-located with GOSIM AI Paris 2025, this event will bring together developers, researchers, and practitioners driving innovation in open source AI and machine learning. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    March 19, 2025

    +

    + PyTorch Day China 2025 Call for Proposals Open +

    +

    We’re excited to announce the first-ever PyTorch Day China! This new event, hosted by the PyTorch Foundation, will take place on June 7 in Beijing, China, bringing together AI practitioners, researchers, and industry professionals to explore the latest advancements in open source AI and machine learning. Co-located with the BAAI Conference, PyTorch Day China is a chance to connect with the community, share knowledge, and help shape the future of deep learning. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    March 13, 2025

    +

    + Introducing the New PyTorch Landscape: Your Guide to the PyTorch Ecosystem +

    +

    We’re excited to reveal our brand new PyTorch Landscape. The PyTorch Landscape helps researchers, developers, and organizations easily locate useful, curated, community-built tools that augment the PyTorch core framework. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    March 11, 2025

    +

    + Scaling Recommendation Systems Training to Thousands of GPUs with 2D Sparse Parallelism +

    +

    At Meta, recommendation systems are the cornerstone of delivering relevant and personalized ads to billions of users globally. Through technologies like PyTorch’s TorchRec, we’ve successfully developed solutions that enable model training across hundreds of GPUs. While these systems have served us well, recent research on scaling laws has revealed a compelling opportunity: we can achieve significantly better model performance by training dramatically larger neural networks. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    March 06, 2025

    +

    + Peak Performance, Minimized Memory: Optimizing torchtune’s performance with torch.compile & Liger Kernel +

    +

    LinkedIn: Shivam Sahni, Byron Hsu, Yanning Chen +Meta: Ankith Gunapal, Evan Smothers + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    March 05, 2025

    +

    + Current and New Activation Checkpointing Techniques in PyTorch +

    +

    As models scale in depth, batch size, and sequence length, etc, activation memory becomes an increasingly significant contributor to the overall memory usage. To help address this, PyTorch provides utilities for activation checkpointing, which reduce the number of saved tensors by recomputing them when needed, trading off memory usage for additional compute. + +

    + +
    + + Read More + +
    + +
    + +
    + +
    + +
    +
    +
    + +
    +
    +
    +
    + +

    Install PyTorch

    + +

    Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

    + +

    NOTE: Latest PyTorch requires Python 3.9 or later.

    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Your OS
    +
    +
    +
    Package
    +
    +
    +
    Language
    +
    +
    +
    Compute Platform
    +
    +
    +
    Run this Command:
    +
    +
    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Stable (1.13.0)
    +
    +
    +
    Preview (Nightly)
    +
    +
    +
    +
    +
    Your OS
    +
    +
    +
    Linux
    +
    +
    +
    Mac
    +
    +
    +
    Windows
    +
    +
    +
    +
    +
    Package
    +
    +
    +
    Pip
    +
    +
    +
    LibTorch
    +
    +
    +
    Source
    +
    +
    +
    +
    +
    Language
    +
    +
    +
    Python
    +
    +
    +
    C++ / Java
    +
    +
    +
    +
    +
    Compute Platform
    +
    +
    +
    CUDA 11.8
    +
    +
    +
    CUDA 12.1
    +
    +
    +
    CUDA 12.4
    +
    +
    +
    ROCm 5.2
    +
    +
    +
    CPU
    +
    +
    +
    +
    +
    Run this Command:
    +
    +
    +
    pip install torch torchvision
    +
    +
    +
    +
    +
    + + + + Previous versions of PyTorch + +
    + +
    +

    Quick Start With
    Cloud Partners

    + +

    Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

    + +
    + + +
    +
    +
    + Google Cloud Platform +
    + + + + + +
    +
    + +
    +
    +
    +

    Microsoft Azure

    +
    + + +
    +
    + +
    +
    +
    + Lightning Studios +
    + +
    +
    +
    + +
    +
    +
    +
    + + + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/20/index.html b/blog/20/index.html new file mode 100644 index 000000000000..4de7cdad8d20 --- /dev/null +++ b/blog/20/index.html @@ -0,0 +1,993 @@ + + + + + + + + + + + + + Blog | 20 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + +
    + + + + + + +
    +
    + +

    Featured Post

    +

    + PyTorch 2.0: Our next generation release that is faster, more Pythonic and Dynamic as ever +

    + + + + Read More + + + +
    +
    + +
    +
    +
    +
    + + + + + + + + +
    +
    +

    March 15, 2023

    +

    + New Library Updates in PyTorch 2.0 +

    +

    Summary + +

    + +
    + + Read More + +
    + + + + + + + + + +
    +
    +

    February 02, 2023

    +

    + Deprecation of CUDA 11.6 and Python 3.7 Support +

    +

    For the upcoming PyTorch 2.0 feature release (target March 2023), we will target CUDA 11.7 as the stable version and CUDA 11.8 as the experimental version of CUDA and Python >=3.8, <=3.11. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    January 09, 2023

    +

    + PyTorch Trace Analysis for the Masses +

    +

    We are excited to announce the public release of Holistic Trace Analysis (HTA), an open source performance analysis and visualization Python library for PyTorch users. HTA takes as input Kineto traces collected by the PyTorch profiler, which are complex and challenging to interpret, and up-levels the performance information contained in these traces. It was initially developed internally at Meta to understand and debug performance problems for large-scale distributed training jobs on GPUs. Th...

    + +
    + + Read More + +
    + + + + +
    +
    +

    December 31, 2022

    +

    + Compromised PyTorch-nightly dependency chain between December 25th and December 30th, 2022. +

    +

    If you installed PyTorch-nightly on Linux via pip between December 25, 2022 and December 30, 2022, please uninstall it and torchtriton immediately, and use the latest nightly binaries (newer than Dec 30th 2022). + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    December 28, 2022

    +

    + Torchserve Performance Tuning, Animated Drawings Case-Study +

    +

    In this post we discuss performance tuning of Torchserve for serving your models in production. One of the biggest challenges in the life cycle of a ML project is deploying models in production. This requires a reliable serving solution along with solutions that address the MLOps needs. A robust serving solution needs to provide support for multi model serving, model versioning, metric logging, monitoring and scaling to serve the peak traffic. In this post, we will have an overview of Torchs...

    + +
    + + Read More + +
    + + + + +
    +
    +

    December 22, 2022

    +

    + Scaling Vision Model Training Platforms with PyTorch +

    +

    TL;DR: We demonstrate the use of PyTorch with FairScale’s FullyShardedDataParallel (FSDP) API in writing large vision transformer models. We discuss our techniques for scaling and optimizing these models on a GPU cluster. The goal of this platform scaling effort is to enable research at scale. This blog does not discuss model accuracy, new model architectures, or new training recipes. + +

    + +
    + + Read More + +
    + +
    + +
    + +
    + +
    +
    +
    + +
    +
    +
    +
    + +

    Install PyTorch

    + +

    Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

    + +

    NOTE: Latest PyTorch requires Python 3.9 or later.

    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Your OS
    +
    +
    +
    Package
    +
    +
    +
    Language
    +
    +
    +
    Compute Platform
    +
    +
    +
    Run this Command:
    +
    +
    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Stable (1.13.0)
    +
    +
    +
    Preview (Nightly)
    +
    +
    +
    +
    +
    Your OS
    +
    +
    +
    Linux
    +
    +
    +
    Mac
    +
    +
    +
    Windows
    +
    +
    +
    +
    +
    Package
    +
    +
    +
    Pip
    +
    +
    +
    LibTorch
    +
    +
    +
    Source
    +
    +
    +
    +
    +
    Language
    +
    +
    +
    Python
    +
    +
    +
    C++ / Java
    +
    +
    +
    +
    +
    Compute Platform
    +
    +
    +
    CUDA 11.8
    +
    +
    +
    CUDA 12.1
    +
    +
    +
    CUDA 12.4
    +
    +
    +
    ROCm 5.2
    +
    +
    +
    CPU
    +
    +
    +
    +
    +
    Run this Command:
    +
    +
    +
    pip install torch torchvision
    +
    +
    +
    +
    +
    + + + + Previous versions of PyTorch + +
    + +
    +

    Quick Start With
    Cloud Partners

    + +

    Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

    + +
    + + +
    +
    +
    + Google Cloud Platform +
    + + + + + +
    +
    + +
    +
    +
    +

    Microsoft Azure

    +
    + + +
    +
    + +
    +
    +
    + Lightning Studios +
    + +
    +
    +
    + +
    +
    +
    +
    + + + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/2024-year-in-review/index.html b/blog/2024-year-in-review/index.html new file mode 100644 index 000000000000..ed442c8e5796 --- /dev/null +++ b/blog/2024-year-in-review/index.html @@ -0,0 +1,713 @@ + + + + + + + + + + + + + PyTorch Grows as the Dominant Open Source Framework for AI and ML: 2024 Year in Review | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Eli Uriegas, Meta and Jennifer Bly, PyTorch Foundation + +

    +

    This past year was a monumental year for PyTorch from major releases to the flagship PyTorch Conference. We’ve seen incredible growth in contributions from more than 3,500 individuals and 3,000 organizations. It’s safe to say PyTorch has now become the dominant deep learning framework for AI/ML. PyTorch leads the model training space with a 63% adoption rate according to the recent Shaping the Future of Generative AI Report from the Linux Foundation.

    + +

    group at a conference

    + +

    The PyTorch Foundation was formed in 2022 with the goal to drive the adoption of AI tooling by fostering and sustaining an ecosystem of open source, vendor-neutral projects centered around PyTorch and today remains a vibrant, collaborative hub created for and by the deep learning community. As we wrap up the year, let’s take a look back at a few highlights and how this year has been one of growth, collaboration, innovation, and community.

    + +

    2024 Highlights: A Year of Growth and Impact

    + +

    PyTorch accelerated its growth this year. Contributions are up 133%, from double the amount of organizations worldwide compared to last year.

    + +

    The project has seen 20% year-over-year growth in new repositories using PyTorch, and a 30% increase in forks and users this past year.

    + +

    Over 70% of AI research implementations are now using PyTorch.

    + +

    Statistics based on the 2024 Linux Foundation Annual Report.

    + +

    people at a conference

    + +

    PyTorch Tools ecosystem grew by over 25%, enhancing both software and hardware capabilities. Working with all major cloud service providers, dozens of major software vendors, and industry partners, PyTorch is setting a new bar for the pace and breadth of AI innovation.

    + +

    people at a conference

    + +

    This year featured 4 milestone releases for PyTorch in the 2.2, 2.3, 2.4 and 2.5 releases. We observed the release of various hallmark features like AOTInductor, FlashAttention-2 support, Tensor Parallelism, a new Python Custom Operator API, and the introduction of FlexAttention. Engineers from across PyTorch Foundation member companies have also come together to introduce support and optimizations for platforms like Intel GPUs (XPU), AWS Graviton processors, Inductor performance, etc.

    + +

    Throughout the year the PyTorch Team has been working hard to introduce a number of new PyTorch-native libraries! The ExecuTorch team released their alpha in collaboration with partners from Arm, Apple, and Qualcomm Technologies, Inc. then quickly followed with a beta focused on stability and adding MediaTek. TorchTune established a PyTorch-native library for easily fine-tuning large language models. TorchAO introduced a PyTorch native library that makes models faster and smaller by leveraging low bit dtypes, quantization and sparsity. TorchCodec was launched to give developers a simple, performant, and PyTorch native way to decode videos into tensors. TorchRec 1.0 was released, the first stable release of the PyTorch native recommendation systems library.

    + +

    We’ve also had a number of strong technical showcases throughout the year to highlight how PyTorch can be used! TorchTitan exhibited what an open source, PyTorch-native distributed training system could look like for training large language models (LLMs). TorchChat showcased how to seamlessly and performantly run LLMs across laptop, desktop, and mobile devices.

    + +

    As well we were very excited to include multiple new projects into the PyTorch ecosystem throughout 2024, including the introduction of vLLM into the PyTorch Ecosystem, a state-of-the-art inference engine, which gives machine learning engineers an easy, fast, and cheap way of serving LLMs. If you are interested in joining the PyTorch Ecosystem, please join!

    + +

    people at a conference

    + +

    In June in Paris, France we premiered the official PyTorch documentary on powering the AI Revolution that spotlights PyTorch’s vibrant ecosystem and its role in advancing AI innovation. The film unveiled the authentic narrative of PyTorch’s inception, attributing its existence to a dedicated group of unsung heroes driving technological innovation.

    + +

    people at a conference

    + +

    The PyTorch Conference 2024, brought in triple the registrations compared to 2023, reflecting the rapid growth of AI and machine learning communities around open source technologies. The two day event included insightful talks, hands-on sessions, and lively discussions about the future of AI, covering everything from generative AI to large language models.

    + +

    A brand new Startup Showcase featured early-stage founders pitching their AI startups to a panel of top venture capitalists, a DL Compiler Mini-Summit took a deep dive into the advances in deep learning (DL) compilers that are transforming AI workloads, and a Fine-Tuning Mini-Summit brought together a thriving community of researchers, developers, practitioners and hobbyists to discuss topics like memory efficiency, parameter-efficient fine-tuning, and performance at scale.

    + +

    speaking on stage at a conference

    + +

    Outstanding contributors were honored with PyTorch Contributor Awards. Congratulations to this year’s nominees and recipients for the outstanding individuals and teams who have played a pivotal role in PyTorch’s journey this year.

    + +

    people at a conference

    + +

    PyTorch Foundation membership is growing with the addition of Arm and Rebellions this year. At the year-end mark, Premier Members include: AMD, Arm, AWS, Google Cloud, Huawei, Hugging Face, IBM, Intel, Lightning AI, Meta, Microsoft Azure, and NVIDIA. General Members include: Graphcore, Rebellions, and Snowflake. If your organization is interested in joining, find out how you can become a member of the PyTorch Foundation.

    + +

    PyTorch hosted numerous in-person and virtual events, including The PyTorch Docathon where contributors worked to improve PyTorch documentation and foster collaboration, Local meetups around the world brought together interested parties in locations from Shanghai to Seoul, and more than a dozen webinars brought in attendees from everywhere during our Summer Webinar Series, live Q&As, and Expert Exchanges.

    + +

    Matt speaking at a conference

    + +

    PyTorch Foundation welcomed new leadership this year. Executive Director Matt White took the reins in April and immediately began raising the profile of PyTorch across the AI landscape. The Technical Advisory Council (TAC) also elected new leadership with Luca Antiga, Lightning AI as the Chair and Jiong Gong, Intel as Vice Chair.

    + +

    The PyTorch Governing Board continued to set the direction and lead the Foundation in accomplishing its mission. The PyTorch Marketing and Outreach Committee developed programs to maximize the visibility of PyTorch and advance the interests of the community. The PyTorch CI Working Group assembled to successfully migrate the PyTorch CI pipeline to the Linux Foundation.

    + +

    Our community joined us on social media with 775 thousand followers strong across X, LinkedIn, Facebook, and YouTube with more than 12 million impressions of PyTorch content throughout the year. The PyTorch Ecosystem also grew, adding many new projects to leverage PyTorch deep learning across many vertical domains.

    + +

    people at a conference

    + +

    PyTorch was mentioned in the media in top technology publications such as The New Stack’s article on Why PyTorch Gets All the Love and InfoWorld’s article on how the TorchAO PyTorch library makes models faster and smaller.

    + +

    We published 74 technical and community blogs, and nearly ten million people visited the PyTorch website throughout the year.

    + +

    fire dancers at a conference

    + +

    Thanks to each of you who helped make this year an outstanding success! The evolution and growth we’ve seen PyTorch undergo over the past year is driven by the passion, dedication, and ingenuity of this amazing community. Looking ahead to next year, we’re excited to build on this momentum as we continue to push the boundaries of AI.

    + +

    Save the date for the PyTorch Conference which will be held October 22-23, 2025 in San Francisco. 2025 promises even greater innovation and stronger community collaboration.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/2025-priorities-for-tac/index.html b/blog/2025-priorities-for-tac/index.html new file mode 100644 index 000000000000..58d2bb2c0553 --- /dev/null +++ b/blog/2025-priorities-for-tac/index.html @@ -0,0 +1,664 @@ + + + + + + + + + + + + + 2025 Priorities for the PyTorch Technical Advisory Council (TAC) | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Luca Antiga, PyTorch TAC Chair + +

    +

    social share

    + +

    2024 has been a year of incredible growth for PyTorch. As that continues in 2025, the PyTorch Foundation has made important steps towards evolving the governance of the project under the Linux Foundation’s vendor-neutral umbrella.

    + +

    An important piece of governance for PyTorch is represented by the Technical Advisory Council (TAC). The TAC acts as a bridge between the industry, including but not limited to the PyTorch Foundation members, the community, and the PyTorch core development team.

    + +

    Operating with transparency and inclusivity, the TAC gathers input, facilitates collaboration, and drives initiatives that enhance the experience for everyone who relies on PyTorch.

    + +

    In 2025, the TAC will focus on four key areas:

    + +
      +
    1. Build Open, Multi-Cloud Continuous Integration (CI): Building on the groundwork from 2024, the TAC will oversee the transition to an open, community-driven CI infrastructure. In addition to ensuring the extremely high bar for correctness that PyTorch has, PyTorch’s CI is complex with a high-quality bar including many automated functional and performance daily test runs. In 2025, PyTorch’s CI infrastructure will be fully open sourced and extended to support multiple compute providers, enabling broader contribution and participation to the effort from organizations benefitting from PyTorch.
    2. +
    3. Support more Accelerators: The TAC is committed to creating a level playing field for the growing landscape of AI accelerators. By gathering industry players and PyTorch developers, the TAC will facilitate efforts towards third-party device support and provide levels of integration of external CI systems with the main PyTorch CI. This will make it easier for emerging hardware to gain adoption within the PyTorch ecosystem, and for users to experiment with diverse compute options for training and inference.
    4. +
    5. Create a High-Quality, User-Centric Ecosystem: A big focus for the TAC in early 2025 is on improving the experience and discoverability of the PyTorch ecosystem. With many projects growing organically, users often face challenges navigating projects of different scope and quality within the rapidly changing AI landscape. To solve this, a newly curated ecosystem landscape tool will be launched soon on the PyTorch website. We will also introduce lightweight, open processes to improve projects and ensure users a predictable, high-quality experience. In many ways, the experience with PyTorch is as good as its ecosystem.
    6. +
    7. Gather Feedback from Industry and the Community: PyTorch has widespread adoption across research labs, startups, and enterprises. Striking the right balance between expressiveness and performance across the board is a very challenging task, so the TAC set out to be one of the several ways the Core development team receives signals. During our monthly TAC meetings, we provide the opportunity to PyTorch Foundation members from industry and academia, as well as non-member organizations to present their use case, their challenges and discuss them directly with appropriate members of the Core team. This feedback loop helps prioritize improvements, ensuring the framework stays relevant in a fast-evolving AI landscape.
    8. +
    + +

    By focusing on these priorities, the TAC aims to maintain PyTorch’s position as the leading deep learning framework, while ensuring it remains open, accessible, and responsive to the needs of its diverse community.

    + +

    As members of the TAC, we’re extremely excited to contribute to the success of PyTorch and to the impact it’s having in the real world. If you are a PyTorch user or developer, consider participating in our monthly calls (they are open to everyone, and the recordings are available here). Also, if you develop or maintain a project based on PyTorch, consider contributing it to the new PyTorch ecosystem (instructions).

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/21/index.html b/blog/21/index.html new file mode 100644 index 000000000000..63c53e856beb --- /dev/null +++ b/blog/21/index.html @@ -0,0 +1,993 @@ + + + + + + + + + + + + + Blog | 21 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + +
    + + + + + + +
    +
    + +

    Featured Post

    +

    + Efficient Large-Scale Training with Pytorch FSDP and AWS +

    + + + + Read More + + + +
    +
    + +
    +
    +
    +
    + + + + + + + + +
    +
    +

    December 15, 2022

    +

    + Scaling PyTorch FSDP for Training Foundation Models on IBM Cloud +

    +

    Large model training using a cloud native approach is of growing interest for many enterprises given the emergence and success of foundation models. Some AI practitioners may assume that the only way they can achieve high GPU utilization for distributed training jobs is to run them on HPC systems, such as those inter-connected with Infiniband and may not consider Ethernet connected systems. We demonstrate how the latest distributed training technique, Fully Sharded Data Parallel (FSDP) from P...

    + +
    + + Read More + +
    + + + + +
    +
    +

    December 02, 2022

    +

    + Get Started with PyTorch 2.0 Summary and Overview +

    +

    Introducing PyTorch 2.0, our first steps toward the next generation 2-series release of PyTorch. Over the last few years we have innovated and iterated from PyTorch 1.0 to the most recent 1.13 and moved to the newly formed PyTorch Foundation, part of the Linux Foundation. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    December 02, 2022

    +

    + Accelerating Hugging Face and TIMM models with PyTorch 2.0 +

    +

    torch.compile() makes it easy to experiment with different compiler backends to make PyTorch code faster with a single line decorator torch.compile(). It works either directly over an nn.Module as a drop-in replacement for torch.jit.script() but without requiring you to make any source code changes. We expect this one line code change to provide you with between 30%-2x training time speedups on the vast majority of models that you’re already running. + +

    + +
    + + Read More + +
    + + + + + + + + + + + + + + + + + + + +
    +
    +

    November 17, 2022

    +

    + Introducing TorchMultimodal - a library for accelerating exploration in Multimodal AI +

    +

    We are announcing TorchMultimodal Beta, a PyTorch domain library for training SoTA multi-task multimodal models at scale. The library provides composable building blocks (modules, transforms, loss functions) to accelerate model development, SoTA model architectures (FLAVA, MDETR, Omnivore) from published research, training and evaluation scripts, as well as notebooks for exploring these models. The library is under active development, and we’d love to hear your feedback! You can find more det...

    + +
    + + Read More + +
    + +
    + +
    + +
    + +
    +
    +
    + +
    +
    +
    +
    + +

    Install PyTorch

    + +

    Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

    + +

    NOTE: Latest PyTorch requires Python 3.9 or later.

    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Your OS
    +
    +
    +
    Package
    +
    +
    +
    Language
    +
    +
    +
    Compute Platform
    +
    +
    +
    Run this Command:
    +
    +
    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Stable (1.13.0)
    +
    +
    +
    Preview (Nightly)
    +
    +
    +
    +
    +
    Your OS
    +
    +
    +
    Linux
    +
    +
    +
    Mac
    +
    +
    +
    Windows
    +
    +
    +
    +
    +
    Package
    +
    +
    +
    Pip
    +
    +
    +
    LibTorch
    +
    +
    +
    Source
    +
    +
    +
    +
    +
    Language
    +
    +
    +
    Python
    +
    +
    +
    C++ / Java
    +
    +
    +
    +
    +
    Compute Platform
    +
    +
    +
    CUDA 11.8
    +
    +
    +
    CUDA 12.1
    +
    +
    +
    CUDA 12.4
    +
    +
    +
    ROCm 5.2
    +
    +
    +
    CPU
    +
    +
    +
    +
    +
    Run this Command:
    +
    +
    +
    pip install torch torchvision
    +
    +
    +
    +
    +
    + + + + Previous versions of PyTorch + +
    + +
    +

    Quick Start With
    Cloud Partners

    + +

    Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

    + +
    + + +
    +
    +
    + Google Cloud Platform +
    + + + + + +
    +
    + +
    +
    +
    +

    Microsoft Azure

    +
    + + +
    +
    + +
    +
    +
    + Lightning Studios +
    + +
    +
    +
    + +
    +
    +
    +
    + + + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/22/index.html b/blog/22/index.html new file mode 100644 index 000000000000..270922c8f32a --- /dev/null +++ b/blog/22/index.html @@ -0,0 +1,995 @@ + + + + + + + + + + + + + Blog | 22 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + +
    + + + + + + +
    +
    + +

    Featured Post

    +

    + PyTorch Enterprise Support Program Update +

    + + + + Read More + + + +
    +
    + +
    +
    +
    +
    + + + + + + + + +
    +
    +

    November 03, 2022

    +

    + Extending TorchVision’s Transforms to Object Detection, Segmentation & Video tasks +

    +

    Note: A previous version of this post was published in November 2022. We have updated this post with the most up-to-date info, in view of the upcoming 0.15 release of torchvision in March 2023, jointly with PyTorch 2.0. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    October 28, 2022

    +

    + New Library Updates in PyTorch 1.13 +

    +

    Summary + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    October 28, 2022

    +

    + PyTorch 1.13 release, including beta versions of functorch and improved support for Apple’s new M1 chips. +

    +

    We are excited to announce the release of PyTorch® 1.13 (release note)! This includes Stable versions of BetterTransformer. We deprecated CUDA 10.2 and 11.3 and completed migration of CUDA 11.6 and 11.7. Beta includes improved support for Apple M1 chips and functorch, a library that offers composable vmap (vectorization) and autodiff transforms, being included in-tree with the PyTorch release. This release is composed of over 3,749 commits and 467 contributors since 1.12.1. We want to sincere...

    + +
    + + Read More + +
    + + + + +
    +
    +

    October 17, 2022

    +

    + PyTorch’s Tracing Based Selective Build +

    +

    Introduction + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    October 13, 2022

    +

    + Scaling PyTorch models on Cloud TPUs with FSDP +

    +

    Introduction + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    September 29, 2022

    +

    + Performance Debugging of Production PyTorch Models at Meta +

    +

    1. Meta’s AI Performance Profiling (MAIProf) + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    September 26, 2022

    +

    + Announcing PyTorch Conference 2022 +

    +

    We are excited to announce that the PyTorch Conference returns in-person as a satellite event to NeurlPS (Neural Information Processing Systems) in New Orleans on Dec. 2nd. + +

    + +
    + + Read More + +
    + +
    + +
    + +
    + +
    +
    +
    + +
    +
    +
    +
    + +

    Install PyTorch

    + +

    Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

    + +

    NOTE: Latest PyTorch requires Python 3.9 or later.

    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Your OS
    +
    +
    +
    Package
    +
    +
    +
    Language
    +
    +
    +
    Compute Platform
    +
    +
    +
    Run this Command:
    +
    +
    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Stable (1.13.0)
    +
    +
    +
    Preview (Nightly)
    +
    +
    +
    +
    +
    Your OS
    +
    +
    +
    Linux
    +
    +
    +
    Mac
    +
    +
    +
    Windows
    +
    +
    +
    +
    +
    Package
    +
    +
    +
    Pip
    +
    +
    +
    LibTorch
    +
    +
    +
    Source
    +
    +
    +
    +
    +
    Language
    +
    +
    +
    Python
    +
    +
    +
    C++ / Java
    +
    +
    +
    +
    +
    Compute Platform
    +
    +
    +
    CUDA 11.8
    +
    +
    +
    CUDA 12.1
    +
    +
    +
    CUDA 12.4
    +
    +
    +
    ROCm 5.2
    +
    +
    +
    CPU
    +
    +
    +
    +
    +
    Run this Command:
    +
    +
    +
    pip install torch torchvision
    +
    +
    +
    +
    +
    + + + + Previous versions of PyTorch + +
    + +
    +

    Quick Start With
    Cloud Partners

    + +

    Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

    + +
    + + +
    +
    +
    + Google Cloud Platform +
    + + + + + +
    +
    + +
    +
    +
    +

    Microsoft Azure

    +
    + + +
    +
    + +
    +
    +
    + Lightning Studios +
    + +
    +
    +
    + +
    +
    +
    +
    + + + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/23/index.html b/blog/23/index.html new file mode 100644 index 000000000000..6fe200decb71 --- /dev/null +++ b/blog/23/index.html @@ -0,0 +1,997 @@ + + + + + + + + + + + + + Blog | 23 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + +
    + + + + + + +
    +
    + +

    Featured Post

    +

    + PyTorch strengthens its governance by joining the Linux Foundation +

    + + + + Read More + + + +
    +
    + +
    +
    +
    +
    + + + + + + + + +
    +
    +

    August 29, 2022

    +

    + Fast Beam Search Decoding in PyTorch with TorchAudio and Flashlight Text +

    +

    Beam search decoding with industry-leading speed from Flashlight Text (part of the Flashlight ML framework) is now available with official support in TorchAudio, bringing high-performance beam search and text utilities for speech and text applications built on top of PyTorch. The current integration supports CTC-style decoding, but it can be used for any modeling setting that outputs token-level probability distributions over time steps. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    August 26, 2022

    +

    + Introducing nvFuser, a deep learning compiler for PyTorch +

    +

    nvFuser is a Deep Learning Compiler for NVIDIA GPUs that automatically just-in-time compiles fast and flexible kernels to reliably accelerate users’ networks. It provides significant speedups for deep learning networks running on Volta and later CUDA accelerators by generating fast custom “fusion” kernels at runtime. nvFuser is specifically designed to meet the unique requirements of the PyTorch community, and it supports diverse network architectures and programs with dynamic inputs of varyi...

    + +
    + + Read More + +
    + + + + +
    +
    +

    August 24, 2022

    +

    + Accelerating PyTorch Vision Models with Channels Last on CPU +

    +

    Overview + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    August 18, 2022

    +

    + Easily list and initialize models with new APIs in TorchVision +

    +

    TorchVision now supports listing and initializing all available built-in models and weights by name. This new API builds upon the recently introduced Multi-weight support API, is currently in Beta, and it addresses a long-standing request from the community. + +

    + +
    + + Read More + +
    + + + + + + + + + + + + + + +
    +
    +

    July 19, 2022

    +

    + What Every User Should Know About Mixed Precision Training in PyTorch +

    +

    Efficient training of modern neural networks often relies on using lower precision data types. Peak float16 matrix multiplication and convolution performance is 16x faster than peak float32 performance on A100 GPUs. And since the float16 and bfloat16 data types are only half the size of float32 they can double the performance of bandwidth-bound kernels and reduce the memory required to train a network, allowing for larger models, larger batches, or larger inputs. Using a module like torch.amp...

    + +
    + + Read More + +
    + +
    + +
    + +
    + +
    +
    +
    + +
    +
    +
    +
    + +

    Install PyTorch

    + +

    Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

    + +

    NOTE: Latest PyTorch requires Python 3.9 or later.

    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Your OS
    +
    +
    +
    Package
    +
    +
    +
    Language
    +
    +
    +
    Compute Platform
    +
    +
    +
    Run this Command:
    +
    +
    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Stable (1.13.0)
    +
    +
    +
    Preview (Nightly)
    +
    +
    +
    +
    +
    Your OS
    +
    +
    +
    Linux
    +
    +
    +
    Mac
    +
    +
    +
    Windows
    +
    +
    +
    +
    +
    Package
    +
    +
    +
    Pip
    +
    +
    +
    LibTorch
    +
    +
    +
    Source
    +
    +
    +
    +
    +
    Language
    +
    +
    +
    Python
    +
    +
    +
    C++ / Java
    +
    +
    +
    +
    +
    Compute Platform
    +
    +
    +
    CUDA 11.8
    +
    +
    +
    CUDA 12.1
    +
    +
    +
    CUDA 12.4
    +
    +
    +
    ROCm 5.2
    +
    +
    +
    CPU
    +
    +
    +
    +
    +
    Run this Command:
    +
    +
    +
    pip install torch torchvision
    +
    +
    +
    +
    +
    + + + + Previous versions of PyTorch + +
    + +
    +

    Quick Start With
    Cloud Partners

    + +

    Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

    + +
    + + +
    +
    +
    + Google Cloud Platform +
    + + + + + +
    +
    + +
    +
    +
    +

    Microsoft Azure

    +
    + + +
    +
    + +
    +
    +
    + Lightning Studios +
    + +
    +
    +
    + +
    +
    +
    +
    + + + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/24/index.html b/blog/24/index.html new file mode 100644 index 000000000000..778bd83ec28b --- /dev/null +++ b/blog/24/index.html @@ -0,0 +1,996 @@ + + + + + + + + + + + + + Blog | 24 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + +
    + + + + + + +
    +
    + +

    Featured Post

    +

    + Case Study: PathAI Uses PyTorch to Improve Patient Outcomes with AI-powered Pathology +

    + + + + Read More + + + +
    +
    + +
    +
    +
    +
    + + + + + + + + +
    +
    +

    July 12, 2022

    +

    + A BetterTransformer for Fast Transformer Inference +

    +

    tl;dr Transformers achieve state-of-the-art performance for NLP, and are becoming popular for a myriad of other tasks. They are computationally expensive which has been a blocker to their widespread productionisation. Launching with PyTorch 1.12, BetterTransformer implements a backwards-compatible fast path of torch.nn.TransformerEncoder for Transformer Encoder Inference and does not require model authors to modify their models. BetterTransformer improvements can exceed 2x in speedup and thro...

    + +
    + + Read More + +
    + + + + +
    +
    +

    June 28, 2022

    +

    + PyTorch 1.12: TorchArrow, Functional API for Modules and nvFuser, are now available +

    +

    We are excited to announce the release of PyTorch 1.12 (release note)! This release is composed of over 3124 commits, 433 contributors. Along with 1.12, we are releasing beta versions of AWS S3 Integration, PyTorch Vision Models on Channels Last on CPU, Empowering PyTorch on Intel® Xeon® Scalable processors with Bfloat16 and FSDP API. We want to sincerely thank our dedicated community for your contributions. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    June 28, 2022

    +

    + New library updates in PyTorch 1.12 +

    +

    We are bringing a number of improvements to the current PyTorch libraries, alongside the PyTorch 1.12 release. These updates demonstrate our focus on developing common and extensible APIs across all domains to make it easier for our community to build ecosystem projects on PyTorch. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    June 27, 2022

    +

    + How Computational Graphs are Executed in PyTorch +

    +

    Welcome to the last entry into understanding the autograd engine of PyTorch series! +If you haven’t read parts 1 & 2 check them now to understand how PyTorch creates the computational graph for the backward pass! + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    June 23, 2022

    +

    + Geospatial deep learning with TorchGeo +

    +

    TorchGeo is a PyTorch domain library providing datasets, samplers, transforms, and pre-trained models specific to geospatial data. + +

    + +
    + + Read More + +
    + + + + + + + + + +
    +
    +

    May 18, 2022

    +

    + Introducing Accelerated PyTorch Training on Mac +

    +

    In collaboration with the Metal engineering team at Apple, we are excited to announce support for GPU-accelerated PyTorch training on Mac. Until now, PyTorch training on Mac only leveraged the CPU, but with the upcoming PyTorch v1.12 release, developers and researchers can take advantage of Apple silicon GPUs for significantly faster model training. This unlocks the ability to perform machine learning workflows like prototyping and fine-tuning locally, right on Mac. + +

    + +
    + + Read More + +
    + +
    + +
    + +
    + +
    +
    +
    + +
    +
    +
    +
    + +

    Install PyTorch

    + +

    Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

    + +

    NOTE: Latest PyTorch requires Python 3.9 or later.

    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Your OS
    +
    +
    +
    Package
    +
    +
    +
    Language
    +
    +
    +
    Compute Platform
    +
    +
    +
    Run this Command:
    +
    +
    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Stable (1.13.0)
    +
    +
    +
    Preview (Nightly)
    +
    +
    +
    +
    +
    Your OS
    +
    +
    +
    Linux
    +
    +
    +
    Mac
    +
    +
    +
    Windows
    +
    +
    +
    +
    +
    Package
    +
    +
    +
    Pip
    +
    +
    +
    LibTorch
    +
    +
    +
    Source
    +
    +
    +
    +
    +
    Language
    +
    +
    +
    Python
    +
    +
    +
    C++ / Java
    +
    +
    +
    +
    +
    Compute Platform
    +
    +
    +
    CUDA 11.8
    +
    +
    +
    CUDA 12.1
    +
    +
    +
    CUDA 12.4
    +
    +
    +
    ROCm 5.2
    +
    +
    +
    CPU
    +
    +
    +
    +
    +
    Run this Command:
    +
    +
    +
    pip install torch torchvision
    +
    +
    +
    +
    +
    + + + + Previous versions of PyTorch + +
    + +
    +

    Quick Start With
    Cloud Partners

    + +

    Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

    + +
    + + +
    +
    +
    + Google Cloud Platform +
    + + + + + +
    +
    + +
    +
    +
    +

    Microsoft Azure

    +
    + + +
    +
    + +
    +
    +
    + Lightning Studios +
    + +
    +
    +
    + +
    +
    +
    +
    + + + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/25/index.html b/blog/25/index.html new file mode 100644 index 000000000000..8307aefa4a0e --- /dev/null +++ b/blog/25/index.html @@ -0,0 +1,1000 @@ + + + + + + + + + + + + + Blog | 25 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + +
    + + + + + + +
    + +
    + +
    +
    +
    +
    + + + + + + + + +
    +
    +

    March 16, 2022

    +

    + Running PyTorch Models on Jetson Nano +

    +

    Overview +NVIDIA Jetson Nano, part of the Jetson family of products or Jetson modules, is a small yet powerful Linux (Ubuntu) based embedded computer with 2/4GB GPU. With it, you can run many PyTorch models efficiently. This document summarizes our experience of running different deep learning models using 3 different mechanisms on Jetson Nano: + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    March 14, 2022

    +

    + Introducing PyTorch Fully Sharded Data Parallel (FSDP) API +

    +

    Recent studies have shown that large model training will be beneficial for improving model quality. During the last 3 years, model size grew 10,000 times from BERT with 110M parameters to Megatron-2 with one trillion. However, training large AI models is not easy—aside from the need for large amounts of computing resources, software engineering complexity is also challenging. PyTorch has been working on building tools and infrastructure to make it easier. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    March 10, 2022

    +

    + PyTorch 1.11, TorchData, and functorch are now available +

    +

    We are excited to announce the release of PyTorch 1.11 (release notes). This release is composed of over 3,300 commits since 1.10, made by 434 contributors. Along with 1.11, we are releasing beta versions of TorchData and functorch. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    March 10, 2022

    +

    + Introducing TorchRec, and other domain library updates in PyTorch 1.11 +

    +

    We are introducing the beta release of TorchRec and a number of improvements to the current PyTorch domain libraries, alongside the PyTorch 1.11 release. These updates demonstrate our focus on developing common and extensible APIs across all domains to make it easier for our community to build ecosystem projects on PyTorch. Highlights include: + +

    + +
    + + Read More + +
    + + + + + + + + + +
    +
    +

    February 24, 2022

    +

    + Case Study: Amazon Ads Uses PyTorch and AWS Inferentia to Scale Models for Ads Processing +

    +

    Amazon Ads uses PyTorch, TorchServe, and AWS Inferentia to reduce inference costs by 71% and drive scale out. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    February 23, 2022

    +

    + Introducing TorchRec, a library for modern production recommendation systems +

    +

    We are excited to announce TorchRec, a PyTorch domain library for Recommendation Systems. This new library provides common sparsity and parallelism primitives, enabling researchers to build state-of-the-art personalization models and deploy them in production. + +

    + +
    + + Read More + +
    + +
    + +
    + +
    + +
    +
    +
    + +
    +
    +
    +
    + +

    Install PyTorch

    + +

    Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

    + +

    NOTE: Latest PyTorch requires Python 3.9 or later.

    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Your OS
    +
    +
    +
    Package
    +
    +
    +
    Language
    +
    +
    +
    Compute Platform
    +
    +
    +
    Run this Command:
    +
    +
    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Stable (1.13.0)
    +
    +
    +
    Preview (Nightly)
    +
    +
    +
    +
    +
    Your OS
    +
    +
    +
    Linux
    +
    +
    +
    Mac
    +
    +
    +
    Windows
    +
    +
    +
    +
    +
    Package
    +
    +
    +
    Pip
    +
    +
    +
    LibTorch
    +
    +
    +
    Source
    +
    +
    +
    +
    +
    Language
    +
    +
    +
    Python
    +
    +
    +
    C++ / Java
    +
    +
    +
    +
    +
    Compute Platform
    +
    +
    +
    CUDA 11.8
    +
    +
    +
    CUDA 12.1
    +
    +
    +
    CUDA 12.4
    +
    +
    +
    ROCm 5.2
    +
    +
    +
    CPU
    +
    +
    +
    +
    +
    Run this Command:
    +
    +
    +
    pip install torch torchvision
    +
    +
    +
    +
    +
    + + + + Previous versions of PyTorch + +
    + +
    +

    Quick Start With
    Cloud Partners

    + +

    Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

    + +
    + + +
    +
    +
    + Google Cloud Platform +
    + + + + + +
    +
    + +
    +
    +
    +

    Microsoft Azure

    +
    + + +
    +
    + +
    +
    +
    + Lightning Studios +
    + +
    +
    +
    + +
    +
    +
    +
    + + + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/26/index.html b/blog/26/index.html new file mode 100644 index 000000000000..25031f242d9d --- /dev/null +++ b/blog/26/index.html @@ -0,0 +1,995 @@ + + + + + + + + + + + + + Blog | 26 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + +
    + + + + + + +
    +
    + +

    Featured Post

    +

    + Practical Quantization in PyTorch +

    + + + + Read More + + + +
    +
    + +
    +
    +
    +
    + + + + + + + + +
    +
    +

    December 22, 2021

    +

    + Introducing TorchVision’s New Multi-Weight Support API +

    +

    TorchVision has a new backwards compatible API for building models with multi-weight support. The new API allows loading different pre-trained weights on the same model variant, keeps track of vital meta-data such as the classification labels and includes the preprocessing transforms necessary for using the models. In this blog post, we plan to review the prototype API, show-case its features and highlight key differences with the existing one. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    December 15, 2021

    +

    + Efficient PyTorch: Tensor Memory Format Matters +

    +

    Ensuring the right memory format for your inputs can significantly impact the running time of your PyTorch vision models. When in doubt, choose a Channels Last memory format. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    December 08, 2021

    +

    + Announcing the Winners of the 2021 PyTorch Annual Hackathon +

    +

    More than 1,900 people worked hard in this year’s PyTorch Annual Hackathon to create unique tools and applications for PyTorch developers and researchers. + +

    + +
    + + Read More + +
    + + + + + + + + + +
    +
    +

    October 29, 2021

    +

    + Feature Extraction in TorchVision using Torch FX +

    +

    + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    October 26, 2021

    +

    + Accelerating PyTorch with CUDA Graphs +

    +

    Today, we are pleased to announce a new advanced CUDA feature, CUDA Graphs, has been brought to PyTorch. Modern DL frameworks have complicated software stacks that incur significant overheads associated with the submission of each operation to the GPU. When DL workloads are strong-scaled to many GPUs for performance, the time taken by each GPU operation diminishes to just a few microseconds and, in these cases, the high work submission latencies of frameworks often lead to low utilization of ...

    + +
    + + Read More + +
    + + + + +
    +
    +

    October 21, 2021

    +

    + PyTorch 1.10 Release, including CUDA Graphs APIs, Frontend and Compiler Improvements +

    +

    We are excited to announce the release of PyTorch 1.10. This release is composed of over 3,400 commits since 1.9, made by 426 contributors. We want to sincerely thank our community for continuously improving PyTorch. + +

    + +
    + + Read More + +
    + +
    + +
    + +
    + +
    +
    +
    + +
    +
    +
    +
    + +

    Install PyTorch

    + +

    Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

    + +

    NOTE: Latest PyTorch requires Python 3.9 or later.

    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Your OS
    +
    +
    +
    Package
    +
    +
    +
    Language
    +
    +
    +
    Compute Platform
    +
    +
    +
    Run this Command:
    +
    +
    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Stable (1.13.0)
    +
    +
    +
    Preview (Nightly)
    +
    +
    +
    +
    +
    Your OS
    +
    +
    +
    Linux
    +
    +
    +
    Mac
    +
    +
    +
    Windows
    +
    +
    +
    +
    +
    Package
    +
    +
    +
    Pip
    +
    +
    +
    LibTorch
    +
    +
    +
    Source
    +
    +
    +
    +
    +
    Language
    +
    +
    +
    Python
    +
    +
    +
    C++ / Java
    +
    +
    +
    +
    +
    Compute Platform
    +
    +
    +
    CUDA 11.8
    +
    +
    +
    CUDA 12.1
    +
    +
    +
    CUDA 12.4
    +
    +
    +
    ROCm 5.2
    +
    +
    +
    CPU
    +
    +
    +
    +
    +
    Run this Command:
    +
    +
    +
    pip install torch torchvision
    +
    +
    +
    +
    +
    + + + + Previous versions of PyTorch + +
    + +
    +

    Quick Start With
    Cloud Partners

    + +

    Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

    + +
    + + +
    +
    +
    + Google Cloud Platform +
    + + + + + +
    +
    + +
    +
    +
    +

    Microsoft Azure

    +
    + + +
    +
    + +
    +
    +
    + Lightning Studios +
    + +
    +
    +
    + +
    +
    +
    +
    + + + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/27/index.html b/blog/27/index.html new file mode 100644 index 000000000000..2807203041ab --- /dev/null +++ b/blog/27/index.html @@ -0,0 +1,995 @@ + + + + + + + + + + + + + Blog | 27 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + +
    + + + + + + +
    +
    + +

    Featured Post

    +

    + New Library Releases in PyTorch 1.10, including TorchX, TorchAudio, TorchVision +

    + + + + Read More + + + +
    +
    + +
    +
    +
    +
    + + + + + + + + +
    +
    +

    September 08, 2021

    +

    + Announcing PyTorch Annual Hackathon 2021 +

    +

    We’re excited to announce the PyTorch Annual Hackathon 2021! This year, we’re looking to support the community in creating innovative PyTorch tools, libraries, and applications. 2021 is the third year we’re hosting this Hackathon, and we welcome you to join the PyTorch community and put your machine learning skills into action. Submissions start on September 8 and end on November 3. Good luck to everyone! + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    August 31, 2021

    +

    + How Computational Graphs are Constructed in PyTorch +

    +

    In the previous post we went over the theoretical foundations of automatic differentiation and reviewed the implementation in PyTorch. In this post, we will be showing the parts of PyTorch involved in creating the graph and executing it. In order to understand the following contents, please read @ezyang’s wonderful blog post about PyTorch internals. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    August 23, 2021

    +

    + Announcing PyTorch Developer Day 2021 +

    +

    We are excited to announce PyTorch Developer Day (#PTD2), taking place virtually from December 1 & 2, 2021. Developer Day is designed for developers and users to discuss core technical developments, ideas, and roadmaps. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    August 18, 2021

    +

    + PipeTransformer: Automated Elastic Pipelining for Distributed Training of Large-scale Models +

    +

    In this blog post, we describe the first peer-reviewed research paper that explores accelerating the hybrid of PyTorch DDP (torch.nn.parallel.DistributedDataParallel) [1] and Pipeline (torch.distributed.pipeline) - PipeTransformer: Automated Elastic Pipelining for Distributed Training of Large-scale Models (Transformers such as BERT [2] and ViT [3]), published at ICML 2021. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    August 03, 2021

    +

    + What’s New in PyTorch Profiler 1.9? +

    +

    PyTorch Profiler v1.9 has been released! The goal of this new release (previous PyTorch Profiler release) is to provide you with new state-of-the-art tools to help diagnose and fix machine learning performance issues regardless of whether you are working on one or numerous machines. The objective is to target the execution steps that are the most costly in time and/or memory, and visualize the work load distribution between GPUs and CPUs. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    June 27, 2021

    +

    + Everything You Need To Know About Torchvision’s SSDlite Implementation +

    +

    In the previous article, we’ve discussed how the SSD algorithm works, covered its implementation details and presented its training process. If you have not read the previous blog post, I encourage you to check it out before continuing. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    June 23, 2021

    +

    + The torch.linalg module: Accelerated Linear Algebra with Autograd in PyTorch +

    +

    Linear algebra is essential to deep learning and scientific computing, and it’s always been a core part of PyTorch. PyTorch 1.9 extends PyTorch’s support for linear algebra operations with the torch.linalg module. This module, documented here, has 26 operators, including faster and easier to use versions of older PyTorch operators, every function from NumPy’s linear algebra module extended with accelerator and autograd support, and a few operators that are completely new. This makes the torch...

    + +
    + + Read More + +
    + +
    + +
    + +
    + +
    +
    +
    + +
    +
    +
    +
    + +

    Install PyTorch

    + +

    Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

    + +

    NOTE: Latest PyTorch requires Python 3.9 or later.

    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Your OS
    +
    +
    +
    Package
    +
    +
    +
    Language
    +
    +
    +
    Compute Platform
    +
    +
    +
    Run this Command:
    +
    +
    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Stable (1.13.0)
    +
    +
    +
    Preview (Nightly)
    +
    +
    +
    +
    +
    Your OS
    +
    +
    +
    Linux
    +
    +
    +
    Mac
    +
    +
    +
    Windows
    +
    +
    +
    +
    +
    Package
    +
    +
    +
    Pip
    +
    +
    +
    LibTorch
    +
    +
    +
    Source
    +
    +
    +
    +
    +
    Language
    +
    +
    +
    Python
    +
    +
    +
    C++ / Java
    +
    +
    +
    +
    +
    Compute Platform
    +
    +
    +
    CUDA 11.8
    +
    +
    +
    CUDA 12.1
    +
    +
    +
    CUDA 12.4
    +
    +
    +
    ROCm 5.2
    +
    +
    +
    CPU
    +
    +
    +
    +
    +
    Run this Command:
    +
    +
    +
    pip install torch torchvision
    +
    +
    +
    +
    +
    + + + + Previous versions of PyTorch + +
    + +
    +

    Quick Start With
    Cloud Partners

    + +

    Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

    + +
    + + +
    +
    +
    + Google Cloud Platform +
    + + + + + +
    +
    + +
    +
    +
    +

    Microsoft Azure

    +
    + + +
    +
    + +
    +
    +
    + Lightning Studios +
    + +
    +
    +
    + +
    +
    +
    +
    + + + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/28/index.html b/blog/28/index.html new file mode 100644 index 000000000000..0203ea43473c --- /dev/null +++ b/blog/28/index.html @@ -0,0 +1,999 @@ + + + + + + + + + + + + + Blog | 28 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + +
    + + + + + + +
    +
    + +

    Featured Post

    +

    + An Overview of the PyTorch Mobile Demo Apps +

    + + + + Read More + + + +
    +
    + +
    +
    +
    +
    + + + + + + + + +
    +
    +

    June 16, 2021

    +

    + Everything You Need To Know About Torchvision’s SSD Implementation +

    +

    In TorchVision v0.10, we’ve released two new Object Detection models based on the SSD architecture. Our plan is to cover the key implementation details of the algorithms along with information on how they were trained in a two-part article. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    June 15, 2021

    +

    + PyTorch 1.9 Release, including torch.linalg and Mobile Interpreter +

    +

    We are excited to announce the release of PyTorch 1.9. The release is composed of more than 3,400 commits since 1.8, made by 398 contributors. The release notes are available here. Highlights include: + + Major improvements to support scientific computing, including torch.linalg, torch.special, and Complex Autograd + Major improvements in on-device binary size with Mobile Interpreter + Native support for elastic-fault tolerance training through the upstreaming of TorchElastic into PyTorch Core...

    + +
    + + Read More + +
    + + + + +
    +
    +

    June 15, 2021

    +

    + New PyTorch Library Releases in PyTorch 1.9, including TorchVision, TorchAudio, and more +

    +

    Today, we are announcing updates to a number of PyTorch libraries, alongside the PyTorch 1.9 release. The updates include new releases for the domain libraries including TorchVision, TorchText and TorchAudio. These releases, along with the PyTorch 1.9 release, include a number of new features and improvements that will provide a broad set of updates for the PyTorch community. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    June 08, 2021

    +

    + Overview of PyTorch Autograd Engine +

    +

    This blog post is based on PyTorch version 1.8, although it should apply for older versions too, since most of the mechanics have remained constant. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    May 26, 2021

    +

    + Everything you need to know about TorchVision’s MobileNetV3 implementation +

    +

    In TorchVision v0.9, we released a series of new mobile-friendly models that can be used for Classification, Object Detection and Semantic Segmentation. In this article, we will dig deep into the code of the models, share notable implementation details, explain how we configured and trained them, and highlight important tradeoffs we made during their tuning. Our goal is to disclose technical details that typically remain undocumented in the original papers and repos of the models. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    May 25, 2021

    +

    + Announcing the PyTorch Enterprise Support Program +

    +

    Today, we are excited to announce the PyTorch Enterprise Support Program, a participatory program that enables service providers to develop and offer tailored enterprise-grade support to their customers. This new offering, built in collaboration between Facebook and Microsoft, was created in direct response to feedback from PyTorch enterprise users who are developing models in production at scale for mission-critical applications. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    May 10, 2021

    +

    + PyTorch Ecosystem Day 2021 Recap and New Contributor Resources +

    +

    Thank you to our incredible community for making the first ever PyTorch Ecosystem Day a success! The day was filled with discussions on new developments, trends and challenges showcased through 71 posters, 32 breakout sessions and 6 keynote speakers. + +

    + +
    + + Read More + +
    + +
    + +
    + +
    + +
    +
    +
    + +
    +
    +
    +
    + +

    Install PyTorch

    + +

    Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

    + +

    NOTE: Latest PyTorch requires Python 3.9 or later.

    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Your OS
    +
    +
    +
    Package
    +
    +
    +
    Language
    +
    +
    +
    Compute Platform
    +
    +
    +
    Run this Command:
    +
    +
    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Stable (1.13.0)
    +
    +
    +
    Preview (Nightly)
    +
    +
    +
    +
    +
    Your OS
    +
    +
    +
    Linux
    +
    +
    +
    Mac
    +
    +
    +
    Windows
    +
    +
    +
    +
    +
    Package
    +
    +
    +
    Pip
    +
    +
    +
    LibTorch
    +
    +
    +
    Source
    +
    +
    +
    +
    +
    Language
    +
    +
    +
    Python
    +
    +
    +
    C++ / Java
    +
    +
    +
    +
    +
    Compute Platform
    +
    +
    +
    CUDA 11.8
    +
    +
    +
    CUDA 12.1
    +
    +
    +
    CUDA 12.4
    +
    +
    +
    ROCm 5.2
    +
    +
    +
    CPU
    +
    +
    +
    +
    +
    Run this Command:
    +
    +
    +
    pip install torch torchvision
    +
    +
    +
    +
    +
    + + + + Previous versions of PyTorch + +
    + +
    +

    Quick Start With
    Cloud Partners

    + +

    Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

    + +
    + + +
    +
    +
    + Google Cloud Platform +
    + + + + + +
    +
    + +
    +
    +
    +

    Microsoft Azure

    +
    + + +
    +
    + +
    +
    +
    + Lightning Studios +
    + +
    +
    +
    + +
    +
    +
    +
    + + + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/29/index.html b/blog/29/index.html new file mode 100644 index 000000000000..2f6193ec60e1 --- /dev/null +++ b/blog/29/index.html @@ -0,0 +1,1000 @@ + + + + + + + + + + + + + Blog | 29 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + +
    + + + + + + +
    +
    + +

    Featured Post

    +

    + An overview of the ML models introduced in TorchVision v0.9 +

    + + + + Read More + + + +
    +
    + +
    +
    +
    +
    + + + + + + + + +
    +
    +

    March 25, 2021

    +

    + Introducing PyTorch Profiler - the new and improved performance tool +

    +

    Along with PyTorch 1.8.1 release, we are excited to announce PyTorch Profiler – the new and improved performance debugging profiler for PyTorch. Developed as part of a collaboration between Microsoft and Facebook, the PyTorch Profiler is an open-source tool that enables accurate and efficient performance analysis and troubleshooting for large-scale deep learning models. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    March 24, 2021

    +

    + PyTorch for AMD ROCm™ Platform now available as Python package +

    +

    With the PyTorch 1.8 release, we are delighted to announce a new installation option for users of +PyTorch on the ROCm™ open software platform. An installable Python package is now hosted on +pytorch.org, along with instructions for local installation in the same simple, selectable format as +PyTorch packages for CPU-only configurations and other GPU platforms. PyTorch on ROCm includes full +capability for mixed-precision and large-scale training using AMD’s MIOpen & RCCL libraries. This +prov...

    + +
    + + Read More + +
    + + + + +
    +
    +

    March 09, 2021

    +

    + Announcing PyTorch Ecosystem Day +

    +

    We’re proud to announce our first PyTorch Ecosystem Day. The virtual, one-day event will focus completely on our Ecosystem and Industry PyTorch communities! + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    March 04, 2021

    +

    + PyTorch 1.8 Release, including Compiler and Distributed Training updates, and New Mobile Tutorials +

    +

    We are excited to announce the availability of PyTorch 1.8. This release is composed of more than 3,000 commits since 1.7. It includes major updates and new features for compilation, code optimization, frontend APIs for scientific computing, and AMD ROCm support through binaries that are available via pytorch.org. It also provides improved features for large-scale training for pipeline and model parallelism, and gradient compression. A few of the highlights include: + + Support for doing pytho...

    + +
    + + Read More + +
    + + + + +
    +
    +

    March 04, 2021

    +

    + New PyTorch library releases including TorchVision Mobile, TorchAudio I/O, and more +

    +

    Today, we are announcing updates to a number of PyTorch libraries, alongside the PyTorch 1.8 release. The updates include new releases for the domain libraries including TorchVision, TorchText and TorchAudio as well as new version of TorchCSPRNG. These releases include a number of new features and improvements and, along with the PyTorch 1.8 release, provide a broad set of updates for the PyTorch community to build on and leverage. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    March 03, 2021

    +

    + The torch.fft module: Accelerated Fast Fourier Transforms with Autograd in PyTorch +

    +

    The Fast Fourier Transform (FFT) calculates the Discrete Fourier Transform in O(n log n) time. It is foundational to a wide variety of numerical algorithms and signal processing techniques since it makes working in signals’ “frequency domains” as tractable as working in their spatial or temporal domains. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    November 12, 2020

    +

    + Prototype Features Now Available - APIs for Hardware Accelerated Mobile and ARM64 Builds +

    +

    Today, we are announcing four PyTorch prototype features. The first three of these will enable Mobile machine-learning developers to execute models on the full set of hardware (HW) engines making up a system-on-chip (SOC). This gives developers options to optimize their model execution for unique performance, power, and system-level concurrency. + +

    + +
    + + Read More + +
    + +
    + +
    + +
    + +
    +
    +
    + +
    +
    +
    +
    + +

    Install PyTorch

    + +

    Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

    + +

    NOTE: Latest PyTorch requires Python 3.9 or later.

    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Your OS
    +
    +
    +
    Package
    +
    +
    +
    Language
    +
    +
    +
    Compute Platform
    +
    +
    +
    Run this Command:
    +
    +
    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Stable (1.13.0)
    +
    +
    +
    Preview (Nightly)
    +
    +
    +
    +
    +
    Your OS
    +
    +
    +
    Linux
    +
    +
    +
    Mac
    +
    +
    +
    Windows
    +
    +
    +
    +
    +
    Package
    +
    +
    +
    Pip
    +
    +
    +
    LibTorch
    +
    +
    +
    Source
    +
    +
    +
    +
    +
    Language
    +
    +
    +
    Python
    +
    +
    +
    C++ / Java
    +
    +
    +
    +
    +
    Compute Platform
    +
    +
    +
    CUDA 11.8
    +
    +
    +
    CUDA 12.1
    +
    +
    +
    CUDA 12.4
    +
    +
    +
    ROCm 5.2
    +
    +
    +
    CPU
    +
    +
    +
    +
    +
    Run this Command:
    +
    +
    +
    pip install torch torchvision
    +
    +
    +
    +
    +
    + + + + Previous versions of PyTorch + +
    + +
    +

    Quick Start With
    Cloud Partners

    + +

    Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

    + +
    + + +
    +
    +
    + Google Cloud Platform +
    + + + + + +
    +
    + +
    +
    +
    +

    Microsoft Azure

    +
    + + +
    +
    + +
    +
    +
    + Lightning Studios +
    + +
    +
    +
    + +
    +
    +
    +
    + + + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/3/index.html b/blog/3/index.html new file mode 100644 index 000000000000..1f14b89b6cb2 --- /dev/null +++ b/blog/3/index.html @@ -0,0 +1,1000 @@ + + + + + + + + + + + + + Blog | 3 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + +
    + + + + + + +
    +
    + +

    Featured Post

    +

    + 📣 Submit to Speak at PyTorch Conference + Save on Registration +

    + + + + Read More + + + +
    +
    + +
    +
    +
    +
    + + + + + + + + +
    +
    +

    February 26, 2025

    +

    + Accelerating Generative AI with PyTorch: Segment Anything 2 - Fast and furious inference with low latency and fast cold starts +

    +

    This post is a follow-up to our first entry in the multi-series blog focused on how to accelerate generative AI models with pure, native PyTorch and a focus on latency and elastic scalability. We use torch.compile and torch.export to create highly optimized low latency versions of SAM2 that can be quickly scaled up on new instances. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    February 11, 2025

    +

    + Unlocking the Latest Features in PyTorch 2.6 for Intel Platforms +

    +

    PyTorch* 2.6 has just been released with a set of exciting new features including torch.compile compatibility with Python 3.13, new security and performance enhancements, and a change in the default parameter for torch.load. PyTorch also announced the deprecation of its official Anaconda channel. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    February 05, 2025

    +

    + Enabling advanced GPU features in PyTorch - Warp Specialization +

    +

    Meta: Hongtao Yu, Manman Ren, Bert Maher, Shane Nay +NVIDIA: Gustav Zhu, Shuhao Jiang + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    January 29, 2025

    +

    + PyTorch 2.6 Release Blog +

    +

    We are excited to announce the release of PyTorch® 2.6 (release notes)! This release features multiple improvements for PT2: torch.compile can now be used with Python 3.13; new performance-related knob torch.compiler.set_stance; several AOTInductor enhancements. Besides the PT2 improvements, another highlight is FP16 support on X86 CPUs. + +

    + +
    + + Read More + +
    + + + + + + + + + +
    +
    +

    January 24, 2025

    +

    + How Intel Uses PyTorch to Empower Generative AI through Intel Arc GPUs +

    +

    Intel has long been at the forefront of technological innovation, and its recent venture into Generative AI (GenAI) solutions is no exception. With the rise of AI-powered gaming experiences, Intel sought to deliver an accessible and intuitive GenAI inferencing solution tailored for AI PCs powered by Intel’s latest GPUs. By leveraging PyTorch as the backbone for development efforts, Intel successfully launched AI Playground, an open source application that showcases advanced GenAI workloads. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    January 21, 2025

    +

    + Accelerating LLM Inference with GemLite, TorchAO and SGLang +

    +

    Large Language Models (LLMs) are typically very resource-intensive, requiring significant amounts of memory, compute and power to operate effectively. Quantization provides a solution by reducing weights and activations from 16 bit floats to lower bitrates (e.g., 8 bit, 4 bit, 2 bit), achieving significant speedup and memory savings and also enables support for larger batch sizes. + +

    + +
    + + Read More + +
    + +
    + +
    + +
    + +
    +
    +
    + +
    +
    +
    +
    + +

    Install PyTorch

    + +

    Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

    + +

    NOTE: Latest PyTorch requires Python 3.9 or later.

    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Your OS
    +
    +
    +
    Package
    +
    +
    +
    Language
    +
    +
    +
    Compute Platform
    +
    +
    +
    Run this Command:
    +
    +
    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Stable (1.13.0)
    +
    +
    +
    Preview (Nightly)
    +
    +
    +
    +
    +
    Your OS
    +
    +
    +
    Linux
    +
    +
    +
    Mac
    +
    +
    +
    Windows
    +
    +
    +
    +
    +
    Package
    +
    +
    +
    Pip
    +
    +
    +
    LibTorch
    +
    +
    +
    Source
    +
    +
    +
    +
    +
    Language
    +
    +
    +
    Python
    +
    +
    +
    C++ / Java
    +
    +
    +
    +
    +
    Compute Platform
    +
    +
    +
    CUDA 11.8
    +
    +
    +
    CUDA 12.1
    +
    +
    +
    CUDA 12.4
    +
    +
    +
    ROCm 5.2
    +
    +
    +
    CPU
    +
    +
    +
    +
    +
    Run this Command:
    +
    +
    +
    pip install torch torchvision
    +
    +
    +
    +
    +
    + + + + Previous versions of PyTorch + +
    + +
    +

    Quick Start With
    Cloud Partners

    + +

    Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

    + +
    + + +
    +
    +
    + Google Cloud Platform +
    + + + + + +
    +
    + +
    +
    +
    +

    Microsoft Azure

    +
    + + +
    +
    + +
    +
    +
    + Lightning Studios +
    + +
    +
    +
    + +
    +
    +
    +
    + + + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/30/index.html b/blog/30/index.html new file mode 100644 index 000000000000..39179ec7c4a7 --- /dev/null +++ b/blog/30/index.html @@ -0,0 +1,989 @@ + + + + + + + + + + + + + Blog | 30 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + +
    + + + + + + +
    +
    + +

    Featured Post

    +

    + Announcing PyTorch Developer Day 2020 +

    + + + + Read More + + + +
    +
    + +
    +
    +
    +
    + + + + + + + + +
    +
    +

    October 27, 2020

    +

    + PyTorch 1.7 released w/ CUDA 11, New APIs for FFTs, Windows support for Distributed training and more +

    +

    Today, we’re announcing the availability of PyTorch 1.7, along with updated domain libraries. The PyTorch 1.7 release includes a number of new APIs including support for NumPy-Compatible FFT operations, profiling tools and major updates to both distributed data parallel (DDP) and remote procedure call (RPC) based distributed training. In addition, several features moved to stable including custom C++ Classes, the memory profiler, extensions via custom tensor-like objects, user async functions...

    + +
    + + Read More + +
    + + + + +
    +
    +

    October 01, 2020

    +

    + Announcing the Winners of the 2020 Global PyTorch Summer Hackathon +

    +

    More than 2,500 participants in this year’s Global PyTorch Summer Hackathon pushed the envelope to create unique new tools and applications for PyTorch developers and researchers. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    August 24, 2020

    +

    + PyTorch framework for cryptographically secure random number generation, torchcsprng, now available +

    +

    One of the key components of modern cryptography is the pseudorandom number generator. Katz and Lindell stated, “The use of badly designed or inappropriate random number generators can often leave a good cryptosystem vulnerable to attack. Particular care must be taken to use a random number generator that is designed for cryptographic use, rather than a ‘general-purpose’ random number generator which may be fine for some applications but not ones that are required to be cryptographically secu...

    + +
    + + Read More + +
    + + + + +
    +
    +

    August 18, 2020

    +

    + PyTorch 1.6 now includes Stochastic Weight Averaging +

    +

    Do you use stochastic gradient descent (SGD) or Adam? Regardless of the procedure you use to train your neural network, you can likely achieve significantly better generalization at virtually no additional cost with a simple new technique now natively supported in PyTorch 1.6, Stochastic Weight Averaging (SWA) [1]. Even if you have already trained your model, it’s easy to realize the benefits of SWA by running SWA for a small number of epochs starting with a pre-trained model. Again and again...

    + +
    + + Read More + +
    + + + + +
    +
    +

    August 11, 2020

    +

    + Efficient PyTorch I/O library for Large Datasets, Many Files, Many GPUs +

    +

    Data sets are growing bigger every day and GPUs are getting faster. This means there are more data sets for deep learning researchers and engineers to train and validate their models. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    July 28, 2020

    +

    + PyTorch 1.6 released w/ Native AMP Support, Microsoft joins as maintainers for Windows +

    +

    Today, we’re announcing the availability of PyTorch 1.6, along with updated domain libraries. We are also excited to announce the team at Microsoft is now maintaining Windows builds and binaries and will also be supporting the community on GitHub as well as the PyTorch Windows discussion forums. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    July 28, 2020

    +

    + PyTorch feature classification changes +

    +

    Traditionally features in PyTorch were classified as either stable or experimental with an implicit third option of testing bleeding edge features by building master or through installing nightly builds (available via prebuilt whls). This has, in a few cases, caused some confusion around the level of readiness, commitment to the feature and backward compatibility that can be expected from a user perspective. Moving forward, we’d like to better classify the 3 types of features as well as defin...

    + +
    + + Read More + +
    + +
    + +
    + +
    + +
    +
    +
    + +
    +
    +
    +
    + +

    Install PyTorch

    + +

    Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

    + +

    NOTE: Latest PyTorch requires Python 3.9 or later.

    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Your OS
    +
    +
    +
    Package
    +
    +
    +
    Language
    +
    +
    +
    Compute Platform
    +
    +
    +
    Run this Command:
    +
    +
    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Stable (1.13.0)
    +
    +
    +
    Preview (Nightly)
    +
    +
    +
    +
    +
    Your OS
    +
    +
    +
    Linux
    +
    +
    +
    Mac
    +
    +
    +
    Windows
    +
    +
    +
    +
    +
    Package
    +
    +
    +
    Pip
    +
    +
    +
    LibTorch
    +
    +
    +
    Source
    +
    +
    +
    +
    +
    Language
    +
    +
    +
    Python
    +
    +
    +
    C++ / Java
    +
    +
    +
    +
    +
    Compute Platform
    +
    +
    +
    CUDA 11.8
    +
    +
    +
    CUDA 12.1
    +
    +
    +
    CUDA 12.4
    +
    +
    +
    ROCm 5.2
    +
    +
    +
    CPU
    +
    +
    +
    +
    +
    Run this Command:
    +
    +
    +
    pip install torch torchvision
    +
    +
    +
    +
    +
    + + + + Previous versions of PyTorch + +
    + +
    +

    Quick Start With
    Cloud Partners

    + +

    Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

    + +
    + + +
    +
    +
    + Google Cloud Platform +
    + + + + + +
    +
    + +
    +
    +
    +

    Microsoft Azure

    +
    + + +
    +
    + +
    +
    +
    + Lightning Studios +
    + +
    +
    +
    + +
    +
    +
    +
    + + + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/31/index.html b/blog/31/index.html new file mode 100644 index 000000000000..884555a7cc0c --- /dev/null +++ b/blog/31/index.html @@ -0,0 +1,993 @@ + + + + + + + + + + + + + Blog | 31 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + +
    + + + + + + +
    +
    + +

    Featured Post

    +

    + Microsoft becomes maintainer of the Windows version of PyTorch +

    + + + + Read More + + + +
    +
    + +
    +
    +
    +
    + + + + + + + + +
    +
    +

    July 28, 2020

    +

    + Introducing native PyTorch automatic mixed precision for faster training on NVIDIA GPUs +

    +

    Most deep learning frameworks, including PyTorch, train with 32-bit floating point (FP32) arithmetic by default. However this is not essential to achieve full accuracy for many deep learning models. In 2017, NVIDIA researchers developed a methodology for mixed-precision training, which combined single-precision (FP32) with half-precision (e.g. FP16) format when training a network, and achieved the same accuracy as FP32 training using the same hyperparameters, with additional performance benef...

    + +
    + + Read More + +
    + + + + +
    +
    +

    May 05, 2020

    +

    + Updates & Improvements to PyTorch Tutorials +

    +

    PyTorch.org provides researchers and developers with documentation, installation instructions, latest news, community projects, tutorials, and more. Today, we are introducing usability and content improvements including tutorials in additional categories, a new recipe format for quickly referencing common topics, sorting using tags, and an updated homepage. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    April 21, 2020

    +

    + PyTorch library updates including new model serving library +

    +

    Along with the PyTorch 1.5 release, we are announcing new libraries for high-performance PyTorch model serving and tight integration with TorchElastic and Kubernetes. Additionally, we are releasing updated packages for torch_xla (Google Cloud TPUs), torchaudio, torchvision, and torchtext. All of these new libraries and enhanced capabilities are available today and accompany all of the core features released in PyTorch 1.5. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    April 21, 2020

    +

    + PyTorch 1.5 released, new and updated APIs including C++ frontend API parity with Python +

    +

    Today, we’re announcing the availability of PyTorch 1.5, along with new and updated libraries. This release includes several major new API additions and improvements. PyTorch now includes a significant update to the C++ frontend, ‘channels last’ memory format for computer vision models, and a stable release of the distributed RPC framework used for model-parallel training. The release also has new APIs for autograd for hessians and jacobians, and an API that allows the creation of Custom C++ ...

    + +
    + + Read More + +
    + + + + +
    +
    +

    March 26, 2020

    +

    + Introduction to Quantization on PyTorch +

    +

    It’s important to make efficient use of both server-side and on-device compute resources when developing machine learning applications. To support more efficient deployment on servers and edge devices, PyTorch added a support for model quantization using the familiar eager mode Python API. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    January 15, 2020

    +

    + PyTorch 1.4 released, domain libraries updated +

    +

    Today, we’re announcing the availability of PyTorch 1.4, along with updates to the PyTorch domain libraries. These releases build on top of the announcements from NeurIPS 2019, where we shared the availability of PyTorch Elastic, a new classification framework for image and video, and the addition of Preferred Networks to the PyTorch community. For those that attended the workshops at NeurIPS, the content can be found here. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    December 06, 2019

    +

    + PyTorch adds new tools and libraries, welcomes Preferred Networks to its community +

    +

    PyTorch continues to be used for the latest state-of-the-art research on display at the NeurIPS conference next week, making up nearly 70% of papers that cite a framework. In addition, we’re excited to welcome Preferred Networks, the maintainers of the Chainer framework, to the PyTorch community. Their teams are moving fully over to PyTorch for developing their ML capabilities and services. + +

    + +
    + + Read More + +
    + +
    + +
    + +
    + +
    +
    +
    + +
    +
    +
    +
    + +

    Install PyTorch

    + +

    Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

    + +

    NOTE: Latest PyTorch requires Python 3.9 or later.

    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Your OS
    +
    +
    +
    Package
    +
    +
    +
    Language
    +
    +
    +
    Compute Platform
    +
    +
    +
    Run this Command:
    +
    +
    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Stable (1.13.0)
    +
    +
    +
    Preview (Nightly)
    +
    +
    +
    +
    +
    Your OS
    +
    +
    +
    Linux
    +
    +
    +
    Mac
    +
    +
    +
    Windows
    +
    +
    +
    +
    +
    Package
    +
    +
    +
    Pip
    +
    +
    +
    LibTorch
    +
    +
    +
    Source
    +
    +
    +
    +
    +
    Language
    +
    +
    +
    Python
    +
    +
    +
    C++ / Java
    +
    +
    +
    +
    +
    Compute Platform
    +
    +
    +
    CUDA 11.8
    +
    +
    +
    CUDA 12.1
    +
    +
    +
    CUDA 12.4
    +
    +
    +
    ROCm 5.2
    +
    +
    +
    CPU
    +
    +
    +
    +
    +
    Run this Command:
    +
    +
    +
    pip install torch torchvision
    +
    +
    +
    +
    +
    + + + + Previous versions of PyTorch + +
    + +
    +

    Quick Start With
    Cloud Partners

    + +

    Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

    + +
    + + +
    +
    +
    + Google Cloud Platform +
    + + + + + +
    +
    + +
    +
    +
    +

    Microsoft Azure

    +
    + + +
    +
    + +
    +
    +
    + Lightning Studios +
    + +
    +
    +
    + +
    +
    +
    +
    + + + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/32/index.html b/blog/32/index.html new file mode 100644 index 000000000000..14621617ac2d --- /dev/null +++ b/blog/32/index.html @@ -0,0 +1,992 @@ + + + + + + + + + + + + + Blog | 32 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + +
    + + + + + + +
    +
    + +

    Featured Post

    +

    + OpenMined and PyTorch partner to launch fellowship funding for privacy-preserving ML community +

    + + + + Read More + + + +
    +
    + +
    +
    +
    +
    + + + + + + + + +
    +
    +

    October 10, 2019

    +

    + PyTorch 1.3 adds mobile, privacy, quantization, and named tensors +

    +

    PyTorch continues to gain momentum because of its focus on meeting the needs of researchers, its streamlined workflow for production use, and most of all because of the enthusiastic support it has received from the AI community. PyTorch citations in papers on ArXiv grew 194 percent in the first half of 2019 alone, as noted by O’Reilly, and the number of contributors to the platform has grown more than 50 percent over the last year, to nearly 1,200. Facebook, Microsoft, Uber, and other organiz...

    + +
    + + Read More + +
    + + + + +
    +
    +

    August 08, 2019

    +

    + New Releases: PyTorch 1.2, torchtext 0.4, torchaudio 0.3, and torchvision 0.4 +

    +

    Since the release of PyTorch 1.0, we’ve seen the community expand to add new tools, contribute to a growing set of models available in the PyTorch Hub, and continually increase usage in both research and production. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    July 23, 2019

    +

    + Mapillary Research: Seamless Scene Segmentation and In-Place Activated BatchNorm +

    +

    With roads in developed countries like the US changing up to 15% annually, Mapillary addresses a growing demand for keeping maps updated by combining images from any camera into a 3D visualization of the world. Mapillary’s independent and collaborative approach enables anyone to collect, share, and use street-level images for improving maps, developing cities, and advancing the automotive industry. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    July 18, 2019

    +

    + PyTorch Adds New Ecosystem Projects for Encrypted AI and Quantum Computing, Expands PyTorch Hub +

    +

    The PyTorch ecosystem includes projects, tools, models and libraries from a broad community of researchers in academia and industry, application developers, and ML engineers. The goal of this ecosystem is to support, accelerate, and aid in your exploration with PyTorch and help you push the state of the art, no matter what field you are exploring. Similarly, we are expanding the recently launched PyTorch Hub to further help you discover and reproduce the latest research. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    June 10, 2019

    +

    + Towards Reproducible Research with PyTorch Hub +

    +

    Reproducibility is an essential requirement for many fields of research including those based on machine learning techniques. However, many machine learning publications are either not reproducible or are difficult to reproduce. With the continued growth in the number of research publications, including tens of thousands of papers now hosted on arXiv and submissions to conferences at an all time high, research reproducibility is more important than ever. While many of these publications are a...

    + +
    + + Read More + +
    + + + + +
    +
    +

    May 22, 2019

    +

    + torchvision 0.3: segmentation, detection models, new datasets and more.. +

    +

    PyTorch domain libraries like torchvision provide convenient access to common datasets and models that can be used to quickly create a state-of-the-art baseline. Moreover, they also provide common abstractions to reduce boilerplate code that users might have to otherwise repeatedly write. The torchvision 0.3 release brings several new features including models for semantic segmentation, object detection, instance segmentation, and person keypoint detection, as well as custom C++ / CUDA ops sp...

    + +
    + + Read More + +
    + + + + +
    +
    +

    May 08, 2019

    +

    + Model Serving in PyTorch +

    +

    PyTorch has seen a lot of adoption in research, but people can get confused about how well PyTorch models can be taken into production. This blog post is meant to clear up any confusion people might have about the road to production in PyTorch. +Usually when people talk about taking a model “to production,” they usually mean performing inference, sometimes called model evaluation or prediction or serving. At the level of a function call, in PyTorch, inference looks something like this: + +

    + +
    + + Read More + +
    + +
    + +
    + +
    + +
    +
    +
    + +
    +
    +
    +
    + +

    Install PyTorch

    + +

    Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

    + +

    NOTE: Latest PyTorch requires Python 3.9 or later.

    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Your OS
    +
    +
    +
    Package
    +
    +
    +
    Language
    +
    +
    +
    Compute Platform
    +
    +
    +
    Run this Command:
    +
    +
    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Stable (1.13.0)
    +
    +
    +
    Preview (Nightly)
    +
    +
    +
    +
    +
    Your OS
    +
    +
    +
    Linux
    +
    +
    +
    Mac
    +
    +
    +
    Windows
    +
    +
    +
    +
    +
    Package
    +
    +
    +
    Pip
    +
    +
    +
    LibTorch
    +
    +
    +
    Source
    +
    +
    +
    +
    +
    Language
    +
    +
    +
    Python
    +
    +
    +
    C++ / Java
    +
    +
    +
    +
    +
    Compute Platform
    +
    +
    +
    CUDA 11.8
    +
    +
    +
    CUDA 12.1
    +
    +
    +
    CUDA 12.4
    +
    +
    +
    ROCm 5.2
    +
    +
    +
    CPU
    +
    +
    +
    +
    +
    Run this Command:
    +
    +
    +
    pip install torch torchvision
    +
    +
    +
    +
    +
    + + + + Previous versions of PyTorch + +
    + +
    +

    Quick Start With
    Cloud Partners

    + +

    Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

    + +
    + + +
    +
    +
    + Google Cloud Platform +
    + + + + + +
    +
    + +
    +
    +
    +

    Microsoft Azure

    +
    + + +
    +
    + +
    +
    +
    + Lightning Studios +
    + +
    +
    +
    + +
    +
    +
    +
    + + + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/33/index.html b/blog/33/index.html new file mode 100644 index 000000000000..1b2af7d09b40 --- /dev/null +++ b/blog/33/index.html @@ -0,0 +1,997 @@ + + + + + + + + + + + + + Blog | 33 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + +
    + + + + + + +
    +
    + +

    Featured Post

    +

    + Optimizing CUDA Recurrent Neural Networks with TorchScript +

    + + + + Read More + + + +
    +
    + +
    +
    +
    +
    + + + + + + + + +
    +
    +

    May 01, 2019

    +

    + PyTorch adds new dev tools as it hits production scale +

    +

    This is a partial re-post of the original blog post on the Facebook AI Blog. The full post can be viewed here + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    April 29, 2019

    +

    + Stochastic Weight Averaging in PyTorch +

    +

    In this blogpost we describe the recently proposed Stochastic Weight Averaging (SWA) technique [1, 2], and its new implementation in torchcontrib. SWA is a simple procedure that improves generalization in deep learning over Stochastic Gradient Descent (SGD) at no additional cost, and can be used as a drop-in replacement for any other optimizer in PyTorch. SWA has a wide range of applications and features: + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    May 02, 2018

    +

    + The road to 1.0: production ready PyTorch +

    +

    We would like to give you a preview of the roadmap for PyTorch 1.0 , the next release of PyTorch. Over the last year, we’ve had 0.2, 0.3 and 0.4 transform PyTorch from a [Torch+Chainer]-like interface into something cleaner, adding double-backwards, numpy-like functions, advanced indexing and removing Variable boilerplate. At this time, we’re confident that the API is in a reasonable and stable state to confidently release a 1.0. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    April 22, 2018

    +

    + PyTorch 0.4.0 Migration Guide +

    +

    Welcome to the migration guide for PyTorch 0.4.0. In this release we introduced many exciting new features and critical bug fixes, with the goal of providing users a better and cleaner interface. In this guide, we will cover the most important changes in migrating existing code from previous versions: + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    March 05, 2018

    +

    + Tensor Comprehensions in PyTorch +

    +

    Tensor Comprehensions (TC) is a tool that lowers the barrier for writing high-performance code. It generates GPU code from a simple high-level language and autotunes the code for specific input sizes. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    January 19, 2018

    +

    + PyTorch, a year in.... +

    +

    Today marks 1 year since PyTorch was released publicly. It’s been a wild ride — our quest to build a flexible deep learning research platform. Over the last year, we’ve seen an amazing community of people using, contributing to and evangelizing PyTorch — thank you for the love. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    June 27, 2017

    +

    + PyTorch Internals Part II - The Build System +

    +

    In the first post I explained how we generate a torch.Tensor object that you can use in your Python interpreter. Next, I will explore the build system for PyTorch. The PyTorch codebase has a variety of components: + +

    + +
    + + Read More + +
    + +
    + +
    + +
    + +
    +
    +
    + +
    +
    +
    +
    + +

    Install PyTorch

    + +

    Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

    + +

    NOTE: Latest PyTorch requires Python 3.9 or later.

    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Your OS
    +
    +
    +
    Package
    +
    +
    +
    Language
    +
    +
    +
    Compute Platform
    +
    +
    +
    Run this Command:
    +
    +
    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Stable (1.13.0)
    +
    +
    +
    Preview (Nightly)
    +
    +
    +
    +
    +
    Your OS
    +
    +
    +
    Linux
    +
    +
    +
    Mac
    +
    +
    +
    Windows
    +
    +
    +
    +
    +
    Package
    +
    +
    +
    Pip
    +
    +
    +
    LibTorch
    +
    +
    +
    Source
    +
    +
    +
    +
    +
    Language
    +
    +
    +
    Python
    +
    +
    +
    C++ / Java
    +
    +
    +
    +
    +
    Compute Platform
    +
    +
    +
    CUDA 11.8
    +
    +
    +
    CUDA 12.1
    +
    +
    +
    CUDA 12.4
    +
    +
    +
    ROCm 5.2
    +
    +
    +
    CPU
    +
    +
    +
    +
    +
    Run this Command:
    +
    +
    +
    pip install torch torchvision
    +
    +
    +
    +
    +
    + + + + Previous versions of PyTorch + +
    + +
    +

    Quick Start With
    Cloud Partners

    + +

    Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

    + +
    + + +
    +
    +
    + Google Cloud Platform +
    + + + + + +
    +
    + +
    +
    +
    +

    Microsoft Azure

    +
    + + +
    +
    + +
    +
    +
    + Lightning Studios +
    + +
    +
    +
    + +
    +
    +
    +
    + + + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/34/index.html b/blog/34/index.html new file mode 100644 index 000000000000..972cd1030a1e --- /dev/null +++ b/blog/34/index.html @@ -0,0 +1,863 @@ + + + + + + + + + + + + + Blog | 34 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + +
    + + + + + + +
    +
    + +

    Featured Post

    +

    + A Tour of PyTorch Internals (Part I) +

    + + + + Read More + + + +
    +
    + +
    +
    +
    +
    + + + + + +
    + +
    + +
    + +
    +
    +
    + +
    +
    +
    +
    + +

    Install PyTorch

    + +

    Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

    + +

    NOTE: Latest PyTorch requires Python 3.9 or later.

    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Your OS
    +
    +
    +
    Package
    +
    +
    +
    Language
    +
    +
    +
    Compute Platform
    +
    +
    +
    Run this Command:
    +
    +
    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Stable (1.13.0)
    +
    +
    +
    Preview (Nightly)
    +
    +
    +
    +
    +
    Your OS
    +
    +
    +
    Linux
    +
    +
    +
    Mac
    +
    +
    +
    Windows
    +
    +
    +
    +
    +
    Package
    +
    +
    +
    Pip
    +
    +
    +
    LibTorch
    +
    +
    +
    Source
    +
    +
    +
    +
    +
    Language
    +
    +
    +
    Python
    +
    +
    +
    C++ / Java
    +
    +
    +
    +
    +
    Compute Platform
    +
    +
    +
    CUDA 11.8
    +
    +
    +
    CUDA 12.1
    +
    +
    +
    CUDA 12.4
    +
    +
    +
    ROCm 5.2
    +
    +
    +
    CPU
    +
    +
    +
    +
    +
    Run this Command:
    +
    +
    +
    pip install torch torchvision
    +
    +
    +
    +
    +
    + + + + Previous versions of PyTorch + +
    + +
    +

    Quick Start With
    Cloud Partners

    + +

    Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

    + +
    + + +
    +
    +
    + Google Cloud Platform +
    + + + + + +
    +
    + +
    +
    +
    +

    Microsoft Azure

    +
    + + +
    +
    + +
    +
    +
    + Lightning Studios +
    + +
    +
    +
    + +
    +
    +
    +
    + + + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/4/index.html b/blog/4/index.html new file mode 100644 index 000000000000..e61998f1368f --- /dev/null +++ b/blog/4/index.html @@ -0,0 +1,993 @@ + + + + + + + + + + + + + Blog | 4 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + +
    + + + + + + +
    +
    + +

    Featured Post

    +

    + GenAI Acceleration for PyTorch 2.5 on Intel® Xeon®Processors +

    + + + + Read More + + + +
    +
    + +
    +
    +
    +
    + + + + + + + + +
    +
    +

    January 09, 2025

    +

    + Integrating Ascend Backend with Torchtune through PyTorch Multi-Device Support +

    +

    In this blog, we will briefly introduce torchtune, the Ascend backend, and demonstrate how torchtune can be used to fine-tune models with Ascend. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    January 06, 2025

    +

    + High-Performance Low-Bit Operators for PyTorch +

    +

    We are excited to announce the addition of embedding operators with low-bit weights (1-8 bit) and linear operators with 8-bit dynamically quantized activations and low-bit weights (1-8 bit) for Arm CPUs in TorchAO, PyTorch’s native low-precision library. These operators work seamlessly across all PyTorch surfaces, including eager, torch.compile, AOTI, and ExecuTorch, and are available to use in torchchat. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    December 23, 2024

    +

    + PyTorch Grows as the Dominant Open Source Framework for AI and ML: 2024 Year in Review +

    +

    This past year was a monumental year for PyTorch from major releases to the flagship PyTorch Conference. We’ve seen incredible growth in contributions from more than 3,500 individuals and 3,000 organizations. It’s safe to say PyTorch has now become the dominant deep learning framework for AI/ML. PyTorch leads the model training space with a 63% adoption rate according to the recent Shaping the Future of Generative AI Report from the Linux Foundation. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    December 20, 2024

    +

    + Improve RAG performance with torch.compile on AWS Graviton Processors +

    +

    Large Language Models (LLMs) are trained on vast volumes of data and use billions of parameters to support tasks like answering questions, translating languages, and completing sentences. There are a few challenges when working with LLMs such as domain knowledge gaps, factuality issues, and hallucination, which affect their reliability especially for the fields that require high levels of accuracy, such as healthcare, law, or engineering. Retrieval Augmented Generation (RAG) provides a soluti...

    + +
    + + Read More + +
    + + + + +
    +
    +

    December 11, 2024

    +

    + torchcodec: Easy and Efficient Video Decoding for PyTorch +

    +

    We are pleased to officially announce torchcodec, a library for decoding videos into PyTorch tensors. It is fast, accurate, and easy to use. When running PyTorch models on videos, torchcodec is our recommended way to turn those videos into data your model can use. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    December 06, 2024

    +

    + Accelerating 2D Dynamic Block Quantized Float8 GEMMs in Triton +

    +

    2D block quantization for Float8 (FP8) holds the promise of improving the accuracy of Float8 quantization while also accelerating GEMM’s for both inference and training. In this blog, we showcase advances using Triton for the two main phases involved in doing block quantized Float8 GEMMs. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    December 02, 2024

    +

    + HadaCore: Tensor Core Accelerated Hadamard Transform Kernel +

    +

    Quantization is a method for improving model inference speeds by compressing model weights and performing (faster) computation in lower precision data types. However, quantization can result in accuracy loss due to the presence of outliers.

    + +
    + + Read More + +
    + +
    + +
    + +
    + +
    +
    +
    + +
    +
    +
    +
    + +

    Install PyTorch

    + +

    Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

    + +

    NOTE: Latest PyTorch requires Python 3.9 or later.

    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Your OS
    +
    +
    +
    Package
    +
    +
    +
    Language
    +
    +
    +
    Compute Platform
    +
    +
    +
    Run this Command:
    +
    +
    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Stable (1.13.0)
    +
    +
    +
    Preview (Nightly)
    +
    +
    +
    +
    +
    Your OS
    +
    +
    +
    Linux
    +
    +
    +
    Mac
    +
    +
    +
    Windows
    +
    +
    +
    +
    +
    Package
    +
    +
    +
    Pip
    +
    +
    +
    LibTorch
    +
    +
    +
    Source
    +
    +
    +
    +
    +
    Language
    +
    +
    +
    Python
    +
    +
    +
    C++ / Java
    +
    +
    +
    +
    +
    Compute Platform
    +
    +
    +
    CUDA 11.8
    +
    +
    +
    CUDA 12.1
    +
    +
    +
    CUDA 12.4
    +
    +
    +
    ROCm 5.2
    +
    +
    +
    CPU
    +
    +
    +
    +
    +
    Run this Command:
    +
    +
    +
    pip install torch torchvision
    +
    +
    +
    +
    +
    + + + + Previous versions of PyTorch + +
    + +
    +

    Quick Start With
    Cloud Partners

    + +

    Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

    + +
    + + +
    +
    +
    + Google Cloud Platform +
    + + + + + +
    +
    + +
    +
    +
    +

    Microsoft Azure

    +
    + + +
    +
    + +
    +
    +
    + Lightning Studios +
    + +
    +
    +
    + +
    +
    +
    +
    + + + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/5/index.html b/blog/5/index.html new file mode 100644 index 000000000000..d6517d93c52d --- /dev/null +++ b/blog/5/index.html @@ -0,0 +1,991 @@ + + + + + + + + + + + + + Blog | 5 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + +
    + + + + + + +
    +
    + +

    Featured Post

    +

    + Supercharging Training using float8 and FSDP2 +

    + + + + Read More + + + +
    +
    + +
    +
    +
    +
    + + + + + + + + +
    +
    +

    November 21, 2024

    +

    + Rebellions Joins the PyTorch Foundation as a General Member +

    +

    The PyTorch Foundation, a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem, is announcing today that Rebellions has joined as a general member.

    + +
    + + Read More + +
    + + + + +
    +
    +

    November 18, 2024

    +

    + Distilling Llama3.1 8B into 1B in torchtune +

    +

    In this blog, we present a case study on distilling a Llama 3.1 8B model into Llama 3.2 1B using torchtune’s knowledge distillation recipe. We demonstrate how knowledge distillation (KD) can be used in post-training to improve instruction-following task performance and showcase how users can leverage the recipe. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    November 01, 2024

    +

    + Deep Dive on CUTLASS Ping-Pong GEMM Kernel +

    +

    In this post, we provide an overview, with relevant FP8 inference kernel benchmarking, of the CUTLASS Ping-Pong GEMM kernel.

    + +
    + + Read More + +
    + + + + +
    +
    +

    October 31, 2024

    +

    + Deploying LLMs with TorchServe + vLLM +

    +

    The vLLM engine is currently one of the top-performing ways to execute large language models (LLM). It provides the vllm serve command as an easy option to deploy a model on a single machine. While this is convenient, to serve these LLMs in production and at scale some advanced features are necessary. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    October 30, 2024

    +

    + Triton Kernel Compilation Stages +

    +

    The Triton open-source programming language and compiler offers a high-level, python-based approach to create efficient GPU code. In this blog, we highlight the underlying details of how a triton program is compiled and the intermediate representations. For an introduction to Triton, we refer readers to this blog. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    October 28, 2024

    +

    + Unleashing the Power of AI on Mobile: LLM Inference for Llama 3.2 Quantized Models with ExecuTorch and KleidiAI +

    +

    At the recent PyTorch Conference, Arm highlighted the widespread impact of its technology, spanning from cloud to edge, emphasizing its commitment to delivering its advanced AI computing capabilities seamlessly to millions of developers worldwide.

    + +
    + + Read More + +
    + + + + +
    +
    +

    October 28, 2024

    +

    + Getting started with PyTorch, ExecuTorch, and Ethos-U85 in three easy steps +

    +

    ExecuTorch support for Ethos-U85 + +

    + +
    + + Read More + +
    + +
    + +
    + +
    + +
    +
    +
    + +
    +
    +
    +
    + +

    Install PyTorch

    + +

    Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

    + +

    NOTE: Latest PyTorch requires Python 3.9 or later.

    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Your OS
    +
    +
    +
    Package
    +
    +
    +
    Language
    +
    +
    +
    Compute Platform
    +
    +
    +
    Run this Command:
    +
    +
    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Stable (1.13.0)
    +
    +
    +
    Preview (Nightly)
    +
    +
    +
    +
    +
    Your OS
    +
    +
    +
    Linux
    +
    +
    +
    Mac
    +
    +
    +
    Windows
    +
    +
    +
    +
    +
    Package
    +
    +
    +
    Pip
    +
    +
    +
    LibTorch
    +
    +
    +
    Source
    +
    +
    +
    +
    +
    Language
    +
    +
    +
    Python
    +
    +
    +
    C++ / Java
    +
    +
    +
    +
    +
    Compute Platform
    +
    +
    +
    CUDA 11.8
    +
    +
    +
    CUDA 12.1
    +
    +
    +
    CUDA 12.4
    +
    +
    +
    ROCm 5.2
    +
    +
    +
    CPU
    +
    +
    +
    +
    +
    Run this Command:
    +
    +
    +
    pip install torch torchvision
    +
    +
    +
    +
    +
    + + + + Previous versions of PyTorch + +
    + +
    +

    Quick Start With
    Cloud Partners

    + +

    Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

    + +
    + + +
    +
    +
    + Google Cloud Platform +
    + + + + + +
    +
    + +
    +
    +
    +

    Microsoft Azure

    +
    + + +
    +
    + +
    +
    +
    + Lightning Studios +
    + +
    +
    +
    + +
    +
    +
    +
    + + + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/6/index.html b/blog/6/index.html new file mode 100644 index 000000000000..e7a393e27ff7 --- /dev/null +++ b/blog/6/index.html @@ -0,0 +1,998 @@ + + + + + + + + + + + + + Blog | 6 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + +
    + + + + + + +
    +
    + +

    Featured Post

    +

    + Intel GPU Support Now Available in PyTorch 2.5 +

    + + + + Read More + + + +
    +
    + +
    +
    +
    +
    + + + + + + + + +
    +
    +

    October 24, 2024

    +

    + ExecuTorch Beta: On-Device AI and LLMs, Stability, and Acceleration with Partners +

    +

    + ExecuTorch has achieved Beta status with the release of v0.4, providing stable APIs and runtime, as well as extensive kernel coverage. + ExecuTorch is the recommended on-device inference engine for Llama 3.2 1B/3B models, offering enhanced performance and memory efficiency for both original and quantized models. + There has been a significant increase in adoption and ecosystem growth for ExecuTorch, and the focus is now on improving reliability, performance, and coverage for non-CPU backen...

    + +
    + + Read More + +
    + + + + +
    +
    +

    October 23, 2024

    +

    + TorchRec and FBGEMM 1.0 Stable Release +

    +

    We are happy to announce the stable release, 1.0, for TorchRec and FBGEMM. TorchRec is the PyTorch native recommendation systems library, powered by FBGEMM’s (Facebook GEneral Matrix Multiplication) efficient, low-level kernels. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    October 17, 2024

    +

    + PyTorch 2.5 Release Blog +

    +

    We are excited to announce the release of PyTorch® 2.5 (release note)! This release features a new cuDNN backend for SDPA, enabling speedups by default for users of SDPA on H100s or newer GPUs. As well, regional compilation of torch.compile offers a way to reduce the cold start up time for torch.compile by allowing users to compile a repeated nn.Module (e.g. a transformer layer in LLM) without recompilations. Finally, TorchInductor CPP backend offers solid performance speedup with numerous en...

    + +
    + + Read More + +
    + + + + +
    +
    +

    October 15, 2024

    +

    + The Path to Achieve PyTorch Performance Boost on Windows CPU +

    +

    The challenge of PyTorch’s lower CPU performance on Windows compared to Linux has been a significant issue. There are multiple factors leading to this performance disparity. Through our investigation, we’ve identified several reasons for poor CPU performance on Windows, two primary issues have been pinpointed: the inefficiency of the Windows default malloc memory allocator and the absence of SIMD for vectorization optimizations on the Windows platform. In this article, we show how PyTorch CPU...

    + +
    + + Read More + +
    + + + + +
    +
    +

    October 08, 2024

    +

    + PyTorch Foundation Technical Advisory Council Elects New Leadership +

    +

    We are pleased to announce the first-ever Chair and Vice Chair of the PyTorch Foundation’s Technical Advisory Council (TAC): Luca Antiga as the Chair and Jiong Gong as Vice Chair. Both leaders bring extensive experience and deep commitment to the PyTorch community, and they are set to guide the TAC in its mission to foster an open, diverse, and innovative PyTorch technical community. + +Meet the New Leadership + + + +Luca Antiga is the CTO at Lightning AI since 2022. He is an early contributor to P...

    + +
    + + Read More + +
    + + + + +
    +
    +

    October 02, 2024

    +

    + PyTorch Conference 2024 Recap: On Fire 🔥 +

    +

    + +The 2024 PyTorch Conference in San Francisco gathered nearly 1,500 AI researchers, developers, and enthusiasts. Over two days, the event featured engaging discussions, insightful keynotes, and hands-on sessions focused on artificial intelligence (AI) and advancements in PyTorch, the leading open-source machine learning framework. Attendees delved into the future of generative AI, Large Language Models (LLMs), and the crucial role open-source technology plays in driving AI innovation. Here’s...

    + +
    + + Read More + +
    + + + + +
    +
    +

    September 26, 2024

    +

    + PyTorch Native Architecture Optimization: torchao +

    +

    We’re happy to officially launch torchao, a PyTorch native library that makes models faster and smaller by leveraging low bit dtypes, quantization and sparsity. torchao is an accessible toolkit of techniques written (mostly) in easy to read PyTorch code spanning both inference and training. This blog will help you pick which techniques matter for your workloads. + +We benchmarked our techniques on popular GenAI models like LLama 3 and Diffusion models and saw minimal drops in accuracy. Unless o...

    + +
    + + Read More + +
    + +
    + +
    + +
    + +
    +
    +
    + +
    +
    +
    +
    + +

    Install PyTorch

    + +

    Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

    + +

    NOTE: Latest PyTorch requires Python 3.9 or later.

    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Your OS
    +
    +
    +
    Package
    +
    +
    +
    Language
    +
    +
    +
    Compute Platform
    +
    +
    +
    Run this Command:
    +
    +
    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Stable (1.13.0)
    +
    +
    +
    Preview (Nightly)
    +
    +
    +
    +
    +
    Your OS
    +
    +
    +
    Linux
    +
    +
    +
    Mac
    +
    +
    +
    Windows
    +
    +
    +
    +
    +
    Package
    +
    +
    +
    Pip
    +
    +
    +
    LibTorch
    +
    +
    +
    Source
    +
    +
    +
    +
    +
    Language
    +
    +
    +
    Python
    +
    +
    +
    C++ / Java
    +
    +
    +
    +
    +
    Compute Platform
    +
    +
    +
    CUDA 11.8
    +
    +
    +
    CUDA 12.1
    +
    +
    +
    CUDA 12.4
    +
    +
    +
    ROCm 5.2
    +
    +
    +
    CPU
    +
    +
    +
    +
    +
    Run this Command:
    +
    +
    +
    pip install torch torchvision
    +
    +
    +
    +
    +
    + + + + Previous versions of PyTorch + +
    + +
    +

    Quick Start With
    Cloud Partners

    + +

    Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

    + +
    + + +
    +
    +
    + Google Cloud Platform +
    + + + + + +
    +
    + +
    +
    +
    +

    Microsoft Azure

    +
    + + +
    +
    + +
    +
    +
    + Lightning Studios +
    + +
    +
    +
    + +
    +
    +
    +
    + + + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/6x-faster-async-checkpointing/index.html b/blog/6x-faster-async-checkpointing/index.html new file mode 100644 index 000000000000..608d029a9f64 --- /dev/null +++ b/blog/6x-faster-async-checkpointing/index.html @@ -0,0 +1,746 @@ + + + + + + + + + + + + + 6x faster Async Checkpointing in PyTorch, using Cached Plans, no GIL contention | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Meta and Crusoe + +

    +

    Meta: Less Wright, Meet Vadakkanchery, Saurabh Mishra, Ela Krepska, Hamid Shojanazeri, Pradeep Fernando
    +Crusoe: Ethan Petersen, Martin Cala, Chip Smith

    + +

    PyTorch DCP (Distributed Checkpointing) has recently enabled new optimizations in asynchronous checkpointing to reduce GPU utilization drop by minimizing collective overhead and improving overall checkpointing efficiency.

    + +

    Using Crusoe’s 2K H200 cluster, with TorchTitan and training a Llama3-70B, we were able to verify these new features deliver substantial speedups at 1856 GPU scale, reducing the background processing time for async DCP checkpoints from ~436 seconds to ~67 seconds.

    + +

    This is roughly a 6.5x reduction in background checkpoint processing time, enabling even more total training time to proceed at full training throughput.

    + +

    chart

    + +

    Fig 1: 1856 training run with high frequency checkpointing. The first checkpoint (drop down in tps) does not have a cached save plan, and the background processing takes far longer than the rest where the cached plan is used.

    + +

    Background: What is Asynchronous Checkpointing?

    + +

    In a standard checkpointing workflow, GPUs are blocked while the checkpointing data is offloaded from GPU to CPU and then written to storage. After the save to physical media is complete, training can resume.

    + +

    Asynchronous checkpointing greatly reduces this downtime by enabling the actual saving to storage to be done via CPU threads, allowing GPU-based training to continue while the checkpoint data is being persisted in parallel. It is used primarily for intermediate/fault tolerant checkpoints as it unblocks the GPUs much faster compared to the synchronous checkpoints.
    +For example, in our large-scale experiment, GPU training was blocked for less than a second (.78 seconds at 1856 scale) while checkpoint data was moved from GPU to CPU (staging). At that point, GPU training immediately continues, which is a substantial training time improvement over traditional checkpointing. For reference, Async Checkpointing is covered in more detail here.

    + +

    Challenges with Asynchronous Checkpointing

    + +

    However, the background processing inherent in Asynchronous Checkpointing has additional challenges that result in a temporary reduction of training throughput while the storage phase is being completed. These are highlighted below.

    + +

    GPU utilization drop from GIL contention:

    + +

    The Global Interpreter Lock (GIL) in Python is a mechanism that prevents multiple native threads from executing Python bytecode at the same time. This lock is necessary mainly because CPython’s memory management is not thread-safe.

    + +

    DCP currently uses background threads for metadata collectives and uploading to storage. Although these expensive steps are done asynchronously, it leads to contention for the GIL with the trainer threads. This causes the GPU utilization (QPS) to suffer significantly and also increases the e2e upload latency. For large-scale checkpoints, the overhead of the CPU parallel processing has a suppressive effect on net GPU training speed since CPUs also drive the training process via GPU kernel launches.

    + +

    Please refer to the following figure from our experiments:

    + +

    chart

    + +

    Fig 2: One can see a sustained drop in training QPS even after staging (i.e. blocking operation to trainer) is complete.

    + +

    The first dip in Figure 2 (marked by the purple line) indicates that staging is complete, and training can continue. However, a second drop is evident (marked by the area between the purple and yellow lines) which is due to trainer thread and checkpointing threads contending for the Python GIL, leading to degraded training QPS until the checkpoint thread completes execution.

    + +

    Collective communications cost:

    + +

    DCP performs multiple collectives today for various reasons: dedupe, global metadata for the checkpoint, resharding, and distributed exception handling. Collectives are costly as these require network I/O and pickling/unpickling of the large metadata being sent across the GPU network. These collectives become extremely expensive as the job scale grows, leading to significantly higher e2e latency and potential for collective timeouts.

    + +

    Solutions

    + +

    Process based async checkpointing

    + +

    DCP now supports async checkpoint save via a background process. This helps avoid the training QPS drop by eliminating the python GIL contention with the trainer threads. Please see Fig 2 for checkpointing via threads and Fig 3 for checkpointing via background process.

    + +

    Caching of the save plans

    + +

    DCP has a clear boundary between the planning and storage I/O steps. SavePlanner in DCP is a stateful component which acts as an access proxy to the state_dict. Planner manages save plans prepared by individual ranks, which carry metadata information necessary to do the write I/O. The planning step involves a collective operation to gather a comprehensive view of the checkpoint on the coordinator rank. The coordinator rank is responsible for de-duplicating parameters/weights to eliminate redundancies, validating the global plan to ensure accuracy and consistency, and creating the global metadata structs. This is followed by a scatter collective where the coordinator rank assigns I/O tasks to each rank. Any transformations done on the plans affect how the storage components finally write the data.

    + +

    During the course of a training job, multiple checkpoints are saved. In the majority of these cases, only the checkpoint data changes between different save instances, and thus, the plan remains the same. This presented an opportunity for us to cache the plans, pay the planning cost only on the first save, and then amortize that cost across all the subsequent attempts. Only the updated plans (plans which changed in the next attempt) are sent via collective, thus reducing the collective overhead significantly.

    + +

    Experiment Results

    + +

    Set up: 1856 H200 GPUs, Llama3-70B, HSDP2 with TorchTitan

    + +

    After deploying both the solutions above, the following are the key results:

    + +
      +
    • TPS drop has significantly narrowed, with a peak dip to 372 vs 315 tps, and for a greatly reduced time window (~67 seconds vs ~437 seconds). This time window is now mostly attributed to the blocking for CPU processing.
    • +
    • Subsequent checkpoint save attempts also continue to be much faster due to very low overhead at the planning stage. E2E latency is thus improved by over 6.5x. This will allow our partners to increase the checkpointing frequency and reduce the lost training progress (i.e. wasted training time).
    • +
    + +

    If you look at the very first downspike in Figure 1, this drawdown in GPU processing time takes training throughput from 700 down to 320 tps, and suppresses it for roughly 7 minutes (467 seconds). Once the CPUs have finished processing, training continues again at full speed.

    + +

    Previously, this ~7 minute suppression would be repeated at every checkpoint. However, with the new process-based checkpointing feature, only the first checkpoint has the full drawdown time (mainly due to overhead from daemon process initialization), as all future checkpoints are executed via the background process, mitigating GIL contention with the trainer threads.

    + +

    This is visually shown in all the subsequent checkpoints where the average MFU suppression time drops to just over a minute, reflected by the sharp spikes that almost immediately revert to full MFU throughput.

    + +

    chart

    + +

    Fig 3: The red box shows the non-cached plan checkpoint, which also includes Checkpoint Background Init process overhead, while the purple box highlights the first checkpoint to run with the cached plan.

    + +

    This means that even large-scale checkpointing, such as shown in Fig 2 at 1856 GPU scale, can be done with ~6x reduced training throughput impact. This enables Asynchronous DCP checkpointing to be run more frequently (thus better rollback protection) while enhancing total training throughput relative to previous Async Checkpointing overhead.

    + +

    Using DCP’s cached checkpointing:

    + +

    This feature is already available as part of the PyTorch nightly builds, and you can test out PyTorch’s Asynchronous DCP checkpointing directly in TorchTitan. Following are the instructions to enable these features:

    + +
      +
    • Process-based asynchronous checkpointing: +
        +
      • Set the async_checkpointer_type to AsyncCheckpointerType.PROCESS in the async_save API. (file: pytorch/torch/distributed/checkpoint/state_dict_saver.py)
      • +
      +
    • +
    • Save plan caching: +
        +
      • Set the enable_plan_caching flag to true in the DefaultSavePlanner. (file: pytorch/torch/distributed/checkpoint/default_planner.py)
      • +
      +
    • +
    + +

    Future work

    + +

    DCP will be rolling out additional optimizations to further improve the checkpointing cost. Currently even though the save plans are cached, coordinator rank still prepares the metadata. For larger jobs and models with many tensors, this overhead is non-trivial. In the next iteration, DCP will eliminate the metadata overhead and improve the e2e latency further. DCP will also introduce additional optimizations, such as zero-overhead checkpointing, to enable efficient checkpointing in large-scale jobs.

    + +

    Stay tuned!

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/7/index.html b/blog/7/index.html new file mode 100644 index 000000000000..3ac8d9817833 --- /dev/null +++ b/blog/7/index.html @@ -0,0 +1,998 @@ + + + + + + + + + + + + + Blog | 7 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + +
    + + + + + + + + +
    +
    +
    +
    + + + + + + + + +
    +
    +

    September 12, 2024

    +

    + Arm Joins the PyTorch Foundation as a Premier Member +

    +

    The PyTorch Foundation, a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem, is announcing today that Arm has joined as a premier member. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    September 04, 2024

    +

    + CUDA-Free Inference for LLMs +

    +

    In this blog, we discuss the methods we used to achieve FP16 inference with popular LLM models such as Meta’s Llama3-8B and IBM’s Granite-8B Code, where 100% of the computation is performed using OpenAI’s Triton Language. +For single token generation times using our Triton kernel based models, we were able to approach 0.76-0.78x performance relative to the CUDA kernel dominant workflows for both Llama and Granite on Nvidia H100 GPUs, and 0.62-0.82x on Nvidia A100 GPUs. + +Why explore using 100%...

    + +
    + + Read More + +
    + + + + +
    +
    +

    August 29, 2024

    +

    + Accelerate Your AI: PyTorch 2.4 Now Supports Intel GPUs for Faster Workloads +

    +

    We have exciting news! PyTorch 2.4 now supports Intel® Data Center GPU Max Series and the SYCL software stack, making it easier to speed up your AI workflows for both training and inference. This update allows for you to have a consistent programming experience with minimal coding effort and extends PyTorch’s device and runtime capabilities, including device, stream, event, generator, allocator, and guard, to seamlessly support streaming devices. This enhancement simplifies deploying PyTorch ...

    + +
    + + Read More + +
    + + + + +
    +
    +

    August 20, 2024

    +

    + Enabling Fast Gradient Clipping and Ghost Clipping in Opacus +

    +

    Introduction and Context + +

    + +
    + + Read More + +
    + + + + + + + + + +
    +
    +

    July 30, 2024

    +

    + Introducing torchchat: Accelerating Local LLM Inference on Laptop, Desktop and Mobile +

    +

    Today, we’re releasing torchchat, a library showcasing how to seamlessly and performantly run Llama 3, 3.1, and other large language models across laptop, desktop, and mobile. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    July 30, 2024

    +

    + Quantization-Aware Training for Large Language Models with PyTorch +

    +

    In this blog, we present an end-to-end Quantization-Aware Training (QAT) flow for large language models in PyTorch. We demonstrate how QAT in PyTorch can recover up to 96% of the accuracy degradation on hellaswag and 68% of the perplexity degradation on wikitext for Llama3 compared to post-training quantization (PTQ). We present the QAT APIs in torchao and showcase how users can leverage them for fine-tuning in torchtune. + +

    + +
    + + Read More + +
    + +
    + +
    + +
    + +
    +
    +
    + +
    +
    +
    +
    + +

    Install PyTorch

    + +

    Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

    + +

    NOTE: Latest PyTorch requires Python 3.9 or later.

    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Your OS
    +
    +
    +
    Package
    +
    +
    +
    Language
    +
    +
    +
    Compute Platform
    +
    +
    +
    Run this Command:
    +
    +
    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Stable (1.13.0)
    +
    +
    +
    Preview (Nightly)
    +
    +
    +
    +
    +
    Your OS
    +
    +
    +
    Linux
    +
    +
    +
    Mac
    +
    +
    +
    Windows
    +
    +
    +
    +
    +
    Package
    +
    +
    +
    Pip
    +
    +
    +
    LibTorch
    +
    +
    +
    Source
    +
    +
    +
    +
    +
    Language
    +
    +
    +
    Python
    +
    +
    +
    C++ / Java
    +
    +
    +
    +
    +
    Compute Platform
    +
    +
    +
    CUDA 11.8
    +
    +
    +
    CUDA 12.1
    +
    +
    +
    CUDA 12.4
    +
    +
    +
    ROCm 5.2
    +
    +
    +
    CPU
    +
    +
    +
    +
    +
    Run this Command:
    +
    +
    +
    pip install torch torchvision
    +
    +
    +
    +
    +
    + + + + Previous versions of PyTorch + +
    + +
    +

    Quick Start With
    Cloud Partners

    + +

    Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

    + +
    + + +
    +
    +
    + Google Cloud Platform +
    + + + + + +
    +
    + +
    +
    +
    +

    Microsoft Azure

    +
    + + +
    +
    + +
    +
    +
    + Lightning Studios +
    + +
    +
    +
    + +
    +
    +
    +
    + + + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/8/index.html b/blog/8/index.html new file mode 100644 index 000000000000..51e4c2782883 --- /dev/null +++ b/blog/8/index.html @@ -0,0 +1,993 @@ + + + + + + + + + + + + + Blog | 8 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + +
    + + + + + + +
    +
    + +

    Featured Post

    +

    + PyTorch 2.4 Release Blog +

    + + + + Read More + + + +
    +
    + +
    +
    +
    +
    + + + + + + + + +
    +
    +

    July 22, 2024

    +

    + Deep Dive on the Hopper TMA Unit for FP8 GEMMs +

    +

    Abstract + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    July 11, 2024

    +

    + FlashAttention-3: Fast and Accurate Attention with Asynchrony and Low-precision +

    +

    Attention, as a core layer of the ubiquitous Transformer architecture, is a bottleneck for large language models and long-context applications. FlashAttention (and FlashAttention-2) pioneered an approach to speed up attention on GPUs by minimizing memory reads/writes, and is now used by most libraries to accelerate Transformer training and inference. This has contributed to a massive increase in LLM context length in the last two years, from 2-4K (GPT-3, OPT) to 128K (GPT-4), or even 1M (Llam...

    + +
    + + Read More + +
    + + + + +
    +
    +

    July 10, 2024

    +

    + Learn how to develop Android applications with ExecuTorch and Llama models +

    +

    This blog is courtesy of the PyTorch team at Arm. More details can be found here. + +

    + +
    + + Read More + +
    + + + + + + + + + +
    +
    +

    July 03, 2024

    +

    + Announcing Hacker Cup AI Track at NeurIPS 2024 +

    +

    The PyTorch team in partnership with Meta Hacker Cup, and Microsoft Research, are excited to announce the Hacker Cup AI Track at NeurIPS 2024. This will be the first AI track for the popular Meta Hacker Cup programming competition designed to assess the capabilities of Generative AI in performing autonomous code generation tasks. We aim to test the limits of AI in complex coding challenges and measure the performance gap between AI systems and human programmers. We will provide access to all ...

    + +
    + + Read More + +
    + + + + +
    +
    +

    June 25, 2024

    +

    + Powering the AI Revolution: The PyTorch Documentary +

    +

    Now live: The official PyTorch Documentary! This film unveils the authentic narrative of PyTorch’s inception, attributing its existence to a dedicated group of unsung heroes driving technological innovation. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    June 23, 2024

    +

    + Training MoEs at Scale with PyTorch +

    +

    Over the past year, Mixture of Experts (MoE) models have surged in popularity, fueled by powerful open-source models like DBRX, Mixtral, DeepSeek, and many more. At Databricks, we’ve worked closely with the PyTorch team to scale training of MoE models. In this blog post, we’ll talk about how we scale to over three thousand GPUs using PyTorch Distributed and MegaBlocks, an efficient open-source MoE implementation in PyTorch. + +

    + +
    + + Read More + +
    + +
    + +
    + +
    + +
    +
    +
    + +
    +
    +
    +
    + +

    Install PyTorch

    + +

    Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

    + +

    NOTE: Latest PyTorch requires Python 3.9 or later.

    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Your OS
    +
    +
    +
    Package
    +
    +
    +
    Language
    +
    +
    +
    Compute Platform
    +
    +
    +
    Run this Command:
    +
    +
    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Stable (1.13.0)
    +
    +
    +
    Preview (Nightly)
    +
    +
    +
    +
    +
    Your OS
    +
    +
    +
    Linux
    +
    +
    +
    Mac
    +
    +
    +
    Windows
    +
    +
    +
    +
    +
    Package
    +
    +
    +
    Pip
    +
    +
    +
    LibTorch
    +
    +
    +
    Source
    +
    +
    +
    +
    +
    Language
    +
    +
    +
    Python
    +
    +
    +
    C++ / Java
    +
    +
    +
    +
    +
    Compute Platform
    +
    +
    +
    CUDA 11.8
    +
    +
    +
    CUDA 12.1
    +
    +
    +
    CUDA 12.4
    +
    +
    +
    ROCm 5.2
    +
    +
    +
    CPU
    +
    +
    +
    +
    +
    Run this Command:
    +
    +
    +
    pip install torch torchvision
    +
    +
    +
    +
    +
    + + + + Previous versions of PyTorch + +
    + +
    +

    Quick Start With
    Cloud Partners

    + +

    Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

    + +
    + + +
    +
    +
    + Google Cloud Platform +
    + + + + + +
    +
    + +
    +
    +
    +

    Microsoft Azure

    +
    + + +
    +
    + +
    +
    +
    + Lightning Studios +
    + +
    +
    +
    + +
    +
    +
    +
    + + + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/9/index.html b/blog/9/index.html new file mode 100644 index 000000000000..c392e3e592f4 --- /dev/null +++ b/blog/9/index.html @@ -0,0 +1,998 @@ + + + + + + + + + + + + + Blog | 9 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + +
    + + + + + + +
    +
    + +

    Featured Post

    +

    + 🎉 PyTorch Docathon H1 2024 Wrap-up 🎉 +

    + + + + Read More + + + +
    +
    + +
    +
    +
    +
    + + + + + + + + +
    +
    +

    June 20, 2024

    +

    + Accelerating Neural Network Training with Semi-Structured (2:4) Sparsity +

    +

    Over the past year, we’ve added support for semi-structured (2:4) sparsity into PyTorch. With just a few lines of code, we were able to show a 10% end-to-end inference speedup on segment-anything by replacing dense matrix multiplications with sparse matrix multiplications. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    June 12, 2024

    +

    + Reducing Model Checkpointing Times by Over 10x with PyTorch Distributed Asynchronous Checkpointing +

    +

    Summary: With PyTorch distributed’s new asynchronous checkpointing feature, developed with feedback from IBM, we show how IBM Research Team is able to implement and reduce effective checkpointing time by a factor of 10-20x. Example: 7B model ‘down time’ for a checkpoint goes from an average of 148.8 seconds to 6.3 seconds, or 23.62x faster. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    June 11, 2024

    +

    + PyTorch Foundation Welcomes New Executive Director +

    +

    +The PyTorch Foundation is excited to welcome Matt White, our new executive director. The PyTorch Foundation formed in 2022 with the goal to drive adoption of AI tooling by fostering and sustaining an ecosystem of open source, vendor-neutral projects with PyTorch. Over the past 2 years, we’ve seen excellent growth across the project – with both contributor and member growth. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    June 06, 2024

    +

    + INT4 Decoding GQA CUDA Optimizations for LLM Inference +

    +

    An efficient decoding Grouped-Query Attention with low-precision KV cache + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    June 04, 2024

    +

    + Ready, Set, Contribute: PyTorch Docathon Kickoff H1 2024 +

    +

    The PyTorch Docathon is now live! This event is dedicated to enhancing the quality of the PyTorch documentation with the invaluable assistance of our community. Our hope with this Docathon is to simplify the process for new users to get started with PyTorch, guide them in effectively utilizing its features, and ultimately expedite the transition from research to production in machine learning. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    May 21, 2024

    +

    + Maximizing Training Throughput Using PyTorch FSDP and Torch.compile +

    +

    Recently, we demonstrated how FSDP and selective activation checkpointing can be used to achieve 57% MFU (Model Flops Utilization) for training a 7B model on A100 GPUs. We also demonstrated how it can train a high quality model, which we open sourced as Granite 7B base model on Hugging Face Hub under the Apache v2.0 license. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    May 15, 2024

    +

    + Achieving Sustainability Goals with PyTorch and Intel AI +

    +

    This post was contributed by Intel AI in partnership with the PyTorch Foundation. + +

    + +
    + + Read More + +
    + +
    + +
    + +
    + +
    +
    +
    + +
    +
    +
    +
    + +

    Install PyTorch

    + +

    Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

    + +

    NOTE: Latest PyTorch requires Python 3.9 or later.

    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Your OS
    +
    +
    +
    Package
    +
    +
    +
    Language
    +
    +
    +
    Compute Platform
    +
    +
    +
    Run this Command:
    +
    +
    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Stable (1.13.0)
    +
    +
    +
    Preview (Nightly)
    +
    +
    +
    +
    +
    Your OS
    +
    +
    +
    Linux
    +
    +
    +
    Mac
    +
    +
    +
    Windows
    +
    +
    +
    +
    +
    Package
    +
    +
    +
    Pip
    +
    +
    +
    LibTorch
    +
    +
    +
    Source
    +
    +
    +
    +
    +
    Language
    +
    +
    +
    Python
    +
    +
    +
    C++ / Java
    +
    +
    +
    +
    +
    Compute Platform
    +
    +
    +
    CUDA 11.8
    +
    +
    +
    CUDA 12.1
    +
    +
    +
    CUDA 12.4
    +
    +
    +
    ROCm 5.2
    +
    +
    +
    CPU
    +
    +
    +
    +
    +
    Run this Command:
    +
    +
    +
    pip install torch torchvision
    +
    +
    +
    +
    +
    + + + + Previous versions of PyTorch + +
    + +
    +

    Quick Start With
    Cloud Partners

    + +

    Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

    + +
    + + +
    +
    +
    + Google Cloud Platform +
    + + + + + +
    +
    + +
    +
    +
    +

    Microsoft Azure

    +
    + + +
    +
    + +
    +
    +
    + Lightning Studios +
    + +
    +
    +
    + +
    +
    +
    +
    + + + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/Accelerating-Hugging-Face-and-TIMM-models/index.html b/blog/Accelerating-Hugging-Face-and-TIMM-models/index.html new file mode 100644 index 000000000000..4b27f246f81e --- /dev/null +++ b/blog/Accelerating-Hugging-Face-and-TIMM-models/index.html @@ -0,0 +1,809 @@ + + + + + + + + + + + + + Accelerating Hugging Face and TIMM models with PyTorch 2.0 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Mark Saroufim + +

    +

    torch.compile() makes it easy to experiment with different compiler backends to make PyTorch code faster with a single line decorator torch.compile(). It works either directly over an nn.Module as a drop-in replacement for torch.jit.script() but without requiring you to make any source code changes. We expect this one line code change to provide you with between 30%-2x training time speedups on the vast majority of models that you’re already running.

    + +
    
    +opt_module = torch.compile(module)
    +
    +
    + +

    torch.compile supports arbitrary PyTorch code, control flow, mutation and comes with experimental support for dynamic shapes. We’re so excited about this development that we call it PyTorch 2.0.

    + +

    What makes this announcement different for us is we’ve already benchmarked some of the most popular open source PyTorch models and gotten substantial speedups ranging from 30% to 2x https://github.com/pytorch/torchdynamo/issues/681.

    + +

    There are no tricks here, we’ve pip installed popular libraries like https://github.com/huggingface/transformers, https://github.com/huggingface/accelerate and https://github.com/rwightman/pytorch-image-models and then ran torch.compile() on them and that’s it.

    + +

    It’s rare to get both performance and convenience, but this is why the core team finds PyTorch 2.0 so exciting. The Hugging Face team is also excited, in their words:

    + +

    Ross Wightman the primary maintainer of TIMM: “PT 2.0 works out of the box with majority of timm models for inference and train workloads and no code changes”

    + +

    Sylvain Gugger the primary maintainer of transformers and accelerate: “With just one line of code to add, PyTorch 2.0 gives a speedup between 1.5x and 2.x in training Transformers models. This is the most exciting thing since mixed precision training was introduced!”

    + +

    This tutorial will show you exactly how to replicate those speedups so you can be as excited as to PyTorch 2.0 as we are.

    + +

    Requirements and Setup

    + +

    For GPU (newer generation GPUs will see drastically better performance)

    + +
    pip3 install numpy --pre torch --force-reinstall --extra-index-url https://download.pytorch.org/whl/nightly/cu117
    +
    +
    + +

    For CPU

    + +
    pip3 install --pre torch --extra-index-url https://download.pytorch.org/whl/nightly/cpu
    +
    +
    + +

    Optional: Verify Installation

    + +
    git clone https://github.com/pytorch/pytorch
    +cd tools/dynamo
    +python verify_dynamo.py
    +
    + +

    Optional: Docker installation

    + +

    We also provide all the required dependencies in the PyTorch nightly +binaries which you can download with

    + +
    docker pull ghcr.io/pytorch/pytorch-nightly
    +
    +
    + +

    And for ad hoc experiments just make sure that your container has access +to all your GPUs

    + +
    docker run --gpus all -it ghcr.io/pytorch/pytorch-nightly:latest /bin/bash
    +
    +
    + +

    Getting started

    + +

    a toy exmaple

    + +

    Let’s start with a simple example and make things more complicated step +by step. Please note that you’re likely to see more significant speedups the newer your GPU is.

    + +
    import torch
    +def fn(x, y):
    +    a = torch.sin(x).cuda()
    +    b = torch.sin(y).cuda()
    +    return a + b
    +new_fn = torch.compile(fn, backend="inductor")
    +input_tensor = torch.randn(10000).to(device="cuda:0")
    +a = new_fn(input_tensor, input_tensor)
    +
    + +

    This example won’t actually run faster but it’s educational.

    + +

    example that features torch.cos() and torch.sin() which are examples of pointwise ops as in they operate element by element on a vector. A more famous pointwise op you might actually want to use would be something like torch.relu().

    + +

    Pointwise ops in eager mode are suboptimal because each one would need to read a tensor from memory, make some changes and then write back those changes.

    + +

    The single most important optimization that PyTorch 2.0 does for you is fusion.

    + +

    So back to our example we can turn 2 reads and 2 writes into 1 read and 1 write which is crucial especially for newer GPUs where the bottleneck is memory bandwidth (how quickly you can send data to a GPU) instead of compute (how quickly your GPU can crunch floating point operations)

    + +

    The second most important optimization that PyTorch 2.0 does for you is CUDA graphs

    + +

    CUDA graphs help eliminate the overhead from launching individual kernels from a python program.

    + +

    torch.compile() supports many different backends but one that we’re particularly excited about is Inductor which generates Triton kernels https://github.com/openai/triton which are written in Python yet outperform the vast majority of handwritten CUDA kernels. Suppose our example above was called trig.py we can actually inspect the code generated triton kernels by running.

    + +
    TORCH_COMPILE_DEBUG=1 python trig.py
    +
    + +
    
    +@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
    +@triton.jit
    +def kernel(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    +    xnumel = 10000
    +    xoffset = tl.program_id(0) * XBLOCK
    +    xindex = xoffset + tl.reshape(tl.arange(0, XBLOCK), [XBLOCK])
    +    xmask = xindex < xnumel
    +    x0 = xindex
    +    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    +    tmp1 = tl.sin(tmp0)
    +    tmp2 = tl.sin(tmp1)
    +    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp2, xmask)
    +
    +
    + +

    And you can verify that fusing the two sins did actually occur because the two sin operations occur within a single Triton kernel and the temporary variables are held in registers with very fast access.

    + +

    a real model

    + +

    As a next step let’s try a real model like resnet50 from the PyTorch hub.

    + +
    import torch
    +model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', pretrained=True)
    +opt_model = torch.compile(model, backend="inductor")
    +model(torch.randn(1,3,64,64))
    +
    +
    + +

    If you actually run you may be surprised that the first run is slow and that’s because the model is being compiled. Subsequent runs will be faster so it’s common practice to warm up your model before you start benchmarking it.

    + +

    You may have noticed how we also passed in the name of a compiler explicitly here with “inductor” but it’s not the only available backend, you can run in a REPL torch._dynamo.list_backends() to see the full list of available backends. For fun you should try out aot_cudagraphs or nvfuser.

    + +

    Hugging Face models

    + +

    Let’s do something a bit more interesting now, our community frequently +uses pretrained models from transformers https://github.com/huggingface/transformers or TIMM https://github.com/rwightman/pytorch-image-models and one of our design goals for PyTorch 2.0 was that any new compiler stack needs to work out of the box with the vast majority of models people actually run.

    + +

    So we’re going to directly download a pretrained model from the Hugging Face hub and optimize it

    + +
    
    +import torch
    +from transformers import BertTokenizer, BertModel
    +# Copy pasted from here https://huggingface.co/bert-base-uncased
    +tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    +model = BertModel.from_pretrained("bert-base-uncased").to(device="cuda:0")
    +model = torch.compile(model) # This is the only line of code that we changed
    +text = "Replace me by any text you'd like."
    +encoded_input = tokenizer(text, return_tensors='pt').to(device="cuda:0")
    +output = model(**encoded_input)
    +
    +
    + +

    If you remove the to(device="cuda:0") from the model and encoded_input then PyTorch 2.0 will generate C++ kernels that will be optimized for running on your CPU. You can inspect both Triton or C++ kernels for BERT, they’re obviously more complex than the trigonometry example we had above but you can similarly skim it and understand if you understand PyTorch.

    + +

    The same code also works just fine if used with https://github.com/huggingface/accelerate and DDP

    + +

    Similarly let’s try out a TIMM example

    + +
    import timm
    +import torch
    +model = timm.create_model('resnext101_32x8d', pretrained=True, num_classes=2)
    +opt_model = torch.compile(model, backend="inductor")
    +opt_model(torch.randn(64,3,7,7))
    +
    + +

    Our goal with PyTorch was to build a breadth-first compiler that would speed up the vast majority of actual models people run in open source. The Hugging Face Hub ended up being an extremely valuable benchmarking tool for us, ensuring that any optimization we work on actually helps accelerate models people want to run.

    + +

    So please try out PyTorch 2.0, enjoy the free perf and if you’re not seeing it then please open an issue and we will make sure your model is supported https://github.com/pytorch/torchdynamo/issues

    + +

    After all, we can’t claim we’re created a breadth-first unless YOUR models actually run faster.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/FX-feature-extraction-torchvision/index.html b/blog/FX-feature-extraction-torchvision/index.html new file mode 100644 index 000000000000..b264bd7bcd75 --- /dev/null +++ b/blog/FX-feature-extraction-torchvision/index.html @@ -0,0 +1,1096 @@ + + + + + + + + + + + + + Feature Extraction in TorchVision using Torch FX | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    + +
    + +
    +
    +
    + +
    +

    + by + + Alexander Soare and Francisco Massa + +

    + + +

    Introduction

    + +

    FX based feature extraction is a new TorchVision utility that lets us access intermediate transformations of an input during the forward pass of a PyTorch Module. It does so by symbolically tracing the forward method to produce a graph where each node represents a single operation. Nodes are named in a human-readable manner such that one may easily specify which nodes they want to access.

    + +

    Did that all sound a little complicated? Not to worry as there’s a little in this article for everyone. Whether you’re a beginner or an advanced deep-vision practitioner, chances are you will want to know about FX feature extraction. If you still want more background on feature extraction in general, read on. If you’re already comfortable with that and want to know how to do it in PyTorch, skim ahead to Existing Methods in PyTorch: Pros and Cons. And if you already know about the challenges of doing feature extraction in PyTorch, feel free to skim forward to FX to The Rescue.

    + +

    A Recap On Feature Extraction

    + +

    We’re all used to the idea of having a deep neural network (DNN) that takes inputs and produces outputs, and we don’t necessarily think of what happens in between. Let’s just consider a ResNet-50 classification model as an example:

    + +

    + CResNet-50 takes an image of a bird and transforms that into the abstract concept 'bird' +
    + Figure 1: ResNet-50 takes an image of a bird and transforms that into the abstract concept "bird". Source: Bird image from ImageNet. +

    + +

    We know though, that there are many sequential “layers” within the ResNet-50 architecture that transform the input step-by-step. In Figure 2 below, we peek under the hood to show the layers within ResNet-50, and we also show the intermediate transformations of the input as it passes through those layers.

    + +

    + ResNet-50 transforms the input image in multiple steps. Conceptually, we may access the intermediate transformation of the image after each one of these steps. +
    + Figure 2: ResNet-50 transforms the input image in multiple steps. Conceptually, we may access the intermediate transformation of the image after each one of these steps. Source: Bird image from ImageNet. +

    + +

    Existing Methods In PyTorch: Pros and Cons

    + +

    There were already a few ways of doing feature extraction in PyTorch prior to FX based feature extraction being introduced.

    + +

    To illustrate these, let’s consider a simple convolutional neural network that does the following

    + +
      +
    • Applies several “blocks” each with several convolution layers within.
    • +
    • After several blocks, it uses a global average pool and flatten operation.
    • +
    • Finally it uses a single output classification layer.
    • +
    + +
    import torch
    +from torch import nn
    +
    +
    +class ConvBlock(nn.Module):
    +   """
    +   Applies `num_layers` 3x3 convolutions each followed by ReLU then downsamples
    +   via 2x2 max pool.
    +   """
    +
    +   def __init__(self, num_layers, in_channels, out_channels):
    +       super().__init__()
    +       self.convs = nn.ModuleList(
    +           [nn.Sequential(
    +               nn.Conv2d(in_channels if i==0 else out_channels, out_channels, 3, padding=1),
    +               nn.ReLU()
    +            )
    +            for i in range(num_layers)]
    +       )
    +       self.downsample = nn.MaxPool2d(kernel_size=2, stride=2)
    +      
    +   def forward(self, x):
    +       for conv in self.convs:
    +           x = conv(x)
    +       x = self.downsample(x)
    +       return x
    +      
    +
    +class CNN(nn.Module):
    +   """
    +   Applies several ConvBlocks each doubling the number of channels, and
    +   halving the feature map size, before taking a global average and classifying.
    +   """
    +
    +   def __init__(self, in_channels, num_blocks, num_classes):
    +       super().__init__()
    +       first_channels = 64
    +       self.blocks = nn.ModuleList(
    +           [ConvBlock(
    +               2 if i==0 else 3,
    +               in_channels=(in_channels if i == 0 else first_channels*(2**(i-1))),
    +               out_channels=first_channels*(2**i))
    +            for i in range(num_blocks)]
    +       )
    +       self.global_pool = nn.AdaptiveAvgPool2d((1, 1))
    +       self.cls = nn.Linear(first_channels*(2**(num_blocks-1)), num_classes)
    +
    +   def forward(self, x):
    +       for block in self.blocks:
    +           x = block(x)
    +       x = self.global_pool(x)
    +       x = x.flatten(1)
    +       x = self.cls(x)
    +       return x
    +
    +
    +model = CNN(3, 4, 10)
    +out = model(torch.zeros(1, 3, 32, 32))  # This will be the final logits over classes
    +
    +
    + +

    Let’s say we want to get the final feature map before global average pooling. We could do the following:

    + +

    Modify the forward method

    + +
    def forward(self, x):
    +   for block in self.blocks:
    +       x = block(x)
    +   self.final_feature_map = x
    +   x = self.global_pool(x)
    +   x = x.flatten(1)
    +   x = self.cls(x)
    +   return x
    +
    + +

    Or return it directly:

    + +
    def forward(self, x):
    +   for block in self.blocks:
    +       x = block(x)
    +   final_feature_map = x
    +   x = self.global_pool(x)
    +   x = x.flatten(1)
    +   x = self.cls(x)
    +   return x, final_feature_map
    +
    +

    That looks pretty easy. But there are some downsides here which all stem from the same underlying issue: that is, modifying the source code is not ideal:

    + +
      +
    • It’s not always easy to access and change given the practical considerations of a project.
    • +
    • If we want flexibility (switching feature extraction on or off, or having variations on it), we need to further adapt the source code to support that.
    • +
    • It’s not always just a question of inserting a single line of code. Think about how you would go about getting the feature map from one of the intermediate blocks with the way I’ve written this module.
    • +
    • Overall, we’d rather avoid the overhead of maintaining source code for a model, when we actually don’t need to change anything about how it works.
    • +
    + +

    One can see how this downside can start to get a lot more thorny when dealing with larger, more complicated models, and trying to get at features from within nested submodules.

    + +

    Write a new module using the parameters from the original one

    + +

    Following on the example from above, say we want to get a feature map from each block. We could write a new module like so:

    + +
    class CNNFeatures(nn.Module):
    +   def __init__(self, backbone):
    +       super().__init__()
    +       self.blocks = backbone.blocks
    +
    +   def forward(self, x):
    +       feature_maps = []
    +       for block in self.blocks:
    +           x = block(x)
    +           feature_maps.append(x)
    +       return feature_maps
    +
    +
    +backbone = CNN(3, 4, 10)
    +model = CNNFeatures(backbone)
    +out = model(torch.zeros(1, 3, 32, 32))  # This is now a list of Tensors, each representing a feature map
    +
    + +

    In fact, this is much like the method that TorchVision used internally to make many of its detection models.

    + +

    Although this approach solves some of the issues with modifying the source code directly, there are still some major downsides:

    + +
      +
    • It’s only really straight-forward to access the outputs of top-level submodules. Dealing with nested submodules rapidly becomes complicated.
    • +
    • We have to be careful not to miss any important operations in between the input and the output. We introduce potential for errors in transcribing the exact functionality of the original module to the new module.
    • +
    + +

    Overall, this method and the last both have the complication of tying in feature extraction with the model’s source code itself. Indeed, if we examine the source code for TorchVision models we might suspect that some of the design choices were influenced by the desire to use them in this way for downstream tasks.

    + +

    Use hooks

    + +

    Hooks move us away from the paradigm of writing source code, towards one of specifying outputs. Considering our toy CNN example above, and the goal of getting feature maps for each layer, we could use hooks like this:

    + +
    model = CNN(3, 4, 10)
    +feature_maps = []  # This will be a list of Tensors, each representing a feature map
    +
    +def hook_feat_map(mod, inp, out):
    +	feature_maps.append(out)
    +
    +for block in model.blocks:
    +	block.register_forward_hook(hook_feat_map)
    +
    +out = model(torch.zeros(1, 3, 32, 32))  # This will be the final logits over classes
    +
    + +

    Now we have full flexibility in terms of accessing nested submodules, and we free ourselves of the responsibilities of fiddling with the source code. But this approach comes with its own downsides:

    + +
      +
    • We can only apply hooks to modules. If we have functional operations (reshape, view, functional non-linearities, etc) for which we want the outputs, hooks won’t work directly on them.
    • +
    • We have not modified anything about the source code, so the whole forward pass is executed, regardless of the hooks. If we only need to access early features without any need for the final output, this could result in a lot of useless computation.
    • +
    • Hooks are not TorchScript friendly.
    • +
    + +

    Here’s a summary of the different methods and their pros/cons:

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
     Can use source code as is without any modifications or rewritingFull flexibility in accessing featuresDrops unnecessary computational stepsTorchScript friendly
    Modify forward methodNOTechnically yes. Depends on how much code you’re willing to write. So in practice, NO.YESYES
    New module that reuses submodules / parameters of original moduleNOTechnically yes. Depends on how much code you’re willing to write. So in practice, NO.YESYES
    HooksYESMostly YES. Only outputs of submodulesNONO
    + +

    Table 1: The pros (or cons) of some of the existing methods for feature extraction with PyTorch

    + +

    In the next section of this article, let’s see how we can get YES across the board.

    + +

    FX to The Rescue

    + +

    The natural question for some new-starters in Python and coding at this point might be: “Can’t we just point to a line of code and tell Python or PyTorch that we want the result of that line?” For those who have spent more time coding, the reason this can’t be done is clear: multiple operations can happen in one line of code, whether they are explicitly written there, or they are implicit as sub-operations. Just take this simple module as an example:

    + +
    class MyModule(torch.nn.Module):
    +    def __init__(self):
    +        super().__init__()
    +        self.param = torch.nn.Parameter(torch.rand(3, 4))
    +        self.submodule = MySubModule()
    +
    +    def forward(self, x):
    +        return self.submodule(x + self.param).clamp(min=0.0, max=1.0)
    +
    + +

    The forward method has a single line of code which we can unravel as:

    + +
      +
    1. Add self.param to x
    2. +
    3. Pass x through self.submodule. Here we would need to consider the steps happening in that submodule. I’m just going to use dummy operation names for illustration: + I. submodule.op_1 + II. submodule.op_2
    4. +
    5. Apply the clamp operation
    6. +
    + +

    So even if we point at this one line, the question then is: “For which step do we want to extract the output?”.

    + +

    FX is a core PyTorch toolkit that (oversimplifying) does the unravelling I just mentioned. It does something called “symbolic tracing”, which means the Python code is interpreted and stepped through, operation-by-operation, using some dummy proxy for a real input. Introducing some nomenclature, each step as described above is considered a “node”, and consecutive nodes are connected to one another to form a “graph” (not unlike the common mathematical notion of a graph). Here are the “steps” above translated to this concept of a graph.

    + +

    + Graphical representation of the result of symbolically tracing our example of a simple forward method. +
    + Figure 3: Graphical representation of the result of symbolically tracing our example of a simple forward method. +

    + +

    Note that we call this a graph, and not just a set of steps, because it’s possible for the graph to branch off and recombine. Think of the skip connection in a residual block. This would look something like:

    + +

    + Graphical representation of a residual skip connection. The middle node is like the main branch of a residual block, and the final node represents the sum of the input and output of the main branch. +
    + Figure 4: Graphical representation of a residual skip connection. The middle node is like the main branch of a residual block, and the final node represents the sum of the input and output of the main branch. +

    + +

    Now, TorchVision’s get_graph_node_names function applies FX as described above, and in the process of doing so, tags each node with a human readable name. Let’s try this with our toy CNN model from the previous section:

    + +
    model = CNN(3, 4, 10)
    +from torchvision.models.feature_extraction import get_graph_node_names
    +nodes, _ = get_graph_node_names(model)
    +print(nodes)
    +
    +

    which will result in:

    +
    ['x', 'blocks.0.convs.0.0', 'blocks.0.convs.0.1', 'blocks.0.convs.1.0', 'blocks.0.convs.1.1', 'blocks.0.downsample', 'blocks.1.convs.0.0', 'blocks.1.convs.0.1', 'blocks.1.convs.1.0', 'blocks.1.convs.1.1', 'blocks.1.convs.2.0', 'blocks.1.convs.2.1', 'blocks.1.downsample', 'blocks.2.convs.0.0', 'blocks.2.convs.0.1', 'blocks.2.convs.1.0', 'blocks.2.convs.1.1', 'blocks.2.convs.2.0', 'blocks.2.convs.2.1', 'blocks.2.downsample', 'blocks.3.convs.0.0', 'blocks.3.convs.0.1', 'blocks.3.convs.1.0', 'blocks.3.convs.1.1', 'blocks.3.convs.2.0', 'blocks.3.convs.2.1', 'blocks.3.downsample', 'global_pool', 'flatten', 'cls']
    +
    + +

    We can read these node names as hierarchically organised “addresses” for the operations of interest. For example ‘blocks.1.downsample’ refers to the MaxPool2d layer in the second ConvBlock.

    + +

    create_feature_extractor, which is where all the magic happens, goes a few steps further than get_graph_node_names. It takes desired node names as one of the input arguments, and then uses more FX core functionality to:

    + +
      +
    1. Assign the desired nodes as outputs.
    2. +
    3. Prune unnecessary downstream nodes and their associated parameters.
    4. +
    5. Translate the resulting graph back into Python code.
    6. +
    7. Return another PyTorch Module to the user. This has the python code from step 3 as the forward method.
    8. +
    + +

    As a demonstration, here’s how we would apply create_feature_extractor to get the 4 feature maps from our toy CNN model

    + +
    from torchvision.models.feature_extraction import create_feature_extractor
    +# Confused about the node specification here?
    +# We are allowed to provide truncated node names, and `create_feature_extractor`
    +# will choose the last node with that prefix.
    +feature_extractor = create_feature_extractor(
    +	model, return_nodes=['blocks.0', 'blocks.1', 'blocks.2', 'blocks.3'])
    +# `out` will be a dict of Tensors, each representing a feature map
    +out = feature_extractor(torch.zeros(1, 3, 32, 32))
    +
    + +

    It’s as simple as that. When it comes down to it, FX feature extraction is just a way of making it possible to do what some of us would have naively hoped for when we first started programming: “just give me the output of this code (points finger at screen)”*.

    + +
      +
    • … does not require us to fiddle with source code.
    • +
    • … provides full flexibility in terms of accessing any intermediate transformation of our inputs, whether they are the results of a module or a functional operation
    • +
    • … does drop unnecessary computations steps once features have been extracted
    • +
    • … and I didn’t mention this before, but it’s also TorchScript friendly!
    • +
    + +

    Here’s that table again with another row added for FX feature extraction

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
     Can use source code as is without any modifications or rewritingFull flexibility in accessing featuresDrops unnecessary computational stepsTorchScript friendly
    Modify forward methodNOTechnically yes. Depends on how much code you’re willing to write. So in practice, NO.YESYES
    New module that reuses submodules / parameters of original moduleNOTechnically yes. Depends on how much code you’re willing to write. So in practice, NO.YESYES
    HooksYESMostly YES. Only outputs of submodulesNONO
    FXYESYESYESYES
    + +

    Table 2: A copy of Table 1 with an added row for FX feature extraction. FX feature extraction gets YES across the board!

    + +

    Current FX Limitations

    + +

    Although I would have loved to end the post there, FX does have some of its own limitations which boil down to:

    + +
      +
    1. There may be some Python code that isn’t yet handled by FX when it comes to the step of interpretation and translation into a graph.
    2. +
    3. Dynamic control flow can’t be represented in terms of a static graph.
    4. +
    + +

    The easiest thing to do when these problems crop up is to bundle the underlying code into a “leaf node”. Recall the example graph from Figure 3? Conceptually, we may agree that the submodule should be treated as a node in itself rather than a set of nodes representing the underlying operations. If we do so, we can redraw the graph as:

    + +

    + The individual operations within `submodule` may (left - within red box), may be consolidated into one node (right - node #2) if we consider the `submodule` as a 'leaf' node. +
    + Figure 5: The individual operations within `submodule` may (left - within red box), may be consolidated into one node (right - node #2) if we consider the `submodule` as a "leaf" node. +

    + +

    We would want to do so if there is some problematic code within the submodule, but we don’t have any need for extracting any intermediate transformations from within it. In practice, this is easily achievable by providing a keyword argument to create_feature_extractor or get_graph_node_names.

    + +
    model = CNN(3, 4, 10)
    +nodes, _ = get_graph_node_names(model, tracer_kwargs={'leaf_modules': [ConvBlock]})
    +print(nodes)
    +
    + +

    for which the output will be:

    + +
    ['x', 'blocks.0', 'blocks.1', 'blocks.2', 'blocks.3', 'global_pool', 'flatten', 'cls']
    +
    + +

    Notice how, as compared to previously, all the nodes for any given ConvBlock are consolidated into a single node.

    + +

    We could do something similar with functions. For example, Python’s inbuilt len needs to be wrapped and the result should be treated as a leaf node. Here’s how you can do that with core FX functionality:

    + +
    torch.fx.wrap('len')
    +
    +class MyModule(nn.Module):
    +   def forward(self, x):
    +       x += 1
    +       len(x)
    +
    +model = MyModule()
    +feature_extractor = create_feature_extractor(model, return_nodes=['add'])
    +
    + +

    For functions you define, you may instead use another keyword argument to create_feature_extractor (minor detail: here’s why you might want to do it this way instead):

    + +
    def myfunc(x):
    +   return len(x)
    +
    +class MyModule(nn.Module):
    +   def forward(self, x):
    +       x += 1
    +       myfunc(x)
    +
    +model = MyModule()
    +feature_extractor = create_feature_extractor(
    +   model, return_nodes=['add'], tracer_kwargs={'autowrap_functions': [myfunc]})
    +
    + +

    Notice that none of the fixes above involved modifying source code.

    + +

    Of course, there may be times when the very intermediate transformation one is trying to get access to is within the same forward method or function that is causing problems. Here, we can’t just treat that module or function as a leaf node, because then we can’t access the intermediate transformations within. In these cases, some rewriting of the source code will be needed. Here are some examples (not exhaustive)

    + +
      +
    • FX will raise an error when trying to trace through code with an assert statement. In this case you may need to remove that assertion or switch it with torch._assert (this is not a public function - so consider it a bandaid and use with caution).
    • +
    • Symbolically tracing in-place changes to slices of tensors is not supported. You will need to make a new variable for the slice, apply the operation, then reconstruct the original tensor using concatenation or stacking.
    • +
    • Representing dynamic control flow in a static graph is just not logically possible. See if you can distill the coded logic down to something that is not dynamic - see FX documentation for tips.
    • +
    + +

    In general, you may consult the FX documentation for more detail on the limitations of symbolic tracing and the possible workarounds.

    + +

    Conclusion

    + +

    We did a quick recap on feature extraction and why one might want to do it. Although there are existing methods for doing feature extraction in PyTorch they all have rather significant shortcomings. We learned how TorchVision’s FX feature extraction utility works and what makes it so versatile compared to the existing methods. While there are still some minor kinks to iron out for the latter, we understand the limitations, and can trade them off against the limitations of other methods depending on our use case. Hopefully by adding this new utility to your PyTorch toolkit, you’re now equipped to handle the vast majority of feature extraction requirements you may come across.

    + +

    Happy coding!

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/PathAI-Uses-PyTorch-to-Improve-Patient-Outcomes-with-AI-powered-Pathology/index.html b/blog/PathAI-Uses-PyTorch-to-Improve-Patient-Outcomes-with-AI-powered-Pathology/index.html new file mode 100644 index 000000000000..f8d9f320fcfd --- /dev/null +++ b/blog/PathAI-Uses-PyTorch-to-Improve-Patient-Outcomes-with-AI-powered-Pathology/index.html @@ -0,0 +1,757 @@ + + + + + + + + + + + + + Case Study: PathAI Uses PyTorch to Improve Patient Outcomes with AI-powered Pathology | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Logan Kilpatrick - Sr. Technology Advocate, Harshith Padigela - ML Engineer, Syed Ashar Javed - ML Technical Lead, Robert Egger - Biomedical Data Scientist + +

    +

    ​PathAI is the leading provider of AI-powered technology tools and services for pathology (the study of disease). Our platform was built to enable substantial improvements to the accuracy of diagnosis and the measurement of therapeutic efficacy for complex diseases, leveraging modern approaches in machine learning like image segmentation, graph neural networks, and multiple instance learning.

    + +

    + +

    + +

    Traditional manual pathology is prone to subjectivity and observer variability that can negatively affect diagnoses and drug development trials. Before we dive into how we use PyTorch to improve our diagnosis workflow, let us first lay out the traditional analog Pathology workflow without machine learning.

    + +

    How Traditional Biopharma Works

    + +

    There are many avenues that biopharma companies take to discover novel therapeutics or diagnostics. One of those avenues relies heavily on the analysis of pathology slides to answer a variety of questions: how does a particular cellular communication pathway work? Can a specific disease state be linked to the presence or lack of a particular protein? Why did a particular drug in a clinical trial work for some patients but not others? Might there be an association between patient outcomes and a novel biomarker?

    + +

    To help answer these questions, biopharma companies rely on expert pathologists to analyze slides and help evaluate the questions they might have. 

    + +

    As you might imagine, it takes an expert board certified pathologist to make accurate interpretations and diagnosis. In one study, a single biopsy result was given to 36 different pathologists and the outcome was 18 different diagnoses varying in severity from no treatment to aggressive treatment necessary. Pathologists also often solicit feedback from colleagues in difficult edge cases. Given the complexity of the problem, even with expert training and collaboration, pathologists can still have a hard time making a correct diagnosis. This potential variance can be the difference between a drug being approved and it failing the clinical trial.

    + +

    How PathAI utilizes machine learning to power drug development

    + +

    PathAI develops machine learning models which provide insights for drug development R&D, for powering clinical trials, and for making diagnoses. To this end, PathAI leverages PyTorch for slide level inference using a variety of methods including graph neural networks (GNN) as well as multiple instance learning. In this context, “slides” refers to full size scanned images of glass slides, which are pieces of glass with a thin slice of tissue between them, stained to show various cell formations. PyTorch enables our teams using these different methodologies to share a common framework which is robust enough to work in all the conditions we need. PyTorch’s high level, imperative, and pythonic syntax allows us to prototype models quickly and then take those models to scale once we have the results we want. 

    + +

    Multi-instance learning on gigabyte images

    + +

    One of the uniquely challenging aspects of applying ML to pathology is the immense size of the images. These digital slides can often be 100,000 x 100,000 pixels or more in resolution and gigabytes in size. Loading the full image in GPU memory and applying traditional computer vision algorithms on them is an almost impossible task. It also takes both a considerable amount of time and resources to have a full slide image (100k x 100k) annotated, especially when annotators need to be domain experts (board-certified pathologists). We often build models to predict image-level labels, like the presence of cancer, on a patient slide which covers a few thousand pixels in the whole image. The cancerous area is sometimes a tiny fraction of the entire slide, which makes the ML problem similar to finding a needle in a haystack. On the other hand, some problems like the prediction of certain histological biomarkers require an aggregation of information from the whole slide which is again hard due to the size of the images. All these factors add significant algorithmic, computational, and logistical complexity when applying ML techniques to pathology problems.

    + +

    Breaking down the image into smaller patches, learning patch representations, and then pooling those representations to predict an image-level label is one way to solve this problem as is depicted in the image below. One popular method for doing this is called Multiple Instance Learning (MIL). Each patch is considered an ‘instance’ and a set of patches forms a ‘bag’. The individual patch representations are pooled together to predict a final bag-level label. Algorithmically, the individual patch instances in the bag do not require labels and hence allow us to learn bag-level labels in a weakly-supervised way. They also use permutation invariant pooling functions which make the prediction independent of the order of patches and allows for an efficient aggregation of information. Typically, attention based pooling functions are used which not only allow for efficient aggregation but also provide attention values for each patch in the bag. These values indicate the importance of the corresponding patch in the prediction and can be visualized to better understand the model predictions. This element of interpretability can be very important to drive adoption of these models in the real world and we use variations like Additive MIL models to enable such spatial explainability. Computationally, MIL models circumvent the problem of applying neural networks to large image sizes since patch representations are obtained independently of the size of the image.

    + +

    + +

    + +

    At PathAI, we use custom MIL models based on deep nets to predict image-level labels. The overview of this process is as follows:

    + +
      +
    1. Select patches from a slide using different sampling approaches.
    2. +
    3. Construct a bag of patches based on random sampling or heuristic rules.
    4. +
    5. Generate patch representations for each instance based on pre-trained models or large-scale representation learning models.
    6. +
    7. Apply permutation invariant pooling functions to get the final slide-level score.
    8. +
    + +

    Now that we have walked through some of the high-level details around MIL in PyTorch, let’s look at some code to see how simple it is to go from ideation to code in production with PyTorch. We begin by defining a sampler, transformations, and our MIL dataset:

    + +
    # Create a bag sampler which randomly samples patches from a slide
    +bag_sampler = RandomBagSampler(bag_size=12)
    +
    +# Setup the transformations
    +crop_transform = FlipRotateCenterCrop(use_flips=True)
    +
    +# Create the dataset which loads patches for each bag
    +train_dataset = MILDataset(
    +  bag_sampler=bag_sampler,
    +  samples_loader=sample_loader,
    +  transform=crop_transform,
    +)
    +
    + +

    After we have defined our sampler and dataset, we need to define the model we will actually train with said dataset. PyTorch’s familiar model definition syntax makes this easy to do while also allowing us to create bespoke models at the same time.

    + +
    classifier = DefaultPooledClassifier(hidden_dims=[256, 256], input_dims=1024, output_dims=1)
    +
    +pooling = DefaultAttentionModule(
    +  input_dims=1024,
    +  hidden_dims=[256, 256],
    +  output_activation=StableSoftmax()
    +)
    +
    +# Define the model which is a composition of the featurizer, pooling module and a classifier
    +model = DefaultMILGraph(featurizer=ShuffleNetV2(), classifier=classifier, pooling = pooling)
    +
    + +

    Since these models are trained end-to-end, they offer a powerful way to go directly from a gigapixel whole slide image to a single label. Due to their wide applicability to different biological problems, two aspects of their implementation and deployment are important:

    + +
      +
    1. Configurable control over each part of the pipeline including the data loaders, the modular parts of the model, and their interaction with each other.
    2. +
    3. Ability to rapidly iterate through the ideate-implement-experiment-productionize loop.
    4. +
    + +

    PyTorch has various advantages when it comes to MIL modeling. It offers an intuitive way to create dynamic computational graphs with flexible control flow which is great for rapid research experimentation. The map-style datasets, configurable sampler and batch-samplers allow us to customize how we construct bags of patches, enabling faster experimentation. Since MIL models are IO heavy, data parallelism and pythonic data loaders make the task very efficient and user friendly. Lastly, the object-oriented nature of PyTorch enables building of reusable modules which aid in the rapid experimentation, maintainable implementation and ease of building compositional components of the pipeline.

    + +

    Exploring spatial tissue organization with GNNs in PyTorch

    + +

    + +

    + +

    In both healthy and diseased tissue, the spatial arrangement and structure of cells can oftentimes be as important as the cells themselves. For example, when assessing lung cancers, pathologists try to look at the overall grouping and structure of tumor cells (do they form solid sheets? Or do they occur in smaller, localized clusters?) to determine if the cancer belongs to specific subtypes which can have vastly different prognosis. Such spatial relationships between cells and other tissue structures can be modeled using graphs to capture tissue topology and cellular composition at the same time. Graph Neural Networks (GNNs) allow learning spatial patterns within these graphs that relate to other clinical variables, for example overexpression of genes in certain cancers.

    + +

    In late 2020, when PathAI started using GNNs on tissue samples, PyTorch had the best and most mature support for GNN functionality via the PyG package. This made PyTorch the natural choice for our team given that GNN models were something that we knew would be an important ML concept we wanted to explore. 

    + +

    One of the main value-adds of GNN’s in the context of tissue samples is that the graph itself can uncover spatial relationships that would otherwise be very difficult to find by visual inspection alone. In our recent AACR publication, we showed that by using GNNs, we can better understand the way the presence of immune cell aggregates (specifically tertiary lymphoid structures, or TLS) in the tumor microenvironment can influence patient prognosis. In this case, the GNN approach was used to predict expression of genes associated with the presence of TLS, and identify histological features beyond the TLS region itself that are relevant to TLS. Such insights into gene expression are difficult to identify from tissue sample images when unassisted by ML models. 

    + +

    One of the most promising GNN variations we have had success with is self attention graph pooling. Let’s take a look at how we define our Self Attention Graph Pooling (SAGPool) model using PyTorch and PyG:

    + +
    class SAGPool(torch.nn.Module):
    +  def __init__(self, ...):
    +    super().__init__()
    +    self.conv1 = GraphConv(in_features, hidden_features, aggr='mean')
    +    self.convs = torch.nn.ModuleList()
    +    self.pools = torch.nn.ModuleList()
    +    self.convs.extend([GraphConv(hidden_features, hidden_features, aggr='mean') for i in range(num_layers - 1)])
    +    self.pools.extend([SAGPooling(hidden_features, ratio, GNN=GraphConv, min_score=min_score) for i in range((num_layers) // 2)])
    +    self.jump = JumpingKnowledge(mode='cat')
    +    self.lin1 = Linear(num_layers * hidden_features, hidden_features)
    +    self.lin2 = Linear(hidden_features, out_features)
    +    self.out_activation = out_activation
    +    self.dropout = dropout
    +
    + +

    In the above code, we begin by defining a single convolutional graph layer and then add two module list layers which allow us to pass in a variable number of layers. We then take our empty module list and append a variable number of GraphConv layers followed by a variable number of SAGPooling layers. We finish up our SAGPool definition by adding a JumpingKnowledge Layer, two linear layers, our activation function, and our dropout value. PyTorch’s intuitive syntax allows us to abstract away the complexity of working with state of the art methods like SAG Poolings while also maintaining the common approach to model development we are familiar with.

    + +

    Models like our SAG Pool one described above are just one example of how GNNs with PyTorch are allowing us to explore new and novel ideas. We also recently explored multimodal CNN - GNN hybrid models which ended up being 20% more accurate than traditional Pathologist consensus scores. These innovations and interplay between traditional CNNs and GNNs are again enabled by the short research to production model development loop.

    + +

    Improving Patient Outcomes

    +

    In order to achieve our mission of improving patient outcomes with AI-powered pathology, PathAI needs to rely on an ML development framework that (1) facilitates quick iteration and easy extension (i.e. Model configuration as code) during initial phases of development and exploration (2) scales model training and inference to massive images (3) easily and robustly serves models for production uses of our products (in clinical trials and beyond). As we’ve demonstrated, PyTorch offers us all of these capabilities and more. We are incredibly excited about the future of PyTorch and cannot wait to see what other impactful challenges we can solve using the framework.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/PyTorch-1.13-release/index.html b/blog/PyTorch-1.13-release/index.html new file mode 100644 index 000000000000..d597cecad1db --- /dev/null +++ b/blog/PyTorch-1.13-release/index.html @@ -0,0 +1,812 @@ + + + + + + + + + + + + + PyTorch 1.13 release, including beta versions of functorch and improved support for Apple’s new M1 chips. | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    We are excited to announce the release of PyTorch® 1.13 (release note)! This includes Stable versions of BetterTransformer. We deprecated CUDA 10.2 and 11.3 and completed migration of CUDA 11.6 and 11.7. Beta includes improved support for Apple M1 chips and functorch, a library that offers composable vmap (vectorization) and autodiff transforms, being included in-tree with the PyTorch release. This release is composed of over 3,749 commits and 467 contributors since 1.12.1. We want to sincerely thank our dedicated community for your contributions.

    + +

    Summary:

    + +
      +
    • +

      The BetterTransformer feature set supports fastpath execution for common Transformer models during Inference out-of-the-box, without the need to modify the model. Additional improvements include accelerated add+matmul linear algebra kernels for sizes commonly used in Transformer models and Nested Tensors is now enabled by default.

      +
    • +
    • +

      Timely deprecating older CUDA versions allows us to proceed with introducing the latest CUDA version as they are introduced by Nvidia®, and hence allows support for C++17 in PyTorch and new NVIDIA Open GPU Kernel Modules.

      +
    • +
    • +

      Previously, functorch was released out-of-tree in a separate package. After installing PyTorch, a user will be able to import functorch and use functorch without needing to install another package.

      +
    • +
    • +

      PyTorch is offering native builds for Apple® silicon machines that use Apple’s new M1 chip as a beta feature, providing improved support across PyTorch’s APIs.

      +
    • +
    + + + +

    Along with 1.13, we are also releasing major updates to the PyTorch libraries, more details can be found in this blog.

    + +

    Stable Features

    + +

    (Stable) BetterTransformer API

    + +

    The BetterTransformer feature set, first released in PyTorch 1.12, is stable. PyTorch BetterTransformer supports fastpath execution for common Transformer models during Inference out-of-the-box, without the need to modify the model. To complement the improvements in Better Transformer, we have also accelerated add+matmul linear algebra kernels for sizes commonly used in Transformer models.

    + +

    Reflecting the performance benefits for many NLP users, Nested Tensors use for Better Transformer is now enabled by default. To ensure compatibility, a mask check is performed to ensure a contiguous mask is supplied. In Transformer Encoder, the mask check for src_key_padding_mask may be suppressed by setting mask_check=False. This accelerates processing for users than can guarantee that only aligned masks are provided. Finally, better error messages are provided to diagnose incorrect inputs, together with improved diagnostics why fastpath execution cannot be used.

    + +

    Better Transformer is directly integrated into the PyTorch TorchText library, enabling TorchText users to transparently and automatically take advantage of BetterTransformer speed and efficiency performance. (Tutorial)

    + +

    + +

    + +

    + +

    +Figure: BetterTransformer fastpath execution is now stable and enables sparsity optimization using Nested Tensor representation as default +

    + +

    Introduction of CUDA 11.6 and 11.7 and deprecation of CUDA 10.2 and 11.3

    + +

    Timely deprecating older CUDA versions allows us to proceed with introducing the latest CUDA version as they are introduced by Nvidia®, and hence allows developers to use the latest features of CUDA and benefit from correctness fixes provided by the latest version.

    + +

    Decommissioning of CUDA 10.2. CUDA 11 is the first CUDA version to support C++17. Hence decommissioning legacy CUDA 10.2 was a major step in adding support for C++17 in PyTorch. It also helps to improve PyTorch code by eliminating legacy CUDA 10.2 specific instructions.

    + +

    Decommissioning of CUDA 11.3 and introduction of CUDA 11.7 brings compatibility support for the new NVIDIA Open GPU Kernel Modules and another significant highlight is the lazy loading support. CUDA 11.7 is shipped with cuDNN 8.5.0 which contains a number of optimizations accelerating transformer-based models, 30% reduction in library size , and various improvements in the runtime fusion engine. Learn more on CUDA 11.7 with our release notes.

    + +

    Beta Features

    + +

    (Beta) functorch

    + +

    Inspired by Google® JAX, functorch is a library that offers composable vmap (vectorization) and autodiff transforms. It enables advanced autodiff use cases that would otherwise be tricky to express in PyTorch. Examples include:

    + + + +

    +We’re excited to announce that, as a first step towards closer integration with PyTorch, functorch has moved to inside the PyTorch library and no longer requires the installation of a separate functorch package. After installing PyTorch via conda or pip, you’ll be able to `import functorch’ in your program. Learn more with our detailed instructions, nightly and release notes.

    + +

    (Beta) Intel® VTune™ Profiler’s Instrumentation and Tracing Technology APIs (ITT) integration

    + +

    PyTorch users are able to visualize op-level timeline of PyTorch scripts execution in Intel® VTune™ Profiler when they need to analyze per-op performance with low-level performance metrics on Intel platforms.

    + +
    with torch.autograd.profiler.emit_itt():
    +    for i in range(10):
    +        torch.itt.range_push('step_{}'.format(i))
    +        model(input)
    +        torch.itt.range_pop()
    +
    + +

    +Learn more with our tutorial.

    + +

    (Beta) NNC: Add BF16 and Channels last support

    + +

    TorchScript graph-mode inference performance on x86 CPU is boosted by adding channels last and BF16 support to NNC. PyTorch users may benefit from channels last optimization on most popular x86 CPUs and benefit from BF16 optimization on Intel Cooper Lake Processor and Sapphire Rapids Processor. >2X geomean performance boost is observed on broad vision models with these two optimizations on Intel Cooper Lake Processor.

    + +

    The performance benefit can be obtained with existing TorchScript, channels last and BF16 Autocast APIs. See code snippet below. We will migrate the optimizations in NNC to the new PyTorch DL Compiler TorchInductor.

    + +

    + +
    import torch
    +import torchvision.models as models
    +model = models.resnet50(pretrained=True)
    +# Convert the model to channels-last
    +model = model.to(memory_format=torch.channels_last)
    +model.eval()
    +data = torch.rand(1, 3, 224, 224)
    +# Convert the data to channels-lastdata = data.to(memory_format=torch.channels_last)
    +# Enable autocast to run with BF16
    +with torch.cpu.amp.autocast(), torch.no_grad():
    +# Trace the model
    +model = torch.jit.trace(model, torch.rand(1, 3, 224, 224))
    +	model = torch.jit.freeze(model)
    +	# Run the traced model
    +	model(data)
    +
    + +

    (Beta) Support for M1 Devices

    + +

    Since v1.12, PyTorch has been offering native builds for Apple® silicon machines that use Apple’s new M1 chip as a prototype feature. In this release, we bring this feature to beta, providing improved support across PyTorch’s APIs.

    + +

    We now run tests for all submodules except torch.distributed on M1 macOS 12.6 instances. With this improved testing, we were able to fix features such as cpp extension and convolution correctness for certain inputs.

    + +

    To get started, just install PyTorch v1.13 on your Apple silicon Mac running macOS 12 or later with a native version (arm64) of Python. Learn more with our release notes.

    + +

    Prototype Features

    + +

    + +

    (Prototype) Arm® Compute Library (ACL) backend support for AWS Graviton

    + +

    We achieved substantial improvements for CV and NLP inference on aarch64 cpu with Arm Compute Library (acl) to enable acl backend for pytorch and torch-xla modules. Highlights include: +

    + +
      +
    • Enabled mkldnn + acl as the default backend for aarch64 torch wheel.
    • +
    • Enabled mkldnn matmul operator for aarch64 bf16 device.
    • +
    • Brought TensorFlow xla+acl feature into torch-xla. We enhanced the TensorFlow xla with Arm Compute Library runtime for aarch64 cpu. These changes are included in TensorFlow master and then the upcoming TF 2.10. Once the torch-xla repo is updated for the tensorflow commit, it will have compiling support for torch-xla. We observed ~2.5-3x improvement for MLPerf Bert inference compared to the torch 1.12 wheel on Graviton3.
    • +
    + +

    (Prototype) CUDA Sanitizer

    + +

    When enabled, the sanitizer begins to analyze low-level CUDA operations invoked as a result of the user’s PyTorch code to detect data race errors caused by unsynchronized data access from different CUDA streams. The errors found are then printed along with stack traces of faulty accesses, much like Thread Sanitizer does. An example of a simple error and the output produced by the sanitizer can be viewed here. It will be especially useful for machine learning applications, where corrupted data can be easy to miss for a human and the errors may not always manifest themselves; the sanitizer will always be able to detect them.

    + +

    (Prototype) Limited Python 3.11 support

    + +

    Binaries for Linux with Python 3.11 support are available to download via pip. Please follow the instructions on the get started page. Please note that Python 3.11 support is only a preview. In particular, features including Distributed, Profiler, FX and JIT might not be fully functional yet.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/PyTorchfoundation/index.html b/blog/PyTorchfoundation/index.html new file mode 100644 index 000000000000..11412fcaa8cf --- /dev/null +++ b/blog/PyTorchfoundation/index.html @@ -0,0 +1,658 @@ + + + + + + + + + + + + + PyTorch strengthens its governance by joining the Linux Foundation | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Soumith Chintala + +

    +

    Today, I am proud to announce that PyTorch is moving to the Linux Foundation (LF) as a top-level project under the name PyTorch Foundation. The core mission of the Linux Foundation is the collaborative development of open source software. With a governing board of leaders from AMD, Amazon Web Services (AWS), Google Cloud, Meta, Microsoft Azure and NVIDIA, this model aligns with where PyTorch stands today and what it needs to travel forward. The creation of the PyTorch Foundation will ensure business decisions are being made in a transparent and open manner by a diverse group of members for years to come. The technical decisions remain in control of individual maintainers. I’m excited that the Linux Foundation will be our new home as they have notable experience supporting large open-source projects like ours such as Kubernetes and NodeJS. At this pivotal moment, I want to take a look back at how we started, share why we are moving, and what’s ahead.

    + +

    This January, PyTorch celebrated its 5 year anniversary! I reflected on what it meant to me in this tweet thread, and this conversation with my colleagues Mike Schroepfer, Lin Qiao, and Yann LeCun. When we started PyTorch development in 2016, it was a collective effort by a band of people from the [Lua]Torch community with a big chunk of people and funding from Meta and individuals contributing from NVIDIA, Twitter and other entities.

    + +

    Since 2017, PyTorch has grown far beyond our initial vision. With over 2,400 contributors who have built nearly 154,000 projects using PyTorch as a foundation, PyTorch has become one of the primary platforms for AI research, as well as commercial production use. We’ve seen its impact across industry and academia, from large companies to numerous university courses at Stanford, NYU, EPFL, Oxford, and other academic institutions. As a maintainer of PyTorch, the journey has been extremely fulfilling, with the impact of the project seen in various fields from self-driving cars to healthcare to aerospace.

    + +

    As PyTorch grew, many companies have made foundational investments around it. While Meta remains the largest contributor to PyTorch, companies such as AMD, Amazon Web Services (AWS), Google Cloud, HuggingFace, Lightning AI, Microsoft Azure, Nvidia, and many others have made significant investments, including both technical contributions and community building efforts. They’ve established teams around PyTorch or filled significant voids within the PyTorch community and sent countless contributions to the PyTorch core and to the ecosystem around it — PyTorch is an important part of their future. With PyTorch continuing to grow as a multi-stakeholder project, it’s time to move to a broader open-source foundation.

    + +

    The business governance of PyTorch was fairly unstructured for quite some time since launch – we operated like a scrappy startup. Team members at Meta spent the time and energy to structure this properly and organize PyTorch into an organizationally more healthy entity. Meta helped PyTorch with introducing many structures, such as Contributor License Agreements, Branding Guidelines, and Trademark registration. Keeping PyTorch’s organizational health up to check is essential and beneficial for the community. The next stage of our organizational progress is to support the interests of multiple stakeholders, hence moving to a foundation is good. We chose the Linux Foundation as it has vast organization experience hosting large multi-stakeholder open-source projects with the right balance of organizational structure and finding specific solutions for these projects.

    + +

    Simultaneously, the technical governance of PyTorch has been a loosely structured community model of open-source development — A set of people maintaining PyTorch by area with their responsibility often tied to their individual identity rather than their employment. While we kept a codified list at the PyTorch - Maintainers page, the technical governance was not formalized nor codified. As PyTorch scales as a community, the next step is to structure and codify. The PyTorch Technical Governance now supports a hierarchical maintainer structure and clear outlining of processes around day to day work and escalations. This doesn’t change how we run things, but it does add discipline and openness that at our scale feels essential and timely.

    + +

    It’s been an exciting journey since 2016. I am grateful for the experiences and people I’ve met along the way. PyTorch started with a small group of contributors which have grown and diversified over the years, all bringing in new ideas and innovations that would not have been possible without our community. We want to continue the open-source spirit – for the community and by the community. Thank you to our contributors, maintainers, users, supporters and new foundation members. We look forward to the next chapter of PyTorch with the PyTorch Foundation.

    + + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/a-better-transformer-for-fast-transformer-encoder-inference/index.html b/blog/a-better-transformer-for-fast-transformer-encoder-inference/index.html new file mode 100644 index 000000000000..ac13b2e4aa8c --- /dev/null +++ b/blog/a-better-transformer-for-fast-transformer-encoder-inference/index.html @@ -0,0 +1,715 @@ + + + + + + + + + + + + + A BetterTransformer for Fast Transformer Inference | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Michael Gschwind, Eric Han, Scott Wolchok, Rui Zhu, Christian Puhrsch + +

    +

    tl;dr Transformers achieve state-of-the-art performance for NLP, and are becoming popular for a myriad of other tasks. They are computationally expensive which has been a blocker to their widespread productionisation. Launching with PyTorch 1.12, BetterTransformer implements a backwards-compatible fast path of torch.nn.TransformerEncoder for Transformer Encoder Inference and does not require model authors to modify their models. BetterTransformer improvements can exceed 2x in speedup and throughput for many common execution scenarios. To use BetterTransformer, install PyTorch 1.12 and start using high-quality, high-performance Transformer models with the PyTorch API today.

    + +

    + +

    + +

    +Diagram of the Transformer Encoder Architecture (from "Attention Is All You Need"). During Inference, the entire module will execute as a single PyTorch-native function. +

    + +

    In this blog post, we share the following topics — Performance Improvements, Backwards compatibility, and Taking advantage of the FastPath. Learn more about these topics below.

    + +

    Performance Improvements

    + +

    BetterTransformer launches with accelerated native implementations of MultiHeadAttention and TransformerEncoderLayer for CPUs and GPUs. These fast paths are integrated in the standard PyTorch Transformer APIs, and will accelerate TransformerEncoder, TransformerEncoderLayer and MultiHeadAttention nn.modules. These new modules implement two types of optimizations: (1) fused kernels combine multiple individual operators normally used to implement Transformers to provide a more efficient implementation, and (2) take advantage of sparsity in the inputs to avoid performing unnecessary operations on padding tokens. Padding tokens frequently account for a large fraction of input batches in many Transformer models used for Natural Language Processing.

    + +

    Backwards compatibility

    + +

    Advantageously, no model changes are necessary to benefit from the performance boost offered by BetterTransformer. To benefit from fast path execution, inputs and operating conditions must satisfy some access conditions (see below). While the internal implementation of Transformer APIs has changed, PyTorch 1.12 maintains strict compatibility with Transformer modules shipped in previous versions, enabling PyTorch users to use models created and trained with previous PyTorch releases while benefiting from BetterTransformer improvements.

    + +

    In addition to enabling the PyTorch nn.Modules, BetterTransformer provides improvements for PyTorch libraries. Performance benefits will become available through two different enablement paths:

    + +
      +
    1. +

      Transparent acceleration: Current users of PyTorch nn.Modules such as MultiHeadAttention as well as higher-level Transformer components will benefit from the improved performance of the new nn.Modules automatically. An example of this is the visual transformer (ViT) implementation used in the torchvision library (code link).

      +
    2. +
    3. +

      Torchtext library acceleration: As part of this project, we have optimized Torchtext to build on the PyTorch core API to benefit from BetterTransformer enhancements while maintaining strict and transparent compatibility with previous library versions and models trained with previous Torchtext versions. Using PyTorch Transformers in Torchtext also ensures that Torchtext will benefit from expected future enhancements to the PyTorch Transformer implementation.

      +
    4. +
    + +

    Taking advantage of the Fastpath

    + +

    BetterTransformer is a fastpath for the PyTorch Transformer API. The fastpath is a native, specialized implementation of key Transformer functions for CPU and GPU that applies to common Transformer use cases.

    + +

    To take advantage of input sparsity (i.e. padding) in accelerating your model (see Figure 2), set the keyword argument enable_nested_tensor=True when instantiating a TransformerEncoder and pass in the src_key_padding_mask argument (which denotes padding tokens) during inference. This requires the padding mask to be contiguous, which is the typical case.

    + +

    Currently, the BetterTransformer speedup only applies to transformer encoder models used in inference. To benefit from fastpath execution, models must be composed of any of the following components: TransformerEncoder, TransformerEncoderLayer or MultiheadAttention (MHA). Fastpath execution is also subject to some criteria. Most importantly, the model must be executed in inference mode and operate on input tensors that do not collect gradient tape information (e.g., running with torch.no_grad). The full list of conditions can be found at these links for nn.MultiHeadAttention and nn.TransformerEncoder, respectively. If the criteria are not met, control flows to the legacy PyTorch 1.11 Transformer implementation which has the same API, but lacks the fastpath performance boost.

    + +

    Other transformer models (such as decoder models) which use the PyTorch MultiheadAttention module will benefit from the BetterTransformer fastpath. Planned future work is to expand the end-to-end BetterTransformer fastpath to models based on TransformerDecoder to support popular seq2seq and decoder-only (e.g., OPT) model architectures, and to training.

    + +

    Speedups

    + +

    The following graphs show the performance achieved for the BERT-base model with small and large-scale inputs:

    + +

    + +

    + +

    +Figure 1: PyTorch 1.12 Improvements with BetterTransformer fastpath execution +

    + +

    + +

    + +

    +Figure 2: PyTorch 1.12 Improvements with BetterTransformer fastpath execution
    +with sparsity optimization enabled by enable_nested_tensor=True
    +

    + +

    BetterTransformer includes two types of optimization: (1) fused kernels implementing multiple operations more efficiently in a single kernel, and (2) exploiting sparsity by avoiding unnecessary processing on padding tokens. Enhanced performance for small input sizes benefits primarily from the fused kernel implementations, and shows a constant performance improvement regardless of padding amount. While large inputs still benefit from fused kernels, the computation heavy processing limits the benefits that may be obtained by the fused kernels as baseline performance is already closer to the theoretical peak. However, as we increase the amount of padding, performance increases dramatically as increasingly large amounts of computation can be avoided by exploiting the sparsity introduced by padding in NLP workloads.

    + +

    Future Work

    + +

    As part of our ongoing work on PyTorch BetterTransformer, we are working on extending BetterTransformer improvements to Transformer Decoders. We aim to expand beyond inference to training as well.

    + +

    We are partnering to enable BetterTransformer on additional libraries such as FairSeq, MetaSeq, and HuggingFace to benefit all Transformer-based PyTorch models. We’ll provide future updates on the progress of BetterTransformer accelerations for the larger PyTorch ecosystem as part of this blog series.

    + +

    Acknowledgements: The authors would like to thank Lin Qiao, Ajit Mathews, Andrew Tulloch, Dmytro Dzhulgakov, Natalia Gimelshein, Emad El-Haraty, Mark Saroufim, Adnan Aziz, Geeta Chauhan, and Hamid Shojanazeri for their support, contributions and many helpful suggestions throughout the course of this project, and in the preparation of this blog.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/a-tour-of-pytorch-internals-1/index.html b/blog/a-tour-of-pytorch-internals-1/index.html new file mode 100644 index 000000000000..1f387b1550bf --- /dev/null +++ b/blog/a-tour-of-pytorch-internals-1/index.html @@ -0,0 +1,957 @@ + + + + + + + + + + + + + A Tour of PyTorch Internals (Part I) | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    May 11, 2017

    +

    + A Tour of PyTorch Internals (Part I) +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Trevor Killeen + +

    +

    The fundamental unit in PyTorch is the Tensor. This post will serve as an overview for how we implement Tensors in PyTorch, such that the user can interact with it from the Python shell. In particular, we want to answer four main questions:

    + +
      +
    • How does PyTorch extend the Python interpreter to define a Tensor type that can be manipulated from Python code?
    • +
    • How does PyTorch wrap the C libraries that actually define the Tensor’s properties and methods?
    • +
    • How does PyTorch cwrap work to generate code for Tensor methods?
    • +
    • How does PyTorch’s build system take all of these components to compile and generate a workable application?
    • +
    + +

    Extending the Python Interpreter

    + +

    PyTorch defines a new package torch. In this post we will consider the ._C module. This module is known as an “extension module” - a Python module written in C. Such modules allow us to define new built-in object types (e.g. the Tensor) and to call C/C++ functions.

    + +

    The ._C module is defined in torch/csrc/Module.cpp. The init_C() / PyInit__C() function creates the module and adds the method definitions as appropriate. This module is passed around to a number of different __init() functions that add further objects to the module, register new types, etc.

    + +

    One collection of these __init() calls is the following:

    + +
    ASSERT_TRUE(THPDoubleTensor_init(module));
    +ASSERT_TRUE(THPFloatTensor_init(module));
    +ASSERT_TRUE(THPHalfTensor_init(module));
    +ASSERT_TRUE(THPLongTensor_init(module));
    +ASSERT_TRUE(THPIntTensor_init(module));
    +ASSERT_TRUE(THPShortTensor_init(module));
    +ASSERT_TRUE(THPCharTensor_init(module));
    +ASSERT_TRUE(THPByteTensor_init(module));
    +
    + +

    These __init() functions add the Tensor object for each type to the ._C module so that they can be used in the module. Let’s learn how these methods work.

    + +

    The THPTensor Type

    + +

    Much like the underlying TH and THC libraries, PyTorch defines a “generic” Tensor which is then specialized to a number of different types. Before considering how this specialization works, let’s first consider how defining a new type in Python works, and how we create the generic THPTensor type.

    + +

    The Python runtime sees all Python objects as variables of type PyObject *, which serves as a “base type” for all Python objects. Every Python type contains the refcount for the object, and a pointer to the object’s type object. The type object determines the properties of the type. For example, it might contain a list of methods associated with the type, and which C functions get called to implement those methods. The object also contains any fields necessary to represent its state.

    + +

    The formula for defining a new type is as follows:

    + +
      +
    • Create a struct that defines what the new object will contain
    • +
    • Define the type object for the type
    • +
    + +

    The struct itself could be very simple. Inn Python, all floating point types are actually objects on the heap. The Python float struct is defined as:

    +
    typedef struct {
    +    PyObject_HEAD
    +    double ob_fval;
    +} PyFloatObject;
    +
    +

    The PyObject_HEAD is a macro that brings in the code that implements an object’s reference counting, and a pointer to the corresponding type object. So in this case, to implement a float, the only other “state” needed is the floating point value itself.

    + +

    Now, let’s see the struct for our THPTensor type:

    +
    struct THPTensor {
    +    PyObject_HEAD
    +    THTensor *cdata;
    +};
    +
    +

    Pretty simple, right? We are just wrapping the underlying TH tensor by storing a pointer to it.

    + +

    The key part is defining the “type object” for a new type. An example definition of a type object for our Python float takes the form:

    +
    static PyTypeObject py_FloatType = {
    +    PyVarObject_HEAD_INIT(NULL, 0)
    +    "py.FloatObject",          /* tp_name */
    +    sizeof(PyFloatObject),     /* tp_basicsize */
    +    0,                         /* tp_itemsize */
    +    0,                         /* tp_dealloc */
    +    0,                         /* tp_print */
    +    0,                         /* tp_getattr */
    +    0,                         /* tp_setattr */
    +    0,                         /* tp_as_async */
    +    0,                         /* tp_repr */
    +    0,                         /* tp_as_number */
    +    0,                         /* tp_as_sequence */
    +    0,                         /* tp_as_mapping */
    +    0,                         /* tp_hash  */
    +    0,                         /* tp_call */
    +    0,                         /* tp_str */
    +    0,                         /* tp_getattro */
    +    0,                         /* tp_setattro */
    +    0,                         /* tp_as_buffer */
    +    Py_TPFLAGS_DEFAULT,        /* tp_flags */
    +    "A floating point number", /* tp_doc */
    +};
    +
    +

    The easiest way to think of a type object is as a set of fields which define the properties of the object. For example, the tp_basicsize field is set to sizeof(PyFloatObject). This is so that Python knows how much memory to allocate when calling PyObject_New() for a PyFloatObject. The full list of fields you can set is defined in object.h in the CPython backend: +https://github.com/python/cpython/blob/master/Include/object.h.

    + +

    The type object for our THPTensor is THPTensorType, defined in csrc/generic/Tensor.cpp. This object defines the name, size, mapping methods, etc. for a THPTensor.

    + +

    As an example, let’s take a look at the tp_new function we set in the PyTypeObject:

    + +
    PyTypeObject THPTensorType = {
    +  PyVarObject_HEAD_INIT(NULL, 0)
    +  ...
    +  THPTensor_(pynew), /* tp_new */
    +};
    +
    +

    The tp_new function enables object creation. It is responsible for creating (as opposed to initializing) objects of that type and is equivalent to the __new__() method at the Python level. The C implementation is a static method that is passed the type being instantiated and any arguments, and returns a newly created object.

    + +
    static PyObject * THPTensor_(pynew)(PyTypeObject *type, PyObject *args, PyObject *kwargs)
    +{
    +  HANDLE_TH_ERRORS
    +  Py_ssize_t num_args = args ? PyTuple_Size(args) : 0;
    +
    +  THPTensorPtr self = (THPTensor *)type->tp_alloc(type, 0);
    +// more code below
    +
    +

    The first thing our new function does is allocate the THPTensor. It then runs through a series of initializations based off of the args passed to the function. For example, when creating a THPTensor x from another THPTensor y, we set the newly created THPTensor’s cdata field to be the result of calling THTensor_(newWithTensor) with the y’s underlying TH Tensor as an argument. Similar constructors exist for sizes, storages, NumPy arrays, and sequences.

    + +

    ** Note that we solely use tp_new, and not a combination of tp_new and tp_init (which corresponds to the __init__() function).

    + +

    The other important thing defined in Tensor.cpp is how indexing works. PyTorch Tensors support Python’s Mapping Protocol. This allows us to do things like:

    +
    x = torch.Tensor(10).fill_(1)
    +y = x[3] // y == 1
    +x[4] = 2
    +// etc.
    +
    +

    ** Note that this indexing extends to Tensor with more than one dimension

    + +

    We are able to use the []-style notation by defining the three mapping methods described here.

    + +

    The most important methods are THPTensor_(getValue) and THPTensor_(setValue) which describe how to index a Tensor, for returning a new Tensor/Scalar, or updating the values of an existing Tensor in place. Read through these implementations to better understand how PyTorch supports basic tensor indexing.

    + +

    Generic Builds (Part One)

    + +

    We could spend a ton of time exploring various aspects of the THPTensor and how it relates to defining a new Python object. But we still need to see how the THPTensor_(init)() function is translated to the THPIntTensor_init() we used in our module initialization. How do we take our Tensor.cpp file that defines a “generic” Tensor and use it to generate Python objects for all the permutations of types? To put it another way, Tensor.cpp is littered with lines of code like:

    +
    return THPTensor_(New)(THTensor_(new)(LIBRARY_STATE_NOARGS));
    +
    +

    This illustrates both cases we need to make type-specific:

    + +
      +
    • Our output code will call THP<Type>Tensor_New(...) in place of THPTensor_(New)
    • +
    • Our output code will call TH<Type>Tensor_new(...) in place of THTensor_(new)
    • +
    + +

    In other words, for all supported Tensor types, we need to “generate” source code that has done the above substitutions. This is part of the “build” process for PyTorch. PyTorch relies on Setuptools (https://setuptools.readthedocs.io/en/latest/) for building the package, and we define a setup.py file in the top-level directory to customize the build process.

    + +

    One component building an Extension module using Setuptools is to list the source files involved in the compilation. However, our csrc/generic/Tensor.cpp file is not listed! So how does the code in this file end up being a part of the end product?

    + +

    Recall that we are calling the THPTensor* functions (such as init) from the directory above generic. If we take a look in this directory, there is another file Tensor.cpp defined. The last line of this file is important:

    +
    //generic_include TH torch/csrc/generic/Tensor.cpp
    +
    +

    Note that this Tensor.cpp file is included in setup.py, but it is wrapped in a call to a Python helper function called split_types. This function takes as input a file, and looks for the “//generic_include” string in the file contents. If it is found, it generates a new output file for each Tensor type, with the following changes:

    + +
      +
    • The output file is renamed to Tensor<Type>.cpp
    • +
    • The output file is slightly modified as follows:
    • +
    + +
    # Before:
    +//generic_include TH torch/csrc/generic/Tensor.cpp
    +
    +# After:
    +#define TH_GENERIC_FILE "torch/src/generic/Tensor.cpp"
    +#include "TH/THGenerate<Type>Type.h"
    +
    +

    Including the header file on the second line has the side effect of including the source code in Tensor.cpp with some additional context defined. Let’s take a look at one of the headers:

    + +
    #ifndef TH_GENERIC_FILE
    +#error "You must define TH_GENERIC_FILE before including THGenerateFloatType.h"
    +#endif
    +
    +#define real float
    +#define accreal double
    +#define TH_CONVERT_REAL_TO_ACCREAL(_val) (accreal)(_val)
    +#define TH_CONVERT_ACCREAL_TO_REAL(_val) (real)(_val)
    +#define Real Float
    +#define THInf FLT_MAX
    +#define TH_REAL_IS_FLOAT
    +#line 1 TH_GENERIC_FILE
    +#include TH_GENERIC_FILE
    +#undef accreal
    +#undef real
    +#undef Real
    +#undef THInf
    +#undef TH_REAL_IS_FLOAT
    +#undef TH_CONVERT_REAL_TO_ACCREAL
    +#undef TH_CONVERT_ACCREAL_TO_REAL
    +
    +#ifndef THGenerateManyTypes
    +#undef TH_GENERIC_FILE
    +#endif
    +
    + +

    What this is doing is bringing in the code from the generic Tensor.cpp file and surrounding it with the following macro definitions. For example, we define real as a float, so any code in the generic Tensor implementation that refers to something as a real will have that real replaced with a float. In the corresponding file THGenerateIntType.h, the same macro would replace real with int.

    + +

    These output files are returned from split_types and added to the list of source files, so we can see how the .cpp code for different types is created.

    + +

    There are a few things to note here: First, the split_types function is not strictly necessary. We could wrap the code in Tensor.cpp in a single file, repeating it for each type. The reason we split the code into separate files is to speed up compilation. Second, what we mean when we talk about the type replacement (e.g. replace real with a float) is that the C preprocessor will perform these substitutions during compilation. Merely surrounding the source code with these macros has no side effects until preprocessing.

    + +

    Generic Builds (Part Two)

    + +

    Now that we have source files for all the Tensor types, we need to consider how the corresponding header declarations are created, and also how the conversions from THTensor_(method) and THPTensor_(method) to TH<Type>Tensor_method and THP<Type>Tensor_method work. For example, csrc/generic/Tensor.h has declarations like:

    +
    THP_API PyObject * THPTensor_(New)(THTensor *ptr);
    +
    +

    We use the same strategy for generating code in the source files for the headers. In csrc/Tensor.h, we do the following:

    +
    #include "generic/Tensor.h"
    +#include <TH/THGenerateAllTypes.h>
    +
    +#include "generic/Tensor.h"
    +#include <TH/THGenerateHalfType.h>
    +
    +

    This has the same effect, where we draw in the code from the generic header, wrapped with the same macro definitions, for each type. The only difference is that the resulting code is contained all within the same header file, as opposed to being split into multiple source files.

    + +

    Lastly, we need to consider how we “convert” or “substitute” the function types. If we look in the same header file, we see a bunch of #define statements, including:

    +
    #define THPTensor_(NAME)            TH_CONCAT_4(THP,Real,Tensor_,NAME)
    +
    +

    This macro says that any string in the source code matching the format THPTensor_(NAME) should be replaced with THPRealTensor_NAME, where Real is derived from whatever the symbol Real is #define‘d to be at the time. Because our header code and source code is surrounded by macro definitions for all the types as seen above, after the preprocessor has run, the resulting code is what we would expect. The code in the TH library defines the same macro for THTensor_(NAME), supporting the translation of those functions as well. In this way, we end up with header and source files with specialized code.

    + +

    Module Objects and Type Methods

    + +

    Now we have seen how we have wrapped TH’s Tensor definition in THP, and generated THP methods such as THPFloatTensor_init(...). Now we can explore what the above code actually does in terms of the module we are creating. The key line in THPTensor_(init) is:

    +
    # THPTensorBaseStr, THPTensorType are also macros that are specific
    +# to each type
    +PyModule_AddObject(module, THPTensorBaseStr, (PyObject *)&THPTensorType);
    +
    +

    This function registers our Tensor objects to the extension module, so we can use THPFloatTensor, THPIntTensor, etc. in our Python code.

    + +

    Just being able to create Tensors isn’t very useful - we need to be able to call all the methods that TH defines. A simple example shows calling the in-place zero_ method on a Tensor.

    +
    x = torch.FloatTensor(10)
    +x.zero_()
    +
    +

    Let’s start by seeing how we add methods to newly defined types. One of the fields in the “type object” is tp_methods. This field holds an array of method definitions (PyMethodDefs) and is used to associate methods (and their underlying C/C++ implementations) with a type. Suppose we wanted to define a new method on our PyFloatObject that replaces the value. We could implement this as follows:

    +
    static PyObject * replace(PyFloatObject *self, PyObject *args) {
    +	double val;
    +	if (!PyArg_ParseTuple(args, "d", &val))
    +		return NULL;
    +	self->ob_fval = val;
    +	Py_RETURN_NONE
    +}
    +
    +

    This is equivalent to the Python method:

    +
    def replace(self, val):
    +	self.ob_fval = val
    +
    +

    It is instructive to read more about how defining methods works in CPython. In general, methods take as the first parameter the instance of the object, and optionally parameters for the positional arguments and keyword arguments. This static function is registered as a method on our float:

    +
    static PyMethodDef float_methods[] = {
    +	{"replace", (PyCFunction)replace, METH_VARARGS,
    +	"replace the value in the float"
    +	},
    +	{NULL} /* Sentinel */
    +}
    +
    +

    This registers a method called replace, which is implemented by the C function of the same name. The METH_VARARGS flag indicates that the method takes a tuple of arguments representing all the arguments to the function. This array is set to the tp_methods field of the type object, and then we can use the replace method on objects of that type.

    + +

    We would like to be able to call all of the methods for TH tensors on our THP tensor equivalents. However, writing wrappers for all of the TH methods would be time-consuming and error prone. We need a better way to do this.

    + +

    PyTorch cwrap

    + +

    PyTorch implements its own cwrap tool to wrap the TH Tensor methods for use in the Python backend. We define a .cwrap file containing a series of C method declarations in our custom YAML format. The cwrap tool takes this file and outputs .cpp source files containing the wrapped methods in a format that is compatible with our THPTensor Python object and the Python C extension method calling format. This tool is used to generate code to wrap not only TH, but also CuDNN. It is defined to be extensible.

    + +

    An example YAML “declaration” for the in-place addmv_ function is as follows:

    +
    [[
    +  name: addmv_
    +  cname: addmv
    +  return: self
    +  arguments:
    +    - THTensor* self
    +    - arg: real beta
    +      default: AS_REAL(1)
    +    - THTensor* self
    +    - arg: real alpha
    +      default: AS_REAL(1)
    +    - THTensor* mat
    +    - THTensor* vec
    +]]
    +
    +

    The architecture of the cwrap tool is very simple. It reads in a file, and then processes it with a series of plugins. See tools/cwrap/plugins/__init__.py for documentation on all the ways a plugin can alter the code.

    + +

    The source code generation occurs in a series of passes. First, the YAML “declaration” is parsed and processed. Then the source code is generated piece-by-piece - adding things like argument checks and extractions, defining the method header, and the actual call to the underlying library such as TH. Finally, the cwrap tool allows for processing the entire file at a time. The resulting output for addmv_ can be explored here.

    + +

    In order to interface with the CPython backend, the tool generates an array of PyMethodDefs that can be stored or appended to the THPTensor’s tp_methods field.

    + +

    In the specific case of wrapping Tensor methods, the build process first generates the output source file from TensorMethods.cwrap. This source file is #include‘d in the generic Tensor source file. This all occurs before the preprocessor does its magic. As a result, all of the method wrappers that are generated undergo the same pass as the THPTensor code above. Thus a single generic declaration and definition is specialized for each type as well.

    + +

    Putting It All Together

    + +

    So far, we have shown how we extend the Python interpreter to create a new extension module, how such a module defines our new THPTensor type, and how we can generate source code for Tensors of all types that interface with TH. Briefly, we will touch on compilation.

    + +

    Setuptools allows us to define an Extension for compilation. The entire torch._C extension is compiled by collecting all of the source files, header files, libraries, etc. and creating a setuptools Extension. Then setuptools handles building the extension itself. I will explore the build process more in a subsequent post.

    + +

    To summarize, let’s revisit our four questions:

    + +
      +
    • How does PyTorch extend the Python interpreter to define a Tensor type that can be manipulated from Python code?
    • +
    + +

    It uses CPython’s framework for extending the Python interpreter and defining new types, while taking special care to generate code for all types.

    + +
      +
    • How does PyTorch wrap the C libraries that actually define the Tensor’s properties and methods?
    • +
    + +

    It does so by defining a new type, THPTensor, that is backed by a TH Tensor. Function calls are forwarded to this tensor via the CPython backend’s conventions.

    + +
      +
    • How does PyTorch cwrap work to generate code for Tensor methods?
    • +
    + +

    It takes our custom YAML-formatted code and generates source code for each method by processing it through a series of steps using a number of plugins.

    + +
      +
    • How does PyTorch’s build system take all of these components to compile and generate a workable application?
    • +
    + +

    It takes a bunch of source/header files, libraries, and compilation directives to build an extension using Setuptools.

    + +

    This is just a snapshot of parts of the build system for PyTorch. There is more nuance, and detail, but I hope this serves as a gentle introduction to a lot of the components of our Tensor library.

    + +

    Resources:

    + + + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/a-tour-of-pytorch-internals-2/index.html b/blog/a-tour-of-pytorch-internals-2/index.html new file mode 100644 index 000000000000..436351f2f4e7 --- /dev/null +++ b/blog/a-tour-of-pytorch-internals-2/index.html @@ -0,0 +1,1203 @@ + + + + + + + + + + + + + PyTorch Internals Part II - The Build System | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    + +
    + +
    +
    +
    + +
    +

    + by + + Trevor Killeen + +

    +

    In the first post I explained how we generate a torch.Tensor object that you can use in your Python interpreter. Next, I will explore the build system for PyTorch. The PyTorch codebase has a variety of components:

    + +
      +
    • The core Torch libraries: TH, THC, THNN, THCUNN
    • +
    • Vendor libraries: CuDNN, NCCL
    • +
    • Python Extension libraries
    • +
    • Additional third-party libraries: NumPy, MKL, LAPACK
    • +
    + +

    How does a simple invocation of python setup.py install do the work that allows you to call import torch and use the PyTorch library in your code?

    + +

    The first part of this document will explain the build process from and end-user point of view. This will explain how we take the components above to build the library. The second part of the document will be important for PyTorch developers. It will document ways to improve your iteration speed by building only a subset of the code that you are working on.

    + +

    Setuptools and PyTorch’s setup( ) function

    + +

    Python uses Setuptools to build the library. Setuptools is an extension to the original distutils system from the core Python library. The core component of Setuptools is the setup.py file which contains all the information needed to build the project. The most important function is the setup() function which serves as the main entry point. Let’s take a look at the one in PyTorch:

    + +
    setup(name="torch", version=version,
    +      description="Tensors and Dynamic neural networks in Python with strong GPU acceleration",
    +      ext_modules=extensions,
    +      cmdclass={
    +          'build': build,
    +          'build_py': build_py,
    +          'build_ext': build_ext,
    +          'build_deps': build_deps,
    +          'build_module': build_module,
    +          'develop': develop,
    +          'install': install,
    +          'clean': clean,
    +      },
    +      packages=packages,
    +      package_data={'torch': [
    +          'lib/*.so*', 'lib/*.dylib*',
    +          'lib/torch_shm_manager',
    +          'lib/*.h',
    +          'lib/include/TH/*.h', 'lib/include/TH/generic/*.h',
    +          'lib/include/THC/*.h', 'lib/include/THC/generic/*.h']},
    +      install_requires=['pyyaml'],
    +      )
    +
    + +

    The function is composed entirely of keyword arguments, which serve two purposes:

    + +
      +
    • Metadata (e.g. name, description, version)
    • +
    • The contents of the package
    • +
    + +

    We are concerned with #2. Let’s break down the individual components:

    + +
      +
    • ext_modules: Python modules are either “pure” modules, containing only Python code, or “extension” modules written in the low-level language of the Python implementation. Here we are listing the extension modules in the build, including the main torch._C library that contains our Python Tensor
    • +
    • cmdclass: When using the setup.py script from the command line, the user must specify one or more “commands”, code snippets that perform a specific action. For example, the “install” command builds and installs the package. This mapping routes specific commands to functions in setup.py that implement them
    • +
    • packages: The list of packages in the project. These are “pure” - i.e. they only contain Python code. These are defined elsewhere in setup.py
    • +
    • package_data: Additional files that need to be installed into a package: in this case the header files and shared libraries that the build will generate must be included in our installation
    • +
    • install_requires: In order to build PyTorch, we need pyyaml. Setuptools will handle making sure that pyyaml will be available, downloading and installing it if necessary
    • +
    + +

    We will consider these components in more detail, but for now it is instructive to look at the end product of an installation – i.e. what Setuptools does after building the code.

    + +

    site_packages

    + +

    Third party packages are by default installed into the lib/<version>/site_packages directory associated with your Python binary. For example, because I am using an Miniconda environment, my Python binary is found at:

    + +
    (p3) killeent@devgpu047:pytorch (master)$ which python
    +~/local/miniconda2/envs/p3/bin/python
    +
    +

    And thus packages are installed into:

    + +
    /home/killeent/local/miniconda2/envs/p3/lib/python3.6/site-packages
    +
    +

    I installed PyTorch, and let’s take a look into torch folder in site-packages:

    + +
    (p3) killeent@devgpu047:site-packages$ cd torch
    +(p3) killeent@devgpu047:torch$ ls
    +autograd  backends  _C.cpython-36m-x86_64-linux-gnu.so  cuda  distributed  _dl.cpython-36m-x86_64-linux-gnu.so  functional.py  __init__.py  legacy  lib  multiprocessing  nn  optim  __pycache__  serialization.py  _six.py  sparse  storage.py  _tensor_docs.py  tensor.py  _tensor_str.py  _thnn  _torch_docs.py  utils  _utils.py  version.py
    +
    + +

    Note that everything we would expect to be here is here:

    + +
      +
    • All the “pure” packages are here [todo print packages from setup.py to explain]
    • +
    • The extension libraries are here - the ._C* and ._dl* shared libraries
    • +
    • The package_data is here: the contents of lib/ match exactly what we described in the setup function:
    • +
    + +
    (p3) killeent@devgpu047:torch$ ls lib/
    +include     libnccl.so.1  libTHC.so.1   libTHCUNN.so.1  libTHNN.so.1  libTH.so.1   THCUNN.h  torch_shm_manager libnccl.so  libshm.so     libTHCS.so.1  libTHD.so.1     libTHPP.so.1  libTHS.so.1  THNN.h
    +
    + +

    The Python interpreter looks into site_packages during an import. If we call import torch in our Python code it will find the module here and initialize and import it. You can read more about the import system here.

    + +

    Building Individual Parts

    + +

    Next, we will look at the various individual components of the build from start to finish. This will illustrate how we combine all the code we mentioned in the introduction.

    + +

    Backend Torch and Vendor Libraries

    + +

    Let’s take a look at the install cmd override in PyTorch’s setup.py:

    + +
    class install(setuptools.command.install.install):
    +
    +    def run(self):
    +        if not self.skip_build:
    +            self.run_command('build_deps')
    +        setuptools.command.install.install.run(self)
    +
    + +

    We note the first thing it does is run a command called “build_deps” - let’s take a look at it’s run() method:

    + +
    def run(self):
    +        from tools.nnwrap import generate_wrappers as generate_nn_wrappers
    +        build_all_cmd = ['bash', 'torch/lib/build_all.sh']
    +        if WITH_CUDA:
    +            build_all_cmd += ['--with-cuda']
    +        if WITH_NCCL and not SYSTEM_NCCL:
    +            build_all_cmd += ['--with-nccl']
    +        if WITH_DISTRIBUTED:
    +            build_all_cmd += ['--with-distributed']
    +        if subprocess.call(build_all_cmd) != 0:
    +            sys.exit(1)
    +        generate_nn_wrappers()
    +
    + +

    Here we note that that we have a shell script build_all.sh in the torch/lib/ directory. This script is configurable by whether we are on a system with CUDA enabled, the NCCL library enabled, and PyTorch’s distributed library enabled.

    + +

    Let’s take a look in torch/lib:

    + +
    (p3) killeent@devgpu047:lib (master)$ ls
    +build_all.sh  libshm  nccl  README.md  TH  THC  THCS  THCUNN  THD  THNN  THPP  THS
    +
    + +

    Here we see the directories for all the backend libraries. TH, THC, THNN, THCUNN, and nccl are git subtrees that are in sync with the libraries in e.g. github.com/torch. THS, THCS, THD, THPP and libshm are libraries specific to PyTorch. All of the libraries contain CMakeLists.txt - indicating they are built with CMake.

    + +

    The build_all.sh is essentially a script that runs the CMake configure step on all of these libraries, and then make install. Let’s run ./build_all.sh and see what we are left with:

    + +
    (p3) killeent@devgpu047:lib (master)$ ./build_all.sh --with-cuda --with-nccl --with-distributed
    +[various CMake output logs]
    +(p3) killeent@devgpu047:lib (master)$ ls
    +build  build_all.sh  include  libnccl.so  libnccl.so.1  libshm  libshm.so  libTHC.so.1  libTHCS.so.1  libTHCUNN.so.1  libTHD.so.1  libTHNN.so.1  libTHPP.so.1  libTH.so.1  libTHS.so.1  nccl  README.md  TH  THC  THCS  THCUNN  THCUNN.h  THD  THNN  THNN.h  THPP  THS  tmp_install  torch_shm_manager
    +
    + +

    Now there are a number of extra things in the directory:

    + +
      +
    • Shared library files for each library
    • +
    • Headers for THNN and THCUNN
    • +
    • build and tmp_install directories
    • +
    • The torch_shm_manager executable
    • +
    + +

    Let’s explore further. In the shell script, we create the build directory and a subdir for each library to build:

    + +
    # We create a build directory for the library, which will
    +# contain the cmake output. $1 is the library to be built
    +  mkdir -p build/$1
    +  cd build/$1
    +
    + +

    Thus e.g. build/TH contains the CMake configuration output including the Makefile for building TH, and also the result of running make install in this directory.

    + +

    Let’s also look at tmp_install:

    + +
    (p3) killeent@devgpu047:lib (master)$ ls tmp_install/
    +bin  include  lib  share
    +
    + +

    tmp_install looks like a standard install directory containing binaries, header files and library files. For example, tmp_install/include/TH contains all the TH headers, and tmp_install/lib/ contains the libTH.so.1 file.

    + +

    So why have this directory? It is used to compile the libraries that depend on each other. For example, the THC library depends on the TH library and its headers. This is referenced in the build shell script as arguments to the cmake command:

    + +
    # install_dir is tmp_install
    +cmake ...
    +	-DTH_INCLUDE_PATH="$INSTALL_DIR/include" \
    +	-DTH_LIB_PATH="$INSTALL_DIR/lib" \
    +
    + +

    And indeed if we look at the THC library we built:

    + +
    (p3) killeent@devgpu047:lib (master)$ ldd libTHC.so.1
    +	...
    +	libTH.so.1 => /home/killeent/github/pytorch/torch/lib/tmp_install/lib/./libTH.so.1 (0x00007f84478b7000)
    +
    + +

    The way the build_all.sh specifies the include and library paths is a little messy but this is representative of the overall idea. Finally, at the end of the script:

    + +
    # If all the builds succeed we copy the libraries, headers,
    +# binaries to torch/lib
    +cp $INSTALL_DIR/lib/* .
    +cp THNN/generic/THNN.h .
    +cp THCUNN/generic/THCUNN.h .
    +cp -r $INSTALL_DIR/include .
    +cp $INSTALL_DIR/bin/* .
    +
    + +

    As we can see, at the end, we copy everything to the top-level torch/lib directory - explaining the contents we saw above. We’ll see why we do this next:

    + +

    NN Wrappers

    + +

    Briefly, let’s touch on the last part of the build_deps command: generate_nn_wrappers(). We bind into the backend libraries using PyTorch’s custom cwrap tooling, which we touched upon in a previous post. For binding TH and THC we manually write the YAML declarations for each function. However, due to the relative simplicity of the THNN and THCUNN libraries, we auto-generate both the cwrap declarations and the resulting C++ code.

    + +

    The reason we copy the THNN.h and THCUNN.h header files into torch/lib is that this is where the generate_nn_wrappers() code expects these files to be located. generate_nn_wrappers() does a few things:

    + +
      +
    1. Parses the header files, generating cwrap YAML declarations and writing them to output .cwrap files
    2. +
    3. Calls cwrap with the appropriate plugins on these .cwrap files to generate source code for each
    4. +
    5. Parses the headers a second time to generate THNN_generic.h - a library that takes THPP Tensors, PyTorch’s “generic” C++ Tensor Library, and calls into the appropriate THNN/THCUNN library function based on the dynamic type of the Tensor
    6. +
    + +

    If we take a look into torch/csrc/nn after running generate_nn_wrappers() we can see the output:

    + +
    (p3) killeent@devgpu047:nn (master)$ ls
    +THCUNN.cpp  THCUNN.cwrap  THNN.cpp  THNN.cwrap  THNN_generic.cpp  THNN_generic.cwrap  THNN_generic.h  THNN_generic.inc.h
    +
    + +

    For example, the code generates cwrap like:

    + +
    [[
    +  name: FloatBatchNormalization_updateOutput
    +  return: void
    +  cname: THNN_FloatBatchNormalization_updateOutput
    +  arguments:
    +    - void* state
    +    - THFloatTensor* input
    +    - THFloatTensor* output
    +    - type: THFloatTensor*
    +      name: weight
    +      nullable: True
    +    - type: THFloatTensor*
    +      name: bias
    +      nullable: True
    +    - THFloatTensor* running_mean
    +    - THFloatTensor* running_var
    +    - THFloatTensor* save_mean
    +    - THFloatTensor* save_std
    +    - bool train
    +    - double momentum
    +    - double eps
    +]]
    +
    + +

    with corresponding .cpp:

    + +
    extern "C" void THNN_FloatBatchNormalization_updateOutput(void*, THFloatTensor*, THFloatTensor*, THFloatTensor*, THFloatTensor*, THFloatTensor*, THFloatTensor*, THFloatTensor*, THFloatTensor*, bool, double, double);
    +
    +PyObject * FloatBatchNormalization_updateOutput(PyObject *_unused, PyObject *args) {
    +	// argument checking, unpacking
    +	 PyThreadState *_save = NULL;
    +      try {
    +        Py_UNBLOCK_THREADS;
    +        THNN_FloatBatchNormalization_updateOutput(arg_state, arg_input, arg_output, arg_weight, arg_bias, arg_running_mean, arg_running_var, arg_save_mean, arg_save_std, arg_train, arg_momentum, arg_eps);
    +        Py_BLOCK_THREADS;
    +        Py_RETURN_NONE;
    +      } catch (...) {
    +        if (_save) {
    +          Py_BLOCK_THREADS;
    +        }
    +        throw;
    +      }
    +
    +    ...
    +}
    +
    + +

    In the THPP generated code, the function looks like this:

    + +
    void BatchNormalization_updateOutput(thpp::Tensor* input, thpp::Tensor* output, thpp::Tensor* weight, thpp::Tensor* bias, thpp::Tensor* running_mean, thpp::Tensor* running_var, thpp::Tensor* save_mean, thpp::Tensor* save_std, bool train, double momentum, double eps) {
    +	// Call appropriate THNN function based on tensor type, whether its on CUDA, etc.
    +}
    +
    + +

    We will look a little more at how these source files are used later.

    + +

    “Building” the Pure Python Modules

    + +

    Now that we have built the backend libraries (the “dependencies”) we can move forward with building the actual PyTorch code. The next Setuptools command that runs is build_py, which is used to build all the “Pure” python modules in our library. These are the “packages” passed to setup.py.

    + +

    The packages are found using the Setuptools’ utility function find_packages():

    + +
    packages = find_packages(exclude=('tools.*',))
    +['torch', 'torch._thnn', 'torch.autograd', 'torch.backends', 'torch.cuda', 'torch.distributed', 'torch.legacy', 'torch.multiprocessing', 'torch.nn', 'torch.optim', 'torch.sparse', 'torch.utils', 'torch.autograd._functions', 'torch.backends.cudnn', 'torch.legacy.nn', 'torch.legacy.optim', 'torch.nn._functions', 'torch.nn.backends', 'torch.nn.modules', 'torch.nn.parallel', 'torch.nn.utils', 'torch.nn._functions.thnn', 'torch.utils.data', 'torch.utils.ffi', 'torch.utils.serialization', 'torch.utils.trainer', 'torch.utils.backcompat', 'torch.utils.trainer.plugins']
    +
    + +

    As we can see, find_package has recursively traversed the torch directory, finding all the directory paths that have an __init__.py file.

    + +

    When building with Setuptools, the tool creates a build directory in the distribution root, i.e. the same location as the setup.py file. Because PyTorch is composed of both “Pure” python modules and Extension Modules, we need to preserve information about the Operating System and Python version used when performing the build. So if we look in my build directory, we see:

    + +
    (p3) killeent@devgpu047:pytorch (master)$ ls build
    +lib.linux-x86_64-3.6  temp.linux-x86_64-3.6
    +
    + +

    This indicates that I’ve built the project on linux-x86-64 using Python 3.6. The lib directory contains the library files, while the temp directory contains files generated during the build that aren’t needed in the final installation.

    + +

    Because “Pure” python modules are just Python code, and don’t need to be “compiled”, the build_py process simply copies files from their locations as found by find_packages to the equivalent location in build/. So our build output is littered with lines like:

    + +
    copying torch/autograd/_functions/blas.py -> build/lib.linux-x86_64-3.6/torch/autograd/_functions
    +
    + +

    We also noted earlier that we could pass files and directories to the package_data keyword argument to the main setup() function, and that Setuptools would handle copying those files to the installation location. During build_py, these files are copied to the build/ directory, so we also see lines like:

    + +
    copying torch/lib/libTH.so.1 -> build/lib.linux-x86_64-3.6/torch/lib
    +...
    +copying torch/lib/include/THC/generic/THCTensor.h -> build/lib.linux-x86_64-3.6/torch/lib/include/THC/generic
    +
    + +

    Building the Extension Modules

    + +

    Finally, we need to build the Extension Modules, i.e. the PyTorch modules written in C++ using the CPython backend. This also constitutes the majority of the code logic in setup.py. Our overridden build_ext Command has some special logic before the extensions themselves are actually built:

    + +
    from tools.cwrap import cwrap
    +from tools.cwrap.plugins.THPPlugin import THPPlugin
    +from tools.cwrap.plugins.ArgcountSortPlugin import ArgcountSortPlugin
    +from tools.cwrap.plugins.AutoGPU import AutoGPU
    +from tools.cwrap.plugins.BoolOption import BoolOption
    +from tools.cwrap.plugins.KwargsPlugin import KwargsPlugin
    +from tools.cwrap.plugins.NullableArguments import NullableArguments
    +from tools.cwrap.plugins.CuDNNPlugin import CuDNNPlugin
    +from tools.cwrap.plugins.WrapDim import WrapDim
    +from tools.cwrap.plugins.AssertNDim import AssertNDim
    +from tools.cwrap.plugins.Broadcast import Broadcast
    +from tools.cwrap.plugins.ProcessorSpecificPlugin import ProcessorSpecificPlugin
    +        thp_plugin = THPPlugin()
    +        cwrap('torch/csrc/generic/TensorMethods.cwrap', plugins=[
    +            ProcessorSpecificPlugin(), BoolOption(), thp_plugin,
    +            AutoGPU(condition='IS_CUDA'), ArgcountSortPlugin(), KwargsPlugin(),
    +            AssertNDim(), WrapDim(), Broadcast()
    +        ])
    +        cwrap('torch/csrc/cudnn/cuDNN.cwrap', plugins=[
    +            CuDNNPlugin(), NullableArguments()
    +        ])
    +
    + +

    Recall above that I documented that we auto-generated C++ code for calling into the THNN etc. libraries. Here is where we bind TH, THC and CuDNN. We take the YAML declarations in TensorMethods.cwrap, and use them to generate output C++ source files that contain implementations that work within PyTorch’s C++ Ecosystem. For example, a simple declaration like zero_:

    + +
    [[
    +  name: zero_
    +  cname: zero
    +  return: self
    +  arguments:
    +    - THTensor* self
    +]]
    +
    + +

    Generates code like:

    + +
     PyObject * THPTensor_(zero_)(PyObject *self, PyObject *args, PyObject *kwargs) {
    +	...
    +	THTensor_(zero)(LIBRARY_STATE arg_self);
    +	...
    +}
    +
    + +

    In the previous post we documented how these functions are tied to specific Tensor types, so I won’t expand on that there. For the build process its enough to know that these C++ files are generated prior to the extension being built, because these source files are used during Extension compilation.

    + +

    Specifying the Extensions

    + +

    Unlike pure modules, it’s not enough just to list modules or packages and expect the Setuptools to go out and find the right files; you have to specify the extension name, source file(s), and any compile/link requirements (include directories, libraries to link with, etc.).

    + +

    The bulk (200~ LOC at the time of this writing) of the setup.py goes into specifying how to build these Extensions. Here, some of the choices we make in build_all.sh begin to make sense. For example, we saw that our build script specified a tmp_install directory where we installed our backend libraries. In our setup.py code, we reference this directory when adding to the list of directories containing header files to include:

    + +
    # tmp_install_path is torch/lib/tmp_install
    +include_dirs += [
    +    cwd,
    +    os.path.join(cwd, "torch", "csrc"),
    +    tmp_install_path + "/include",
    +    tmp_install_path + "/include/TH",
    +    tmp_install_path + "/include/THPP",
    +    tmp_install_path + "/include/THNN",
    +
    + +

    Similarly, we copied the shared object libraries to torch/csrc at the end of the build_all.sh script. We reference these locations directly in our setup.py code when identifying libraries that we may link against:

    + +
    # lib_path is torch/lib
    +TH_LIB = os.path.join(lib_path, 'libTH.so.1')
    +THS_LIB = os.path.join(lib_path, 'libTHS.so.1')
    +THC_LIB = os.path.join(lib_path, 'libTHC.so.1')
    +THCS_LIB = os.path.join(lib_path, 'libTHCS.so.1')
    +THNN_LIB = os.path.join(lib_path, 'libTHNN.so.1')
    +# ...
    +
    + +

    Let’s consider how we build the main torch._C Extension Module:

    + +
    C = Extension("torch._C",
    +              libraries=main_libraries,
    +              sources=main_sources,
    +              language='c++',
    +              extra_compile_args=main_compile_args + extra_compile_args,
    +              include_dirs=include_dirs,
    +              library_dirs=library_dirs,
    +              extra_link_args=extra_link_args + main_link_args + [make_relative_rpath('lib')],
    +              )
    +
    + +
      +
    • The main libraries are all the libraries we link against. This includes things like shm, PyTorch’s shared memory management library, and also system libraries like cudart and cudnn. Note that the TH libraries are not listed here
    • +
    • The main sources are the C++ files that make up the C++ backend for PyTorch
    • +
    • The compile args are various flags that configure compilation. For example, we might want to add debug flags when compiling in debug mode
    • +
    • The include dirs are the paths to all the directories containing header files. This is also another example where the build_all.sh script is important - for example, we look for the TH header files in torch/lib/tmp_install/include/TH - which is the install location we specified with our CMake configuration
    • +
    • The library dirs are directories to search for shared libraries at link time. For example, we include torch/lib - the location we copied our .so files to at the end of build_all.sh, but also the paths to the CUDA and CuDNN directories
    • +
    • The link arguments are used when linking object files together to create the extension. In PyTorch, this includes more normal options like decided to link libstdc++ statically. However, there is one key component: this is where we link the backend TH libraries. Note that we have lines like:
    • +
    + +
    # The explicit paths to .so files we described above
    +main_link_args = [TH_LIB, THS_LIB, THPP_LIB, THNN_LIB]
    +
    + +

    You might be wondering why we do this as opposed to adding these libraries to the list we pass to the libraries keyword argument. After all, that is a list of libraries to link against. The issue is that Lua Torch installs often set the LD_LIBRARY_PATH variable, and thus we could mistakenly link against a TH library built for Lua Torch, instead of the library we have built locally. This would be problematic because the code could be out of date, and also there are various configuration options for Lua Torch’s TH that would not play nicely with PyTorch.

    + +

    As such, we manually specify the paths to the shared libraries we generated directly to the linker.

    + +

    There are other extensions needed to power PyTorch and they are built in a similar way. The Setuptools library invokes the C++ compiler and linker to build all of these extensions. If the builds succeed, we have successfully built the PyTorch library and we can move on to installation.

    + +

    Installation

    + +

    After building has finished, installation is quite simple. We simply have to copy everything from our build/lib.linux-x86_64-3.6 directory to the appropriate installation directory. Recall that we noted above that this directory is the site_packages directory associated with our Python binary. As a result, we see lines like:

    + +
    running install_lib
    +creating /home/killeent/local/miniconda2/envs/p3/lib/python3.6/site-packages/torch
    +copying build/lib.linux-x86_64-3.6/torch/_C.cpython-36m-x86_64-linux-gnu.so -> /home/killeent/local/miniconda2/envs/p3/lib/python3.6/site-packages/torch
    +copying build/lib.linux-x86_64-3.6/torch/_dl.cpython-36m-x86_64-linux-gnu.so -> /home/killeent/local/miniconda2/envs/p3/lib/python3.6/site-packages/torch
    +creating /home/killeent/local/miniconda2/envs/p3/lib/python3.6/site-packages/torch/_thnn
    +copying build/lib.linux-x86_64-3.6/torch/_thnn/_THNN.cpython-36m-x86_64-linux-gnu.so -> /home/killeent/local/miniconda2/envs/p3/lib/python3.6/site-packages/torch/_thnn
    +copying build/lib.linux-x86_64-3.6/torch/_thnn/_THCUNN.cpython-36m-x86_64-linux-gnu.so -> /home/killeent/local/miniconda2/envs/p3/lib/python3.6/site-packages/torch/_thnn
    +
    + +

    Finally lets power up the Python interpreter. When the Python interpreter executes an import statement, it searches for Python code and extension modules along a search path. A default value for the path is configured into the Python binary when the interpreter is built.

    + +
    # note we are now in my home directory
    +(p3) killeent@devgpu047:~$ python
    +Python 3.6.1 |Continuum Analytics, Inc.| (default, Mar 22 2017, 19:54:23)
    +[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)] on linux
    +Type "help", "copyright", "credits" or "license" for more information.
    +>>> import sys
    +>>> sys.path
    +['', '/home/killeent/local/miniconda2/envs/p3/lib/python36.zip', '/home/killeent/local/miniconda2/envs/p3/lib/python3.6', '/home/killeent/local/miniconda2/envs/p3/lib/python3.6/lib-dynload', '/home/killeent/.local/lib/python3.6/site-packages', '/home/killeent/local/miniconda2/envs/p3/lib/python3.6/site-packages', '/home/killeent/github/pytorch', '/home/killeent/local/miniconda2/envs/p3/lib/python3.6/site-packages/setuptools-27.2.0-py3.6.egg']
    +
    + +

    As we can see, the site-packages directory we copied our PyTorch installation to is part of search path. Now let’s load the torch module and see its location:

    + +
    >>> import torch
    +>>> import inspect
    +>>> inspect.getfile(torch)
    +'/home/killeent/local/miniconda2/envs/p3/lib/python3.6/site-packages/torch/__init__.py'
    +
    + +

    As we can see, we have loaded the module from site_packages as expected - and our build and installation is successful!

    + +

    Note: Python prepends the empty string to sys.path to represent the current working directory - making it the first place we search for a module. So if we run Python from the pytorch directory, we would accidentally load the local version of PyTorch rather than our installed version. This is something to watch out for.

    + +

    Addendum - Developer Efficiency, 3rd Party Libraries, Things I Didn’t Cover

    + +

    The entire installation loop for PyTorch can be quite time-consuming. On my devserver, it takes around 5 minutes for an installation from source. Often times, when developing PyTorch, we only want to work on a subset of the entire project, and re-build only that subset in order to test changes. Fortunately, our build system enables this.

    + +

    Setuptools Develop Mode

    + +

    The main tool that supports this is Setuptools develop command. The documentation states that:

    + +
    +

    This command allows you to deploy your project’s source for use in one or more “staging areas” where it will be available for importing. This deployment is done in such a way that changes to the project source are immediately available in the staging area(s), without needing to run a build or install step after each change.

    +
    + +

    But how does it work? Suppose we run python setup.py build develop in the PyTorch directory. The build command is run, building our dependencies (TH, THPP, etc.) and the extension libraries. However, if we look inside site-packages:

    + +
    (p3) killeent@devgpu047:site-packages$ ls -la torch*
    +-rw-r--r--. 1 killeent users 31 Jun 27 08:02 torch.egg-link
    +
    + +

    Looking at the contents of the torch.egg-link file, it simply references the PyTorch directory:

    + +
    (p3) killeent@devgpu047:site-packages$ cat torch.egg-link
    +/home/killeent/github/pytorch
    +
    + +

    If we navigate back to the PyTorch directory, we see there is a new directory torch.egg-info:

    + +
    (p3) killeent@devgpu047:pytorch (master)$ ls -la torch.egg-info/
    +total 28
    +drwxr-xr-x.  2 killeent users  4096 Jun 27 08:09 .
    +drwxr-xr-x. 10 killeent users  4096 Jun 27 08:01 ..
    +-rw-r--r--.  1 killeent users     1 Jun 27 08:01 dependency_links.txt
    +-rw-r--r--.  1 killeent users   255 Jun 27 08:01 PKG-INFO
    +-rw-r--r--.  1 killeent users     7 Jun 27 08:01 requires.txt
    +-rw-r--r--.  1 killeent users 16080 Jun 27 08:01 SOURCES.txt
    +-rw-r--r--.  1 killeent users    12 Jun 27 08:01 top_level.txt
    +
    + +

    This file contains metadata about the PyTorch project. For example, requirements.txt lists all of the dependencies for setting up PyTorch:

    + +
    (p3) killeent@devgpu047:pytorch (master)$ cat torch.egg-info/requires.txt
    +pyyaml
    +
    + +

    Without going into too much detail, develop allows us to essentially treat the PyTorch repo itself as if it were in site-packages, so we can import the module and it just works:

    + +
    (p3) killeent@devgpu047:~$ python
    +Python 3.6.1 |Continuum Analytics, Inc.| (default, Mar 22 2017, 19:54:23)
    +[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)] on linux
    +Type "help", "copyright", "credits" or "license" for more information.
    +>>> import torch
    +>>> torch.__file__
    +'/home/killeent/github/pytorch/torch/__init__.py'
    +
    + +

    As a result, the following consequences hold:

    + +
      +
    • If we change a Python source file, the changes are automatically picked up, and we don’t have to run any commands to let the Python interpreter see this change
    • +
    • If we change a C++ Source File in one of the extension libraries, we can re-run the develop command, it will re-build the extension
    • +
    + +

    Thus we can develop the PyTorch codebases seamlessly, and test our changes in an easy way.

    + +

    Working on the Dependency Libraries

    + +

    If we are working on the dependencies (e.g. TH, THPP, etc.) we can re-build our changes more quickly by simply running the build_deps command directly. This will automatically call into build_all.sh to re-build our libraries, and copy the generated libraries appropriately. If we are using Setuptools develop mode, we will be using the local extension library built in the PyTorch directory. Because we have specified the paths to the shared libraries when compiling our Extension Libraries, the changes will be picked up:

    + +
    # we are using the local extension
    +(p3) killeent@devgpu047:~$ python
    +Python 3.6.1 |Continuum Analytics, Inc.| (default, Mar 22 2017, 19:54:23)
    +[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)] on linux
    +Type "help", "copyright", "credits" or "license" for more information.
    +>>> import torch
    +>>> torch._C.__file__
    +'/home/killeent/github/pytorch/torch/_C.cpython-36m-x86_64-linux-gnu.so'
    +
    +# it references the local shared object library we just re-built
    +(p3) killeent@devgpu047:~$ ldd /home/killeent/github/pytorch/torch/_C.cpython-36m-x86_64-linux-gnu.so
    +# ...
    +libTH.so.1 => /home/killeent/github/pytorch/torch/lib/libTH.so.1 (0x00007f543d0e2000)
    +# ...
    +
    + +

    As such, we can test any changes here without having to do a full rebuild.

    + +

    3rd Party Libraries

    + +

    PyTorch has dependencies on some 3rd party libraries. The usual mechanism for using these libraries is to install them via Anaconda, and then link against them. For example, we can use the mkl library with PyTorch by doing:

    + +
    # installed to miniconda2/envs/p3/lib/libmkl_intel_lp64.so
    +conda install mkl
    +
    + +

    And then as long as we have the path to this lib directory on our $CMAKE_PREFIX_PATH, it will successfully find this library when compiling:

    + +
    # in the site-packages dir
    +(p3) killeent@devgpu047:torch$ ldd _C.cpython-36m-x86_64-linux-gnu.so
    +# ...
    +libmkl_intel_lp64.so => /home/killeent/local/miniconda2/envs/p3/lib/libmkl_intel_lp64.so (0x00007f3450bba000)
    +# ...
    +
    + +

    Not Covered, But Also Relevant

    + +
      +
    • How ccache is used to speed up build times
    • +
    • How PyTorch’s top-level __init__.py file handles the initial module import and pulling together all the various modules and extension libraries
    • +
    • The CMake build system, how the backend libraries are configured and built with CMake
    • +
    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/a-year-in/index.html b/blog/a-year-in/index.html new file mode 100644 index 000000000000..8ccd37a1bef1 --- /dev/null +++ b/blog/a-year-in/index.html @@ -0,0 +1,818 @@ + + + + + + + + + + + + + PyTorch, a year in.... | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    January 19, 2018

    +

    + PyTorch, a year in.... +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + The PyTorch Team + +

    +

    Today marks 1 year since PyTorch was released publicly. It’s been a wild ride — our quest to build a flexible deep learning research platform. Over the last year, we’ve seen an amazing community of people using, contributing to and evangelizing PyTorch — thank you for the love.

    + +

    Looking back, we wanted to summarize PyTorch over the past year: the progress, the news and highlights from the community.

    + +

    Community

    + +

    We’ve been blessed with a strong organic community of researchers and engineers who fell in love with PyTorch. The core team has engineers and researchers from multiple countries, companies and universities, and we couldn’t have made PyTorch what it is without each contribution.

    + +

    Research papers, packages and Github

    + +

    Within days of release, users from the community started to implement their favorite research papers in PyTorch and release the code on Github. Open-source code is a primary and essential tool for researchers today.

    + +

    Folks came together to create torchtext, torchvision and torchaudio packages to help facilitate and democratize research in different domains.

    + +

    The first community package based on PyTorch came from Brandon Amos, titled Block, and helped with easier manipulation of block matrices. The Locus Lab at CMU subsequently went on to publish PyTorch packages and implementations for most of their research. The first research paper code came from Sergey Zagoruyko titled Paying more attention to attention.

    + +

    Jun-Yan Zhu, Taesung Park, Phillip Isola, Alyosha Efros and team from U.C.Berkeley released the hugely popular Cycle-GAN and pix2pix which does image to image transforms.

    + +
    + +
    + +

    The researchers at HarvardNLP and Systran started developing and improving OpenNMT in PyTorch, seeded by initial reimplementation of the [Lua]Torch code from Adam Lerer at Facebook.

    + +

    The MagicPony team at Twitter contributed implementations of their Super-resolution work early on into PyTorch’s examples.

    + +

    Salesforce Research released several packages, including their highlight release of PyTorch-QRNN, a type of RNN that is 2x to 17x faster than standard LSTMs optimized by CuDNN. James Bradbury and team form one of the most active and engaging forces in the PyTorch community.

    + + + + +

    Researchers from Uber, Northeastern and Stanford came together to form an active probabilistic programming community around their packages Pyro and ProbTorch. They are actively developing the torch.distributions core package. This community is so active and fast-moving, we had our first pytorch-probabilistic-programming meetup at NIPS 2017 with Fritz Obermeyer, Noah Goodman, Jan-Willem van de Meent, Brooks Paige, Dustin Tran and 22 additional attendees discussing how to make the world bayesian.

    + +
    + +
    + +

    NVIDIA Researchers released three high-quality repositories that implemented pix2pix-HD, Sentiment Neuron and FlowNet2 papers. Their analysis of scalability of different Data Parallel models in PyTorch was helpful to the community.

    + +
    + +
    + +

    The Allen Institute for AI released AllenNLP which includes several state-of-the-art models in NLP — reference implementations and easy to use web demos for standard NLP tasks.

    + +
    + +
    + +

    We also had our first Kaggle winning team grt123 in July. They won the DataScience Bowl 2017 on Lung Cancer detection and subsequently released their PyTorch implementations.

    + +

    On the visualization front, Tzu-Wei Huang implemented a TensorBoard-PyTorch plugin and Facebook AI Research released PyTorch compatibility for their visdom visualization package.

    + +
    + + +
    + +

    Lastly, Facebook AI Research released several projects such as ParlAI, fairseq-py, VoiceLoop and FaderNetworks that implemented cutting-edge models and interfaced datasets in multiple domains.

    + +

    There are countless good projects that we haven’t highlighted for the lack of space, you can find a curated list here.

    + +

    We would also like to give a huge shout-out to folks who actively help others out on the Forums, especially ptrblck, jpeg729, QuantScientist, albanD, Thomas Viehmann and chenyuntc. You are providing an invaluable service, thank you so much!

    + +

    Metrics

    + +

    In terms of sheer numbers,

    + +
      +
    • 87,769 lines of Python code on github that import torch
    • +
    • 3,983 repositories on Github that mention PyTorch in their name or description
    • +
    • More than half a million downloads of PyTorch binaries. 651,916 to be precise.
    • +
    • 5,400 users wrote 21,500 posts discussing 5,200 topics on our forums discuss.pytorch.org (http://discuss.pytorch.org/)
    • +
    • 131 mentions of PyTorch on Reddit’s /r/machinelearning since the day of release. In the same period, TensorFlow was mentioned 255 times.
    • +
    + +

    Research Metrics

    + +

    PyTorch is a research-focused framework. So one of the metrics of interest is to see the usage of PyTorch in machine learning research papers.

    + +
      +
    • +

      In the recent ICLR2018 conference submissions, PyTorch was mentioned in 87 papers, compared to TensorFlow at 228 papers, Keras at 42 papers, Theano and Matlab at 32 papers.

      +
    • +
    • +

      Monthly arxiv.org mentions for frameworks had PyTorch at 72 mentions, with TensorFlow at 273 mentions, Keras at 100 mentions, Caffe at 94 mentions and Theano at 53 mentions.

      +
    • +
    + +

    Courses, Tutorials and Books

    + +

    When we released PyTorch, we had good API documentation, but our tutorials were limited to a few ipython notebooks — helpful, but not good enough.

    + +

    Sasank Chilamkurthy took it upon himself to revamp the tutorials into the beautiful website that it is today.

    + +
    + +
    + +

    Sean Robertson and Justin Johnson wrote great new tutorials — in NLP, and to learn by example. Yunjey Choi wrote a beautiful tutorial where most models were implemented in 30 lines or less. +Each new tutorial helped users find their way faster, with different approaches to learning.

    + +

    Goku Mohandas and Delip Rao switched the code content of their book-in-progress to use PyTorch.

    + +

    We’ve seen quite a few university machine learning courses being taught with PyTorch as the primary tool, such as Harvard’s CS287. Taking it one step further and democratizing learning, we had three online courses pop up that teach using PyTorch.

    + + + +

    Engineering

    + +

    Over the last year we implemented multiple features, improved performance across the board and fixed lots of bugs. A full list of the work we’ve done is found in our release notes. +Here are highlights from our work over the last year:

    + +

    Higher-order gradients

    + +

    With the release of several papers that implement penalties of gradients and with ongoing research in 2nd order gradient methods, this was an essential and sought-after feature. In August, we implemented a generalized interface that can take n-th order derivatives and increased the coverage of functions that support higher-order gradients over time, such that at the moment of writing almost all ops support this.

    + +

    Distributed PyTorch

    + +

    In August, we released a small distributed package that followed the highly popular MPI-collective approach. The package has multiple backends such as TCP, MPI, Gloo and NCCL2 to support various types of CPU/GPU collective operations and use-cases, and integrates distributed technologies such as Infiniband and RoCE. Distributed is hard, and we had bugs in the initial iteration. Over subsequent releases, we made the package more stable and improved performance.

    + +

    Closer to NumPy

    + +

    One of the biggest demands from users were NumPy features that they were familiar with. Features such as Broadcasting and Advanced Indexing are convenient and save users a lot of verbosity. We implemented these features and started to align our API to be closer to NumPy. Over time, we expect to get closer and closer to NumPy’s API where appropriate.

    + +

    Sparse Tensors

    + +

    In March, we released a small package supporting sparse Tensors and in May we released CUDA support for the sparse package. The package is small and limited in functionality, and is used for implementing Sparse Embeddings and commonly used sparse paradigms in deep learning. This package is still small in scope and there’s demand to expand it — if you are interested in working on expanding the sparse package, reach out to us on our Discussion Boards

    + +

    Performance

    + +

    Performance is always an ongoing battle, especially for PyTorch which is a dynamic framework that wants to maximize flexibility. Over the last year, we’ve improved performance across board, from our core Tensor library to the neural network operators, writing faster micro-optimized across board.

    + +
      +
    • We’ve added specialized AVX and AVX2 intrinsics for Tensor operations
    • +
    • Wrote faster GPU kernels for frequent workloads like concatenation and Softmax (among many other things)
    • +
    • Rewrote the code for several neural network operators (too many to list), but notably nn.Embedding and group convolutions.
    • +
    + +

    Reducing framework overhead by 10x across board

    + +

    Since PyTorch is a dynamic graph framework, we create a new graph on the fly at every iteration of a training loop. Hence, the framework overhead has to be low, or the workload has to be large enough that the framework overhead is hidden. In August, the authors of DyNet (Graham Neubig and team) showcased that it’s much faster than PyTorch on small NLP models. This was an interesting challenge, we didn’t realize that models of those sizes were being trained. In a multi-month (and ongoing) effort, we embarked upon a significant rewrite of PyTorch internals that reduced the framework overhead from more than 10 microseconds per operator execution to as little as 1 microsecond.

    + +

    ATen

    + +

    As we embarked upon a redesign of the PyTorch internals, we built the ATen C++11 library that now powers all of the PyTorch backend. ATen has an API that mirrors PyTorch’s Python API, which makes it a convenient C++ library for Tensor computation. ATen can be built and used independently of PyTorch.

    + +

    Exporting models to production — ONNX Support and the JIT compiler

    + +

    One of the common requests we’ve received was to export PyTorch models to another framework. Users engaged in a rapid research cycle in PyTorch and when they were done, they wanted to ship it to larger projects with C++ only requirements.

    + +

    With this in mind, we built a tracer for PyTorch — which can export PyTorch models into an intermediate representation. +The subsequent trace can be either used to run the current PyTorch model more efficiently (by running optimization passes on it), or be converted to the ONNX format to be shipped to other frameworks such as Caffe2, MXNet, TensorFlow and others or directly to the hardware accelerated libraries like CoreML or TensorRT. Over the next year, you will hear more about the JIT compiler for performance improvements.

    + +

    Users being funny :)

    + +

    Our users express their support in funny ways, made us laugh, thanks for this :)

    + + + + + + + + + + + + + + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/accelerate-pytorch-models/index.html b/blog/accelerate-pytorch-models/index.html new file mode 100644 index 000000000000..478015b029f5 --- /dev/null +++ b/blog/accelerate-pytorch-models/index.html @@ -0,0 +1,877 @@ + + + + + + + + + + + + + Accelerate PyTorch Models Using Quantization Techniques with Intel Extension for PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Intel + +

    +

    Overview

    + +

    PyTorch is a Python-based framework for developing deep learning models. It is one of the most popular industry-standard AI frameworks and is used for a wide variety of computer vision and natural language processing applications. PyTorch was developed by Meta and is now part of The Linux Foundation. Intel works with the open source PyTorch project to optimize the PyTorch framework for Intel® hardware. The newest optimizations and features are first released in Intel® Extension for PyTorch before upstreaming them into PyTorch. The Intel extension provides quantization features to deliver good accuracy results for large deep learning models.

    + +

    This article introduces quantization, types of quantization, and demonstrates a code sample on how to accelerate PyTorch-based models by applying Intel Extension for PyTorch quantization.

    + +

    What Is Quantization?

    + +

    Quantization is a systematic reduction of the precision of all or several layers within the model. This means a higher-precision type (like single precision floating-point (FP32) that is mostly used in deep learning) is converted into a lower-precision type, such as FP16 (16 bits) or int8 (8 bits).

    + +

    This helps to achieve:

    + +
      +
    • Lower memory bandwidth
    • +
    • Lower storage
    • +
    • Higher performance with minimum to zero accuracy loss
    • +
    + +

    Quantization is especially important with large models such as those based on the Transformer architecture (like BERT or GPT).

    + +

    There are two types of quantization:

    + +
      +
    • Static: This quantizes the weights and activations of the model, and is used when memory bandwidth and compute savings are important.
    • +
    • Dynamic: The weights are quantized ahead of time, but the activations are dynamically quantized during inference.
    • +
    + +

    How to Perform Static Quantization and Dynamic Quantization

    + +

    The Intel extension extends PyTorch with up-to-date features and optimizations for an extra performance boost on Intel hardware.

    + +

    Installation Instructions for Intel Extension for PyTorch

    + +

    The extension can be loaded as a Python module or linked as a C++ library. Python users can enable it dynamically by importing intel_extension_for_pytorch. The extension provides built-in quantization to deliver good statistical accuracy for most popular deep learning workloads including convolutional neural networks (CNN), natural language processing (NLP), and recommendation models. The quantization functionality in the Intel extension currently supports post-training quantization.

    + +

    To quantize the existing FP32 model to an int8 model using static quantization:

    + +
      +
    1. Prepare the quantization configuration. For default static quantization configuration, use ipex.quantization.default_static_qconfig.
    2. +
    3. Prepare the model for calibration using the ipex.quantization.prepare method.
    4. +
    5. Perform calibration against the dataset. This calibration is specific for static quantization as it needs the representative dataset to determine the optimal quantization parameters, so the user should provide data to the model in batches to calibrate it.
    6. +
    7. Convert the model from FP32 to int8 using the ipex.quantization.convert method. This function converts the FP32 model to int8 based on the applied calibration and configuration.
    8. +
    + +

    To quantize the existing FP32 model to an int8 model using dynamic quantization, which is similar to static quantization:

    + +
      +
    1. Prepare the quantization configuration. For default dynamic quantization configuration, use ipex.quantization.default_dynamic_qconfig.
    2. +
    3. Prepare the FP32 model by using the ipex.quantization.prepare method. Provide the parameters, such as FP32 model to quantize, the prepared configuration, example inputs, and information.
    4. +
    5. Convert the model from FP32 to int8 using the ipex.quantization.convert method. The input model is the model prepared in Step 2.
    6. +
    + +

    Code Sample

    + +

    Dataset

    + +

    For static quantization, the model is calibrated with the CIFAR-10 dataset. The CIFAR-10 is a subset of the 80 million tiny images dataset collected by Alex Krizhevsky, Vinod Nair, and Geoffrey Hinton.

    + +

    This dataset contains 60,000 images in 10 classes (airplane, automobile, bird, cat, deer, dog, frog, horse, ship, and track). Every class has exactly 6,000 images. All images are 32 x 32 pixels and are colored. Also, the classes are completely mutually exclusive, which means there is no overlapping between classes.

    + +

    Implementation

    + +

    The code sample demonstrates how to quantize (using static and dynamic quantization) a ResNet*-50 model using Intel Extension for PyTorch. The following steps are implemented in the code sample:

    + +

    Download and Prepare the Dataset

    + +

    Here, we use the CIFAR-10 dataset available in torchvision.

    + +
      +
    1. To make data fit the model:
    2. +
    + +
      +
    • Transform the data.
    • +
    • Change the size of the images from 32 x 32 pixels to 224 x 224 pixels.
    • +
    • Convert them to tensors.
    • +
    • Normalize them.
    • +
    + +
      +
    1. Prepare transformations of the dataset as shown:
    2. +
    + +
    transform = torchvision.transforms.Compose([
    +torchvision.transforms.Resize((224, 224)),
    +torchvision.transforms.ToTensor(),
    +torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
    +
    +
    + +
      +
    1. Initialize the dataset.
    2. +
    + +
    test_dataset = torchvision.datasets.CIFAR10(root=DATA, train=False, transform=transform, download=Ture)
    +
    + +

    Prepare the Data Loader

    + +

    To load a dataset for static quantization calibration in specific size batches, create the loader as shown:

    + +
    calibration_data_loader = torch.utils.data.DataLoader(
    +dataset=test_dataset,
    +batch_size=128
    +)
    +
    + +

    Create the Model

    + +

    Use the pretrained ResNet-50 model available in the Torchvision library with default weights. The prepared model is FP32.

    + +
    model_fp32 = torchvision.models.resnet50(weights=torchvision.models.ResNet50_Weights.DEFAULT)
    +
    + +

    Apply Static Quantization

    + +

    Create a staticQuantize function that implements the steps described previously.

    + +
      +
    1. To perform static quantization, we need:
    2. +
    + +
      +
    • FP32 model loaded earlier
    • +
    • Example data
    • +
    • Calibration dataset
    • +
    + +
      +
    1. Prepare the quantization configuration:
    2. +
    + +
    config_static = ipex.quantization.default_static_qconfig
    +
    + +

    In this code sample, we are using the default quantization configuration, but you can also define your own. \

    + +
      +
    1. Prepare the model using the declared configuration:
    2. +
    + +
    prepared_model_static = prepare(model_fp32,
    +qconfig_static,
    +example_inputs=data,
    +inplace=False)
    +
    + +
      +
    1. Calibrate the model with the calibration dataset. Feed the model with successive batches of data from the dataset.
    2. +
    + +
    for batch_idx, (data, target) in enumerate(calibration_data_loader):
    +prepared_model_static(data)
    +if batch_idx % 10 == 0:
    +print("Batch %d/%d complete, continue ..." %(batch_idx+1, len(calibration_data_loader)))
    +
    + +
      +
    1. Convert the model.
    2. +
    + +
    converted_model_static = convert(prepared_model_static)
    +
    + +

    Apply Dynamic Quantization

    + +

    Create the dynamicQuantize function similar to the staticQuantize function.

    + +
      +
    1. To perform dynamic quantization, we only need:
    2. +
    + +
      +
    • The FP32 model loaded earlier
    • +
    • Example data
    • +
    + +
      +
    1. Prepare the quantization configuration:
    2. +
    + +
    qconfig_dynamic = ipex.quantization.default_dynamic_qconfig
    +
    + +
      +
    1. Prepare the model.
    2. +
    + +
    prepared_model_dynamic = prepare(model_fp32,
    +qconfig_dynamic,
    +example_inputs=data,
    +inplace=False)
    +
    + +
      +
    1. Convert the model from FP32 to int8.
    2. +
    + +
    converted_model_dynamic = convert(prepared_model_dynamic)
    +
    + +

    In this way, two functions are created to take advantage of the optimizations that quantization offers:

    + +
      +
    • DynamicQuantize for dynamic quantization of models
    • +
    • StaticQuantize for static model quantization
    • +
    + +

    Next Steps

    + +

    Get started with Intel Extension for PyTorch quantization today and use it to achieve better accuracy results for deep learning workloads. Additionally, Intel® Neural Compressor provides quantization to improve the speed of inference.

    + +

    Check out and incorporate Intel’s other AI and machine learning framework optimizations and end-to-end portfolio of tools into your AI workflow.

    + +

    Learn about the unified, open, standards-based oneAPI programming model that forms the foundation of Intel’s AI Software Portfolio to help you prepare, build, deploy, and scale your AI solutions.

    + +

    For more details about the 4th gen Intel® Xeon® Scalable processors, visit the Intel® AI platform overview where you can learn how Intel is empowering developers to run end-to-end AI pipelines on these powerful CPUs.

    + +

    Additional Resources

    + + + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/accelerated-cpu-inference/index.html b/blog/accelerated-cpu-inference/index.html new file mode 100644 index 000000000000..e395bbf19d5f --- /dev/null +++ b/blog/accelerated-cpu-inference/index.html @@ -0,0 +1,1097 @@ + + + + + + + + + + + + + Accelerated CPU Inference with PyTorch Inductor using torch.compile | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Intel + +

    +

    Story at a Glance

    + +
      +
    • Although the PyTorch* Inductor C++/OpenMP* backend has enabled users to take advantage of modern CPU architectures and parallel processing, it has lacked optimizations, resulting in the backend performing worse than eager mode in terms of end-to-end performance.
    • +
    • Intel optimized the Inductor backend using a hybrid strategy that classified operations into two categories: Conv/GEMM and non-Conv/GEMM element-wise and reduction ops.
    • +
    • For popular deep learning models, this hybrid strategy demonstrates promising performance improvements compared to eager mode and improves the C++/OpenMP backend’s efficiency and reliability for PyTorch models.
    • +
    + +
    + +

    Inductor Backend Challenges

    + +

    The PyTorch Inductor C++/OpenMP backend enables users to take advantage of modern CPU architectures and parallel processing to accelerate computations.

    + +

    However, during the early stages of its development, the backend lacked some optimizations, which prevented it from fully utilizing the CPU computation capabilities. As a result, for most models the C++/OpenMP backend performed worse than eager mode in terms of end-to-end performance, with 45% of TorchBench, 100% of Hugging Face, and 75% of TIMM models performing worse than eager mode.

    + +

    In this post, we highlight Intel’s optimizations to the Inductor CPU backend, including the technologies and results.

    + +

    We optimized the backend by using a hybrid strategy that classified operations into two categories: Conv/GEMM and non-Conv/GEMM element-wise and reduction ops. Post-op fusion and weight prepacking using the oneDNN performance library were utilized to optimize the former, while explicit vectorization in C++ codegen was used to optimize the latter.

    + +

    This hybrid strategy demonstrated promising performance improvements compared to eager mode, particularly on popular deep learning models such as Inductor Hugging Face, Inductor TorchBench and Inductor TIMM. Overall, Intel’s optimizations improve the C++/OpenMP backend’s efficiency and reliability for PyTorch models.

    + +

    Figure 1. Performance Speedup Ratio Trend

    + +

    Figure 1: Performance Speedup Ratio Trend

    + +

    Performance Status of Intel Hybrid Optimizations

    + +

    Compared to eager mode with the hybrid optimizations, the C++/OpenMP backend shows promising performance improvements. We measured the performance of the three Inductor benchmark suites—TorchBench, Hugging Face, and TIMM—and the results are as follows. (Note: we publish our performance data twice per week on GitHub.)

    + +

    Overall, these optimizations help to ensure that the C++/OpenMP backend provides efficient and reliable support for PyTorch models.

    + +

    Passrate

    + +
    +----------+------------+-------------+-------------+
    +| Compiler | torchbench | huggingface | timm_models |
    ++----------+------------+-------------+-------------+
    +| inductor | 93%, 56/60 | 96%, 44/46  | 100%, 61/61 |
    ++----------+------------+-------------+-------------+
    +
    + +

    Geometric mean speedup (Single-Socket Multi-threads)

    + +
    +----------+------------+-------------+-------------+
    +| Compiler | torchbench | huggingface | timm_models |
    ++----------+------------+-------------+-------------+
    +| inductor |   1.39x    |    1.20x    |    1.73x    |
    ++----------+------------+-------------+-------------+
    +
    + +

    Individual Model Performance

    + +

    Figure 2. TorchBench FP32 Performance (Single-Socket Multi-threads)

    + +

    Figure 2: TorchBench FP32 Performance (Single-Socket Multi-threads)

    + +

    Figure 3. Hugging Face FP32 Performance (Single-Socket Multi-thread)

    + +

    Figure 3: Hugging Face FP32 Performance (Single-Socket Multi-thread)

    + +

    Figure 4. TIMM FP32 Performance (Single-Socket Multi-threads)

    + +

    Figure 4: TIMM FP32 Performance (Single-Socket Multi-threads)

    + +

    Geometric mean speedup (Single-core Single-thread)

    + +
    +----------+------------+-------------+-------------+
    +| Compiler | torchbench | huggingface | timm_models |
    ++----------+------------+-------------+-------------+
    +| inductor |    1.29x   |    1.15x    |    1.37x    |
    ++----------+------------+-------------+-------------+
    +
    + +

    Figure 5. TorchBench FP32 Performance (Single-Socket Single-thread)

    + +

    Figure 5: TorchBench FP32 Performance (Single-Socket Single-thread)

    + +

    Figure 6. Hugging Face FP32 Performance (Single-Socket Single Thread)

    + +

    Figure 6: Hugging Face FP32 Performance (Single-Socket Single Thread)

    + +

    Figure 7. TIMM FP32 Performance (Single-Socket Single-thread)

    + +

    Figure 7: TIMM FP32 Performance (Single-Socket Single-thread)

    + +

    Technical Deep Dive

    + +

    Now, let’s take a closer look at the two primary optimizations used in the Inductor C++/OpenMP backend:

    + +
      +
    1. weight prepacking and post-operation fusion via oneDNN library
    2. +
    3. explicit vectorization in Inductor C++ codegen
    4. +
    + +

    Weight Prepackaging & Post-op Fusion via oneDNN

    + +

    Shorthand for Intel® oneAPI Deep Neural Network Library, oneDNN library provides a range of post-op fusions (i.e., fuse convolution and matmal with its consecutive operation) that can benefit popular models. The Intel® Extension for PyTorch has implemented most of these fusions and has achieved significant performance improvements. As a result, we have upstreamed all of these fusions that have been applied in Intel’s PyTorch extension to Inductor, enabling a wider range of models to benefit from these optimizations. We have defined these fusions as operators under the mkldnn namespace. This allows the Python module to invoke these mkldnn operations directly.

    + +

    Currently, the defined fused operations are as follows. You can find these defined fused operations at RegisterMkldnnOpContextClass.cpp.

    + +
      +
    • _linear_pointwise: Fuses Linear and its post-unary element-wise operations
    • +
    • _linear_pointwise.binary: Fuses Linear and its post-binary element-wise operations
    • +
    • _convolution_pointwise: Fuses Convolution and its post-unary element-wise operations
    • +
    • _convolution_pointwise.binary: Fuses Convolution and its post-binary element-wise operations
    • +
    + +

    The detailed fusion patterns are defined in the mkldnn.py file: convolution/linear + sigmoid/hardsigmoid/tanh/hardtanh/hardswish/leaky_relu/gelu/relu/relu6/siluconvolution/linear + add/add_/iadd/sub/sub_

    + +

    On the Inductor side, we apply these fusions on the FX graph that has been lowered. We have defined mkldnn_fuse_fx as the entry point to apply all the fusions. The code snippet for this is as follows:

    + +
    def mkldnn_fuse_fx(gm: torch.fx.GraphModule, example_inputs):
    +    ...
    +    gm = fuse_unary(gm)
    +    gm = fuse_binary(gm)
    +    ...
    +    if config.cpp.weight_prepack:
    +        gm = pack_module(gm)
    +    return gm
    +
    + +

    In the mkldnn_fuse_fx function, we apply fusion on the FX graph that hasn’t been lowered yet. To fuse convolution/linear and its consecutive elementwise operations, we invoke fuse_unary and fuse_binary as follows:

    + +
       gm = fuse_unary(gm)
    +   gm = fuse_binary(gm)
    +
    + +

    In addition to the post-op fusion, we apply weight prepacking to improve the Conv/GEMM performance further:

    + +
       gm = pack_module(gm)
    +
    + +

    Weight prepacking involves rearranging the weight tensor in a blocked layout, which:

    + +
      +
    • can improve vectorization and cache reuse compared to plain formats like NCHW or NHWC and;
    • +
    • can help avoid weight reordering at runtime, which can reduce overhead and improve performance and;
    • +
    • increases memory usage as the tradeoff.
    • +
    + +

    For these reasons, we provide config.cpp.weight_prepack flag in Inductor to provide users with more control over this optimization, allowing them to enable it based on their specific needs.

    + +

    Explicit Vectorization in Inductor C++ Codegen

    + +

    Vectorization is a key optimization technique that can significantly improve the performance of numerical computations. By utilizing SIMD (Single Instruction, Multiple Data) instructions, vectorization enables multiple computations to be performed simultaneously on a single processor core, which can lead to significant performance improvements.

    + +

    In the Inductor C++/OpenMP backend, we use Intel® AVX2 and Intel® AVX-512 ISA (Instruction Set Architecture) options for vectorization by leveraging the aten vectorization library to facilitate the implementation. Aten vectorization supports multiple platforms, including x86 and Arm, as well as multiple data types. It can be extended to support other ISAs easily by adding more VecISA sub-classes. This allows Inductor to easily support other platforms and data types in the future.

    + +

    Due to differences in platforms, the C++/OpenMP backend of Inductor starts by detecting the CPU features to determine the vectorization bit width at the beginning of code generation. By default, if the machine supports both AVX-512 and AVX2, the backend will choose 512-bit vectorization.

    + +

    If the hardware supports vectorization, the C++/OpenMP backend first detects if the loop body can be vectorized or not. There are primarily three scenarios that we are not able to generate kernel with vectorization:

    + +
      +
    1. Loop body lacks vector intrinsics support, e.g., rand and atomic_add.
    2. +
    3. Loop body lacks efficient vector intrinsics support, e.g., non-contiguous load/store.
    4. +
    5. Data types with vectorization not yet supported but work in progress, e.g., integer, double, half, and bfloat16.
    6. +
    + +

    To address this issue, the C++/OpenMP backend uses CppVecKernelChecker to detect whether all operations in a particular loop body can be vectorized or not. In general, we classified the operations into two categories by identifying if they depend on the context.

    + +

    For most elementwise operations such as add, sub, relu, vectorization is straightforward, and their execution does not depend on context.

    + +

    However, for certain other operations, their semantics are more complex and their execution depends on context through static analysis.

    + +

    For example, let’s consider the where operation that takes in mask, true_value, and false_value while the mask value is loaded from a uint8 tensor. The fx graph could be as follows:

    + +
    graph():
    +    %ops : [#users=9] = placeholder[target=ops]
    +    %get_index : [#users=1] = call_module[target=get_index](args = (index0,), kwargs = {})
    +    %load : [#users=1] = call_method[target=load](args = (%ops, arg1_1, %get_index), kwargs = {})
    +    %to_dtype : [#users=1] = call_method[target=to_dtype](args = (%ops, %load, torch.bool), kwargs = {})
    +    ...
    +    %where : [#users=1] = call_method[target=where](args = (%ops, %to_dtype, %to_dtype_2, %to_dtype_3), kwargs = {})
    +
    + +

    Regarding uint8, it is a general data type and could be used for computation but is not limited to being used as Boolean for mask. Hence, we need to analyze its context statically. In particular, the CppVecKernelChecker will check whether a uint8 tensor is only used by to_dtype and to_dtype is only used by where. If yes, it could be vectorized. Otherwise, it will fall back to the scalar version. The generated code could be as follows:

    + +

    Scalar Version

    + +
    auto tmp0 = in_ptr0[i1 + (17*i0)];
    +auto tmp3 = in_ptr1[i1 + (17*i0)];
    +auto tmp1 = static_cast<bool>(tmp0);
    +auto tmp2 = static_cast<float>(-33.0);
    +auto tmp4 = tmp1 ? tmp2 : tmp3;
    +tmp5 = std::max(tmp5, tmp4);
    +
    + +

    Vectorization Version

    + +
    float g_tmp_buffer_in_ptr0[16] = {0};
    +// Convert the flag to float for vectorization. 
    +flag_to_float(in_ptr0 + (16*i1) + (17*i0), g_tmp_buffer_in_ptr0, 16);
    +auto tmp0 = at::vec::Vectorized<float>::loadu(g_tmp_buffer_in_ptr0);
    +auto tmp3 = at::vec::Vectorized<float>::loadu(in_ptr1 + (16*i1) + (17*i0));
    +auto tmp1 = (tmp0);
    +auto tmp2 = at::vec::Vectorized<float>(static_cast<float>(-33.0));
    +auto tmp4 = decltype(tmp2)::blendv(tmp3, tmp2, tmp1);
    +
    + +

    In addition to context analysis, the C++/OpenMP backend also incorporates several other vectorization-related optimizations. These include:

    + +
      +
    • Tiled kernel implementation for supporting transpose load - cpp.py
    • +
    • Data type demotion based on value range - cpp.py
    • +
    • Replacement of sleef implementation with oneDNN/oneMKL implementation for optimizing aten vectorization - #94577, #92289, #91613
    • +
    + +

    In summary, we examined vectorization optimization in Inductor C++ backend for FP32 training and inference of 150 benchmark models with 90% of inference kernels and 71% of training kernels being vectorized.

    + +

    In terms of inference, a total of 28,185 CPP kernels were generated, with 25,579 (90%) of them being vectorized, while the remaining 10% were scalar. As for training, 103,084 kernels were generated, with 73,909 (71%) being vectorized and 29% not vectorized.

    + +

    The results indicate that the vectorization of inference kernels is quite impressive (there is still some work to be done in training kernels since we just started to work on the training). The remaining non-vectorized kernels are analyzed in different categories, highlighting the next steps to improve vectorization coverage: index-related operations, int64 support, vertical reduction, vectorization with fallback, and more.

    + +

    In addition, we also optimized the C++/OpenMP backend with other optimizations like buffer-reuse and CppWrapper.

    + +

    Future Work

    + +

    The next step, we will continue optimizing the C++/OpenMP backend and extend it to support more data types as the next step. This includes:

    + +
      +
    1. Improve vectorization coverage
    2. +
    3. Support and optimize low precision kernel including BF16, FP16, Quantization
    4. +
    5. Training optimization
    6. +
    7. Loop tiling
    8. +
    9. Autotune
    10. +
    11. Further fusion optimization of Conv/GEMM kernels.
    12. +
    13. Explore alternative codegen paths: clang/llvm/triton
    14. +
    + +

    Summary

    + +

    Inductor C++/OpenMP backend is a flexible and efficient backend for the CPU. This blog describes the optimizations used in the C++/OpenMP backend of Inductor for inference and training of three benchmark suites – TorchBench, Hugging

    + +

    Face and TIMM. The primary optimizations include weight prepacking and post-operation fusion via the oneDNN library, as well as explicit vectorization in Inductor C++ codegen using AVX2 and AVX-512 instructions.

    + +

    The results show that 90% of inference kernels and 71% of training kernels are vectorized, indicating impressive vectorization for inference and room for improvement in training. In addition, we also applied other optimizations like buffer-reuse and CppWrapper. And we will continuously focus on the future work mentioned above to further improve the performance.

    + +

    Acknowledgements

    + +

    The results presented in this blog post are the culmination of a collaborative effort between the Intel PyTorch team and Meta. We would like to express our sincere gratitude to @jansel, @desertfire, and @Chillee for their invaluable contributions and unwavering support throughout the development process. Their expertise and dedication have been instrumental in achieving the optimizations and performance improvements discussed here.

    + +

    Configuration Details

    + +

    Hardware Details

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +Item + +Value +
    +Manufacturer + +Amazon EC2 +
    +Product Name + +c6i.16xlarge +
    +CPU Model + +Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz +
    +Installed Memory + +128GB (1x128GB DDR4 3200 MT/s [Unknown]) +
    +OS + +Ubuntu 22.04.2 LTS +
    +Kernel + +5.19.0-1022-aws +
    +Microcode + +0xd000389 +
    +GCC + +gcc (Ubuntu 11.3.0-1ubuntu1~22.04) 11.3.0 +
    +GLIBC + +ldd (Ubuntu GLIBC 2.35-0ubuntu3.1) 2.35 +
    +Binutils + +GNU ld (GNU Binutils for Ubuntu) 2.38 +
    +Python + +Python 3.10.6 +
    +OpenSSL + +OpenSSL 3.0.2 15 Mar 2022 (Library: OpenSSL 3.0.2 15 Mar 2022) +
    + +

    Software Details

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +SW + +Nightly commit + +Main commit +
    +Pytorch + +a977a12 + +0b1b063 +
    +Torchbench + +/ + +a0848e19 +
    +torchaudio + +0a652f5 + +d5b2996 +
    +torchtext + +c4ad5dd + +79100a6 +
    +torchvision + +f2009ab + +b78d98b +
    +torchdata + +5cb3e6d + +f2bfd3d +
    +dynamo_benchmarks + +fea73cb + +/ +
    + +

    Configuration

    + +
      +
    • Intel OpenMP
    • +
    • Jemalloc - oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1
    • +
    • Single-Socket Multi-threads: #of Instances: 1; Cores/Instance: 32
    • +
    • Single-Core Single-thread: #of Instances: 1; Cores/Instance: 1
    • +
    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/accelerated-diffusers-pt-20/index.html b/blog/accelerated-diffusers-pt-20/index.html new file mode 100644 index 000000000000..d66b22162c0f --- /dev/null +++ b/blog/accelerated-diffusers-pt-20/index.html @@ -0,0 +1,741 @@ + + + + + + + + + + + + + Accelerated Diffusers with PyTorch 2.0 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    March 16, 2023

    +

    + Accelerated Diffusers with PyTorch 2.0 +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Pedro Cuenca, Patrick von Platen, Suraj Patil, Sayak Paul + +

    +

    PyTorch 2.0 has just been released. Its flagship new feature is torch.compile(), a one-line code change that promises to automatically improve performance across codebases. We have previously checked on that promise in Hugging Face Transformers and TIMM models, and delved deep into its motivation, architecture and the road ahead.

    + +

    As important as torch.compile() is, there’s much more to PyTorch 2.0. Notably, PyTorch 2.0 incorporates several strategies to accelerate transformer blocks, and these improvements are very relevant for diffusion models too. Techniques such as FlashAttention, for example, have become very popular in the diffusion community thanks to their ability to significantly speed up Stable Diffusion and achieve larger batch sizes, and they are now part of PyTorch 2.0.

    + +

    In this post we discuss how attention layers are optimized in PyTorch 2.0 and how these optimization are applied to the popular 🧨 Diffusers library. We finish with a benchmark that shows how the use of PyTorch 2.0 and Diffusers immediately translates to significant performance improvements across different hardware.

    + +

    Update (June 2023): a new section has been added to show dramatic performance improvements of torch.compile() with the latest version of PyTorch (2.0.1), after going through the process of fixing graph breaks in the diffusers codebase. A more detailed analysis of how to find and fix graph breaks will be published in a separate post.

    + +

    Accelerating transformer blocks

    + +

    PyTorch 2.0 includes a scaled dot-product attention function as part of torch.nn.functional. This function encompasses several implementations that can be applied depending on the inputs and the hardware in use. Before PyTorch 2.0, you had to search for third-party implementations and install separate packages in order to take advantage of memory optimized algorithms, such as FlashAttention. The available implementations are:

    +
      +
    • FlashAttention, from the official FlashAttention project.
    • +
    • Memory-Efficient Attention, from the xFormers project.
    • +
    • A native C++ implementation suitable for non-CUDA devices or when high-precision is required.
    • +
    + +

    All these methods are available by default, and PyTorch will try to select the optimal one automatically through the use of the new scaled dot-product attention (SDPA) API. You can also individually toggle them for finer-grained control, see the documentation for details.

    + +

    Using scaled dot-product attention in diffusers

    + +

    The incorporation of Accelerated PyTorch 2.0 Transformer attention to the Diffusers library was achieved through the use of the set_attn_processor method, which allows for pluggable attention modules to be configured. In this case, a new attention processor was created, which is enabled by default when PyTorch 2.0 is available. For clarity, this is how you could enable it manually (but it’s usually not necessary since diffusers will automatically take care of it):

    + +
    from diffusers import StableDiffusionPipeline
    +from diffusers.models.cross_attention import AttnProcessor2_0
    +
    +pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
    +pipe.to("cuda")
    +pipe.unet.set_attn_processor(AttnProcessor2_0())
    +
    +prompt = "a photo of an astronaut riding a horse on mars"
    +image = pipe(prompt).images[0]
    +
    + +

    Stable Diffusion Benchmark

    + +

    We ran a number of tests using accelerated dot-product attention from PyTorch 2.0 in Diffusers. We installed diffusers from pip and used nightly versions of PyTorch 2.0, since our tests were performed before the official release. We also used torch.set_float32_matmul_precision('high') to enable additional fast matrix multiplication algorithms.

    + +

    We compared results with the traditional attention implementation in diffusers (referred to as vanilla below) as well as with the best-performing solution in pre-2.0 PyTorch: PyTorch 1.13.1 with the xFormers package (v0.0.16) installed.

    + +

    Results were measured without compilation (i.e., no code changes at all), and also with a single call to torch.compile() to wrap the UNet module. We did not compile the image decoder because most of the time is spent in the 50 denoising iterations that run UNet evaluations.

    + +

    Results in float32

    + +

    Diffusers Speedup vs xFormers float32

    + +

    The following figures explore performance improvement vs batch size for various representative GPUs belonging to different generations. We collected data for each combination until we reached maximum memory utilization. Vanilla attention runs out of memory earlier than xFormers or PyTorch 2.0, which explains the missing bars for larger batch sizes. Similarly, A100 (we used the 40 GB version) is capable of running batch sizes of 64, but the other GPUs could only reach 32 in our tests.

    + +

    Diffusers Inference Speedup vs Vanilla and xFormers Attention (A100, float32)

    + +

    Diffusers Inference Speedup vs Vanilla and xFormers Attention (3090, float32)

    + +

    Diffusers Inference Speedup vs Vanilla and xFormers Attention (4090, float32)

    + +

    Diffusers Inference Speedup vs Vanilla and xFormers Attention (V100, float32)

    + +

    We found very significant performance improvements over vanilla attention across the board, without even using torch.compile(). An out of the box installation of PyTorch 2.0 and diffusers yields about 50% speedup on A100 and between 35% and 50% on 4090 GPUs, depending on batch size. Performance improvements are more pronounced for modern CUDA architectures such as Ada (4090) or Ampere (A100), but they are still very significant for older architectures still heavily in use in cloud services.

    + +

    In addition to faster speeds, the accelerated transformers implementation in PyTorch 2.0 allows much larger batch sizes to be used. A single 40GB A100 GPU runs out of memory with a batch size of 10, and 24 GB high-end consumer cards such as 3090 and 4090 cannot generate 8 images at once. Using PyTorch 2.0 and diffusers we could achieve batch sizes of 48 for 3090 and 4090, and 64 for A100. This is of great significance for cloud services and applications, as they can efficiently process more images at a time.

    + +

    When compared with PyTorch 1.13.1 + xFormers, the new accelerated transformers implementation is still faster and requires no additional packages or dependencies. In this case we found moderate speedups of up to 2% on datacenter cards such as A100 or T4, but performance was great on the two last generations of consumer cards: up to 20% speed improvement on 3090 and between 10% and 45% on 4090, depending on batch size.

    + +

    When torch.compile() is used, we get an additional performance boost of (typically) 2% and 3% over the previous improvements. As compilation takes some time, this is better geared towards user-facing inference services or training. Update: improvements achieved by torch.compile() are much larger when graph breaks are minimized, see the new section for details.

    + +

    Results in float16

    + +

    Diffusers Speedup vs xFormers float16

    + +

    Diffusers Inference Speedup vs Vanilla and xFormers Attention (A100, float16)

    + +

    Diffusers Inference Speedup vs Vanilla and xFormers Attention (4090, float16)

    + +

    Diffusers Inference Speedup vs Vanilla and xFormers Attention (3090, float16)

    + +

    When we consider float16 inference, the performance improvements of the accelerated transformers implementation in PyTorch 2.0 are between 20% and 28% over standard attention, across all the GPUs we tested, except for the 4090, which belongs to the more modern Ada architecture. This GPU benefits from a dramatic performance improvement when using PyTorch 2.0 nightlies. With respect to optimized SDPA vs xFormers, results are usually on par for most GPUs, except again for the 4090. Adding torch.compile() to the mix boosts performance a few more percentage points across the board.

    + +

    Performance of torch.compile() after minimizing graph breaks

    + +

    In the previous sections we saw that using the accelerated transformers implementation of PyTorch 2.0 provides important performance improvements with respect to earlier versions of PyTorch (with or without xFormers). However, torch.compile() only contributed modest marginal improvements. With the help of the PyTorch team we discovered that the reason for those moderate improvements was that some operations in the diffusers source code were causing graph breaks, which prevented torch.compile() from taking full advantage of graph optimizations.

    + +

    After fixing the graph breaks (see these PRs for details), we measured the additional improvement of torch.compile() vs the uncompiled version of PyTorch 2, and we saw very important incremental performance gains. The following chart was obtained using a nightly version of PyTorch 2 downloaded on May 1st, 2023, and it shows improvements in the range of ~13% to 22% for most workloads. The performance gains get better for modern GPU families, achieving more than 30% for A100. There are also two outliers in the chart. First, we see a performance decrease on T4 for a batch size of 16, which imposes a huge memory pressure on that card. At the opposite end of the spectrum, we see a performance increase on A100 of more than 100% when using a batch size of only 1, which is interesting but not representative of real-world use of a gpu with such large amount of RAM – larger batch sizes capable of serving multiple customers will usually be more interesting for service deployment on A100.

    + +

    Diffusers Speedup using torch.compile() in float16

    + +

    To stress it again, these performance gains are additional to the ones achieved by migrating to PyTorch 2 and using the accelerated transformers scaled dot-product attention implementation. We recommend using torch.compile() when deploying diffusers in production.

    + +

    Conclusions

    + +

    PyTorch 2.0 comes with multiple features to optimize the crucial components of the foundational transformer block, and they can be further improved with the use of torch.compile. These optimizations lead to significant memory and time improvements for diffusion models, and remove the need for third-party library installations.

    + +

    To take advantage of these speed and memory improvements all you have to do is upgrade to PyTorch 2.0 and use diffusers >= 0.13.0.

    + +

    For more examples and in-detail benchmark numbers, please also have a look at the Diffusers with PyTorch 2.0 docs.

    + +

    Acknowledgement

    + +

    The authors are grateful to the PyTorch team for creating such excellent software.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/accelerated-generative-diffusion-models/index.html b/blog/accelerated-generative-diffusion-models/index.html new file mode 100644 index 000000000000..dfda1620ec09 --- /dev/null +++ b/blog/accelerated-generative-diffusion-models/index.html @@ -0,0 +1,1122 @@ + + + + + + + + + + + + + Accelerated Generative Diffusion Models with PyTorch 2 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Grigory Sizov, Michael Gschwind, Hamid Shojanazeri, Driss Guessous, Daniel Haziza, Christian Puhrsch + +

    +

    TL;DR: PyTorch 2.0 nightly offers out-of-the-box performance improvement for Generative Diffusion models by using the new torch.compile() compiler and optimized implementations of Multihead Attention integrated with PyTorch 2.

    + +

    Introduction

    + +

    A large part of the recent progress in Generative AI came from denoising diffusion models, which allow producing high quality images and videos from text prompts. This family includes Imagen, DALLE, Latent Diffusion, and others. However, all models in this family share a common drawback: generation is rather slow, due to the iterative nature of the sampling process by which the images are produced. This makes it important to optimize the code running inside the sampling loop.

    + +

    We took an open source implementation of a popular text-to-image diffusion model as a starting point and accelerated its generation using two optimizations available in PyTorch 2: compilation and fast attention implementation. Together with a few minor memory processing improvements in the code these optimizations give up to 49% inference speedup relative to the original implementation without xFormers, and 39% inference speedup relative to using the original code with xFormers (excluding the compilation time), depending on the GPU architecture and batch size. Importantly, the speedup comes without a need to install xFormers or any other extra dependencies.

    + +

    The table below shows the improvement in runtime between the original implementation with xFormers installed and our optimized version with PyTorch-integrated memory efficient attention (originally developed for and released in the xFormers library) and PyTorch compilation. The compilation time is excluded.

    + +

    Runtime improvement in % compared to original+xFormers

    + +

    See the absolute runtime numbers in section “Benchmarking setup and results summary”

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    GPU + Batch size 1 + Batch size 2 + Batch size 4 +
    P100 (no compilation) + -3.8 + 0.44 + 5.47 +
    T4 + 2.12 + 10.51 + 14.2 +
    A10 + -2.34 + 8.99 + 10.57 +
    V100 + 18.63 + 6.39 + 10.43 +
    A100 + 38.5 + 20.33 + 12.17 +
    + +

    One can notice the following:

    + +
      +
    • The improvements are significant for powerful GPUs like A100 and V100. For those GPUs the improvement is most pronounced for batch size 1
    • +
    • For less powerful GPUs we observe smaller speedups (or in two cases slight regressions). The batch size trend is reversed here: improvement is larger for larger batches
    • +
    + +

    In the following sections we describe the applied optimizations and provide detailed benchmarking data, comparing the generation time with various optimization features on/off.

    + +

    Specifically, we benchmark 5 configurations and the plots below compare their absolute performance for different GPUs and batch sizes. For definitions of these configurations see section “Benchmarking setup and results”.

    + +

    Benchmark of denoising diffusion text-to-image generation across GPU architectures, batch size 1

    + +

    Benchmark of denoising diffusion text-to-image generation across GPU architectures, batch size 2

    + +

    Benchmark of denoising diffusion text-to-image generation across GPU architectures, batch size 1

    + +

    Optimizations

    + +

    Here we’ll go into more detail about the optimizations introduced into the model code. These optimizations rely on features of PyTorch 2.0 which has been released recently.

    + +

    Optimized Attention

    + +

    One part of the code which we optimized is the scaled dot-product attention. Attention is known to be a heavy operation: naive implementation materializes the attention matrix, leading to time and memory complexity quadratic in sequence length. It is common for diffusion models to use attention (CrossAttention) as part of Transformer blocks in multiple parts of the U-Net. Since the U-Net runs at every sampling step, this becomes a critical point to optimize. Instead of custom attention implementation one can use torch.nn.MultiheadAttention, which in PyTorch 2 has optimized attention implementation is integrated into it. This optimization schematically boils down to the following pseudocode:

    + +
    class CrossAttention(nn.Module):
    +    def __init__(self, ...):
    +        # Create matrices: Q, K, V, out_proj
    +        ...
    +    def forward(self, x, context=None, mask=None):
    +       # Compute out = SoftMax(Q*K/sqrt(d))V
    +       # Return out_proj(out)
    +       …
    +
    + +

    gets replaced with

    + +
    class CrossAttention(nn.Module):
    +    def __init__(self, ...):
    +        self.mha = nn.MultiheadAttention(...)
    +    def forward(self, x, context):
    +	return self.mha(x, context, context)
    +
    + +

    The optimized implementation of attention was available already in PyTorch 1.13 (see here) and widely adopted (see e.g. HuggingFace transformers library example). In particular, it integrates memory-efficient attention from the xFormers library and flash attention from https://arxiv.org/abs/2205.14135. PyTorch 2.0 expands this to additional attention functions such as cross attention and custom kernels for further acceleration, making it applicable to diffusion models.

    + +

    Flash attention is available on GPUs with compute capability SM 7.5 or SM 8.x - for example, on T4, A10, and A100, which are included in our benchmark (you can check compute capability of each NVIDIA GPU here). However, in our tests on A100 the memory efficient attention performed better than flash attention for the particular case of diffusion models, due to the small number of attention heads and small batch size. PyTorch understands this and in this case chooses memory efficient attention over flash attention when both are available (see the logic here). For full control over the attention backends (memory-efficient attention, flash attention, “vanilla math”, or any future ones), power users can enable and disable them manually with the help of the context manager torch.backends.cuda.sdp_kernel.

    + +

    Compilation

    + +

    Compilation is a new feature of PyTorch 2.0, enabling significant speedups with a very simple user experience. To invoke the default behavior, simply wrap a PyTorch module or a function into torch.compile:

    + +
    model = torch.compile(model)
    +
    + +

    PyTorch compiler then turns Python code into a set of instructions which can be executed efficiently without Python overhead. The compilation happens dynamically the first time the code is executed. With the default behavior, under the hood PyTorch utilized TorchDynamo to compile the code and TorchInductor to further optimize it. See this tutorial for more details.

    + +

    Although the one-liner above is enough for compilation, certain modifications in the code can squeeze a larger speedup. In particular, one should avoid so-called graph breaks - places in the code which PyTorch can’t compile. As opposed to previous PyTorch compilation approaches (like TorchScript), PyTorch 2 compiler doesn’t break in this case. Instead it falls back on eager execution - so the code runs, but with reduced performance. We introduced a few minor changes to the model code to get rid of graph breaks. This included eliminating functions from libraries not supported by the compiler, such as inspect.isfunction and einops.rearrange. See this doc to learn more about graph breaks and how to eliminate them.

    + +

    Theoretically, one can apply torch.compile on the whole diffusion sampling loop. However, in practice it is enough to just compile the U-Net. The reason is that torch.compile doesn’t yet have a loop analyzer and would recompile the code for each iteration of the sampling loop. Moreover, compiled sampler code is likely to generate graph breaks - so one would need to adjust it if one wants to get a good performance from the compiled version.

    + +

    Note that compilation requires GPU compute capability >= SM 7.0 to run in non-eager mode. This covers all GPUs in our benchmarks - T4, V100, A10, A100 - except for P100 (see the full list).

    + +

    Other optimizations

    + +

    In addition, we have improved efficiency of GPU memory operations by eliminating some common pitfalls, e.g. creating a tensor on GPU directly rather than creating it on CPU and later moving to GPU. The places where such optimizations were necessary were determined by line-profiling and looking at CPU/GPU traces and Flame Graphs.

    + +

    Benchmarking setup and results summary

    + +

    We have two versions of code to compare: original and optimized. On top of this, several optimization features (xFormers, PyTorch memory efficient attention, compilation) can be turned on/off. Overall, as mentioned in the introduction, we will be benchmarking 5 configurations:

    + +
      +
    • Original code without xFormers
    • +
    • Original code with xFormers
    • +
    • Optimized code with vanilla math attention backend and no compilation
    • +
    • Optimized code with memory-efficient attention backend and no compilation
    • +
    • Optimized code with memory-efficient attention backend and compilation
    • +
    + +

    As the original version we took the version of the code which uses PyTorch 1.12 and a custom implementation of attention. The optimized version uses nn.MultiheadAttention in CrossAttention and PyTorch 2.0.0.dev20230111+cu117. It also has a few other minor optimizations in PyTorch-related code.

    + +

    The table below shows runtime of each version of the code in seconds, and the percentage improvement compared to the _original with xFormers. _The compilation time is excluded.

    + +

    Runtimes for batch size 1. In parenthesis - relative improvement with respect to the “Original with xFormers” row

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Configuration + P100 + T4 + A10 + V100 + A100 +
    Original without xFormers + 30.4s (-19.3%) + 29.8s (-77.3%) + 13.0s (-83.9%) + 10.9s (-33.1%) + 8.0s (-19.3%) +
    Original with xFormers + 25.5s (0.0%) + 16.8s (0.0%) + 7.1s (0.0%) + 8.2s (0.0%) + 6.7s (0.0%) +
    Optimized with vanilla math attention, no compilation + 27.3s (-7.0%) + 19.9s (-18.7%) + 13.2s (-87.2%) + 7.5s (8.7%) + 5.7s (15.1%) +
    Optimized with mem. efficient attention, no compilation + 26.5s (-3.8%) + 16.8s (0.2%) + 7.1s (-0.8%) + 6.9s (16.0%) + 5.3s (20.6%) +
    Optimized with mem. efficient attention and compilation + - + 16.4s (2.1%) + 7.2s (-2.3%) + 6.6s (18.6%) + 4.1s (38.5%) +
    + +

    Runtimes for batch size 2

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Configuration + P100 + T4 + A10 + V100 + A100 +
    Original without xFormers + 58.0s (-21.6%) + 57.6s (-84.0%) + 24.4s (-95.2%) + 18.6s (-63.0%) + 12.0s (-50.6%) +
    Original with xFormers + 47.7s (0.0%) + 31.3s (0.0%) + 12.5s (0.0%) + 11.4s (0.0%) + 8.0s (0.0%) +
    Optimized with vanilla math attention, no compilation + 49.3s (-3.5%) + 37.9s (-21.0%) + 17.8s (-42.2%) + 12.7s (-10.7%) + 7.8s (1.8%) +
    Optimized with mem. efficient attention, no compilation + 47.5s (0.4%) + 31.2s (0.5%) + 12.2s (2.6%) + 11.5s (-0.7%) + 7.0s (12.6%) +
    Optimized with mem. efficient attention and compilation + - + 28.0s (10.5%) + 11.4s (9.0%) + 10.7s (6.4%) + 6.4s (20.3%) +
    + +

    Runtimes for batch size 4

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Configuration + P100 + T4 + A10 + V100 + A100 +
    Original without xFormers + 117.9s (-20.0%) + 112.4s (-81.8%) + 47.2s (-101.7%) + 35.8s (-71.9%) + 22.8s (-78.9%) +
    Original with xFormers + 98.3s (0.0%) + 61.8s (0.0%) + 23.4s (0.0%) + 20.8s (0.0%) + 12.7s (0.0%) +
    Optimized with vanilla math attention, no compilation + 101.1s (-2.9%) + 73.0s (-18.0%) + 28.3s (-21.0%) + 23.3s (-11.9%) + 14.5s (-13.9%) +
    Optimized with mem. efficient attention, no compilation + 92.9s (5.5%) + 61.1s (1.2%) + 23.9s (-1.9%) + 20.8s (-0.1%) + 12.8s (-0.9%) +
    Optimized with mem. efficient attention and compilation + - + 53.1s (14.2%) + 20.9s (10.6%) + 18.6s (10.4%) + 11.2s (12.2%) +
    + +

    To minimize fluctuations and external influence on the performance of the benchmarked code, we ran each version of the code one after another, and then repeated this sequence 10 times: A, B, C, D, E, A, B, … So the results of a typical run would look like the one in the picture below.. Note that one shouldn’t rely on comparison of absolute run times between different graphs, but comparison of run times_ inside_ one graph is pretty reliable, thanks to our benchmarking setup.

    + +

    Denoising diffusion model generation benchmarks

    + +

    Each run of text-to-image generation script produces several batches, the number of which is regulated by the CLI parameter --n_iter. In the benchmarks we used n_iter = 2, but introduced an additional “warm-up” iteration, which doesn’t contribute to the run time. This was necessary for the runs with compilation, because compilation happens the first time the code runs, and so the first iteration is much longer than all subsequent. To make comparison fair, we also introduced this additional “warm-up” iteration to all other runs.

    + +

    The numbers in the table above are for number of iterations 2 (plus a “warm-up one”), prompt ”A photo”, seed 1, PLMS sampler, and autocast turned on.

    + +

    Benchmarks were done using P100, V100, A100, A10 and T4 GPUs. The T4 benchmarks were done in Google Colab Pro. The A10 benchmarks were done on g5.4xlarge AWS instances with 1 GPU.

    + +

    Conclusions and next steps

    + +

    We have shown that new features of PyTorch 2 - compiler and optimized attention implementation - give performance improvements exceeding or comparable with what previously required installation of an external dependency (xFormers). PyTorch achieved this, in particular, by integrating memory efficient attention from xFormers into its codebase. This is a significant improvement for user experience, given that xFormers, being a state-of-the-art library, in many scenarios requires custom installation process and long builds.

    + +

    There are a few natural directions in which this work can be continued:

    + +
      +
    • The optimizations we implemented and described here are only benchmarked for text-to-image inference so far. It would be interesting to see how they affect training performance. PyTorch compilation can be directly applied to training; enabling training with PyTorch optimized attention is on the roadmap
    • +
    • We intentionally minimized changes to the original model code. Further profiling and optimization can probably bring more improvements
    • +
    • At the moment compilation is applied only to the U-Net model inside the sampler. Since there is a lot happening outside of U-Net (e.g. operations directly in the sampling loop), it would be beneficial to compile the whole sampler. However, this would require analysis of the compilation process to avoid recompilation at every sampling step
    • +
    • Current code only applies compilation within the PLMS sampler, but it should be trivial to extend it to other samplers
    • +
    • Besides text-to-image generation, diffusion models are also applied to other tasks - image-to-image and inpainting. It would be interesting to measure how their performance improves from PyTorch 2 optimizations
    • +
    + +

    See if you can increase performance of open source diffusion models using the methods we described, and share the results!

    + +

    Resources

    + + + +

    Acknowledgements

    + +

    We would like to thank Geeta Chauhan, Natalia Gimelshein, Patrick Labatut, Bert Maher, Mark Saroufim, Michael Voznesensky and Francisco Massa for their valuable advice and early feedback on the text.

    + +

    Special thanks to Yudong Tao initiating the work on using PyTorch native attention in diffusion models.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/accelerated-image-seg/index.html b/blog/accelerated-image-seg/index.html new file mode 100644 index 000000000000..be1fa3e207ad --- /dev/null +++ b/blog/accelerated-image-seg/index.html @@ -0,0 +1,878 @@ + + + + + + + + + + + + + Accelerated Image Segmentation using PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Intel + +

    +

    Using Intel® Extension for PyTorch to Boost Image Processing Performance

    + +

    PyTorch delivers great CPU performance, and it can be further accelerated with Intel® Extension for PyTorch. I trained an AI image segmentation model using PyTorch 1.13.1 (with ResNet34 + UNet architecture) to identify roads and speed limits from satellite images, all on the 4th Gen Intel® Xeon® Scalable processor.

    + +

    I will walk you through the steps to work with a satellite image dataset called SpaceNet5 and how I optimized the code to make deep learning workloads feasible on CPUs just by flipping a few key switches.

    + +

    Before we get started, some housekeeping…

    + +

    The code accompanying this article is available in the examples folder in the Intel Extension for PyTorch repository. I borrowed heavily from the City-Scale Road Extraction from Satellite Imagery (CRESI) repository. I adapted it for the 4th Gen Intel Xeon processors with PyTorch optimizations and Intel Extension for PyTorch optimizations. In particular, I was able to piece together a workflow using the notebooks here.

    + +

    You can find the accompanying talk I gave on YouTube.

    + +

    I also highly recommend these articles for a detailed explanation of how to get started with the SpaceNet5 data:

    + + + +

    I referenced two Hugging Face blogs by Julien Simon; he ran his tests on the AWS instance r7iz.metal-16xl:

    + + + +

    The potential cost savings from using a CPU instance instead of a GPU instance on the major cloud service providers (CSP) can be significant. The latest processors are still being rolled out to the CSPs, so I’m using a 4th Gen Intel Xeon processor that is hosted on the Intel® Developer Cloud (you can sign up for the Beta here: cloud.intel.com).

    + +

    On AWS, you can select from the r7iz.* EC2 instances after you sign up for the preview here (Figure 1). At the time of writing, the new AI-acceleration engine, Intel® Advanced Matrix Extensions (Intel® AMX), is only available on bare metal but it should soon be enabled on the virtual machines.

    + +

    List of 4th Gen Xeon  instances on AWS EC2

    + +

    Figure 1. List of 4th Gen Xeon instances on AWS EC2 (image by author)

    + +

    On Google Cloud* Platform, you can select from the 4th Gen Xeon Scalable processors C3 VMs (Figure 2).

    + +

    List of 4th Gen Intel Xeon Scalable processor instances on Google Cloud Platform

    + +

    Figure 2. List of 4th Gen Intel Xeon Scalable processor instances on Google Cloud Platform (image by author)

    + +

    Hardware Introduction and Optimizations

    + +

    The 4th Gen Intel Xeon processors were released January 2023, and the bare-metal instance I am using has two sockets (each with 56 physical cores), 504 GB of memory, and Intel AMX acceleration. I installed a few key libraries in the backend to take control and monitor the sockets, memory, and cores that I am using on the CPU:

    + +

    numactl (with sudo apt-get install numactl)

    + +

    libjemalloc-dev (with sudo apt-get install libjemalloc)

    + +

    intel-openmp (with conda install intel-openmp)

    + +

    gperftools (with conda install gperftools -c conda-forge)

    + +

    Both PyTorch and Intel Extension for PyTorch have helper scripts so that one does not need to explicitly use intel-openmp and numactl, but they do need to be installed in the backend. In case you want to set them up for other work, here is what I used for OpenMP* …

    + +
    export OMP_NUM_THREADS=36
    +export KMP_AFFINITY=granularity=fine,compact,1,0
    +export KMP_BLOCKTIME=1
    +
    + +

    … where OMP_NUM_THREADS is the number of threads allocated to the job, KMP_AFFINITY affects thread affinity settings (including packing threads close to each other, the state of pinning threads), and KMP_BLOCKTIME sets the time in milliseconds that an idle thread should wait before going to sleep.

    + +

    Here’s what I used for numactl

    + +
    numactl -C 0-35 --membind=0 train.py
    +
    + +

    …where -C specifies which cores to use and --membind instructs the program to only use one socket (socket 0 in this case).

    + +

    SpaceNet Data

    + +

    I am using a satellite image dataset from the SpaceNet 5 Challenge. Different cities can be downloaded for free from an AWS S3 bucket:

    + +
    aws s3 ls s3://spacenet-dataset/spacenet/SN5_roads/tarballs/ --human-readable
    +
    + +
    2019-09-03 20:59:32    5.8 GiB SN5_roads_test_public_AOI_7_Moscow.tar.gz
    +2019-09-24 08:43:02    3.2 GiB SN5_roads_test_public_AOI_8_Mumbai.tar.gz
    +2019-09-24 08:43:47    4.9 GiB SN5_roads_test_public_AOI_9_San_Juan.tar.gz
    +2019-09-14 13:13:26   35.0 GiB SN5_roads_train_AOI_7_Moscow.tar.gz
    +2019-09-14 13:13:34   18.5 GiB SN5_roads_train_AOI_8_Mumbai.tar.gz
    +
    + +

    You can use the following commands to download and unpack a file:

    + +
    aws s3 cp s3://spacenet-dataset/spacenet/SN5_roads/tarballs/SN5_roads_train_AOI_7_Moscow.tar.gz .
    +tar -xvzf ~/spacenet5data/moscow/SN5_roads_train_AOI_7_Moscow.tar.gz
    +
    + +

    Dataset Preparation

    + +

    I used the Moscow satellite image dataset, which consists of 1,352 images of 1,300 by 1,300 pixels with corresponding street labels in separate text files. The dataset contains both 8-band multispectral images and 3-band RGB images. Figure 3 shows four sample RGB satellite images and their corresponding generated masks. I used the speed_masks.py script from the CRESI repository to generate the segmentation masks.

    + +

    Satellite image 3-channel RGB chips from Moscow (top row) and corresponding pixel segmentation masks with varying speed limits

    + +

    Figure 3. Satellite image 3-channel RGB chips from Moscow (top row) and corresponding pixel segmentation masks with varying speed limits (bottom row) (image by author)

    + +

    There is a JSON configuration file that must be updated for all remaining components: training and validation split, training, and inference. An example configuration can be found here. I perform an 80:20 training/validation split, making sure to point to the correct folder of satellite images and corresponding masks for training. The configuration parameters are explained in more in the notebook under examples in GitHub for Intel Extension for PyTorch here.

    + +

    Training a ResNet34 + UNet Model

    + +

    I made some changes to the cresi code described below in order to run on a CPU and optimize the training. To run natively on a CPU, replace self.model = nn.DataParallel(model).cuda() with self.model = nn.DataParallel(model) in the train.py script. In the 01_train.py script, remove torch.randn(10).cuda().

    + +

    To optimize training, add import intel_extension_for_pytorch as ipex to the import statements in the train.py script. Just after defining the model and optimizer as follows:

    + +
    self.model = nn.DataParallel(model)
    +self.optimizer = optimizer(self.model.parameters(), lr=config.lr)
    +
    + +

    Add the ipex.optimize line to use BF16 precision, instead of FP32: \

    + +
    self.model, self.optimizer = ipex.optimize(self.model, 
    +    optimizer=self.optimizer,dtype=torch.bfloat16)
    +
    + +

    Add a line to do mixed-precision training just before running a forward pass and calculating the loss function:

    + +
    with torch.cpu.amp.autocast():
    +    if verbose:
    +        print("input.shape, target.shape:", input.shape, target.shape)
    +    output = self.model(input)
    +    meter = self.calculate_loss_single_channel(output, target, meter, training, iter_size)
    +
    + +

    Now that we have optimized our training code, we can move onto training our model.

    + +

    Like the winner of the SpaceNet 5 competition, I trained a ResNet34 encoder + UNet decoder model. It is pretrained from ImageNet weights, and the backbone is left completely unfrozen during training. The training can be run with the 01_train.py script, but in order to control the use of hardware I used a helper script. There are actually two helper scripts: one that comes with stock PyTorch and one that comes with Intel Extension for PyTorch. They both accomplish the same thing, but the first one from stock is torch.backends.xeon.run_cpu, and the second one from Intel Extension for PyTorch is ipexrun.

    + +

    Here is what I ran in the command-line:

    + +
    python -m torch.backends.xeon.run_cpu --ninstances 1 \
    +  --ncores_per_instance 32 \
    +  --log_path /home/devcloud/spacenet5data/moscow/v10_xeon4_devcloud22.04/logs/run_cpu_logs \
    +  /home/devcloud/cresi/cresi/01_train.py \
    +  /home/devcloud/cresi/cresi/configs/ben/v10_xeon4_baseline_ben.json --fold=0
    +
    + +
    ipexrun --ninstances 1 \
    +--ncore_per_instance 32 \
    +/home/devcloud/cresi/cresi/01_train.py \
    +/home/devcloud/cresi/cresi/configs/ben/v10_xeon4_baseline_ben.json --fold=0
    +
    + +

    In both cases, I am asking PyTorch to run training on one socket with 32 cores. Upon running, I get a printout of what environment variables get set in the backend to understand how PyTorch is using the hardware:

    + +
    INFO - Use TCMalloc memory allocator
    +INFO - OMP_NUM_THREADS=32
    +INFO - Using Intel OpenMP
    +INFO - KMP_AFFINITY=granularity=fine,compact,1,0
    +INFO - KMP_BLOCKTIME=1
    +INFO - LD_PRELOAD=/home/devcloud/.conda/envs/py39/lib/libiomp5.so:/home/devcloud/.conda/envs/py39/lib/libtcmalloc.so
    +INFO - numactl -C 0-31 -m 0 /home/devcloud/.conda/envs/py39/bin/python -u 01_train.py configs/ben/v10_xeon4_baseline_ben.json --fold=0
    +
    + +

    During training, I make sure that my total loss function is decreasing (i.e., the model is converging on a solution).

    + +

    Inference

    + +

    After training a model, we can start to make predictions from satellite images alone. In the eval.py inference script, add import intel_extension_for_pytorch as ipex to the import statements. After loading the PyTorch model, use Intel Extension for PyTorch to optimize the model for BF16 inference:

    + +
    model = torch.load(os.path.join(path_model_weights, 
    +    'fold{}_best.pth'.format(fold)), 
    +    map_location = lambda storage, 
    +    loc: storage)
    +model.eval()
    +model = ipex.optimize(model, dtype = torch.bfloat16)
    +
    + +

    Just prior to running prediction, add two lines for mixed precision:

    + +
    with torch.no_grad():
    +    with torch.cpu.amp.autocast():
    +        for data in pbar:
    +            samples = torch.autograd.Variable(data['image'], volatile=True)
    +            predicted = predict(model, samples, flips=self.flips)
    +
    + +

    To run inference, we can use the 02_eval.py script. Now that we have a trained model, we can make predictions on satellite images (Figure 4). We can see that it does seem to map the roads closely to the image!

    + +

    Moscow satellite image and accompanying prediction of roads

    + +

    Figure 4. Moscow satellite image and accompanying prediction of roads (image by author)

    + +

    I realize that the model I’ve trained is overfit to the Moscow image data and probably won’t generalize well to other cities. However, the winning solution to this challenge used data from six cities (Las Vegas, Paris, Shanghai, Khartoum, Moscow, Mumbai) and performs well on new cities. In the future, one thing that would be worth testing is training on all six cities and running inference on another city to reproduce their results.

    + +

    Note on Post-Processing

    + +

    There are further post-processing steps that can be performed to add the mask as graph features to maps. You can read more about the post-processing steps here:

    + +

    The SpaceNet 5 Baseline — Part 3: Extracting Road Speed Vectors from Satellite Imagery

    + +

    Post-processing scripts

    + +

    Conclusions

    + +

    In summary, we:

    + +
      +
    • Created 1,352 image training masks (with speed limits) to correspond to our training satellite image data (from .geojson text file labels)
    • +
    • Defined our configuration file for training and inference
    • +
    • Split up our data into training and validation sets
    • +
    • Optimized our code for CPU training, including using Intel Extension for PyTorch and BF16
    • +
    • Trained a performant ResNet34 + UNet model on a 4th Gen Intel Xeon CPU
    • +
    • Ran initial inference to see the prediction of a speed limit mask
    • +
    + +

    You can find detailed benchmarks here for the 4th Gen Intel Xeon CPU here.

    + +

    Next Steps

    + +

    Extend the optimizations on an Intel CPU by using the Intel Extension for PyTorch:

    + +

    pip install intel-extension-for-pytorch

    + +

    git clone https://github.com/intel/intel-extension-for-pytorch

    + +

    Get in touch with me on LinkedIn if you have any more questions!

    + +

    More information about the Intel Extension for PyTorch can be found here.

    + +

    Get the Software

    + +

    I encourage you to check out Intel’s other AI Tools and Framework optimizations and learn about the open, standards-based oneAPI multiarchitecture, multivendor programming model that forms the foundation of Intel’s AI software portfolio.

    + +

    For more details about 4th Gen Intel Xeon Scalable processor, visit AI Platform where you can learn about how Intel is empowering developers to run high-performance, efficient end-to-end AI pipelines.

    + +

    PyTorch Resources

    + + + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/accelerated-pytorch-2/index.html b/blog/accelerated-pytorch-2/index.html new file mode 100644 index 000000000000..727cbe888481 --- /dev/null +++ b/blog/accelerated-pytorch-2/index.html @@ -0,0 +1,691 @@ + + + + + + + + + + + + + Accelerated PyTorch 2 Transformers | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    March 28, 2023

    +

    + Accelerated PyTorch 2 Transformers +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Michael Gschwind, Driss Guessous, Christian Puhrsch + +

    +

    The PyTorch 2.0 release includes a new high-performance implementation of the PyTorch Transformer API with the goal of making training and deployment of state-of-the-art Transformer models affordable. Following the successful release of “fastpath” inference execution (“Better Transformer”), this release introduces high-performance support for training and inference using a custom kernel architecture for scaled dot product attention (SPDA).

    + +

    You can take advantage of the new fused SDPA kernels either by calling the new SDPA operator directly (as described in the SDPA tutorial), or transparently via integration into the pre-existing PyTorch Transformer API. All features of the PyTorch Transformer API will continue to work compatibly, with many features mapped to high-performance SDPA kernels, while other features are impossible to support with higher performance (e.g., need_weights, as per below) while expanded high-performance support for other features may still be under active development.
    +
    +Similar to the “fastpath” architecture, custom kernels are fully integrated into the PyTorch Transformer API – thus, using the native Transformer and MultiHeadAttention API will enable users to transparently see significant speed improvements. Unlike the “fastpath” architecture, the newly introduced “custom kernels” support many more use cases including models using Cross-Attention, Transformer Decoders, and for training models, in addition to the existing fastpath inference for fixed and variable sequence length Transformer Encoder and Self Attention use cases.

    + +

    To take full advantage of different hardware models and Transformer use cases, multiple SDPA custom kernels are supported, with custom kernel selection logic that will pick the highest-performance kernel for a given model and hardware type. In particular, the first custom kernels included with the PyTorch 2.0 release are the Flash Attention kernel (sdpa_flash, for 16-bit floating point training and inference on Nvidia GPUs with SM80+ architecture level) and the xFormers memory-efficient attention kernel (sdpa_mem_eff, for 16-bit and 32-bit floating point training and inference on a broad range of Nvidia GPUs). A general-purpose kernel sdpa_math provides an implementation when the custom kernels are not applicable.

    + +

    As mentioned, custom kernels provide a wider range of support for execution scenarios To ensure efficient execution (e,g., to use GPU tensor cores), model configurations need to meet a small number of requirements. This list of requirements will evolve over time, prospectively relaxing constraints limiting the usage of currently supported custom kernels, or providing additional kernels in the future.

    + +

    For the most up to date list of custom kernels and dispatch constraints, you can refer to sdp_utils.h. As of PyTorch 2.0, the existing fused SDPA kernels have the following constraints:

    + +
      +
    • Flash Attention only supports 16 bit floating point data types (float16 and bfloat16).
    • +
    • The head dimension must be a multiple of 8 for 16-bit floating point numbers and a multiple of 4 for 32-bit floating point numbers. At present, the maximum head_dim support for the Flash Attention custom kernel is 128.
    • +
    • The CUDA architecture level must be sm5x or better for the mem_efficient kernel, and sm80 for Flash Attention.
    • +
    • Flash Attention supports arbitrary dropout, in PyTorch 2.0 the mem_efficient kernel does not support dropout (i.e., dropout must be set to zero for this kernel to be selected in PyTorch 2.0).
    • +
    • To support variable-sequence length batches, all SDPA kernels support Nested Tensor inputs that combine input data and padding information using variable sequence length tensors for forward. (You can find more information about Nested Tensors in the Nested Tensor tutorial.)
    • +
    • You can specify both a key_padding_mask and an attn_mask by combining them before passing them to the SDPA operator. In particular, you can use the per-batch-element key padding mask of the nn.Transformer API to implement training for variable-sequence length inputs in a batch.
    • +
    • At present, the only attention mask supported by fused kernel implementation is the causal mask commonly used for training. To specify the causal mask in custom kernels, it must be specified with the is_causal boolean and attn_mask must be None.
    • +
    • Support for Nested Tensors is still under development. Specifically, in PyTorch 2.0, only the sdpa_math kernel supports training with Nested Tensors. Also, PyTorch 2.0 does not support Nested Tensors as part of code being compiled with torch.compile().
    • +
    • The SDPA operator does not support returning averaged attention weights because computing them defeats the optimizations that enabled fused kernels to execute more efficiently. The argument need_weights for torch.nn.MultiheadAttention’s forward function defaults to True. In order to use the fused kernels, need_weights needs to be set to need_weights=False.
    • +
    + +

    We find that an attention mask is rarely used in real-world applications, except for the causal mask during training. Consequently, we reduce kernel complexity and compute cost by building in the option to use a causal mask as attention mask, and select this new capability with the is_causal parameter introduced in conjunction with the new SDPA operator.

    + +

    Providing the is_causal Boolean flag for the frequently used causal mask also obviates the expensive and memory-intensive allocation of a causal mask, increasing training memory efficiency by allowing more memory to be used for large batch sizes, and reduce memory bandwidth and cache contention – which are both at a premium in GPU accelerators – by not needing to load an attention mask tensor.

    + +

    If the constraints of none of the available custom kernels are met, then training falls back to using the default sdpa_math kernel, implementing the mathematical equations for scaled dot product attention using a sequence of PyTorch operator to implement SDPA. This is the most general “catch-all” fallback kernel to ensure successful training for all models.

    + +

    In addition to the existing Transformer API, model developers may also use the scaled dot product attention kernels directly by calling the new scaled_dot_product_attention() operator. This operator may be used to efficiently implement multi-head attention by combining it with in-projection and outprojection, as described in the SDPA tutorial.

    + +

    In addition to adding custom kernels, Accelerated PyTorch 2 Transformers are integrated with PyTorch 2.0 compilation. To use your model while benefiting from the additional acceleration of PT2-compilation (for inference or training), pre-process the model with

    + +
    model = torch.compile(model)
    +
    + +

    We have achieved major speedups for training transformer models and in particular large language models with Accelerated PyTorch 2 Transformers using a combination of custom kernels and torch.compile().

    + +

    Better Transformer chart +Figure: Using scaled dot product attention with custom kernels and torch.compile delivers significant speedups for training large language models, such as for nanoGPT shown here.

    + +

    Finally, because the custom kernels are much more memory efficient, try to increase the size of training batches to achieve faster training with increased batch size.

    + +

    In addition to automatic kernel selection, a context manager enables developers to override the kernel selection algorithm – this is not required for day to day operation, but enables developers to debug their code as well as enable performance engineers to override kernel selection. The SDPA tutorial provides additional information on using the SDPA context manager.

    + +

    In addition to availability as part of the nn.Transformer API, Accelerated PyTorch 2 Transformer custom kernels are also available in conjunction with the torchtext, torchvision, and fairseq domain libraries with the launch of PyTorch 2.0.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/accelerated-pytorch-inference/index.html b/blog/accelerated-pytorch-inference/index.html new file mode 100644 index 000000000000..12b4baa23f2d --- /dev/null +++ b/blog/accelerated-pytorch-inference/index.html @@ -0,0 +1,1047 @@ + + + + + + + + + + + + + Accelerated PyTorch inference with torch.compile on AWS Graviton processors | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Sunita Nadampalli + +

    +

    Summary

    + +

    Originally PyTorch, used an eager mode where each PyTorch operation that forms the model is run independently as soon as it’s reached. PyTorch 2.0 introduced torch.compile to speed up PyTorch code over the default eager mode. In contrast to eager mode, the torch.compile pre-compiles the entire model into a single graph in a manner that’s optimal for running on a given hardware platform. AWS optimized the PyTorch torch.compile feature for AWS Graviton3 processors. This optimization results in up to 2x better performance for Hugging Face model inference (based on geomean of performance improvement for 33 models) and up to 1.35x better performance for TorchBench model inference (geomean of performance improvement for 45 models) compared to the default eager mode inference across several natural language processing (NLP), computer vision (CV), and recommendation models on AWS Graviton3-based Amazon EC2 instances. Starting with PyTorch 2.3.1, the optimizations are available in torch Python wheels and AWS Graviton PyTorch deep learning container (DLC).

    + +

    In this blog post, we show how we optimized torch.compile performance on AWS Graviton3-based EC2 instances, how to use the optimizations to improve inference performance, and the resulting speedups.

    + +

    Why torch.compile and what’s the goal?

    + +

    In eager mode, operators in a model are run immediately as they are encountered. It’s easier to use, more suitable for machine learning (ML) researchers, and hence is the default mode. However, eager mode incurs runtime overhead because of redundant kernel launch and memory read overhead. Whereas in torch compile mode, operators are first synthesized into a graph, wherein one operator is merged with another to reduce and localize memory reads and total kernel launch overhead.

    + +

    The goal for the AWS Graviton team was to optimize torch.compile backend for Graviton3 processors. PyTorch eager mode was already optimized for Graviton3 processors with Arm Compute Library (ACL) kernels using oneDNN (also known as MKLDNN). So, the question was, how to reuse those kernels in torch.compile mode to get the best of graph compilation and the optimized kernel performance together?

    + +

    Results

    + +

    The AWS Graviton team extended the torch inductor and oneDNN primitives that reused the ACL kernels and optimized compile mode performance on Graviton3 processors. Starting with PyTorch 2.3.1, the optimizations are available in the torch Python wheels and AWS Graviton DLC. Please see the Running an inference section that follows for the instructions on installation, runtime configuration, and how to run the tests.

    + +

    To demonstrate the performance improvements, we used NLP, CV, and recommendation models from TorchBench and the most downloaded NLP models from Hugging Face across Question Answering, Text Classification, Token Classification, Translation, Zero-Shot Classification, Translation, Summarization, Feature Extraction, Text Generation, Text2Text Generation, Fill-Mask, and Sentence Similarity tasks to cover a wide variety of customer use cases.

    + +

    We started with measuring TorchBench model inference latency, in milliseconds (msec), for the eager mode, which is marked 1.0 with a red dotted line in the following graph. Then we compared the improvements from torch.compile for the same model inference, the normalized results are plotted in the graph. You can see that for the 45 models we benchmarked, there is a 1.35x latency improvement (geomean for the 45 models).

    + +

    PyTorch model inference performance improvement with torch.compile on AWS Graviton3-based c7g instance using TorchBench framework

    + +

    Image 1: PyTorch model inference performance improvement with torch.compile on AWS Graviton3-based c7g instance using TorchBench framework. The reference eager mode performance is marked as 1.0. (higher is better)

    + +

    Similar to the preceding TorchBench inference performance graph, we started with measuring the Hugging Face NLP model inference latency, in msec, for the eager mode, which is marked 1.0 with a red dotted line in the following graph. Then we compared the improvements from torch.compile for the same model inference, the normalized results are plotted in the graph. You can see that for the 33 models we benchmarked, there is around 2x performance improvement (geomean for the 33 models).

    + +

    Hugging Face NLP model inference performance improvement with torch.compile on AWS Graviton3-based c7g instance using Hugging Face example scripts

    + +

    Image 2: Hugging Face NLP model inference performance improvement with torch.compile on AWS Graviton3-based c7g instance using Hugging Face example scripts. The reference eager mode performance is marked as 1.0. (higher is better)

    + +

    Running an inference

    + +

    Starting with PyTorch 2.3.1, the optimizations are available in the torch Python wheel and in AWS Graviton PyTorch DLC. This section shows how to run inference in eager and torch.compile modes using torch Python wheels and benchmarking scripts from Hugging Face and TorchBench repos.

    + +

    To successfully run the scripts and reproduce the speedup numbers mentioned in this post, you need an instance from the Graviton3 family (c7g/r7g/m7g/hpc7g) of hardware. For this post, we used the c7g.4xl (16 vcpu) instance. The instance, the AMI details, and the required torch library versions are mentioned in the following snippet.

    + +
    Instance: c7g.4xl instance
    +Region: us-west-2
    +AMI: ami-05cc25bfa725a144a (Ubuntu 22.04/Jammy with 6.5.0-1017-aws kernel)
    +
    +# Install Python
    +sudo apt-get update
    +sudo apt-get install -y python3 python3-pip
    +
    +# Upgrade pip3 to the latest version
    +python3 -m pip install --upgrade pip
    +
    +# Install PyTorch and extensions
    +python3 -m pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1
    +
    + +

    The generic runtime tunings implemented for eager mode inference are equally applicable for the torch.compile mode, so, we set the following environment variables to further improve the torch.compile performance on AWS Graviton3 processors.

    + +
    # Enable the fast math GEMM kernels, to accelerate fp32 inference with bfloat16 gemm
    +export DNNL_DEFAULT_FPMATH_MODE=BF16
    +
    +# Enable Linux Transparent Huge Page (THP) allocations,
    +# to reduce the tensor memory allocation latency
    +export THP_MEM_ALLOC_ENABLE=1
    +
    +# Set LRU Cache capacity to cache the primitives and avoid redundant
    +# memory allocations
    +export LRU_CACHE_CAPACITY=1024
    +
    + +

    TORCHBENCH BENCHMARKING SCRIPTS

    + +

    TorchBench is a collection of open source benchmarks used to evaluate PyTorch performance. We benchmarked 45 models using the scripts from the TorchBench repo. Following code shows how to run the scripts for the eager mode and the compile mode with inductor backend.

    + +
    # Set OMP_NUM_THREADS to number of vcpus, 16 for c7g.4xl instance
    +export OMP_NUM_THREADS=16
    +
    +# Install the dependencies
    +sudo apt-get install -y libgl1-mesa-glx
    +sudo apt-get install -y libpangocairo-1.0-0
    +python3 -m pip install psutil numpy transformers pynvml numba onnx onnxruntime scikit-learn timm effdet gym doctr opencv-python h5py==3.10.0 python-doctr 
    +
    +# Clone pytorch benchmark repo
    +git clone https://github.com/pytorch/benchmark.git
    +cd benchmark
    +# PyTorch benchmark repo doesn't have any release tags. So,
    +# listing the commit we used for collecting the performance numbers
    +git checkout 9a5e4137299741e1b6fb7aa7f5a6a853e5dd2295
    +
    +# Setup the models
    +python3 install.py 
    +
    +# Colect eager mode performance using the following command. The results will be
    +# stored at .userbenchmark/cpu/metric-<timestamp>.json.
    +python3 run_benchmark.py cpu --model BERT_pytorch,hf_Bert,hf_Bert_large,hf_GPT2,hf_Albert,hf_Bart,hf_BigBird,hf_DistilBert,hf_GPT2_large,dlrm,hf_T5,mnasnet1_0,mobilenet_v2,mobilenet_v3_large,squeezenet1_1,timm_efficientnet,shufflenet_v2_x1_0,timm_regnet,resnet50,soft_actor_critic,phlippe_densenet,resnet152,resnet18,resnext50_32x4d,densenet121,phlippe_resnet,doctr_det_predictor,timm_vovnet,alexnet,doctr_reco_predictor,vgg16,dcgan,yolov3,pytorch_stargan,hf_Longformer,timm_nfnet,timm_vision_transformer,timm_vision_transformer_large,nvidia_deeprecommender,demucs,tts_angular,hf_Reformer,pytorch_CycleGAN_and_pix2pix,functorch_dp_cifar10,pytorch_unet --test eval --metrics="latencies,cpu_peak_mem"
    +
    +# Collect torch.compile mode performance with inductor backend
    +# and weights pre-packing enabled. The results will be stored at
    +# .userbenchmark/cpu/metric-<timestamp>.json
    +python3 run_benchmark.py cpu --model BERT_pytorch,hf_Bert,hf_Bert_large,hf_GPT2,hf_Albert,hf_Bart,hf_BigBird,hf_DistilBert,hf_GPT2_large,dlrm,hf_T5,mnasnet1_0,mobilenet_v2,mobilenet_v3_large,squeezenet1_1,timm_efficientnet,shufflenet_v2_x1_0,timm_regnet,resnet50,soft_actor_critic,phlippe_densenet,resnet152,resnet18,resnext50_32x4d,densenet121,phlippe_resnet,doctr_det_predictor,timm_vovnet,alexnet,doctr_reco_predictor,vgg16,dcgan,yolov3,pytorch_stargan,hf_Longformer,timm_nfnet,timm_vision_transformer,timm_vision_transformer_large,nvidia_deeprecommender,demucs,tts_angular,hf_Reformer,pytorch_CycleGAN_and_pix2pix,functorch_dp_cifar10,pytorch_unet --test eval --torchdynamo inductor --freeze_prepack_weights --metrics="latencies,cpu_peak_mem"
    +
    + +

    On successful completion of the inference runs, the script stores the results in JSON format. The following is the sample output:

    + +
    {
    + "name": "cpu"
    + "environ": {
    +     "pytorch_git_version": "d44533f9d073df13895333e70b66f81c513c1889"
    +  },
    +  
    +  "metrics": {
    +       "BERT_pytorch-eval_latency": 56.3769865,
    +       "BERT_pytorch-eval_cmem": 0.4169921875
    +  }
    +}
    +
    + +

    HUGGING FACE BENCHMARKING SCRIPTS

    + +

    Google T5 Small Text Translation model is one of the around 30 Hugging Face models we benchmarked. We’re using it as a sample model to demonstrate how to run inference in eager and compile modes. The additional configurations and APIs required to run it in compile mode are highlighted in BOLD. Save the following script as google_t5_small_text_translation.py.

    + +
    import argparse
    +from transformers import T5Tokenizer, T5Model
    +import torch
    +from torch.profiler import profile, record_function, ProfilerActivity
    +import torch._inductor.config as config
    +config.cpp.weight_prepack=True
    +config.freezing=True
    +
    +def test_inference(mode, num_iter):
    +    tokenizer = T5Tokenizer.from_pretrained("t5-small")
    +    model = T5Model.from_pretrained("t5-small")
    +
    +    input_ids = tokenizer(
    +        "Studies have been shown that owning a dog is good for you", return_tensors="pt"
    +    ).input_ids  # Batch size 1
    +    decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
    +
    +    if (mode == 'compile'):
    +        model = torch.compile(model)
    +
    +    with torch.no_grad():
    +        for _ in range(50):
    +            outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
    +
    +        with profile(activities=[ProfilerActivity.CPU]) as prof:
    +            with record_function("model_inference"):
    +                for _ in range(num_iter):
    +                    outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
    +
    +    print(prof.key_averages().table(sort_by="self_cpu_time_total"))
    +
    +def main() -> None:
    +    global m, args
    +    parser = argparse.ArgumentParser(__doc__)
    +    parser.add_argument(
    +        "-m",
    +        "--mode",
    +        choices=["eager", "compile"],
    +        default="eager",
    +        help="Which test to run.",
    +    )
    +    parser.add_argument(
    +        "-n",
    +        "--number",
    +        type=int,
    +        default=100,
    +        help="how many iterations to run.",
    +    )
    +    args = parser.parse_args()
    +    test_inference(args.mode, args.number)
    +
    +if __name__ == "__main__":
    +    main()
    +
    + +

    Run the script with the following steps:

    + +
    # Set OMP_NUM_THREADS to number of vcpus to 4 because
    +# the scripts are running inference in sequence, and
    +# they don't need large number of vcpus
    +export OMP_NUM_THREADS=4
    +
    +# Install the dependencies
    +python3 -m pip install transformers
    +
    +# Run the inference script in Eager mode
    +# using number of iterations as 1 just to show the torch profiler output
    +# but for the benchmarking, we used 1000 iterations.
    +python3 google_t5_small_text_translation.py -n 1 -m eager
    +
    +# Run the inference script in torch compile mode
    +python3 google_t5_small_text_translation.py -n 1 -m compile
    +
    + +

    On successful completion of the inference runs, the script prints the torch profiler output with the latency breakdown for the torch operators. The following is the sample output from torch profiler:

    + +
    # Torch profiler output for the eager mode run on c7g.xl (4vcpu)
    +------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
    +                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
    +------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
    +                aten::mm        40.71%      12.502ms        40.71%      12.502ms     130.229us            96  
    +         model_inference        26.44%       8.118ms       100.00%      30.708ms      30.708ms             1  
    +               aten::bmm         6.85%       2.102ms         9.47%       2.908ms      80.778us            36  
    +            aten::matmul         3.73%       1.146ms        57.26%      17.583ms     133.205us           132  
    +            aten::select         1.88%     576.000us         1.90%     583.000us       0.998us           584  
    +         aten::transpose         1.51%     464.000us         1.83%     563.000us       3.027us           186  
    +------------------------ ------------ ------------ ------------ ------------ ------------ -------------------
    +Self CPU time total: 30.708ms
    +
    +# Torch profiler output for the compile mode run for the same model on the same instance
    +---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
    +                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
    +---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
    +        mkldnn::_linear_pointwise        37.98%       5.461ms        45.91%       6.602ms      68.771us            96  
    +            Torch-Compiled Region        29.56%       4.251ms        98.53%      14.168ms      14.168ms             1  
    +                        aten::bmm        14.90%       2.143ms        21.73%       3.124ms      86.778us            36  
    +                     aten::select         4.51%     648.000us         4.62%     665.000us       1.155us           576  
    +                       aten::view         3.29%     473.000us         3.29%     473.000us       1.642us           288  
    +                      aten::empty         2.53%     364.000us         2.53%     364.000us       3.165us           115  
    +--------------------------------- ------------ ------------ ------------ ------------ ------------ --------------------
    +Self CPU time total: 14.379ms
    +
    + +

    Technical deep dive: What are the challenges and optimization details

    + +

    Underpinning torch.compile are new technologies – TorchDynamo, AOTDispatcher, and TorchInductor.

    + +

    TorchDynamo captures PyTorch programs safely using Python Frame Evaluation Hooks
    +AOTDispatcher overloads PyTorch’s autograd engine as a tracing autodiff for generating ahead-of-time backward traces.
    +TorchInductor is a deep learning compiler that generates fast code for multiple accelerators and backends.

    + +

    The PyTorch compilation process source

    + +

    Image 3: The PyTorch compilation process

    + +

    When torch.compile is invoked, torch dynamo rewrites Python bytecode to extract sequences of PyTorch operations into an FX Graph, which is then compiled with inductor backend. For a typical inference scenario where the graph is frozen and gradient calculations are disabled, the inductor invokes platform specific optimizations like graph rewrite into more performant operators, operator fusion, and weights pre-packing.

    + +

    However, on Graviton3, the inductor wasn’t able to perform any of those optimizations because there was no aarch64 backend defined. To fix this, we extended the inductor’s FX passes to pick oneDNN operators for linear layer compilation on Graviton3 processors with ACL backend. The code snippet for this follows:

    + +
    packed_weight_op = (
    +    mkldnn._reorder_linear_weight
    +    if (is_bf16_weight or mkldnn._is_mkldnn_acl_supported())
    +                    
    +packed_linear_inputs: Tuple[Any, ...] = (input, packed_weight_node)
    +if is_bf16_weight or mkldnn._is_mkldnn_acl_supported():
    +    packed_linear_inputs += (bias, "none", [], "")
    +    packed_linear_op = mkldnn._linear_pointwise.default
    +
    + +

    After this was done, the FX pass was successful in compiling the matmul operators to linear_pointwise . The following snippet highlights the matmul operator in the original model:

    + +
     %attention_scores   : [num_users=1] = call_function[target=torch.matmul](args = (%query_layer, %transpose), kwargs = {})
    + %attention_scores_1 : [num_users=1] = call_function[target=operator.truediv](args = (%attention_scores, 8.0), kwargs = {})
    + %attention_scores_2 : [num_users=1] = call_function[target=operator.add](args = (%attention_scores_1, %extended_attention_mask_3), kwargs = {})
    +
    + +

    The following snippet highlights the linear_pointwise operator in the compiled graph:

    + +
    %_linear_pointwise_default_140 : [num_users=2] = call_function[target=torch.ops.mkldnn._linear_pointwise.default](args = (%add_7, %_frozen_param278, %_frozen_param16, none, [], ), kwargs = {})
    +%mul_5 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_linear_pointwise_default_140, 0.5), kwargs = {})
    +%mul_6 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_linear_pointwise_default_140, 0.7071067811865476), kwargs = {})
    +%erf   : [num_users=1] = call_function[target=torch.ops.aten.erf.default](args = (%mul_6,), kwargs = {})
    +%add_8 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%erf, 1), kwargs = {})
    +
    + +

    This completes the torch inductor changes required to compile the graph into optimized operators on AWS Graviton3 processors. Next comes the actual inference where the compiled graph is dispatched to be run. OneDNN with ACL was the backend we chose during the inductor compilation, so, the new operators were dispatched to oneDNN as expected, for example, mkldnn._linear_pointwise. However, due to gaps in oneDNN ACL primitives, the operators were run with C++ reference kernels instead of the optimized ACL kernels. Hence, the compile performance was still significantly behind the eager mode performance.

    + +

    There were mainly three areas where oneDNN ACL primitives lack support for torch.compile mode. The following section talks about them in detail.

    + +

    1. ACL primitives didn’t have support for weights in blocked layout

    + +

    ACL primitives originally designed for eager mode supported weights only in the standard channels last (NHWC) format, without any pre-packing. Whereas weights pre-packing into blocked layout is one of the main optimizations in the inductor compilation passes where the weights are reordered into blocks specific to the runtime platform. This avoids the redundant and on-the-fly reorders when running the General Matrix Multiplication (GEMM), which otherwise would be the bottleneck for inference performance. But the ACL primitives didn’t have support for blocked layout and hence the operators were run with oneDNN C++ reference kernels instead.

    + +

    2. Mixed precision primitives weren’t supported in oneDNN

    + +

    AWS Graviton3 processors support bfloat16 MMLA instructions which can be used to accelerate fp32 inference with bfloat16 GEMM as a mixed precision compute. ACL supports bfloat16 mixed precision GEMM kernels, and are integrated into oneDNN as a fast math compute option for the existing fp32 operators. However, the fast math approach didn’t work for compile mode because of weights pre-packing optimization. The compile mode requires explicit mixed precision primitive implementation in oneDNN in order to use bfloat16 acceleration.

    + +

    3. ACL primitives didn’t support fused kernels for some of the activation functions

    + +

    In eager mode, operators are dispatched individually because the model is run independently as soon as it’s reached. Whereas in compile mode, operator fusion is another important optimization where the operators are fused for runtime efficiency. For example, Gaussian Error Linear Unit (GELU) is one of the most widely used activation functions in transformers-based neural network architectures. So, it’s typical to have a linear layer (with matrix multiplications) followed by GELU activation. As part of compiling the model into efficient operators, the torch inductor fuses matmul and GELU into a single linearpointwise+gelu operator. However, oneDNN ACL primitives didn’t have the support for fused kernels with GELU.

    + +

    We addressed these gaps by extending oneDNN primitives to handle the additional layouts and new primitive definitions. The following sections talk about the optimizations in detail.

    + +

    Optimization 1: Extended ACL primitives to accept weight tensors in blocked layout

    + +

    We extended the ACL primitives to accept blocked layout in addition to the the standard NHWC format. The code snippet for this is as follows:

    + +
    const bool is_weights_md_format_ok
    +                    = utils::one_of(weights_format_kind_received,
    +                      format_kind::any, format_kind::blocked);
    +
    +
    +const memory_desc_t weights_md_received = weights_md_;
    +acl_utils::reorder_to_weight_format(aip.wei_tensor_info,
    +             weights_md_, expected_weight_format, inner_dim, o_dim,
    +             remaining_dims, {});
    +
    +ACL_CHECK_SUPPORT(
    +     (weights_format_kind_received == format_kind::blocked)
    +      && !(dnnl_memory_desc_equal(
    +      &weights_md_received, &weights_md_)),
    +      "specified blocked format not supported by ACL, use "
    +      "format_kind_t::any to find a supported blocked format for "
    +      "your platform");
    +
    + +

    Optimization 2: Defined new ACL primitives to handle mixed precision operators (weights in bfloat16 and activations in fp32)

    + +

    We defined mixed precision primitive definitions and updated the existing oneDNN ACL fp32 primitives to handle bfloat16 tensors.

    + +
     /* With graph compilation, we are able to reorder and pre-pack the weights during the model load
    +  * and compilation phase itself so that redundant and on-the-fly reorders can be avoided.
    +  * This primitive definition is to support gemm fastmath mode for the compile scenario where src is
    +  * in fp32 and weights are in bf16
    +  */
    + {{forward, f32, bf16, f32}, {
    +    CPU_INSTANCE_AARCH64_ACL(acl_inner_product_fwd_t)
    +    nullptr,
    + }},
    +
    + +

    Optimization 3: Disabled operator fusion pass in torch inductor

    + +

    We bypassed the operator fusion pass in torch inductor so that the compiled graph doesn’t contain GELU fused operators. This is a temporary solution to enable ACL kernels in torch.compile. There is a work in progress to enable operator fusion pass for the future PyTorch releases. With this workaround, we were able to successfully dispatch the linear layer to ACL. As shown in the following torch.profiler output, the aten::addmm (one of the variants of the matmul operator) and aten::gelu in the original model (as highlighted in Image 4) was compiled to mkldnn::_linear_pointwise without gelu operator fusion (as highlighted in Image 5).

    + +
    ---------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
    +                       Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
    +---------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
    +                aten::addmm        73.32%      46.543ms        74.49%      47.287ms     647.767us            73  
    +            model_inference         9.92%       6.296ms       100.00%      63.479ms      63.479ms             1  
    +                  aten::bmm         4.37%       2.776ms         5.46%       3.467ms     144.458us            24  
    +                aten::copy_         1.74%       1.102ms         1.74%       1.102ms       8.103us           136  
    +                 aten::gelu         1.50%     950.000us         1.50%     950.000us      79.167us            12  
    +
    + +

    Image 4: torch.profiler output for Hugging Face bert base model inference in Eager mode, showing addmm and gelu operators

    +
     
    + +
    -----------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
    +                                                 Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
    +-----------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
    +                            mkldnn::_linear_pointwise        53.61%      15.529ms        57.53%      16.665ms     228.288us            73  
    +                                Torch-Compiled Region        36.95%      10.705ms        99.31%      28.769ms      28.769ms             1  
    +    aten::_scaled_dot_product_flash_attention_for_cpu         3.67%       1.064ms         4.43%       1.284ms     107.000us            12  
    +                                           aten::view         1.97%     572.000us         1.97%     572.000us       2.509us           228  
    +                                          aten::empty         1.38%     399.000us         1.38%     399.000us       3.270us           122 
    +
    + +

    Image 5: torch.profiler output for Hugging Face Bert base model inference in torch.compile mode, showing linear_pointwise operator without gelu fusion

    + +

    Lastly, the gelu operator was compiled into erf (error function) and was dispatched to an inductor auto vectorization backend. The following snippets show the erf operator in the compiled graph and running it using libm.so.

    + +
    %_linear_pointwise_default_140 : [num_users=2] = call_function[target=torch.ops.mkldnn._linear_pointwise.default](args = (%add_7, %_frozen_param278, %_frozen_param16, none, [], ), kwargs = {})
    +%mul_5 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_linear_pointwise_default_140, 0.5), kwargs = {})
    +%mul_6 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_linear_pointwise_default_140, 0.7071067811865476), kwargs = {})
    +%erf   : [num_users=1] = call_function[target=torch.ops.aten.erf.default](args = (%mul_6,), kwargs = {})
    +%add_8 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%erf, 1), kwargs = {})
    +
    + +

    Image 6: snippet after post grad pass showing erf function in the compiled graph

    +
     
    + +
         0.82%     0.40%  python3  libm.so.6            [.] erff32
    +     0.05%     0.00%  python3  libtorch_python.so   [.] torch::autograd::THPVariable_erf
    +     0.05%     0.00%  python3  libtorch_cpu.so      [.] at::_ops::erf::call
    +
    + +

    Image 7: Linux perf report showing erf dispatch to libm.so

    + +

    With this work, we were able to optimize torch.compile performance on Graviton3 processors by using inductor graph compilation along with the oneDNN+ACL backend.

    + +

    TorchBench enhancements

    + +

    To demonstrate the torch.compile performance improvements on AWS Graviton3 processors, we extended TorchBench framework to add a new argument to enable graph freeze and weights pre-packing and disable torch auto grad for eval test mode. The code snippet for this is as follows:

    + +
    parser.add_argument(
    + "—freeze_prepack_weights",
    + action='store_true',
    + help="set to freeze the graph and prepack weights",
    + )
    +
    +if args.freeze_prepack_weights:
    + torch._inductor.config.freezing=True
    + torch._inductor.config.cpp.weight_prepack=True
    +
    + +

    Image 8: Added freeze_prepack_weights option for torchdynamo backend in TorchBench to demonstrate torch.compile performance improvements on AWS Graviton3 processors

    + +

    We have upstreamed all the optimizations, and starting with PyTorch 2.3.1, these are supported in torch Python wheels and AWS Graviton PyTorch DLC.

    + +

    What’s next

    + +

    Next, we’re extending the torch inductor CPU backend support to compile Llama model, and adding support for fused GEMM kernels to enable torch inductor operator fusion optimization on AWS Graviton3 processors.

    + +

    Conclusion

    + +

    In this tutorial, we covered how we optimized torch.compile performance on AWS Graviton3-based EC2 instances, how to use the optimizations to improve PyTorch model inference performance, and demonstrated the resulting speedups. We hope that you will give it a try! If you need any support with ML software on Graviton, please open an issue on the AWS Graviton Technical Guide GitHub.

    + +

    Acknowledgements

    + +

    We would like to thank the PyTorch community for the baseline torch.compile framework and their continued efforts to optimize it further.

    + +

    References: https://pytorch.org/assets/pytorch2-2.pdf

    + +

    Author

    + +

    Sunita Nadampalli is a Software Development Manager and AI/ML expert at AWS. She leads AWS Graviton software performance optimizations for AI/ML and HPC workloads. She is passionate about open source software development and delivering high-performance and sustainable software solutions for SoCs based on the Arm ISA.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/accelerating-gemms-triton/index.html b/blog/accelerating-gemms-triton/index.html new file mode 100644 index 000000000000..cdce705fc69a --- /dev/null +++ b/blog/accelerating-gemms-triton/index.html @@ -0,0 +1,753 @@ + + + + + + + + + + + + + Accelerating 2D Dynamic Block Quantized Float8 GEMMs in Triton | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Meta: Less Wright, IBM: Adnan Hoque + +

    +

    2D block quantization for Float8 (FP8) holds the promise of improving the accuracy of Float8 quantization while also accelerating GEMM’s for both inference and training. In this blog, we showcase advances using Triton for the two main phases involved in doing block quantized Float8 GEMMs.

    + +

    For the incoming quantization of A and B tensors from high precision (BFloat16) to Float8, we showcase GridQuant which leverages a mini-grid stride loop style of processing with nearly 2x speedups (99.31%) over a current 2D block quantization kernel.

    + +

    For the Float8 GEMM, we showcase 3 new developments for Triton - Warp Specialization, TMA and a persistent kernel to effectively create a cooperative style kernel (an alternative to the Ping-Pong schedule). As a result, we achieve ~1.2x speedup over our best-performing SplitK kernel from last year.

    + +

    Figure 1: A comparison of the 2D quantization speedup over a current baseline, across a range of sizes.

    + +

    Figure 1: A comparison of the 2D quantization speedup over a current baseline, across a range of sizes. (lower-is-better)

    + +

    Why 2D Blockwise Quantization for FP8?

    + +

    Generally speaking, the accuracy of fp8 quantization improves as we move from tensor-wise scaling, to row-wise scaling, to 2D block-wise, and then finally to column-wise scaling. This is because features for a given token are stored in each column, and thus each column in that tensor is more similarly scaled.

    + +

    To minimize the number of outliers of a given numerical set, we want to find commonality so that numbers are being scaled in a similar fashion. For transformers, this means column based quantization could be optimal…however, columnar memory access is massively inefficient due to the data being laid out in memory in a rowwise contiguous manner. Thus columnwise loading would require memory access involving large strides in memory to pull isolated values, contrary to the core tenets of efficient memory access.

    + +

    However, 2D is the next best option as it includes some aspects of columnar while being more memory efficient to pull since we can vectorize these loads with 2D vectorization. Therefore, we want to find ways to improve the speed for 2D block quantization which is why we developed the GridQuant kernel.

    + +

    For the quantization process, we need to 2D block quantize both the higher precision BF16 incoming tensors (A = input activations, B = weights) and then proceed to do the Float8 matmul using the quantized tensors and their 2D block scaling values, and return an output C tensor in BF16.

    + +

    How does GridQuant improve 2D block quantization efficiency?

    + +

    The GridQuant kernel has several improvements over the initial baseline quantization implementation which was a standard tile based implementation. The GridQuant kernel has two full passes through the entire input tensor and works as follows:

    + +

    Phase 1 - Determine the max abs value for each 256x256 sub block from the incoming high precision tensor.

    + +

    1 - We divide the BF16 tensor into 256 x 256 sub blocks. This quantization size is configurable, but 256x256 is the default as it provides a blend of quantization precision and processing efficiency.

    + +

    2 - Each 256x256 sub-block is subdivided into 64 sub-blocks arranged in an 8x8 pattern, with each sub-block processing a 32x32 element block. A single warp (32 threads) handles the computation for all elements within its assigned 32x32 block.

    + +

    3 - We declare a 32x32 max_vals array in shared memory. This will store the current max val for each position i,j as the 2d vector block moves across the entire 256x256 sub_block.

    + +

    This is an important improvement because it means we can do vectorized, rather than scalar, updates to the max vals scoring system and allows for much more efficient updates.

    + +

    Figure 2: The Fractionalized layout of an incoming tensor - a grid of 256x256 is created across the tensor, and within each 256x256 block, it is further refined into 32x32 sub blocks. A 32x32 max_vals is created for each 256x256 block.

    + +

    Figure 2: The Fractionalized layout of an incoming tensor - a grid of 256x256 is created across the tensor, and within each 256x256 block, it is further refined into 32x32 sub blocks. A 32x32 max_vals is created for each 256x256 block.

    + +

    4 - Each warp processes a 32x32 chunk and because we are using 4 warps, we ensure the Triton compiler can pipeline the memory loads for the next 32x32 chunk with the actual processing of absmax calculations for the current chunk. This ensures that the warp scheduler is able to toggle warps loading data with those processing and keep the SM continuously busy.

    + +

    5 - The 32x32 2D vector block processing is moved across and through the entire 256x256 subblock in a grid stride looping fashion, with each warp updating the shared memory 32x32 max_vals against its current 32x32 sub-block. Thus max_vals[i,j] holds the latest max value as each sub block is processed.

    + +

    After completing the 256x256 block grid stride loop, the maxvals matrix is then itself reduced to find the absolute single max value for that entire 256 block.

    + +

    This gives us our final scaling factor value for this 2D 256 x 256 block.

    + +

    Phase 2 - Quantize the 256x256 block values to Float8, by using the single max value scaling factor found during Phase 1.

    + +

    Next, we make a second pass through the entire 256x256 block to rescale all the numbers using this max value found in phase 1 to convert them to the float 8 format.

    + +

    Because we know we need to do 2 complete passes, for the loads during the phase 1 portion we instruct the triton compiler to keep these values in cache at higher priority (evict policy = last).

    + +

    This means that during the second pass, we can get a high hit rate from the L2 cache which provides much faster memory access than going all the way to HBM.

    + +

    With the 2D block quantization processing complete when all 256 x256 blocks are processed, we can return the new Float8 quantized tensor along with it’s scaling factor matrix, which we’ll use in the next phase of the GEMM processing. This input quantization is repeated for the second input tensor as well, meaning we end up with A_Float 8, A_scaling_matrix, and B_Float8 and B_scaling matrix.

    + +

    GridQuant - GEMM Kernel

    + +

    The GridQuant-GEMM kernel takes in the four outputs from the quantization above for processing. Our high-performance GEMM kernel features several new Triton developments to achieve SOTA performance for matrix shape profiles relevant in LLM inference during the decoding phase.

    + +

    These new features are commonly found in Hopper optimized kernels like FlashAttention-3 and Machete, built using CUTLASS 3.x. Here, we discuss these methods and showcase the performance benefits that can be achieved leveraging them in Triton.

    + +

    Tensor Memory Accelerator (TMA)

    + +

    The TMA unit on NVIDIA Hopper GPUs, is a dedicated hardware unit for load/store operations that act on multidimensional tensors commonly found in AI workloads. This has several important benefits.

    + +

    Transferring data from global and shared memory can occur without involving other resources on GPU SMs, freeing up registers and CUDA Cores. Further, when used in warp-specialized kernels, light-weight TMA operations can be assigned to a producer warp allowing for a high degree of overlap of memory transfers and computation.

    + +

    For more details on how TMA is used in Triton see our previous blog.

    + +

    Warp-Specialization (Cooperative Persistent Kernel Design)

    + +

    Warp Specialization is a technique to leverage pipeline parallelism on GPUs. This experimental feature enables the expression of specialized threads through a tl.async_task API, allowing the user to specify how operations in a Triton program should be “split” amongst warps. The cooperative Triton kernel performs different types of computation and loads that each take place on their own dedicated hardware. Having dedicated hardware for each of these specialized tasks makes it possible to realize parallelism efficiently for operations that have no data dependency.

    + +

    Figure 3. Logical view of dedicated HW units in NVIDIA H100 SM

    + +

    Figure 3. Logical view of dedicated HW units in NVIDIA H100 SM

    + +

    The operations in our kernel that create the pipeline are:

    + +

    A - Load per-block scale from GMEM into SMEM (cp.async engine)

    + +

    B - Load activation (A) and Weight (B) tiles from GMEM into SMEM (TMA)

    + +

    C - Matrix-Multiplication of A tile and B tile = C tile (Tensor Core)

    + +

    D - Scale C tile with per-block scale from A and per-block scale from B (CUDA core)

    + +

    These steps can be assigned to “tasks” which are carried out by specialized warp groups in a threadblock. The cooperative strategy has three warp groups. A producer warp group that is responsible for feeding the compute units and 2 consumer warp groups that perform the computation. The two consumer warp groups each work on half of the same output tile.

    + +

    Figure 4. Warp-Specialized Persistent Cooperative kernel

    + +

    Figure 4. Warp-Specialized Persistent Cooperative kernel (source: NVIDIA)

    + +

    This is different from the ping-pong schedule we discussed in our previous blog, where each consumer warp group works on different output tiles. We note that the Tensor Core ops are not overlapped with the epilogue computation. Decreased utilization of the Tensor Core pipeline during the epilogue phase of the computation will reduce register pressure for the consumer warp group compared to ping-pong which always keeps the Tensor Core busy, thus allowing for larger tile sizes.

    + +

    Lastly, our kernel is designed to be persistent when the grid size exceeds the number of available compute units on H100 GPUs (132). Persistent kernels remain active on the GPU for an extended period and compute multiple output tiles during its lifetime. Our kernel leverages TMA async shared to global memory stores, while continuing to do work on the next output tile as opposed to incurring the cost of scheduling multiple threadblocks.

    + +

    Microbenchmarks

    + +

    Figure 5: Latency comparison (us) of Gridquant-GEMM vs our best performing SplitK kernel for small batch regime and Llama3 8192 N,K sizing.

    + +

    Figure 5: Latency comparison (us) of Gridquant-GEMM vs our best performing SplitK kernel for small batch regime and Llama3 8192 N,K sizing. (lower-is-better)

    + +

    The Warp-Specialized Triton kernel achieves SOTA performance at the above small-M and square matrix shapes, achieving a nearly 1.2x speedup over the SplitK Triton kernel, which was the previous best performing strategy for Triton GEMMs in this low arithmetic intensity regime. For future work, we plan to tune our kernel performance for the medium-to-large M regime and non-square matrices.

    + +

    Conclusion and Future Work

    + +

    Future work includes benchmarking gridquant on end to end workflows. In addition, we plan to run more extensive benchmarks on non-square (rectangular) matrices as well as medium-to-large M sizes. Finally, we plan to explore ping-pong style warp-specialization in Triton versus the current cooperative implementation.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/accelerating-generative-ai-2/index.html b/blog/accelerating-generative-ai-2/index.html new file mode 100644 index 000000000000..016bb9edaa7f --- /dev/null +++ b/blog/accelerating-generative-ai-2/index.html @@ -0,0 +1,889 @@ + + + + + + + + + + + + + Accelerating Generative AI with PyTorch II: GPT, Fast | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    This post is the second part of a multi-series blog focused on how to accelerate generative AI models with pure, native PyTorch. We are excited to share a breadth of newly released PyTorch performance features alongside practical examples to see how far we can push PyTorch native performance. In part one, we showed how to accelerate Segment Anything over 8x using only pure, native PyTorch. In this blog we’ll focus on LLM optimization.

    + +

    Over the past year, generative AI use cases have exploded in popularity. Text generation has been one particularly popular area, with lots of innovation among open-source projects such as llama.cpp, vLLM, and MLC-LLM.

    + +

    While these projects are performant, they often come with tradeoffs in ease of use, such as requiring model conversion to specific formats or building and shipping new dependencies. This begs the question: how fast can we run transformer inference with only pure, native PyTorch?

    + +

    As announced during our recent PyTorch Developer Conference, the PyTorch team wrote a from-scratch LLM almost 10x faster than baseline, with no loss of accuracy, all using native PyTorch optimizations. We leverage a breadth of optimizations including:

    + + + +

    And, even better, we can do it in less than 1000 lines of native PyTorch code.

    + +

    If this excites you enough to jump straight into the code, check it out at https://github.com/pytorch-labs/gpt-fast!

    + +

    Screen recording

    + +

    Note: We will be focusing on latency (i.e. batch size=1) for all of these benchmarks. Unless otherwise specified, all benchmarks are run on an A100-80GB, power limited to 330W.

    + +

    Starting Point (25.5 tok/s)

    + +

    Let’s start off with an extremely basic and simple implementation.

    + +

    simple implementation

    + +

    Sadly, this does not perform very well. But why? Looking at a trace reveals the answer - it’s heavily CPU overhead bound! What this means is that our CPU is not able to tell the GPU what to do fast enough for the GPU to be fully utilized.

    + +

    trace

    + +

    Imagine the GPU as this super massive factory with a ridiculous amount of compute available. Then, imagine the CPU as some messenger shuttling instructions back and forth to the GPU. Remember, in large scale deep learning systems, the GPU is responsible for doing 100% of the work! In such systems, the only role of the CPU is to tell the GPU what work it should be doing.

    + +

    factory

    + +

    So, the CPU runs over and tells the GPU to do an “add”, but by the time the CPU can give the GPU another chunk of work, the GPU has long finished the previous chunk of work.

    + +

    Despite the fact that the GPU needs to perform thousands of computations while the CPU only needs to do orchestration work, this is surprisingly common! There’s a variety of reasons for this, ranging from the fact that the CPU is likely running some single-threaded Python to the fact that GPUs are just incredibly fast nowadays.

    + +

    Regardless of the reason, we now find ourselves in the overhead-bound regime. So, what can we do? One, we could rewrite our implementation in C++, perhaps even eschew frameworks entirely and write raw CUDA. Or…. we could just send more work to the GPU at once.

    + +

    factory

    + +

    By just sending a massive chunk of work at once, we can keep our GPU busy! Although during training, this may just be accomplished by increasing your batch size, how do we do this during inference?

    + +

    Enter torch.compile.

    + +

    Step 1: Reducing CPU overhead through torch.compile and a static kv-cache (107.0 tok/s)

    + +

    Torch.compile allows us to capture a larger region into a single compiled region, and particularly when run with mode=”reduce-overhead”, is very effective at reducing CPU overhead. Here, we also specify fullgraph=True, which validates that there are no “graph breaks” in your model (i.e. portions that torch.compile cannot compile). In other words, it ensures that torch.compile is running to its fullest potential.

    + +

    To apply it, we simply wrap a function (or a module) with it.

    + +
    torch.compile(decode_one_token, mode="reduce-overhead", fullgraph=True)
    +
    + +

    However, there are a couple of nuances here that make it somewhat nontrivial for folks to get significant performance boosts from applying torch.compile to text generation.

    + +

    The first obstacle is the kv-cache. The kv-cache is an inference-time optimization that caches the activations computed for the previous tokens (see here for a more in-depth explanation). However, as we generate more tokens, the “logical length” of the kv-cache grows. This is problematic for two reasons. One is that reallocating (and copying!) the kv-cache every time the cache grows is simply expensive. The other one is that this dynamism makes it harder to reduce the overhead, as we are no longer able to leverage approaches like cudagraphs.

    + +

    To resolve this, we use a “static” kv-cache, which means that we statically allocate the maximum size of the kv-cache, and then mask out the unused values in the attention portion of the computation.

    + +

    code

    + +

    The second obstacle is the prefill phase. Transformer text generation is best thought of as a two phase process: 1. The prefill where the entire prompt is processed, and 2. Decoding where each token is generated autoregressively.

    + +

    Although decoding can be made entirely static once the kv-cache is made static, the prefill stage still requires significantly more dynamism, due to having a variable prompt length. Thus, we actually need to compile the two stages with separate compilation strategies.

    + +

    compile

    + +

    Although these details are a bit tricky, the actual implementation is not very difficult at all (see gpt-fast)! And the performance boost is dramatic.

    + +

    chart

    + +

    All of a sudden, our performance improves by more than 4x! Such performance gains are often common when one’s workload is overhead bound.

    + +

    Sidenote: How is torch.compile helping?

    + +

    It is worth disentangling how exactly torch.compile is improving performance. There’s 2 main factors leading to torch.compile’s performance.

    + +

    The first factor, like mentioned above, is overhead reduction. Torch.compile is able to reduce overhead through a variety of optimizations, but one of the most effective ones is called CUDAGraphs. Although torch.compile applies this automatically for you when “reduce-overhead” is set, saving the extra work and code you need to write when doing this yourself manually without torch.compile.

    + +

    The second factor, however, is that torch.compile simply generates faster kernels. In the decoding benchmark above, torch.compile actually generates every single kernel from scratch, including both the matrix multiplications and the attention! And even cooler, these kernels are actually faster than the built in alternatives (CuBLAS and FlashAttention2)!

    + +

    This may sound implausible to many of you, considering how hard it is to write efficient matrix multiplication/attention kernels, and how much manpower has been put into CuBLAS and FlashAttention. The key here, however, is that transformer decoding has very unusual computational properties. In particular, because of the KV-cache, for BS=1 every single matrix multiplication in a transformer is actually a matrix vector multiplication.

    + +

    This means that the computations are completely memory-bandwidth bound, and as such, are well within the range of compilers to automatically generate. And in fact, when we benchmark torch.compile’s matrix-vector multiplications against CuBLAS, we find that torch.compile’s kernels are actually quite a bit faster!

    + +

    code

    + +

    code

    + +

    Step 2: Alleviating memory bandwidth bottleneck through int8 weight-only quantization (157.4 tok/s)

    + +

    So, given that we’ve already seen massive speedups from applying torch.compile, is it possible to do even better? One way to think about this problem is to compute how close we are to the theoretical peak. In this case, the largest bottleneck is the cost of loading the weights from GPU global memory to registers. In other words, each forward pass requires us to “touch” every single parameter on the GPU. So, how fast can we theoretically “touch” every single parameter in a model?

    + +

    weights

    + +

    To measure this, we can use Model Bandwidth Utilization (MBU). This measures what percentage of our memory bandwidth we’re able to use during inference.

    + +

    Computing it is pretty simple. We simply take the total size of our model (# params * bytes per param) and multiply it by the number of inferences we can do per second. Then, we divide this by the peak bandwidth of the GPU to get our MBU.

    + +

    MBU

    + +

    For example, for our above case, we have a 7B parameter model. Each parameter is stored in fp16 (2 bytes per parameter), and we achieved 107 tokens/s. Finally, our A100-80GB has a theoretical 2 TB/s of memory bandwidth.

    + +

    MBU

    + +

    Putting this all together, we get **72% MBU! **This is quite good, considering that even just copying memory struggles to break 85%.

    + +

    But… it does mean that we’re pretty close to the theoretical limit here, and that we’re clearly bottlenecked on just loading our weights from memory. It doesn’t matter what we do - without changing the problem statement in some manner, we might only be able to eek out another 10% in performance.

    + +

    Let’s take another look at the above equation. We can’t really change the number of parameters in our model. We can’t really change the memory bandwidth of our GPU (well, without paying more money). But, we can change how many bytes each parameter is stored in!

    + +

    MBU

    + +

    Thus, we arrive at our next technique - int8 quantization. The idea here is simple. If loading our weights from memory is our main bottleneck, why don’t we just make the weights smaller?

    + +

    MBU

    + +

    Note that this is quantizing only the weights - the computation itself is still done in bf16. This makes this form of quantization easy to apply with very little to no accuracy degradation.

    + +

    Moreover, torch.compile can also easily generate efficient code for int8 quantization. Let’s look again at the above benchmark, this time with int8 weight-only quantization included.

    + +

    code

    + +

    code

    + +

    As you can see from the dark blue line (torch.compile + int8), there is a significant performance improvement when using torch.compile + int8 weight-only quantization! Moreover, the light-blue line (no torch.compile + int8) is actually much worse than even the fp16 performance! This is because in order to take advantage of the perf benefits of int8 quantization, we need the kernels to be fused. This shows one of the benefits of torch.compile - these kernels can be automatically generated for the user!

    + +

    Applying int8 quantization to our model, we see a nice 50% performance improvement, bringing us up to 157.4 tokens/s!

    + +

    chart

    + +

    Step 3: Reframing the problem using speculative decoding

    + +

    Even after using techniques like quantization, we’re still faced with another problem. In order to generate 100 tokens, we must load our weights 100 times.

    + +

    diagram

    + +

    Even if the weights are quantized, we still must load our weights over and over, once for each token we generate! Is there any way around this?

    + +

    At first glance, the answer might seem like no - there’s a strict serial dependency in our autoregressive generation. However, as it turns out, by utilizing speculative decoding, we’re able to break this strict serial dependency and obtain speedups!

    + +

    engineers

    + +

    Imagine you had a senior engineer (called Verity), who makes the right technical decisions but is rather slow at writing code. However, you also have a junior engineer (called Drake), who doesn’t always make the right technical decisions but can write code much faster (and cheaper!) than Verity. How can we take advantage of Drake (the junior engineer) to write code faster while ensuring that we are still making the right technical decisions?

    + +

    engineers

    + +

    First, Drake goes through the labor-intensive process of writing the code, making technical decisions along the way. Next, we give the code to Verity to review.

    + +

    engineers

    + +

    Upon reviewing the code, Verity might decide that the first 3 technical decisions Drake made are correct, but the last 2 need to be redone. So, Drake goes back, throws away his last 2 decisions, and restarts coding from there.

    + +

    Notably, although Verity (the senior engineer) has only looked at the code once, we are able to generate 3 pieces of validated code identical to what she would have written! Thus, assuming Verity is able to review the code faster than it would have taken her to write those 3 pieces herself, this approach comes out ahead.

    + +

    In the context of transformer inference, Verity would be played by the role of the larger model whose outputs we want for our task, called the verifier model. Similarly, Drake would be played by a smaller model that’s able to generate text much faster than the larger model, called the draft model. So, we would generate 8 tokens using the draft model, and then process all eight tokens in parallel using the verifier model, throwing out the ones that don’t match.

    + +

    Like mentioned above, one crucial property of speculative decoding is that it does not change the quality of the output. As long as the time it takes for generating the tokens using the draft model + verifying the tokens is less than it would have taken to generate those tokens, we come out ahead.

    + +

    One of the great things about doing this all in native PyTorch is that this technique is actually really easy to implement! Here’s the entirety of the implementation, in about 50 lines of native PyTorch.

    + +

    code

    + +

    Although speculative decoding guarantees that we have mathematically identical results compared to regular generation, it does have the property that the runtime performance varies depending on the generated text, as well as how aligned the draft and verifier model are. For example, when running CodeLlama-34B + CodeLlama-7B, we’re able to obtain a 2x boost in tokens/s for generating code. On the other hand, when using Llama-7B + TinyLlama-1B, we’re only able to obtain about a 1.3x boost in tokens/s.

    + +

    Sidenote: Running this on AMD

    + +

    Like mentioned above, every single kernel in decoding is generated from scratch by torch.compile, and is converted into OpenAI Triton. As AMD has a torch.compile backend (and also a Triton backend), we can simply go through all of the optimizations above… but on an AMD GPU! With int8 quantization, we’re able to achieve 102.5 tokens/s with one GCD (i.e. one half) of a MI250x!

    + +

    chart

    + +

    Step 4: Reducing the size of the weights even more with int4 quantization and GPTQ (202.1 tok/s)

    + +

    Of course, if reducing the weights down from 16 bits to 8 bits allows for speedups by reducing the number of bytes we need to load, reducing the weights down to 4 bits would result in even larger speedups!

    + +

    Unfortunately, when reducing weights down to 4-bits, the accuracy of the model starts to become a much larger concern. From our preliminary evals, we see that although using int8 weight-only quantization has no perceptible accuracy degradation, using int4 weight-only quantization does.

    + +

    table

    + +

    There are 2 main tricks we can use to limit the accuracy degradation of int4 quantization.

    + +

    The first one is to have a more granular scaling factor. One way to think about the scaling factor is that when we have a quantized tensor representation, it is on a sliding scale between a floating point tensor (each value has a scaling factor) and an integer tensor (no values have a scaling factor). For example, with int8 quantization, we had one scaling factor per row. If we want higher accuracy, however, we can change that to “one scaling factor per 32 elements”. We choose a group size of 32 to minimize accuracy degradation, and this is also a common choice among the community.

    + +

    The other one is to use a more advanced quantization strategy than simply rounding the weights. For example, approaches like GPTQ leverage example data in order to calibrate the weights more accurately. In this case, we prototype an implementation of GPTQ in the repository based off of PyTorch’s recently released torch.export.

    + +

    In addition, we need kernels that fuse int4 dequantize with the matrix vector multiplication. In this case, torch.compile is unfortunately not able to generate these kernels from scratch, so we leverage some handwritten CUDA kernels in PyTorch.

    + +

    These techniques require some additional work, but putting them all together results in even better performance!

    + +

    chart

    + +

    Step 5: Combining everything together (244.7 tok/s)

    + +

    Finally, we can compose all of the techniques together to achieve even better performance!

    + +

    chart

    + +

    Step 6: Using Tensor Parallelism

    + +

    So far, we’ve been restricting ourselves to minimizing latency while on a single GPU. In many settings, however, we have access to multiple GPUs. This allows us to improve our latency further!

    + +

    To get an intuitive sense of why this would allow us to improve our latency, let’s take a look at the prior equation for MBU, particularly the denominator. Running on multiple GPUs gives us access to more memory bandwidth, and thus, higher potential performance.

    + +

    MBU

    + +

    As for which parallelism strategy to pick, note that in order to reduce our latency for one example, we need to be able to leverage our memory bandwidth across more devices simultaneously. This means that we need to split the processing of one token across multiple devices. In other words, we need to use tensor parallelism.

    + +

    Luckily, PyTorch also provides low-level tools for tensor-parallelism that compose with torch.compile. We are also working on higher-level APIs for expressing tensor parallelism, stay tuned for those!

    + +

    However, even without a higher-level API, it’s actually still quite easy to add tensor parallelism. Our implementation comes in at 150 lines of code, and doesn’t require any model changes.

    + +

    code

    + +

    We are still able to take advantage of all the optimizations mentioned previously, which all can continue to compose with tensor parallelism. Combining these together, we’re able to serve Llama-70B at 55 tokens/s with int8 quantization!

    + +

    chart

    + +

    Conclusion

    + +

    Let’s take a look at what we’re able to accomplish.

    + +
      +
    1. Simplicity: Ignoring quantization, model.py (244 LOC) + generate.py (371 LOC) + tp.py (151 LOC) comes out to 766 LOC to implement fast inference + speculative decoding + tensor-parallelism.
    2. +
    3. Performance: With Llama-7B, we’re able to use compile + int4 quant + speculative decoding to reach 241 tok/s. With llama-70B, we’re able to also throw in tensor-parallelism to reach 80 tok/s. These are both close to or surpassing SOTA performance numbers!
    4. +
    + +

    PyTorch has always allowed for simplicity, ease of use, and flexibility. However, with torch.compile, we can throw in performance as well.

    + +

    The code can be found here: https://github.com/pytorch-labs/gpt-fast. We hope that the community finds it useful. Our goal with this repo is not to provide another library or framework for people to import. Instead, we encourage users to copy-paste, fork, and modify the code in the repo.

    + +

    Acknowledgements

    + +

    We would like to thank the vibrant open source community for their continual support of scaling LLMs, including:

    + +
      +
    • Lightning AI for supporting pytorch and work in flash attention, int8 quantization, and LoRA fine-tuning.
    • +
    • GGML for driving forward fast, on device inference of LLMs
    • +
    • Andrej Karpathy for spearheading simple, interpretable and fast LLM implementations
    • +
    • MLC-LLM for pushing 4-bit quantization performance on heterogenous hardware
    • +
    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/accelerating-generative-ai-3/index.html b/blog/accelerating-generative-ai-3/index.html new file mode 100644 index 000000000000..ff1463ec8868 --- /dev/null +++ b/blog/accelerating-generative-ai-3/index.html @@ -0,0 +1,920 @@ + + + + + + + + + + + + + Accelerating Generative AI Part III: Diffusion, Fast | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Sayak Paul and Patrick von Platen (Hugging Face 🤗) + +

    +

    This post is the third part of a multi-series blog focused on how to accelerate generative AI models with pure, native PyTorch. We are excited to share a breadth of newly released PyTorch performance features alongside practical examples to see how far we can push PyTorch native performance. In part one, we showed how to accelerate Segment Anything over 8x using only pure, native PyTorch. In part two, we showed how to accelerate Llama-7B by almost 10x using only native PyTorch optimizations. In this blog, we’ll focus on speeding up text-to-image diffusion models by upto 3x.

    + +

    We will leverage an array of optimizations including:

    + +
      +
    • Running with the bfloat16 precision
    • +
    • scaled_dot_product_attention (SPDA)
    • +
    • torch.compile
    • +
    • Combining q,k,v projections for attention computation
    • +
    • Dynamic int8 quantization
    • +
    + +

    We will primarily focus on Stable Diffusion XL (SDXL), demonstrating a latency improvement of 3x. These techniques are PyTorch-native, which means you don’t have to rely on any third-party libraries or any C++ code to take advantage of them.

    + +

    Enabling these optimizations with the 🤗Diffusers library takes just a few lines of code. If you’re already feeling excited and cannot wait to jump to the code, check out the accompanying repository here: https://github.com/huggingface/diffusion-fast.

    + +

    SDXL Chart

    + +

    (The discussed techniques are not SDXL-specific and can be used to speed up other text-to-image diffusion systems, as shown later.)

    + +

    Below, you can find some blog posts on similar topics:

    + + + +

    Setup

    + +

    We will demonstrate the optimizations and their respective speed-up gains using the 🤗Diffusers library. Apart from that, we will make use of the following PyTorch-native libraries and environments:

    + +
      +
    • Torch nightly (to benefit from the fastest kernels for efficient attention; 2.3.0.dev20231218+cu121)
    • +
    • 🤗 PEFT (version: 0.7.1)
    • +
    • torchao (commit SHA: 54bcd5a10d0abbe7b0c045052029257099f83fd9)
    • +
    • CUDA 12.1
    • +
    + +

    For an easier reproduction environment, you can also refer to this Dockerfile. The benchmarking numbers presented in this post come from a 400W 80GB A100 GPU (with its clock rate set to its maximum capacity).

    + +

    Since we use an A100 GPU (Ampere architecture) here, we can specify torch.set_float32_matmul_precision("high") to benefit from the TF32 precision format.

    + +

    Run inference using a reduced precision

    + +

    Running SDXL in Diffusers just takes a few lines of code:

    + +
    from diffusers import StableDiffusionXLPipeline
    +
    +## Load the pipeline in full-precision and place its model components on CUDA.
    +pipe = StableDiffusionXLPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0").to("cuda")
    +
    +## Run the attention ops without efficiency.
    +pipe.unet.set_default_attn_processor()
    +pipe.vae.set_default_attn_processor()
    +
    +prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
    +image = pipe(prompt, num_inference_steps=30).images[0]
    +
    + +

    But this isn’t very practical as it takes 7.36 seconds to generate a single image with 30 steps. This is our baseline which we will try to optimize one step at a time.

    + +

    SDXL Chart

    + +

    Here, we’re running the pipeline with the full precision. We can immediately cut down the inference time by using a reduced precision such as bfloat16. Besides, modern GPUs come with dedicated cores for running accelerated computation benefiting from reduced precision. To run the computations of the pipeline in the bfloat16 precision, we just need to specify the data type while initializing the pipeline:

    + +
    from diffusers import StableDiffusionXLPipeline
    +
    +pipe = StableDiffusionXLPipeline.from_pretrained(
    +	"stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.bfloat16
    +).to("cuda")
    +
    +## Run the attention ops without efficiency.
    +pipe.unet.set_default_attn_processor()
    +pipe.vae.set_default_attn_processor()
    +prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
    +image = pipe(prompt, num_inference_steps=30).images[0]
    +
    + +

    SDXL Chart

    + +

    By using a reduced precision, we’re able to cut down the inference latency from 7.36 seconds to 4.63 seconds.

    + +

    Some notes on the use of bfloat16

    + +
      +
    • Using a reduced numerical precision (such as float16, bfloat16) to run inference doesn’t affect the generation quality but significantly improves latency.
    • +
    • The benefits of using the bfloat16 numerical precision as compared to float16 are hardware-dependent. Modern generations of GPUs tend to favor bfloat16.
    • +
    • Furthermore, in our experiments, we bfloat16 to be much more resilient when used with quantization in comparison to float16.
    • +
    + +

    (We later ran the experiments in float16 and found out that the recent versions of torchao do not incur numerical problems from float16.)

    + +

    Use SDPA for performing attention computations

    + +

    By default, Diffusers uses scaled_dot_product_attention (SDPA) for performing attention-related computations when using PyTorch 2. SDPA provides faster and more efficient kernels to run intensive attention-related operations. To run the pipeline SDPA, we simply don’t set any attention processor like so:

    + +
    from diffusers import StableDiffusionXLPipeline
    +
    +pipe = StableDiffusionXLPipeline.from_pretrained(
    +	"stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.bfloat16
    +).to("cuda")
    +
    +prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
    +image = pipe(prompt, num_inference_steps=30).images[0]
    +
    + +

    SDPA gives a nice boost from 4.63 seconds to 3.31 seconds.

    + +

    SDXL Chart

    + +

    Compiling the UNet and VAE

    + +

    We can ask PyTorch to perform some low-level optimizations (such as operator fusion and launching faster kernels with CUDA graphs) by using torch.compile. For the StableDiffusionXLPipeline, we compile the denoiser (UNet) and the VAE:

    + +
    from diffusers import StableDiffusionXLPipeline
    +import torch
    +
    +pipe = StableDiffusionXLPipeline.from_pretrained(
    +    "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.bfloat16
    +).to("cuda")
    +
    +## Compile the UNet and VAE.
    +pipe.unet = torch.compile(pipe.unet, mode="max-autotune", fullgraph=True)
    +pipe.vae.decode = torch.compile(pipe.vae.decode, mode="max-autotune", fullgraph=True)
    +
    +prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
    +
    +## First call to `pipe` will be slow, subsequent ones will be faster.
    +image = pipe(prompt, num_inference_steps=30).images[0]
    +
    + +

    Using SDPA attention and compiling both the UNet and VAE reduces the latency from 3.31 seconds to 2.54 seconds.

    + +

    SDXL Chart

    + +

    Notes on torch.compile

    + +

    torch.compile offers different backends and modes. As we’re aiming for maximum inference speed, we opt for the inductor backend using the “max-autotune”. “max-autotune” uses CUDA graphs and optimizes the compilation graph specifically for latency. Using CUDA graphs greatly reduces the overhead of launching GPU operations. It saves time by using a mechanism to launch multiple GPU operations through a single CPU operation.

    + +

    Specifying fullgraph to be True ensures that there are no graph breaks in the underlying model, ensuring the fullest potential of torch.compile. In our case, the following compiler flags were also important to be explicitly set:

    + +
    torch._inductor.config.conv_1x1_as_mm = True
    +torch._inductor.config.coordinate_descent_tuning = True
    +torch._inductor.config.epilogue_fusion = False
    +torch._inductor.config.coordinate_descent_check_all_directions = True
    +
    + +

    For the full list of compiler flags, refer to this file.

    + +

    We also change the memory layout of the UNet and the VAE to “channels_last” when compiling them to ensure maximum speed:

    + +
    pipe.unet.to(memory_format=torch.channels_last)
    +pipe.vae.to(memory_format=torch.channels_last)
    +
    + +

    In the next section, we’ll show how to improve the latency even further.

    + +

    Additional optimizations

    + +

    No graph breaks during torch.compile

    + +

    Ensuring that the underlying model/method can be fully compiled is crucial for performance (torch.compile with fullgraph=True). This means having no graph breaks. We did this for the UNet and VAE by changing how we access the returning variables. Consider the following example:

    + +

    code example

    + +

    Getting rid of GPU syncs after compilation

    + +

    During the iterative reverse diffusion process, we call step() on the scheduler each time after the denoiser predicts the less noisy latent embeddings. Inside step(), the sigmas variable is indexed. If the sigmas array is placed on the GPU, indexing causes a communication sync between the CPU and GPU. This causes a latency, and it becomes more evident when the denoiser has already been compiled.

    + +

    But if the sigmas array always stays on the CPU (refer to this line), this sync doesn’t take place, hence improved latency. In general, any CPU <-> GPU communication sync should be none or be kept to a bare minimum as it can impact inference latency.

    + +

    Using combined projections for attention ops

    + +

    Both the UNet and the VAE used in SDXL make use of Transformer-like blocks. A Transformer block consists of attention blocks and feed-forward blocks.

    + +

    In an attention block, the input is projected into three sub-spaces using three different projection matrices – Q, K, and V. In the naive implementation, these projections are performed separately on the input. But we can horizontally combine the projection matrices into a single matrix and perform the projection in one shot. This increases the size of the matmuls of the input projections and improves the impact of quantization (to be discussed next).

    + +

    Enabling this kind of computation in Diffusers just takes a single line of code:

    + +
    pipe.fuse_qkv_projections()
    +
    + +

    This will make the attention operations for both the UNet and the VAE take advantage of the combined projections. For the cross-attention layers, we only combine the key and value matrices. To learn more, you can refer to the official documentation here. It’s worth noting that we leverage PyTorch’s scaled_dot_product_attention here internally.

    + +

    These additional techniques improved the inference latency from 2.54 seconds to 2.52 seconds.

    + +

    SDXL Chart

    + +

    Dynamic int8 quantization

    + +

    We selectively apply dynamic int8 quantization to both the UNet and the VAE. This is because quantization adds additional conversion overhead to the model that is hopefully made up for by faster matmuls (dynamic quantization). If the matmuls are too small, these techniques may degrade performance.

    + +

    Through experimentation, we found that certain linear layers in the UNet and the VAE don’t benefit from dynamic int8 quantization. You can check out the full code for filtering those layers here (referred to as dynamic_quant_filter_fn below).

    + +

    We leverage the ultra-lightweight pure PyTorch library torchao to use its user-friendly APIs for quantization:

    + +
    from torchao.quantization import apply_dynamic_quant
    +
    +apply_dynamic_quant(pipe.unet, dynamic_quant_filter_fn)
    +apply_dynamic_quant(pipe.vae, dynamic_quant_filter_fn)
    +
    + +

    Since this quantization support is limited to linear layers only, we also turn suitable pointwise convolution layers into linear layers to maximize the benefit. We also specify the following compiler flags when using this option:

    + +
    torch._inductor.config.force_fuse_int_mm_with_mul = True
    +torch._inductor.config.use_mixed_mm = True
    +
    + +

    To prevent any numerical issues stemming from quantization, we run everything in the bfloat16 format.

    + +

    Applying quantization this way improved the latency from 2.52 seconds to 2.43 seconds.

    + +

    SDXL Chart

    + +

    Resources

    + +

    We welcome you to check out the following codebases to reproduce these numbers and extend the techniques to other text-to-image diffusion systems as well:

    + + + +

    Other links

    + + + +

    Improvements in other pipelines

    + +

    We applied these techniques to other pipelines to test the generality of our approach. Below are our findings:

    + +

    SSD-1B

    + +

    SSD-1B Chart

    + +

    Stable Diffusion v1-5

    + +

    Stable Diffusion v1-5 chart

    + +

    PixArt-alpha/PixArt-XL-2-1024-MS

    + +

    It’s worth noting that PixArt-Alpha uses a Transformer-based architecture as its denoiser for the reverse diffusion process instead of a UNet.

    + +

    PixArt-alpha/PixArt-XL-2-1024-MS chart

    + +

    Note that for Stable Diffusion v1-5 and PixArt-Alpha, we didn’t explore the best shape combination criteria for applying dynamic int8 quantization. It might be possible to get better numbers with a better combination.

    + +

    Collectively, the methods we presented offer substantial speedup over the baseline without degradation in the generation quality. Furthermore, we believe that these methods should complement other optimization methods popular in the community (such as DeepCache, Stable Fast, etc.).

    + +

    Conclusion and next steps

    + +

    In this post, we presented a basket of simple yet effective techniques that can help improve the inference latency of text-to-image Diffusion models in pure PyTorch. In summary:

    + +
      +
    • Using a reduced precision to perform our computations
    • +
    • Scaled-dot product attention for running the attention blocks efficiently
    • +
    • torch.compile with “max-autotune” to improve for latency
    • +
    • Combining the different projections together for computing attention
    • +
    • Dynamic int8 quantization
    • +
    + +

    We believe there’s a lot to be explored in terms of how we apply quantization to a text-to-image diffusion system. We didn’t exhaustively explore which layers in the UNet and the VAE tend to benefit from dynamic quantization. There might be opportunities to further speed things up with a better combination of the layers being targeted for quantization.

    + +

    We kept the text encoders of SDXL untouched other than just running them in bfloat16. Optimizing them might also lead to improvements in latency.

    + +

    Acknowledgements

    + +

    Thanks to Ollin Boer Bohan whose VAE was used throughout the benchmarking process as it is numerically more stable under reduced numerical precisions.

    + +

    Thanks to Hugo Larcher from Hugging Face for helping with infrastructure.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/accelerating-generative-ai-4/index.html b/blog/accelerating-generative-ai-4/index.html new file mode 100644 index 000000000000..adb243eb09ef --- /dev/null +++ b/blog/accelerating-generative-ai-4/index.html @@ -0,0 +1,823 @@ + + + + + + + + + + + + + Accelerating Generative AI with PyTorch IV: Seamless M4T, fast | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Yejin Lee, Carole-Jean Wu, Christian Puhrsch, Joel Schlosser, Driss Guessous, Jeffrey Wan, Joe Isaacson, Can Balioglu, Juan Pino + +

    +

    This post is the fourth part of a multi-series blog focused on how to accelerate generative AI models with pure, native PyTorch. To skip to the code, check out our github (seamless_communication, fairseq2). We are excited to share a breadth of newly released PyTorch performance features alongside practical examples to see how far we can push PyTorch native performance. In part one, we showed how to accelerate Segment Anything over 8x using only pure, native PyTorch. In part two, we showed how to accelerate Llama-7B by almost 10x using only native PyTorch optimizations. In part three, we showed how to accelerate text-to-image diffusion models up to 3x using only native Pytorch optimizations.

    + +

    In this blog, we’ll focus on speeding up FAIR’s Seamless M4T-v2 model resulting in 2x speedup for text decoder module and 30x for vocoder module, resulting in 2.7x speedup for end-to-end inference, with no loss of accuracy by using CUDA Graph and native PyTorch optimization:

    + + + +

    End to End Inference Speedup

    + +

    Introduction

    + +

    Seamless M4T is an open-source foundational speech/text translation and transcription technology developed by FAIR. Seamless M4T is a massively multilingual and multimodal machine translation model, with the latest version (Seamless M4T-v2) released on November 30th, 2023. The high-level model architecture of Seamless M4T-v2 is illustrated in Figure 1.

    + +

    Model Architecture of Seamless M4T-v2

    + +

    Figure 1. Model Architecture of Seamless M4T-v2.

    + +

    Accelerating inference latency is crucial for translation models to improve user experience through faster communication across languages. In particular, batch_size=1 is often used for fast translation where latency matters a lot in applications such as chatbots, speech translation, and live subtitling. Therefore, we conducted the performance analysis on inference with batch_size=1, as shown in Figure 2 to understand the Amdahl’s Law bottleneck. Our results indicate that the text decoder and vocoder are the most time-consuming modules, accounting for 61% and 23% of the inference time, respectively.

    + +

    Text decoder and vocoder are the most time consuming module. Breakdown of inference time by modules for English-Spanish S2ST (Speech-to-Speech-Text) task for batch_size=1 on A100 GPU.

    + +

    Figure 2. Text decoder and vocoder are the most time consuming module. Breakdown of inference time by modules for English-Spanish S2ST (Speech-to-Speech-Text) task for batch_size=1 on A100 GPU.

    + +

    To take a closer look at the performance bottleneck of the text decoder and vocoder, we analyzed GPU traces for the text decoder and vocoder for the 8th sample for the English-Spanish translation example of FLEURS dataset as shown in Figure 3. It revealed that the text decoder and vocoder are heavily CPU-bound modules. We observed a significant gap incurred by CPU overhead that delayed the launch of GPU kernels, resulting in a substantial increase in the execution time for both the modules.

    + +

    CPU and GPU trace for Text Decoder

    + +

    (a) CPU and GPU trace for Text Decoder

    + +

    CPU and GPU trace for Vocoder

    + +

    (b) CPU and GPU trace for Vocoder

    + +

    Figure 3. Text Decoder and Vocoder are heavily CPU-bound modules. CPU and GPU trace for (a) Text Decoder (b) Vocoder for the 8th sample for English-Spanish translation example of FLEURS dataset. The trace is obtained by running inference with batch_size=1 on A100 gpu.

    + +

    Based on the real-system performance analysis results that text_decoder and vocoder are heavily CPU bound modules in Seamless M4T-v2, we enabled torch.compile + CUDA Graph to those modules. In this post, we share modifications required to enable torch.compile + CUDA Graph on each module for batch_size=1 inference scenario, discussion on CUDA Graph and next step plans.

    + +

    Torch.compile with CUDA Graph

    + +

    torch.compile is a PyTorch API that allows users to compile PyTorch models into a standalone executable or script which is generally used for optimizing model performance by removing unnecessary overhead.

    + +

    CUDA Graph is a feature provided by NVIDIA that allows for the optimization of kernel launches in CUDA applications. It creates an execution graph of CUDA kernels, which can be pre-processed and optimized by the driver before being executed on the GPU. The main advantage of using CUDA Graph is that it reduces the overhead associated with launching individual kernels, as the graph can be launched as a single unit, reducing the number of API calls and data transfers between the host and device. This can lead to significant performance improvements, especially for applications that have a large number of small kernels or repeat the same set of kernels multiple times. If this is something you are interested in learning more about, check out this paper that highlights the important role of data for accelerated computing: Where is the data? Why you cannot debate CPU vs. GPU performance without the answer by our own Kim Hazelwood! This is when NVIDIA was heavily investing in general-purpose GPU (GPGPUs) and before deep learning revolutionized the computing industry!

    + +

    However, because CUDA Graph operates on 1) fixed memory pointer, 2) fixed shape of tensors, that are recorded at the compile time, we introduced the following improvements for CUDA Graph to be reused across multiple sizes of inputs to prevent CUDA Graph generation for each iteration and let the data inside CUDA Graph be reused across different runs to share KV Cache for multiple decoding steps.

    + +

    Text Decoder

    + +

    The Text Decoder in Seamless is a decoder from NLLB [1] that performs T2TT (Text to Text Translation). Also, this module is a CPU-bound model where gpu execution time is not long enough to hide CPU overhead because of the nature of auto-regressive generation that requires sequential processing of tokens, which limits the amount of parallelism that can be achieved on the GPU. Based on this observation, we enabled torch.compile + CUDA Graph for the text decoders to reduce the dominating CPU overhead as shown in Figure 4.

    + +

    CPU and GPU trace for Text Decoder after torch.compile + CUDA Graph are enabled

    + +

    Figure 4. CPU and GPU trace for Text Decoder after torch.compile + CUDA Graph are enabled.

    + +

    1. Updating and retrieving KV cache

    + +

    During inference, the text decoder has two computation phases: a prefill phase that consumes the prompt and an incremental generation phase that generates output tokens one by one. Given a high enough batch size or input length, prefill operates on a sufficiently high number of tokens in parallel — GPU performance is the bottleneck and the CPU overheads do not impact performance significantly. On the other hand, incremental token generation is always executed with sequence length 1 and it is often executed with a small batch size (even 1), e.g. for interactive use cases. Thus, incremental generation can be limited by the CPU speed and thus is a good candidate for torch.compile + CUDA Graph.

    + +

    However, during the incremental token generation phase, the sequence_length dimension of key and value involved in the attention computation increases by one with each step while the sequence length of query always remains 1. Specifically, key/value are generated by appending the newly computed key/value of sequence length 1 to the key/value stored in the KV cache so far. But as mentioned above, CUDA Graph records all the shapes of tensors during compilation and replay with the recorded shapes. Thus, few modifications have been made to address this issue following the great work here.

    + +

    a) We modify the KV-cache handling to take the indices in which to write new values in a CUDA Tensor (i.e., valid_seq_pos) rather than a Python integer.

    + +

    Modification to KV cache append and get

    + +

    Figure 5. Modification to KV cache append and get

    + +

    b) We also modify attention to work with the fixed shape of key and value over the max_seq_length. We only compute softmax over the sequence positions up to the current decoding step (i.e., valid_seq_pos) . To mask out sequence positions > current decoding step (i.e., valid_seq_pos), we create a boolean mask tensor (i.e., mask) where sequence positions > valid_seq_pos are set to False.

    + +

    Helper function to generate valid_seq_pos and mask

    + +

    Figure 6. Helper function to generate valid_seq_pos and mask

    + +

    It’s important to post that these modifications result in an increase in the amount of computation required, as we compute attention over more sequence positions than necessary (up to max_seq_length). However, despite this drawback, our results demonstrate that torch.compile + CUDA Graph still provide significant performance benefits compared to standard PyTorch code.

    + +

    c) As different inference samples have different sequence length, it also generates different shapes of inputs that are to be projected to key and value for the cross attention layers. Thus, we pad the input to have a static shape and generate a padding mask to mask out padded output.

    + +

    2. Memory Pointer Management

    + +

    As CUDA Graph records memory pointers along with the shape of tensors, it is important to make different inference samples to correctly reference the recorded memory pointer (e.g., KV cache) to avoid compiling CUDA Graph for each inference sample. However, some parts of the Seamless codebase made different inference samples to refer to different memory addresses, so we made modifications to improve the memory implications.

    + +

    e) Seamless adopts beam search as a text decoding strategy. In the beam search process, we need to perform KV cache reordering for all the attention layers for each incremental decoding step to make sure each selected beam performs with corresponding KV cache as shown in the code snippet below.

    + +

    KV cache reordering operation for beam search decoding strategy

    + +

    Figure 8. KV cache reordering operation for beam search decoding strategy.

    + +

    The above code allocates new memory space and overwrites the original memory pointer for cache_k and cache_v. Thus we modified KV cache reordering to keep the memory pointer of each cache as was recorded during compilation by using copy_ operator.

    + +

    In-place update for KV cache using copy_ operator

    + +

    Figure 9. In-place update for KV cache using copy_ operator

    + +

    f) After enabling torch.compile + CUDA Graph to text decoder by modifying the code as mentioned above, the overhead of text decoder shifts to KV cache reordering as shown in Figure 10. KV cache reordering repeatedly calls index_select 96 times (assuming 24 decoder layers where each layer consists of two types of attention layers with cache for key and value).

    + +

    CPU and GPU trace for Text Decoder after enabling torch.compile + CUDA Graph

    + +

    Figure 10. CPU and GPU trace for Text Decoder after enabling torch.compile + CUDA Graph.

    + +

    As part of accelerating text decoder, we additionally applied torch.compile to KV cache reordering to benefit from fusing kernels as shown in Figure 11. Note that we cannot use CUDA Graph here (mode='max-autotune') here, because copy_ operation modifies the inputs which violates the static input requirement of CUDA graph version in torch.compile.

    + +

    Applying torch.compile to KV Cache reordering

    + +

    Figure 11. Applying torch.compile to KV Cache reordering.

    + +

    As a result of enabling torch.compile to KV cache reordering, the gpu kernels that were launched separately (Figure 12(a)) are now fused so there are much fewer gpu kernels to launch (Figure 12(b)).

    + +

    CPU and GPU trace for KV cache reordering before enabling torch.compile

    + +

    (a) CPU and GPU trace for KV cache reordering before enabling torch.compile

    + +

    CPU and GPU trace for KV cache reordering after enabling torch.compile

    + +

    (b) CPU and GPU trace for KV cache reordering after enabling torch.compile

    + +

    Figure 12. CPU and GPU trace for KV cache reordering (a) before and (b) after enabling torch.compile

    + +

    Vocoder

    + +

    Vocoder in Seamless is a HiFi-GAN unit-vocoder that converts generated units to waveform output where an unit is a representation of speech that combines different aspects such as phonemes and syllables, which can be used to generate sounds that are audible to humans. Vocoder is a relatively simple module that consists of Conv1d and ConvTranspose1d layers and is a CPU bound module as shown in FIgure 3. Based on this observation, we decided to enable torch.compile + CUDA Graph for vocoder to reduce the disproportionally large CPU overhead as shown in Figure 10. But there were several fixes to be made.

    + +

    CPU and GPU trace for Vocoder after torch.compile + CUDA Graph are enabled

    + +

    Figure 13. CPU and GPU trace for Vocoder after torch.compile + CUDA Graph are enabled.

    + +

    a) The input tensor shape of the vocoder is different across different inference samples. But as CUDA Graph records the shape of tensors and replays them, we had to pad the input to the fixed size with zeros. Since vocoder only consists of Conv1d layers, we do not need an additional padding mask, and padding with zeros is sufficient.

    + +

    b) Vocoder consists of conv1d layers wrapped with torch.nn.utils.weight_norm (see here). However, applying torch.compile directly to Vocoder incurs graph break as below, which leads to suboptimal performance improvement. This graph break happens inside the hook handling part in the PyTorch code of weight_norm.

    + +
    [1/0_2] torch._dynamo.symbolic_convert.__graph_breaks: [DEBUG] Graph break: setattr(UserDefinedObjectVariable) <function Module.__setattr__ at 0x7fac8f483c10> from user code at:
    +[1/0_2] torch._dynamo.symbolic_convert.__graph_breaks: [DEBUG]   File "/mnt/fsx-home/yejinlee/yejinlee/seamless_communication/src/seamless_communication/models/vocoder/vocoder.py", line 49, in forward
    +[1/0_2] torch._dynamo.symbolic_convert.__graph_breaks: [DEBUG]     return self.code_generator(x, dur_prediction)  # type: ignore[no-any-return]1/0_2] torch._dynamo.symbolic_convert.__graph_breaks: [DEBUG]   File "/data/home/yejinlee/mambaforge/envs/fairseq2_12.1/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    +[1/0_2] torch._dynamo.symbolic_convert.__graph_breaks: [DEBUG]     return forward_call(*args, **kwargs)
    +[2023-12-13 04:26:16,822] [1/0_2] torch._dynamo.symbolic_convert.__graph_breaks: [DEBUG]   File "/mnt/fsx-home/yejinlee/yejinlee/seamless_communication/src/seamless_communication/models/vocoder/codehifigan.py", line 101, in forward
    +[1/0_2] torch._dynamo.symbolic_convert.__graph_breaks: [DEBUG]     return super().forward(x)
    +[1/0_2] torch._dynamo.symbolic_convert.__graph_breaks: [DEBUG]   File "/mnt/fsx-home/yejinlee/yejinlee/seamless_communication/src/seamless_communication/models/vocoder/hifigan.py", line 185, in forward
    +[1/0_2] torch._dynamo.symbolic_convert.__graph_breaks: [DEBUG]     x = self.ups[i](x)
    +[1/0_2] torch._dynamo.symbolic_convert.__graph_breaks: [DEBUG]   File "/data/home/yejinlee/mambaforge/envs/fairseq2_12.1/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1550, in _call_impl
    +[1/0_2] torch._dynamo.symbolic_convert.__graph_breaks: [DEBUG]     args_result = hook(self, args)
    +[1/0_2] torch._dynamo.symbolic_convert.__graph_breaks: [DEBUG]   File "/data/home/yejinlee/mambaforge/envs/fairseq2_12.1/lib/python3.8/site-packages/torch/nn/utils/weight_norm.py", line 65, in __call__
    +[1/0_2] torch._dynamo.symbolic_convert.__graph_breaks: [DEBUG]     setattr(module, self.name, self.compute_weight(module))
    +[1/0_2] torch._dynamo.symbolic_convert.__graph_breaks: [DEBUG] 
    +
    + +

    Since the weights of layers do not change during the inference, we do not need weight normalization. So we simply removed weight normalization for Vocoder as shown in Figure 14, by utilizing remove_weight_norm function which is already provided at the Seamless codebase (here).

    + +

    Removing weight_norm for Vocoder

    + +

    Figure 14. Removing weight_norm for Vocoder

    + +

    Performance Evaluation + Impact of CUDA Graph

    + +

    Figure 15 shows the speedup result when enabling torch.compile(mode=”max-autotune”) + CUDA Graph on the text decoder and vocoder. We achieve 2x speedup for the text decoder and 30x speedup for vocoder, leading to 2.7x faster end-to-end inference time.

    + + + + + + +
    + +Inference time speedup of text decoder and vocoder of applying torch.compile and torch.compile + CUDA Graph + + +Inference time speedup of text decoder and vocoder of applying torch.compile and torch.compile + CUDA Graph +
    + +

    Figure 15. Inference time speedup of text decoder and vocoder of applying torch.compile and torch.compile + CUDA Graph

    + +

    We also report the speedups for text decoder and vocoder using torch.compile without CUDA Graph, which is supported by torch.compile’s API (i.e., torch.compile(mode="max-autotune-no-cudagraphs")), to identify the impact of CUDA Graph on the performance. Without CUDA Graph, the speedup for text decoder and vocoder reduces to 1.17x and 18.4x. While still quite significant, it indicates the important role of CUDA Graph. We conclude that Seamless M4T-v2 is exposed to a lot of time launching CUDA kernels, especially when we use small batch size (e.g., 1) where the GPU kernel execution time is not long enough to amortize the GPU kernel launch time.

    + +

    End-to-end inference speedup of applying torch.compile and CUDA graph incrementally

    + +

    Figure 16. End-to-end inference speedup of applying torch.compile and CUDA graph incrementally. a) “Inc. Decoding”: Apply torch.compile only to the text decoder b) “Inc. Decoding w/ CUDA Graph”: Apply torch.compile + CUDA Graph to the text decoder c) “+KV Cache Reordering”: Additionally apply torch.compile to KV cache reordering operation upon b) d) “+Vocoder”: Additionally apply torch.compile to the vocoder upon c) e) “+Vocoder w/ CUDA Graph”: Additionally apply torch.compile + CUDA Graph to the vocoder upon d).

    + +

    Figure 16 represents the cumulative effect of applying torch.compile with and without CUDA Graph to the modules. The results indicate a significant improvement in the end-to-end inference speedup, demonstrating the effectiveness of these techniques in optimizing the overall latency. As a result, we gain 2.7x end-to-end inference speedup for Seamless M4T-v2 with batch_size=1.

    + +

    Acknowledgements

    + +

    We thank the PyTorch team and Seamless team for their tremendous support with this work.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/accelerating-generative-ai-segment-anything-2/index.html b/blog/accelerating-generative-ai-segment-anything-2/index.html new file mode 100644 index 000000000000..f69568073751 --- /dev/null +++ b/blog/accelerating-generative-ai-segment-anything-2/index.html @@ -0,0 +1,1915 @@ + + + + + + + + + + + + + Accelerating Generative AI with PyTorch: Segment Anything 2 - Fast and furious inference with low latency and fast cold starts | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    This post is a follow-up to our first entry in the multi-series blog focused on how to accelerate generative AI models with pure, native PyTorch and a focus on latency and elastic scalability. We use torch.compile and torch.export to create highly optimized low latency versions of SAM2 that can be quickly scaled up on new instances.

    + +

    By utilizing AOTInductor’s (AOTI) ahead-of-time compilation via torch.export, reduced precision, batched prompts and GPU preprocessing we observe up to 13x improvement in p90 execution latency and queue times compared to regular eager mode PyTorch.

    + +

    We calculate our final results and demonstrate the improvement in a realistic deployment on auto-scaling cloud infrastructure from Modal.

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + p50 execution latency +
    +(ms / improvement) +
    p90 execution latency +
    +(ms / improvement) +
    + eager float32 + AOTI float16 + eager float32 + AOTI float16 +
    AMG + 741 + 112 (6.6x) + 1140 + 176 (6.5x) +
    SPS + 98 + 20 (4.9x) + 130 + 28 (4.6x) +
    MPS + 269 + 38 (7.1x) + 714 + 52 (13.7x) +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + p50 queue time (ms / improvement) + p90 queue time (ms / improvement) +
    + eager float32 + AOTI float16 + eager float32 + AOTI float16 +
    AMG + 201 + 41 (4.9x) + 815 + 327 (2.6x) +
    SPS + 31 + 33 (0.9x) + 441 + 49 (9.0x) +
    MPS + 40 + 37 (1.1x) + 942 + 75 (12.6x) +
    + +

    The Tasks

    + +

    The first post focused on processing a small number of varying prompts (points of interest) per image. These points represented the center points of the ground truth masks. For this post, we’ll now focus on a broader set of tasks. Single prompt segmentation (SPS), multi prompt segmentation (MPS), automatic mask generation (AMG) which generates the full set of masks for the input image without a given set of prompts. The first post focused on MPS only.

    + +

    comparison of 3 images

    + +

    The little star in the image represents a user prompt. For AMG there are no prompts and masks are filtered down heuristically from a dense grid of initial candidate prompts (guesses). For SPS and MPS user prompts are derived from the center points of AMG masks. For SPS we choose the mask with the largest area.

    + +

    Note that SAM2 uses a different backbone than SAM1. In particular, we only consider the largest and most accurate sam2.1_hiera_large backbone for this blog.

    + +

    We aggregate the scripts needed to reproduce the results in torchao’s example folder and incrementally upstream the more stable parts of the changes to the SAM2 model in torchao to the main SAM2 repository. So if you are interested in taking a look at the cutting-edge variant or would like to contribute experimental features, please don’t hesitate to reach out to the torchao repository and team. For the more stable and latest model version, please head on over to SAM2 directly.

    + +

    Overview

    + +

    We categorize the changes presented here into two. Fast changes constrain themselves to techniques that are not meant to affect model accuracy. Furious changes sacrifice some numerical accuracy for additional speed by making use of approximations such as low-precision data types.

    + +

    Approximations may slightly lower precision metrics in favor of significantly improved performance while still passing an end-to-end check based on mean intersection over union (mIoU).

    + +

    To measure the performance improvements we processed 1000 images, which were selected at random from the SAM2 validation dataset. We look at the p50 and p90 latency per image. To measure accuracy we consider the mIoU. Most notably for the AMG task we also define a fail count metric. We consider a comparison failed if the number of masks differs. This turns out to be a fairly unstable quantity and we can see that the other tasks are not as sensitive to small numeric changes as AMG.

    + +

    The Setup

    + +

    We are running the offline experiments on a regular H100 devserver, which is a fairly beefy and performant machine.

    + +

    However, we try to look at these tasks with realistic constraints. In particular, we would like to emulate a server-side inference environment. That means we don’t use DataLoader to hide the latency of image preprocessing or decoding routines.

    + +

    For the latency calculations we include decoding, segmentation and conversion of masks to a dictionary of run-length encoded masks. Or put differently, we exclude loading the images into in-memory host bytearrays and storing the resulting dictionaries as json files on disk. This is meant to emulate a more realistic setting.

    + +

    More concretely, consider the code below for the routines we include in our measurements. For any task gen_masks produces a batched bool Tensor bitmask that represents the corresponding object masks. We then compress this bitmask into a run length encoded (rle) format that can be used to transfer back the results from a remote server much more efficiently.

    + +
    image_tensors = decode_img_bytes(...)
    +masks = gen_masks(image_tensors, ...)
    +rle_dicts = [rle_dict_from_masks(m) for m in masks]
    +
    + +

    Optimizations

    + +

    ao: eager code optimizations

    + +

    The most effective tool for this work is the PyTorch autograd profiler combined with record_function. To build this software, we’ve used the profiler repeatedly to observe the program and confirm the effectiveness of any changes. It’s also important to keep in mind that the profiler itself has overhead. The more data you collect, such as stack traces, the more overhead you introduce, which might skew the collected trace. But it is excellent to find synchronization points, space between kernels and GPU kernels that take a long time.

    + +

    GPU traces help you understand bottlenecks that are not necessarily easily addressed by compile. We found that AutomaticMaskGeneration in particular is dominated by the data structure used to store the masks and by the routine used to convert the masks to a run-length encoded compressed format. We also found a large part of AMG performance is dominated by the large number of masks created as a single batch. Sometimes candidate masks can be filtered down to fewer candidates earlier in the postprocessing stage by reordering operations. This in turn significantly speeds up the later operations.

    + +

    In order to confirm the accuracy of our implementation we first compare without any changes in settings and using float32 precision. We see that mIoU is unchanged and the masks match perfectly when using the exact same settings. This means that these eager mode changes did not affect the accuracy of these tasks.

    + +

    AMG

    + + + + + + + + + + + + + + + + + + + + + + + +
    + p50 latency (ms) + p90 latency (ms) + memory (MiB) + mIoU / fail count +
    Baseline + 864 + 1144 + 4350 + reference +
    AO + 693 + 786 + 4010 + 1 / 0 +
    + +

    ao: batching prompts

    + +

    Another lossless performance optimization that we were able to apply is batching the user input prompt calculations. When optimizing for latency at batch size 1 on a server-grade GPU such as an H100 we are often left with a lot of spare memory. We can easily trade off that memory for more performance by processing more points of interest (also called user prompts) at once. Remember that SAM2 is split into two parts: First the backbone (image encoder), second the prediction and decoding of masks based on a set of user prompts / points of interest. It is the second part where we may expect a larger or even varying number of inputs and it is this second part where we apply batching.

    + +

    This causes a large increase in memory, but also much better latency. The baseline generates one mask per prompt in a loop. For AMG the baseline processes 64 prompts at once and all that is needed is to change it to 1024, which is the number of candidate prompts generated. For SPS we process one prompt at a time, but it’s still included below for completeness.

    + +

    AMG

    + + + + + + + + + + + + + + + + + + + + + + + +
    + p50 latency (ms) + p90 latency (ms) + memory (MiB) + mIoU / fail count +
    Baseline + 864 + 1144 + 4350 + reference +
    AO + batching + 613 + 706 + 33786 + 0.9999995 / 0 +
    + +

    SPS

    + + + + + + + + + + + + + + + + + + + + + + + +
    + p50 latency (ms) + p90 latency (ms) + memory (MiB) + mIoU +
    Baseline + 116 + 181 + 1337 + reference +
    AO + 110 + 170 + 1339 + 1 +
    + +

    MPS

    + + + + + + + + + + + + + + + + + + + + + + + +
    + p50 latency (ms) + p90 latency (ms) + memory (MiB) + mIoU +
    Baseline + 276 + 681 + 1337 + reference +
    AO + batching + 126 + 225 + 8021 + 0.9999992 +
    + +

    As a technical side note: Most notably to enable batching for MPS, and to avoid a significant manual rewrite of the code base to support multiple prompts at the same time, we used a Tensor subclass we call MapTensor. A MapTensor allows us to pass a batch of N prompts, but have it advertise a batch size of 1. Any operation is then automatically broadcast to the wrapped Tensor and propagated throughout the prediction part of the model. This works because individual prompt predictions are independent of one another. This is very similar to torch.vmap.

    + +
    center_points_torch = to_map_tensor(center_points_torch)
    +center_points_label_torch = to_map_tensor(center_points_label_torch)
    +masks, scores, _ = mask_generator.predictor.predict(
    +    point_coords=center_points_torch,
    +    point_labels=center_points_label_torch,
    +    multimask_output=True,
    +    return_logits=False,
    +    return_type="torch",
    +)
    +# Unwrapping MapTensor
    +masks = masks.elems
    +scores = scores.elems
    +
    + +

    fast: fullgraph compilation

    + +

    Just as with our first post, we first remove GPU syncs and graph breaks to make use of fullgraph compiled model code with max-autotune kernels where appropriate. After some rewriting, we are able to compile the image encoder and the prediction of masks.

    + +

    We run the experiments twice to get a sense of the overhead due to compilation. We run it once in an environment with an empty TORCHINDUCTOR_CACHE_DIR and then again while ingesting the artifacts from the previous run. In particular, auto-tuning can take a long time and happens on the first call in a pristine environment. We call the second run “warm”. The first iteration is typically expected to be slow due to various other related initialization processes, but compile increases it significantly, even if an existing cache is used and the same exact shapes are fed again. Having said that, an overhead of a few seconds in a warm environment is often still stomachable on the very first call.

    + +

    Most of these drawbacks can be mitigated and compiling causes a significant improvement in latency and reduction in memory.

    + +

    AMG

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + p50 latency (ms) + p90 latency (ms) + memory (MiB) + mIoU / +
    +fail count +
    first iteration +
    +(ms) +
    AO + batching + 613 + 706 + 33786 + 0.9999995 / 0 + 1125 +
    + compile (cold) + 423 + 513 + 29349 + skipped + 404866 +
    + compile (warm) + 439 + 530 + 29349 + 0.994 / 190 + 8544 +
    + +

    The number of masks produced per mask can vary slightly when using automatic mask segmentation. There is ambiguity in the number of masks per object the model may produce. For example, a car may be subdivided into frames, windows and doors or treated as a whole. When a modification causes the number of masks to change, we consider the comparison failed and we only calculate the mIoU on masks with an exact match. This does not apply to the other tasks. We found that the number of masks generated is very sensitive to small numerical changes. The other tasks use the same code and MPS in particular can help us further verify correctness.

    + +

    SPS

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + p50 latency (ms) + p90 latency (ms) + memory (MiB) + mIoU + first iteration +
    +(ms) +
    AO + 110 + 170 + 1339 + 1 + 562 +
    + compile (cold) + 102 + 158 + 1343 + skipped + 319954 +
    + compile (warm) + 100 + 160 + 1302 + 0.9999 + 8947 +
    + +

    MPS

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + p50 latency (ms) + p90 latency (ms) + memory (MiB) + mIoU + first iteration +
    +(ms) +
    AO + batching + 126 + 225 + 8021 + 0.9999992 + 504 +
    + compile (cold) + 129 + 215 + 8021 + skipped + 333308 +
    + compile (warm) + 113 + 213 + 8021 + 0.998 + 8617 +
    + +

    furious: TF32, float16 and GPU preprocessing

    + +

    We found that using float16 is the right level of precision for a few significant subcomponents of the model. In particular, the image encoder and mask decoder weights can be converted entirely to float16. We can also use TensorFloat32 precision for the remaining float32 matrix operations. It should be possible to further reduce the precision and we may address this in a future post. We also move image preprocessing such as image normalization onto the GPU with the furious mode. We can’t use GPU decoding (nvJPEG) routines, because the differences are too significant and the model suffers from significant degradation in mIoU, so image decoding still happens on the CPU.

    + +

    AMG

    + + + + + + + + + + + + + + + + + + + + + + + +
    + p50 latency (ms) + p90 latency (ms) + memory (MiB) + mIoU / +
    +fail count +
    AO +
    ++ batching +
    ++ compile (warm) +
    439 + 530 + 29349 + 0.994 / 190 +
    + furious + 165 + 240 + 28335 + 0.978 / 306 +
    + +

    This causes a significant degradation in mIoU for the AMG task, but doesn’t affect the other tasks. After an in-depth investigation, we still chalk this up to numerical instability and reordering of operations. More work is needed to further investigate this and it may not be interesting to run the AMG task in lower precision. The other tasks, however, benefit drastically in latency with minimal changes in mIoU.

    + +

    SPS

    + + + + + + + + + + + + + + + + + + + + + + + +
    + p50 latency (ms) + p90 latency (ms) + memory (MiB) + mIoU +
    AO +
    ++ compile (warm) +
    100 + 160 + 1302 + 0.9999 +
    + furious + 32 + 63 + 861 + 0.9997 +
    + +

    MPS

    + + + + + + + + + + + + + + + + + + + + + + + +
    + p50 latency (ms) + p90 latency (ms) + memory (MiB) + mIoU +
    AO +
    ++ batching +
    ++ compile (warm) +
    113 + 213 + 8021 + 0.998 +
    + furious + 36 + 64 + 4222 + 0.997 +
    + +

    AOTInductor’s (AOTI) ahead-of-time compilation via torch.export

    + +

    When scaling elastically it often is not possible to accommodate long startup times. That means the first iteration cannot be slow, but we must quickly deliver results. This is when torch.compile’s current compilation overhead can get in the way. To address this we can use AOTInductor’s (AOTI) ahead-of-time compilation via torch.export. AOTI lets us compile the model on a representative input and store the resulting code in a binary that is quick to load and run.

    + +

    AOTI via torch.export is a new feature and we currently can’t export everything that is compilable. We’ve been able to export the image encoder for all tasks but have only been able to export the mask prediction for the AMG and SPS tasks due to varying prompts. torch.export also supports dynamic shapes, but we need to invest a bit more time to prepare the code for it.

    + +

    AMG: AO + batching + furious

    + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + p50 latency (ms) + p90 latency (ms) + memory (MiB) + mIoU / +
    +fail count +
    first iteration +
    +(ms) +
    + compile (warm) + 165 + 240 + 28335 + 0.978 / 306 + 10341 +
    + load export +
    +(cold) +
    162 + 233 + 27927 + 0.974 / 308 + 906 +
    + +

    SPS: AO + furious

    + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + p50 latency (ms) + p90 latency (ms) + memory (MiB) + mIoU + first iteration +
    +(ms) +
    + compile (warm) + 32 + 63 + 861 + 0.9997 + 7989 +
    + load export +
    +(cold) +
    35 + 66 + 1686 + 0.9997 + 763 +
    + +

    Note that loading the exported model significantly increases memory. It likely only increases peak memory utilization, because initialization really needs to be delayed before loading up an exported model to avoid having twice the weights in memory at once. This is something we could address, but the memory consumption is nowhere near the limit. We don’t see an increase in the other tasks, because AMG and MPS peak memory is dominated by processing batches of masks. One way to reduce that could be to operate on masks in the rle format (or some other sparse format) earlier on, but for now, there is no reason for this given the current memory consumption and focus on latency.

    + +

    MPS: AO + batching + furious

    + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + p50 latency (ms) + p90 latency (ms) + memory (MiB) + mIoU + first iteration +
    +(ms) +
    + compile (warm) + 36 + 64 + 4222 + 0.997 + 9626 +
    + load export +
    +(cold) +
    43 + 72 + 3813 + 0.997 + 747 +
    + +

    Using export by itself doesn’t seem to benefit from extensive warmup and can be run in a pristine new inductor cache directory. But again, we do not evict the CUDA cache or other caches. In the section on Modal, we are running some of these experiments in a pristine environment.

    + +

    When only processing 1000 images in a new process, using export can really be worth it to save out on compile and other cold start overhead.

    + +

    bonus: More GPU preprocessing

    + +

    At this point, the latency is fairly low. In particular, for the SPS and MPS tasks we are processing at around 30ms to 40ms. Let’s bring back the pseudo-code from the setup section again.

    + +
    image_tensors = decode_img_bytes(...)
    +masks = gen_masks(image_tensors, ...)
    +rle_dicts = [rle_dict_from_masks(m) for m in masks]
    +
    + +

    Further profiling showed that at this point decode_img_bytes takes about 10ms. In particular, it uses torchvision’s ToTensor transform to convert from a numpy Tensor to a scaled, float32 torch.Tensor. The bytes passed to ToTensor have already been decoded and converted to an numpy ndarray. By slightly rewriting ToTensor, using torchvision’s v2 API and moving the uint8 decoded smaller integer Tensor to GPU first before scaling, we can gain another 10ms in latency. Without including decode_img_bytes in our analysis we would have missed this opportunity that has real-world impact on server-side inference.

    + +
    image_tensor = torch.from_numpy(image_tensor)
    +image_tensor = image_tensor.permute((2, 0, 1))
    +image_tensor = image_tensor.cuda()
    +image_tensor = v2.ToDtype(torch.float32, scale=True)( image_tensor)
    +
    + +

    Note in particular that using pinned memory to perform asynchronous data transfers doesn’t apply, since the time it takes to move the Tensor into pinned memory isn’t worth the gain in asynchronicity for this data movement. For future work, we might want to explore further improvements here by using more advanced direct memory transfer techniques.

    + +

    AMG: AO + batching + furious

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + p50 latency (ms) + p90 latency (ms) + memory (MiB) + mIoU / +
    +fail count +
    first iteration +
    +(ms) +
    + load export +
    +(cold) +
    162 + 233 + 27927 + 0.974 / 308 + 906 +
    + load export (warm) + 157 + 230 + 27927 + 0.974 / 308 + 799 +
    + load export (warm) +
    ++ preproc +
    136 + 208 + 27950 + 0.977 / 311 + 908 +
    + +

    SPS: AO + furious

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + p50 latency (ms) + p90 latency (ms) + memory (MiB) + mIoU + first iteration +
    +(ms) +
    + load export +
    +(cold) +
    35 + 66 + 1686 + 0.9997 + 763 +
    + load export (warm) + 31 + 63 + 1686 + 0.9997 + 683 +
    + load export (warm) +
    ++ preproc +
    19 + 25 + 1711 + 0.9997 + 658 +
    + +

    MPS: AO + batching + furious

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + p50 latency (ms) + p90 latency (ms) + memory (MiB) + mIoU + first iteration +
    +(ms) +
    + load export +
    +(cold) +
    43 + 72 + 3813 + 0.997 + 747 +
    + load export (warm) + 53 + 81 + 3813 + 0.997 + 807 +
    + load export (warm) +
    ++ preproc +
    31 + 41 + 3837 + 0.997 + 671 +
    + +

    This small change has a significant impact on the SPS and MPS task.

    + +

    Deploying on Modal

    + +

    Finally, we deployed our optimized inference onto Modal, a serverless infrastructure provider, to demonstrate that the benefits of these optimizations can be realized in a more realistic deployment setting.

    + +

    In particular, compilation and AOTI via torch.export requires extra work. In a naïve deployment that work might be added to every single inference execution, adding latency that dwarfs any improvements from a faster model. This is particularly challenging with elastic or autoscaling infrastructure, where replicas of our inference service need to be regularly and automatically created and destroyed.

    + +

    We share a deployment script in the torchao repository (cli_on_modal.py) to demonstrate one pattern for an elastic deployment. We build the exported models ahead of time and then upload them to distributed storage. Relative to eager execution, this adds a bit of extra work when replicas spin up since they need to read this data over a network, but this is far less costly than compilation or export.

    + +

    We benchmarked this deployment with a large batch inference workload: sending 1000 images for concurrent processing. The deployment scales up to ten replicas on ten GPUs at peak and scales down to zero GPUs when inactive.

    + +

    First, let’s look at the execution latencies.

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + p50 execution latency +
    +(ms / improvement) +
    p90 execution latency +
    +(ms / improvement) +
    + eager float32 + AOTI float16 + eager float32 + AOTI float16 +
    + + Modal + Offline + + Modal + Offline +
    AMG + 741 + 112 (6.6x) + 136 (5.4x) + 1140 + 176 (6.5x) + 208 (5.5x) +
    SPS + 98 + 20 (4.9x) + 19 (5.2x) + 130 + 28 (4.6x) + 25 (5.2x) +
    MPS + 269 + 38 (7.1x) + 31 (8.7x) + 714 + 52 (13.7x) + 41 (17.4x) +
    + +

    We notice that execution latencies on Modal and Offline are fairly close, especially relative to the baseline, indicating that optimizing the deployment offline was a reasonable proxy for optimizing the deployment directly.

    + +

    In addition to execution latency, our batch workload has queueing time, since there are fewer replicas than there are inputs, and so some inputs have to wait in line.

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + p50 queue time (ms) + p90 queue time (ms) +
    + eager float32 + AOTI float16 + eager float32 + AOTI float16 +
    AMG + 201 + 41 (4.9x) + 815 + 327 (2.6x) +
    SPS + 31 + 33 (0.9x) + 441 + 49 (9.0x) +
    MPS + 40 + 37 (1.1x) + 942 + 75 (12.6x) +
    + +

    Even though the queueing system provided by the infrastructure is unchanged, the queue latencies also decrease when we use our optimized model – in the p90 case by a factor of 2 to 12. That’s because when we finish previous inputs faster (from reduced execution latency) we can pull our next inputs sooner (reducing their queueing time).

    + +

    If you’re interested in optimizing SAM2 inference or deployments further, don’t hesitate to reach out to us at the torchao repository!

    + +

    Conclusions

    + +

    We rewrote Meta’s original SAM2 in pure PyTorch with little loss of accuracy and a strong focus on latency. We deployed our optimized inference onto Modal, a serverless infrastructure provider, to demonstrate that the benefits of these optimizations can be realized in a more realistic deployment setting.

    + +

    By utilizing AOTInductor’s (AOTI) ahead-of-time compilation via torch.export, reduced precision, batched prompts and GPU preprocessing we observe up to 13x improvement in p90 execution latency and queue times compared to regular eager mode PyTorch.

    + +

    With elastic or autoscaling infrastructure, where replicas of our inference service need to be regularly and automatically created and destroyed, a naïve deployment of torch.compile can add work to inference execution that dwarfs any improvements from a faster model. By utilizing AOTInductor’s (AOTI) ahead-of-time compilation via torch.export, we are able to upload exported models ahead of time and read this data over a network, which enables us to get the benefits of compilation without significantly increased work.

    + +

    For more details on how to reproduce the data in this blog post, check out the experiments folder of torchao. Please don’t hesitate to contact us or open an issue if you run into any technical issues.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/accelerating-generative-ai/index.html b/blog/accelerating-generative-ai/index.html new file mode 100644 index 000000000000..3f2b6a8eb46a --- /dev/null +++ b/blog/accelerating-generative-ai/index.html @@ -0,0 +1,902 @@ + + + + + + + + + + + + + Accelerating Generative AI with PyTorch: Segment Anything, Fast | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    This post is the first part of a multi-series blog focused on how to accelerate generative AI models with pure, native PyTorch. We are excited to share a breadth of newly released PyTorch performance features alongside practical examples of how these features can be combined to see how far we can push PyTorch native performance.

    + +

    As announced during the PyTorch Developer Conference 2023, the PyTorch team rewrote Meta’s Segment Anything (“SAM”) Model resulting in 8x faster code than the original implementation, with no loss of accuracy, all using native PyTorch optimizations. We leverage a breadth of new PyTorch features:

    + +
      +
    • Torch.compile: A compiler for PyTorch models
    • +
    • GPU quantization: Accelerate models with reduced precision operations
    • +
    • Scaled Dot Product Attention (SDPA): Memory efficient attention implementations
    • +
    • Semi-Structured (2:4) Sparsity: A GPU optimized sparse memory format
    • +
    • Nested Tensor: Batch together non-uniformly sized data into a single Tensor, such as images of different sizes.
    • +
    • Custom operators with Triton: Write GPU operations using Triton Python DSL and easily integrate it into PyTorch’s various components with custom operator registration.
    • +
    + +

    We encourage readers to copy-paste code from our implementation of SAM on Github and ask us questions on Github.

    + +

    A quick glimpse of increasing throughput and decreasing memory overhead

    + +

    A quick glimpse of increasing throughput and decreasing memory overhead with our newly released, PyTorch native, features. Benchmarks run on p4d.24xlarge instance (8x A100s).

    + +

    SegmentAnything Model

    + +

    SAM is a zero-shot vision model for generating promptable image masks.

    + +

    sam image masks

    + +

    The SAM architecture [described in its paper] includes multiple prompt and image encoders based on the Transformer architecture. Of this, we measured performance across the smallest and largest vision transformer backbones: ViT-B and ViT-H. And for simplicity, we only show traces for the ViT-B model.

    + +

    Optimizations

    + +

    Below we tell the story of optimizing SAM: profiling, identifying bottlenecks, and building new features into PyTorch that solve these problems. Throughout, we showcase our new PyTorch features: torch.compile, SDPA, Triton kernels, Nested Tensor and semi-structured sparsity. The following sections are progressively built upon each other, ending with our SAM-fast, now available on Github. We motivate each feature using real kernel and memory traces, using fully PyTorch native tooling, and visualize these traces with Perfetto UI.

    + +

    Baseline

    + +

    Our SAM baseline is Facebook Research’s unmodified model, using float32 dtype and a batch size of 1. After some initial warmup, we can look at a kernel trace using the PyTorch Profiler:

    + +

    kernel trace

    + +

    We notice two areas ripe for optimization.

    + +

    The first is long calls to aten::index, the underlying call resulting from a Tensor index operation (e.g., []). While the actual GPU time spent on aten::index is relatively low. aten::index is launching two kernels, and a blocking cudaStreamSynchronize is happening in between. This means the CPU is waiting for the GPU to finish processing until it launches the second kernel. To optimize SAM, we should aim to remove blocking GPU syncs causing idle time.

    + +

    The second is significant time spent on GPU in matrix multiplication (dark green on stream 7 7 above). This is common in Transformers. We can significantly speed up SAM if we can reduce the amount of GPU time spent on matrix multiplication.

    + +

    We can measure the throughput (img/s) and memory overhead (GiB) from out of the box SAM to establish a baseline:

    + +

    throughput (img/s) and memory overhead (GiB) from out of the box SAM

    + +

    Bfloat16 Half precision (+GPU syncs and batching)

    + +

    To address the first issue of less time spent in matrix multiplication, we can turn to bfloat16. Bfloat16 is a commonly used half-precision type. Through less precision per parameter and activations, we can save significant time and memory in computation. With reducing precision of parameters, it’s critical to validate end to end model accuracy.

    + +

    replacing padding dtypes with half precision, bfloat16

    + +

    Shown here is an example of replacing padding dtypes with half precision, bfloat16. Code is here.

    + +

    Next to simply setting model.to(torch.bfloat16) we have to change a few small places that assume the default dtype.

    + +

    Now, in order to remove GPU syncs we need to audit operations that cause them. We can find these pieces of code by searching the GPU traces for calls to cudaStreamSynchronize. In fact, we found two locations that we were able to rewrite to be sync-free.

    + +

    code sample 1

    + +

    replacing padding dtypes with half precision, bfloat16

    + +

    Specifically, we see that within SAM’s image encoder, there are variables acting as coordinate scalers, q_coords and k_coords. These are both allocated and processed on the CPU. However, once these variables are used to index in rel_pos_resized, the index operation automatically moves these variables to the GPU. This copy over causes the GPU sync we’ve observed above. We notice a second call to index in SAM’s prompt encoder: We can use torch.where to rewrite this as shown above.

    + +

    Kernel trace

    + +

    After applying these changes, we begin to see significant time between individual kernel calls. This is typically observed with small batch sizes (1 here) due to the GPU overhead of launching kernels. To get a closer look at practical areas for optimization, we can start to profile SAM inference with batch size 8:

    + +

    profile SAM inference with batch size 8

    + +

    Looking at the time spent per-kernel, we obverse most of SAM’s GPU time spent on elementwise kernels and softmax operation. With this we now see that matrix multiplications have become a much smaller relative overhead.

    + +

    matrix multiplications have become a much smaller relative overhead

    + +

    Taken the GPU sync and bfloat16 optimizations together, we have now pushed SAM performance by up to 3x

    + +

    SAM performance by up to 3x

    + +

    Torch.compile (+graph breaks and CUDA graphs)

    + +

    When observing a large number of small operations, such as the elementwise kernels profiled above, turning to a compiler to fuse operations can have strong benefits. PyTorch’s recently released torch.compile does a great job optimizing by:

    + +
      +
    1. Fusing together sequences of operations such as nn.LayerNorm or nn.GELU into a single GPU kernel that is called and
    2. +
    3. Epilogues: fusing operations that immediately follow matrix multiplication kernels to reduce the number of GPU kernel calls.
    4. +
    + +

    Through these optimizations, we reduce the number of GPU global memory roundtrips, thus speeding up inference. We can now try torch.compile on SAM’s image encoder. To maximize performance we use a few advanced compile techniques such as:

    + +
      +
    • using torch.compile’s max-autotune mode enables CUDA graphs and shape-specific kernels with custom epilogues
    • +
    • By setting TORCH_LOGS=”graph_breaks,recompiles” we can manually verify that we are not running into graph breaks or recompiles.
    • +
    • Padding the batch of images input to the encoder with zeros ensures compile accepts static shapes thus being able to always use shape-specific optimized kernels with custom epilogues without recompilations.
    • +
    + +
    predictor.model.image_encoder = \
    +    torch.compile(predictor.model.image_encoder, mode=use_compile)
    +
    + +

    Kernel trace

    + +

    Kernel trace

    + +

    torch.compile is working beautifully. We launch a single CUDA graph, which makes up a significant portion of GPU time within the timed region. Let’s run our profile again and look at the percentage of GPU time spent in specific kernels:

    + +

    the percentage of GPU time spent in specific kernels

    + +

    We now see softmax makes up a significant portion of the time followed by various GEMM variants. In summary we observe the following measurements for batch size 8 and above changes.

    + +

    measurements for batch size 8 and above

    + +

    SDPA: scaled_dot_product_attention

    + +

    Next up, we can tackle one of the most common areas for transformer performance overhead: the attention mechanism. Naive attention implementations scale quadratically in time and memory with sequence length. PyTorch’s scaled_dot_product_attention operation built upon the principles of Flash Attention, FlashAttentionV2 and xFormer’s memory efficient attention can significantly speed up GPU attention. Combined with torch.compile, this operation allows us to express and fuse a common pattern within variants of MultiheadAttention. After a small set of changes we can adapt the model to use scaled_dot_product_attention.

    + +

    PyTorch native attention implementation

    + +

    PyTorch native attention implementation, see code here.

    + +

    Kernel trace

    + +

    We can now see that in particular the memory efficient attention kernel is taking up a large amount of computational time on the GPU:

    + +

    memory efficient attention kernel is taking up a large amount of computational time on the GPU

    + +

    Using PyTorch’s native scaled_dot_product_attention, we can significantly increase the batch size. We now observe the following measurements for batch size 32 and above changes.

    + +

    batch size 32 and above

    + +

    Triton: Custom SDPA for fused relative positional encoding

    + +

    Transitioning away from inference throughput for a moment, we started profiling overall SAM memory. Within the image encoder, we saw significant spikes in memory allocation:

    + +

    spikes in memory allocation

    + +

    Zooming in, we see this allocation happens within add_decomposed_rel_pos, on the following line:

    + +

    we see this allocation happens within add_decomposed_rel_pos

    + +

    The attn variable here is the addition of two smaller tensors: rel_h of shape (B, q_h, q_w, k_h, 1) and rel_w of shape (B, q_h, q_w, 1, k_w).

    + +

    It’s not surprising that the memory efficient attention kernel (used via SDPA) is taking a long time with an attention bias size over 3.0GiB. If instead of allocating this large attn tensor, we thread into SDPA the two smaller rel_h and rel_w tensors, and only construct attn as needed, we’d anticipate significant performance gain.

    + +

    Unfortunately this is not a trivial modification; SDPA kernels are highly optimized and written in CUDA. We can turn to Triton, with their easy to understand and use tutorial on a FlashAttention implementation. After some significant digging and in close collaboration with xFormer’s Daniel Haziza we found one case of input shapes where it is relatively straightforward to implement a fused version of the kernel. The details have been added to the repository. Surprisingly this can be done in under 350 lines of code for the inference case.

    + +

    This is a great example of extending PyTorch with a new kernel, straightforwardly built with Triton code.

    + +

    Kernel trace

    + +

    kernel trace

    + +

    With our custom positional Triton kernel we observe the following measurements for batch size 32.

    + +

    we observe the following measurements for batch size 32

    + +

    NT: NestedTensor and batching predict_torch

    + +

    We have spent a lot of time on the image encoder. This makes sense, since it takes up the most amount of computational time. At this point however it is fairly well optimized and the operator that takes the most time would require significant additional investment to be improved.

    + +

    We discovered an interesting observation with the mask prediction pipeline: for each image we have there is an associated size, coords, and fg_labels Tensor. Each of these tensors are of different batch sizes. Each image itself is also of a different size. This representation of data looks like Jagged Data. With PyTorch’s recently released NestedTensor, we can modify our data pipeline batch coords and fg_labels Tensors into a single NestedTensor. This can have significant performance benefits for the prompt encoder and mask decoder that follow the image encoder. Invoking:

    + +
    torch.nested.nested_tensor(data, dtype=dtype, layout=torch.jagged)
    +
    + +

    Kernel trace

    + +

    Kernel trace

    + +

    we can launch kernels much faster from the CPU than the GPU can process

    + +

    We can see now that we can launch kernels much faster from the CPU than the GPU can process and that it spends a long time waiting at the end of our timed region for the GPU to finish (cudaDeviceSynchronize). We also don’t see any more idle time (white space) between kernels on the GPU.

    + +

    With Nested Tensor, we observe the following measurements for batch size 32 and above changes.

    + +

    batch size 32 and above changes

    + +

    int8: quantization and approximating matmul

    + +

    We notice in the above trace, that significant time is now spent in GEMM kernels. We’ve optimized enough that we now see matrix multiplication account for more time in inference than scaled dot product attention.

    + +

    Building on earlier learnings going from fp32 to bfloat16, let’s go a step further, emulating even lower precision with int8 quantization. Looking at quantization methods, we focus on Dynamic quantization wherein our model observes the range of possible inputs and weights of a layer, and subdivides the expressible int8 range to uniformly “spread out” observed values. Ultimately each float input will be mapped to a single integer in the range [-128, 127]. For more information see PyTorch’s tutorial on quantization

    + +

    Reducing precision can immediately lead to peak memory savings, but to realize inference speedups, we have to make full use of int8 through SAM’s operations. This requires building an efficient int8@int8 matrix multiplication kernel, as well as casting logic to translate from high to low precision (quantization) as well as reversing back from low to high (dequantization). Utilizing the power of torch.compile, we can compile and fuse together these quantization and dequantization routines into efficient single kernels and epilogues of our matrix multiplication. The resulting implementation is fairly short and less than 250 lines of code. For more information on the APIs and usage, see pytorch-labs/ao.

    + +

    While it’s common to see some accuracy regression when quantizing models at inference time, SAM has been particularly robust to lower precision inference with minimal loss of accuracy. With quantization added, we now observe the following measurements for batch size 32 and above changes.

    + +

    batch size 32 and above changes

    + +

    sparse: Semi-structured (2:4) sparsity

    + +

    Matrix multiplications are still our bottleneck. We can turn to the model acceleration playbook with another classic method to approximate matrix multiplication: sparsification. By sparsifying our matrices (i.e., zeroing out values), we could theoretically use fewer bits to store weight and activation tensors. The process by which we decide which weights in the tensor to set to zero is called pruning. The idea behind pruning is that small weights in a weight tensor contribute little to the net output of a layer, typically the product of weights with activations. Pruning away small weights can potentially reduce model size without significant loss of accuracy.

    + +

    Methods for pruning are varied, from completely unstructured, wherein weights are greedily pruned to highly structured, wherein large sub-components of a tensor are pruned a time. Choice of method is not trivial. While unstructured pruning may have the theoretically least impact on accuracy, GPUs are also highly efficient with multiplying large, dense matrices and may suffer significant performance degradation in sparse regimes. One recent pruning method supported in PyTorch seeks to strike a balance, called semi-structured (or 2:4) sparsity. This sparse storage reduces the original tensor by a significant 50%, while simultaneously resulting in a dense tensor output that can leverage highly performant, 2:4 GPU kernels. See the following picture for an illustration.

    + +

    dense tensor output that can leverage highly performant, 2:4 GPU kernels

    + +

    From developer.nvidia.com/blog/exploiting-ampere-structured-sparsity-with-cusparselt

    + +

    In order to use this sparse storage format and the associated fast kernels we need to prune our weights such that they adhere to the constraints for the format. We pick the two smallest weights to prune in a 1 by 4 region, measuring the performance vs accuracy tradeoff. It is easy to change a weight from its default PyTorch (“strided”) layout to this new, semi-structured sparse layout. To implement apply_sparse(model) we only require 32 lines of Python code:

    + +
    import torch
    +from torch.sparse import to_sparse_semi_structured, SparseSemiStructuredTensor
    +
    +# Sparsity helper functions
    +def apply_fake_sparsity(model):
    +    """
    +    This function simulates 2:4 sparsity on all linear layers in a model.
    +    It uses the torch.ao.pruning flow.
    +    """
    +    # torch.ao.pruning flow
    +    from torch.ao.pruning import WeightNormSparsifier
    +    sparse_config = []
    +    for name, mod in model.named_modules():
    +        if isinstance(mod, torch.nn.Linear):
    +            sparse_config.append({"tensor_fqn": f"{name}.weight"})
    +
    +    sparsifier = WeightNormSparsifier(sparsity_level=1.0,
    +                                      sparse_block_shape=(1,4),
    +                                      zeros_per_block=2)
    +    sparsifier.prepare(model, sparse_config)
    +    sparsifier.step()
    +
    +    sparsifier.step()
    +    sparsifier.squash_mask()
    +
    +
    +def apply_sparse(model):
    +    apply_fake_sparsity(model)
    +    for name, mod in model.named_modules():
    +        if isinstance(mod, torch.nn.Linear):
    +            mod.weight = torch.nn.Parameter(to_sparse_semi_structured(mod.weight))
    +
    + +

    With 2:4 sparsity, we observe peak performance on SAM with vit_b and batch size 32:

    + +

    With 2:4 sparsity, we observe peak performance on SAM with vit_b and batch size 32

    + +

    Conclusion

    + +

    Wrapping up, we are excited to have announced our fastest implementation of Segment Anything to date. We rewrote Meta’s original SAM in pure PyTorch with no loss of accuracy using a breadth of newly released features:

    + +
      +
    • Torch.compile PyTorch’s native JIT compiler, providing fast, automated fusion of PyTorch operations [tutorial]
    • +
    • GPU quantization accelerate models with reduced precision operations [api]
    • +
    • Scaled Dot Product Attention (SDPA) a new, memory efficient implementation of Attention [tutorial]
    • +
    • Semi-Structured (2:4) Sparsity accelerate models with fewer bits to store weights and activations [tutorial]
    • +
    • Nested Tensor Highly optimized, ragged array handling for non-uniform batch and image sizes [tutorial]
    • +
    • Triton kernels. Custom GPU operations, easily built and optimized via Triton
    • +
    + +

    For more details on how to reproduce the data presented in this blog post, check out the experiments folder of segment-anything-fast. Please don’t hesitate to contact us or open an issue if you run into any technical issues.

    + +

    In our next post, we are excited to share similar performance gains with our PyTorch natively authored LLM!

    + +

    Acknowledgements

    + +

    We would like to thank Meta’s xFormers team including Daniel Haziza and Francisco Massa for authoring SDPA kernels and helping us design our custom one-off Triton kernel.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/accelerating-inference/index.html b/blog/accelerating-inference/index.html new file mode 100644 index 000000000000..912976876379 --- /dev/null +++ b/blog/accelerating-inference/index.html @@ -0,0 +1,679 @@ + + + + + + + + + + + + + Accelerating Inference on x86-64 Machines with oneDNN Graph | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Intel + +

    +

    Supported in PyTorch 2.0 as a beta feature, oneDNN Graph leverages aggressive fusion patterns to accelerate inference on x86-64 machines, especially Intel® Xeon® Scalable processors.

    + +

    oneDNN Graph API extends oneDNN with a flexible graph API to maximize the optimization opportunity for generating efficient code on AI hardware. It automatically identifies the graph partitions to be accelerated via fusion. The fusion patterns focus on fusing compute-intensive operations such as convolution, matmul, and their neighbor operations for both inference and training use cases.

    + +

    In PyTorch 2.0 and beyond, oneDNN Graph can help accelerate inference on x86-64 CPUs (primarily, Intel Xeon processor-based machines) with Float32 and BFloat16 (with PyTorch’s Automatic Mixed Precision support) datatypes. With BFloat16, speedup is limited to machines that support AVX512_BF16 ISA (Instruction Set Architecture), as well as machines that also support AMX_BF16 ISA.

    + +

    oneDNN Graph Usage

    + +

    From a user’s perspective, the usage is quite simple and intuitive, with the only change in code being an API invocation. To leverage oneDNN Graph with JIT-tracing, a model is profiled with an example input as shown below in Figure 1.

    + +

    Figure 1. A code-snippet that demonstrates using oneDNN Graph

    + +

    Fig. 1: A code-snippet that demonstrates using oneDNN Graph

    + +

    oneDNN Graph receives the model’s graph and identifies candidates for operator-fusion with respect to the input shape of the example input. Currently, only static shapes are supported. This means that any other input shape would neither be supported nor receive any performance-benefit.

    + +

    Measurements

    + +

    To ensure reproducibility of results, we used a fork of TorchBench to measure inference speed-up of some Vision models on an AWS m7i.16xlarge instance, which uses 4th Gen Intel® Xeon® Scalable processors.

    + +

    The baseline for comparison was torch.jit.optimize_for_inference which only supports Float32 datatype. The batch-size for each model was based on the respective batch size being used for them in TorchBench.

    + +

    In Figure 2, we depict the inference speedup of using oneDNN Graph over PyTorch alone. The geomean speedup with oneDNN Graph for Float32 datatype was 1.24x, and the geomean speedup for BFloat16 datatype was 3.31x1.

    + +

    Figure 2. Inference speedup with oneDNN Graph over default CPU JIT Fuser (which only uses Float32 datatype)

    + +

    Fig. 2: Inference speedup with oneDNN Graph over default CPU JIT Fuser (which only uses Float32 datatype)

    + +

    Future work

    + +

    oneDNN Graph is currently supported in PyTorch through TorchScript, but work is already underway by Intel to integrate it with the Inductor-CPU backend as a prototype feature in a future PyTorch release and Dynamo make supporting dynamic shapes easier with PyTorch, and we would like to introduce Dynamic shape support with Inductor-CPU. We also plan to add int8 quantization support.

    + +

    Acknowledgements

    + +

    The results presented in this blog are a joint effort between Meta and the Intel PyTorch team. Special thanks to Elias Ellison from Meta who spent precious time thoroughly reviewing the PRs and gave us helpful feedback.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/accelerating-large-language-models/index.html b/blog/accelerating-large-language-models/index.html new file mode 100644 index 000000000000..4bb714e97af0 --- /dev/null +++ b/blog/accelerating-large-language-models/index.html @@ -0,0 +1,837 @@ + + + + + + + + + + + + + Accelerating Large Language Models with Accelerated Transformers | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Lucas Pasqualin, Driss Guessous, Christian Puhrsch, Bertrand Maher, Michael Gschwind + +

    +

    TL;DR. We show how to use Accelerated PyTorch 2.0 Transformers and the newly introduced torch.compile() method to accelerate Large Language Models on the example of nanoGPT, a compact open-source implementation of the GPT model from Andrej Karpathy. Using the new scaled dot product attention operator introduced with Accelerated PT2 Transformers, we select the flash_attention custom kernel and achieve faster training time per batch (measured with Nvidia A100 GPUs), going from a ~143ms/batch baseline to ~113 ms/batch. In addition, the enhanced implementation using the SDPA operator offers better numerical stability. Finally, further optimizations are achieved using padded inputs, which when combined with flash attention lead to ~87ms/batch.

    + +

    Recent times have seen exponential adoption of large language models (LLMs) and Generative AI in everyday life. Tightly coupled with these ever-growing models is the ever-growing training cost - in terms of both time and hardware utilization. The PyTorch team has tackled these challenges head on with Accelerated PyTorch 2 Transformers (previously known as “Better Transformer”) and JIT Compilation in PyTorch 2.0.

    + +

    In this blog post, we explore training optimizations gained by utilizing custom kernel implementations of SDPA - also known as scaled dot product attention - a critical layer in transformer models. The custom kernel for SDPA replaces several discrete sequential operations with one globally optimized kernel which avoids allocating a large amount of intermediate CUDA memory. This approach offers a number of advantages, including but not limited to: higher performance computation of SDPA by reducing memory bandwidth bottleneck, reduced memory footprint to support larger batch sizes, and finally added numerical stability by prescaling input tensors. These optimizations are demonstrated on nanoGPT, an open-source implementation of GPT from Andrej Karpathy.

    + +

    Background

    + +

    Scaled dot product attention is the fundamental building block of multihead attention, as introduced in “Attention is All You Need”, and has a wide range of applications in LLM and Generative AI models.

    + +

    The Transformer model architecture

    + +

    Figure 1: The Transformer model architecture based on “Attention is All You Need”. With the new PyTorch SDPA operator, Multi-Head Attention is efficiently implemented by a linear layer for the in-projection, the SDPA operator, and a linear layer for the out-projection.

    + +

    With the new scaled_dot_product_attention operator, multihead attention can be implemented in just 3 steps: in projection with a linear layer, SDPA, and out projection with a linear layer.

    + +
    # In Projection
    +# variable descriptions:
    +# q,k,v = Query, Key, Value tensors
    +# bsz = batch size
    +# num_heads = Numner of heads for Multihead Attention
    +# tgt_len = Target length
    +# src_len = Source Length
    +# head_dim: Head Dimension
    +    q, k, v = _in_projection(query, key, value, q_proj_weight, k_proj_weight, v_proj_weight, b_q, b_k, b_v)
    +    q = q.view(bsz, num_heads, tgt_len, head_dim)
    +    k = k.view(bsz, num_heads, src_len, head_dim)
    +    v = v.view(bsz, num_heads, src_len, head_dim)
    +
    +    # Scaled Dot Product Attention
    +    attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)
    +
    +    # Out Projection
    +    attn_output = attn_output.permute(2, 0, 1, 3).contiguous().view(bsz * tgt_len, embed_dim)
    +    attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
    +    attn_output = attn_output.view(tgt_len, bsz, attn_output.size(1))
    +
    + +

    PyTorch 2. supports multiple different kernels optimized for specific use cases, with specific requirements. A kernel picker picks the best kernel for a particular combination of input parameters. If no optimized “custom kernel” for a particular combination of input parameters can be identified, the kernel picker selects a general kernel that can handle all input combinations.

    + +

    While future releases may extend this set of operators, PyTorch 2.0 launches with 3 implementations for the SDPA operator:

    + +
      +
    1. A generic kernel which implements the mathematical equation of SDPA in the function sdpa_math()
    2. +
    3. An optimized kernel based on the paper “Flash Attention”, which supports evaluation of SDPA with 16 bit floating point data types on compute architecture SM80 (A100).
    4. +
    5. An optimized kernel based on the paper “Self-Attention Does Not Need O(n^2) Memory” and implemented in xFormer, which supports both 32 and 16 bit floating data types on a wider range of architectures (SM40 and later). This blog post refers to this kernel as the mem_efficient kernel.
    6. +
    + +

    Note that both optimized kernels (two and three listed above), support a key padding mask and limit the supported attention mask to causal attention. Accelerated PyTorch 2.0 Transformers today only support the causal mask when it is specified using the is_causal boolean. When a mask is specified, the general-purpose kernel will be selected because it is too expensive to analyze the contents of a provided mask to determine if it is the causal mask. Additional explanations on the constraints for each kernel can be found in the Accelerated PT2 Transformer blog.

    + +

    Enabling Accelerated Transformers with nanoGPT

    + +

    The SDPA operator being a critical component of the GPT model, we identified the open source nanoGPT model as an excellent candidate for both demonstrating the ease of implementation and benefits of PyTorch 2.0’s Accelerated Transformers. The following demonstrates the exact process by which Accelerated Transformers was enabled on nanoGPT.

    + +

    This process largely revolves around replacing the existing SDPA implementation with the newly added F.scaled_dot_product_attention operator from functional.py. This process can be easily adapted to enable the operator in many other LLMs. Alternatively, users can instead choose to call F.multi_head_attention_forward() or utilize the nn.MultiHeadAttention module directly where applicable. The following code snippets are adapted from Karpathy’s nanoGPT repository.

    + +

    Step 1: Identify the existing SDPA implementation

    + +

    In the case of nanoGPT, SDPA is implemented in the model’s CausalSelfAttention class. The original implementation at time of writing is adapted below for this post.

    + +

    The original implementation at time of writing

    + +

    Step 2: Replace with Torch’s scaled_dot_product_attention

    + +

    At this point we can note the following:

    + +
      +
    • Lines 36 - 42 define the mathematical implementation of SDPA which we are replacing
    • +
    • The mask applied on line 39 is no longer relevant since we are using scaled_dot_product_attention’s is_causal flag.
    • +
    • The dropout layer used in line 41 is also now unnecessary.
    • +
    + +

    Swapping out the SDPA implementation for torch’s scaled_dot_product_attention and removing the now redundant code yields the following implementation.

    + +

    Swapping out the SDPA implementation for torch’s scaled_dot_product_attention and removing the now redundant code yields the following implementation.

    + +

    Alternatively, the original mask can be passed into the attn_mask field however due to the mentioned kernel constraints that would limit the implementation to only support the generic sdpa_math kernel.

    + +

    Step 3 (Bonus): Faster matmuls with padding

    + +

    On top of the performance improvements from SDPA, our analysis yielded a nice ancillary win. In Andrej’s words “The most dramatic optimization to nanoGPT so far (~25% speedup) is to simply increase the vocab size from 50257 to 50304 (nearest multiple of 64).”

    + +

    Tweet by Andrej Karpathy

    + +

    The vocab size determines the dimensions of matmuls in the output layer of GPT, and these are so large that they were taking a majority of the time for the entire training loop! We discovered that they were achieving performance significantly below the peak throughput achievable on the A100 GPU, and guessed from NVIDIA’s matmul documentation that 64-element alignment would yield better results. Indeed, padding these matmuls achieves nearly a 3x speedup! The underlying cause is that unaligned memory accesses significantly reduce efficiency. A deeper analysis can be found in this Twitter thread.

    + +

    With this optimization we were able to further reduce training time from ~113 ms (using flash attention) to ~87 ms per batch.

    + +

    Results

    + +

    The figure below demonstrates the performance gained using Pytorch custom kernels. Here are the exact figures:

    + +
      +
    • baseline (nanoGPT implementation): ~143ms
    • +
    • sdpa_math (generic): ~134ms (6.71% faster)
    • +
    • mem_efficient kernel: ~119ms (20.16% faster)
    • +
    • flash_attention kernel: ~113ms (26.54% faster)
    • +
    • flash_attention + padded vocab: ~87ms (64.37% faster)
    • +
    + +

    All code was run on an 8 x NVIDIA Corporation A100 server with 80 GB HBM [A100 SXM4 80GB], and for the purpose of this experiment dropout was set to 0.

    + +

    Using scaled dot product attention with custom kernels and torch.compile delivers significant speedups for training large language models

    + +

    Figure 2: Using scaled dot product attention with custom kernels and torch.compile delivers significant speedups for training large language models, such as for nanoGPT shown here.

    + +

    Enhancing Numerical Model Stability

    + +

    In addition to being faster, PyTorch’s implementation offers increased numerical stability by avoiding loss of precision in many execution scenarios. There is a great explanation here, but essentially the PyTorch implementation scales the Query and Key matrices before multiplication, which is said to be more stable and avoid loss of precision. Because of the merged custom kernel architecture of SDPA, this scaling does not introduce additional overhead in the computation of the attention result. In comparison, an implementation from the individual computational components would require separate pre-scaling at additional cost. For an additional explanation, see Appendix A.

    + +

    Improved Memory Consumption

    + +

    Yet another large advantage of using the torch SDPA kernels is the reduced memory footprint, which allows for the utilization of larger batch sizes. The following chart compares the best validation loss after one hour of training for both flash attention and the baseline implementations of causal attention. As can be seen, the maximum batch size achieved with the baseline causal attention implementation (on 8 x NVIDIA Corporation A100 server with 80 GB HBM) was 24, significantly less then the maximum achieved with flash attention, which was 39.

    + +

    Using Flash Attention enables the usage of larger batch sizes

    + +

    Figure 3: Using Flash Attention enables the usage of larger batch sizes, allowing users to achieve lower validation loss after one hour of training (smaller is better).

    + +

    Conclusion

    + +

    Accelerated PyTorch 2 Transformers were designed to make the training and production deployment of state-of-the-art transformer models affordable and integrated with PyTorch 2.0 model JIT compilation. The newly introduced PyTorch SDPA operator provides improved performance for training Transformer models and is particularly valuable for the expensive Large Language Model training. In this post we demonstrate a number of optimizations on the exemplary nanoGPT model including:

    + +
      +
    • Over 26% training speedup, when compared against the baseline with constant batch size
    • +
    • An additional speedup achieved with padded vocabulary, bringing the total optimization to approximately 64% compared to the baseline
    • +
    • Additional numerical stability
    • +
    + +

    Appendix A: Analyzing Attention Numeric Stability

    + +

    In this section we provide a more in depth explanation of the previously mentioned enhanced numerical stability which is gained by prescaling SDPA’s input vectors. The following is a simplified version of nanoGPT’s mathematical implementation of SDPA. The important thing to note here is that the query undergoes matrix multiplication without being scaled.

    + +
    # nanoGPT implementation of SDPA
    +# notice q (our query vector) is not scaled !
    +att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
    +att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
    +att = F.softmax(att, dim=-1)
    +
    +# Dropout is set to 0, so we can safely ignore this line in the implementation# att = self.attn_dropout(att) 
    +
    +y_nanogpt = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
    +
    + +

    The following is the equivalent mathematical implementation in torch’s scaled_dot_product_attention.

    + +
    # PyTorch implementation of SDPA
    +embed_size = q.size(-1)
    +scaling_factor = math.sqrt(math.sqrt(embed_size))
    +q = q / scaling_factor 	# notice q _is_ scaled here !
    +
    +# same as above, but with scaling factor
    +att = q @ (k.transpose(-2, -1) / scaling_factor)
    +att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
    +att = F.softmax(att0, dim=-1)
    +
    +# Dropout is set to 0, so we can safely ignore this line in the implementation# att = self.attn_dropout(att) 
    +
    +y_scale_before = att @ v
    +
    + +

    Mathematically both approaches should be equivalent, however our experimentation shows that in practice we receive different results from each approach.

    + +

    Using the approach above, we verified y_scale_before matches the expected output from using the scaled_dot_product_attention method while y_nanogpt does not.

    + +

    The torch.allclose method was used to test equivalence. Specifically, we showed that:

    + +
    y_sdpa = torch.nn.functional._scaled_dot_product_attention(
    +	q,
    +	k,
    +	v,
    +	attn_mask=self.bias[:,:,:T,:T] != 0,
    +	dropout_p=0.0,
    +	need_attn_weights=False,
    +	is_causal=False,
    +)
    +
    +torch.allclose(y_sdpa, y_nanogpt) # False, indicating fp issues
    +torch.allclose(y_sdpa, y_scale_before) # True, as expected
    +
    + +

    Appendix B: Reproducing Experiment Results

    + +

    Researchers seeking to reproduce these results should start with the following commit from Andrej’s nanoGPT repository - b3c17c6c6a363357623f223aaa4a8b1e89d0a465. This commit was used as the baseline when measuring the per batch speed improvements. For results which include padded vocabulary optimizations (which yielded the most significant improvements to batch speed), use the following commit - 77e7e04c2657846ddf30c1ca2dd9f7cbb93ddeab. From either checkout, selecting kernels for experimentation is made trivial with the use of the torch.backends API.

    + +

    The desired kernel can be selected via a context manager:

    + +
    with torch.backends.cuda.sdp_kernel (
    +    enable_math = False,
    +    enable_flash = False,
    +    enable_mem_efficient = True
    +):
    +    train(model)
    +
    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/accelerating-llama3/index.html b/blog/accelerating-llama3/index.html new file mode 100644 index 000000000000..02fbc6784b43 --- /dev/null +++ b/blog/accelerating-llama3/index.html @@ -0,0 +1,774 @@ + + + + + + + + + + + + + Accelerating Llama3 FP8 Inference with Triton Kernels | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Adnan Hoque, Less Wright, Chih Chieh Yang + +

    +

    1.0 Summary

    + +

    We present an optimized Triton FP8 GEMM (General Matrix-Matrix Multiply) kernel TK-GEMM, which leverages SplitK parallelization. For small batch size inference, TK-GEMM delivers up to 1.94x over the base Triton matmul implementation, 1.87x speedup over cuBLAS FP8 and 1.71x over cuBLAS FP16 for Llama3-70B inference problem sizes on NVIDIA H100 GPUs.

    + +

    TK-GEMM Speedup over PyTorch (calling cuBLAS) for Llama3-70B Attention Layer Matrix Shapes (N=K=8192)

    + +

    Figure 1. TK-GEMM Speedup over PyTorch (calling cuBLAS) for Llama3-70B Attention Layer Matrix Shapes (N=K=8192)

    + +

    In this blog, we will cover how we designed an optimized kernel using Triton for FP8 inference and tuned it for Lama3-70B inference. We will cover FP8 (8-bit floating point), a new datatype supported by Hopper generation GPUs (SM90), the key SM90 features that Triton supports, and how we modified the parallelization to be able to maximize memory throughput for memory-bound (inference) problem sizes.

    + +

    We also dedicate a section on CUDA graphs, an important technology that will help materialize kernel level speedups and enable developers who want to use Triton kernels in production settings to get additional performance gain.

    + +

    Repo and code available at: https://github.com/pytorch-labs/applied-ai

    + +

    2.0 FP8 Datatype

    + +

    The FP8 datatype was introduced jointly by Nvidia, Arm and Intel and serves as a successor to 16-bit floating point types. With half the bit count, it has the potential to provide significant throughput improvements over its predecessors for Transformer networks. The FP8 datatype consists of 2 formats:

    + +

    E4M3 (4-bit exponent and 3-bit mantissa). Able to store +/ 448 and nan.
    +E5M2 (5-bit exponent and 2-bit mantissa). Able to store +/- 57,334, nan and inf.

    + +

    BF16, FP16, FP8 E4M3 and FP8 E5M2

    + +

    Above: BF16, FP16, FP8 E4M3 and FP8 E5M2.
    +To show precision differences, the closest representation to 0.3952 is shown in each format.
    +Image Credit: Nvidia

    + +

    We use E4M3 in inference and forward pass training due its higher precision and E5M2 in training backward pass due to its higher dynamic range. Nvidia has designed their H100 FP8 Tensor Core to provide a peak of 3958 TFLOPS, 2x the FLOPS of the FP16 Tensor Core.

    + +

    We designed our Triton kernel with these hardware innovations in mind and in the rest of the blog we will discuss methods to leverage and verify that these features are indeed being utilized by the Triton compiler.

    + +

    3.0 Triton Hopper Support and FP8 Tensor Core Instruction

    + +

    The Hopper GPU architecture has added the following new features that we can expect will accelerate FP8 GEMM.

    + +
      +
    • TMA (Tensor Memory Accelerator) Hardware Unit
    • +
    • WGMMA (Warp Group Matrix Multiply-Accumulate Instruction)
    • +
    • Threadblock Clusters
    • +
    + +

    Triton currently takes advantage of one of these features, the wgmma instruction, whereas PyTorch (calling cuBLAS) leverages all 3 which makes these speedups even more impressive. To fully take advantage of the Hopper FP8 Tensor Core, the wgmma is necessary even though the older mma.sync instruction is still supported.

    + +

    The key difference between the mma and wgmma instructions is that instead of 1 CUDA warp being responsible for an output shard, an entire warp group, 4 CUDA warps, asynchronously contributes to an output shard.

    + +

    To see what this instruction looks like in practice, and to verify that our Triton Kernel is indeed utilizing this feature we analyzed the PTX and SASS assembly using nsight compute.

    + +

    PTX Assembly

    + +

    Figure 2. PTX Assembly

    + +

    This instruction is further lowered into a QGMMA instruction in SASS.

    + +

    SASS Assembly

    + +

    Figure 3. SASS Assembly

    + +

    Both instructions tell us that we are multiplying two FP8 E4M3 input tensors and accumulating in F32, which confirms that the TK-GEMM Kernel is utilizing the FP8 Tensor Core and the lowering is being done correctly.

    + +

    4.0 SplitK Work Decomposition

    + +

    TK-GEMM vs Base Triton GEMM TFLOPS for M = 1-64

    + +

    Figure 4. TK-GEMM vs Base Triton GEMM TFLOPS for M = 1-64

    + +

    The base Triton FP8 GEMM implementation does not perform well for the small M regime, where for a matrix multiplication of A (MxN) x B (NxK), M < N, K. To optimize for this type matrix profile we applied a SplitK work decomposition instead of the Data Parallel decomposition found in the base Triton kernel. This greatly improved latencies for the small M regime.

    + +

    For background, SplitK launches additional thread blocks along the k dimension to calculate partial output sums. The partial results from each thread block are then summed using an atomic reduction. This allows for finer grained work decomposition with resultant performance improvements. More details on SplitK are available in our arxiv paper.

    + +

    After carefully tuning the other relevant hyperparameters for our kernel such as tile sizes, number of warps and the number of pipeline stages to Llama3-70B problem sizes we were able to produce up to 1.94x speedup over the Triton base implementation. For a more comprehensive introduction to hyperparameter tuning, see our blog.

    + +

    NCU profiler times for TK-GEMM under varying batch sizes, and compared with PyTorch (calling cuBLAS) FP8 and FP16.

    + +

    Above: NCU profiler times for TK-GEMM under varying batch sizes, and compared with PyTorch (calling cuBLAS) FP8 and FP16.

    + +

    Note that starting at M=32, the cuBLAS FP8 kernel starts to outperform TK-GEMM. For M >= 32, we suspect that hyperparameters we found are not optimal, and thus another set of experiments is required to determine the optimal parameters for the mid-sized M regime.

    + +

    5.0 CUDA Graphs to Enable End-to-End Speedup

    + +

    To be able to realize these speedups in an end-to-end setting, we must take into account both the kernel execution time (GPU duration) as well as the wall time (CPU+GPU) duration. Triton kernels, which are handwritten (as opposed to torch compile generated) are known to suffer from high-kernel launch latencies. If we use torch profiler to trace the TK-GEMM kernel we can see the call stack on the CPU side to pinpoint exactly what is causing the slowdown.

    + +

    CPU Launch Overhead: 2.413ms

    + +

    Figure 5. CPU Launch Overhead: 2.413ms

    + +

    From above, we see that the majority of the wall time of our optimized kernel is dominated by JIT (Just-in-Time) compilation overhead. To combat this we can use CUDA graphs.

    + +

    CUDA Graphs Visualization

    + +

    Figure 6. CUDA Graphs Visualization
    +Image Credit: PyTorch

    + +

    The key idea is instead of multiple kernel launches, we instead can create and instantiate a graph (1 time cost) and then submit that instance of the graph for execution. To illustrate this point we simulate a Llama3-70B Attention layer, As shown in the below figure generated using nsight systems, the time between each GEMM is 165us compared to the 12us spent on the actual matmul due the CPU kernel launch overhead. This means that 92% of the time of the time in an Attention layer the GPU is idle and not doing any work.

    + +

    Simulated Llama3-70B Attention Layer with TK-GEMM

    + +

    Figure 7. Simulated Llama3-70B Attention Layer with TK-GEMM

    + +

    To show the impact of CUDA graphs, we then created a graph of the TK-GEMM kernel in the toy Attention layer and replayed the graph. Below, we can see that the gaps between kernel executions are reduced to 6.65us.

    + +

    Simulated Llama3-70B Attention Layer with TK-GEMM and CUDA Graphs

    + +

    Figure 8. Simulated Llama3-70B Attention Layer with TK-GEMM and CUDA Graphs

    + +

    In practice, this optimization would result in a 6.4x speedup of a single attention layer in Llama3-70B, over naively using TK-GEMM in a model without CUDA graphs.

    + +

    6.0 Potential Future Optimization Paths

    + +

    TMA Hardware Unit

    + +

    Figure 9. TMA Hardware Unit
    +Image Credit: Nvidia

    + +

    The Nvidia H100 features a TMA hardware unit. The dedicated TMA unit frees up registers and threads to do other work, as address generation is completely handled by the TMA. For memory bound problem sizes, this can provide even further gain when Triton enables support for this feature.

    + +

    Tensor Core Utilization (Arrows Indicate Degrees of Freedom)

    + +

    Figure 10. Tensor Core Utilization (Arrows Indicate Degrees of Freedom)

    + +

    To identify how well we are utilizing the Tensor Core, we can analyze the roofline chart. Notice that we are in the memory-bound region as expected for small M. To improve kernel latency we can either increase the arithmetic intensity, which with a fixed problem size can only be achieved through exploiting data locality and other loop optimizations or increasing the memory throughput. This requires either a more optimal parallel algorithm specialized for the FP8 datatype as well as the type of problem size characteristics we expect to see in FP8 inference.

    + +

    DRAM Throughput Circled, 1.65TB/s vs Peak 3.35TB/s on H100 (M=16, N=8192, K=8192)

    + +

    Figure 11. DRAM Throughput Circled, 1.65TB/s vs Peak 3.35TB/s on H100 (M=16, N=8192, K=8192)

    + +

    Lastly, we can see that we are only achieving around 50% of peak DRAM throughput on the NVIDIA H100. High performance GEMM kernels typically achieve around 70-80% of peak throughput. This means that there is still a lot of room to improve and the techniques mentioned above (loop unrolling, optimized parallelization) are needed for additional gain.

    + +

    7.0 Future Work

    + +

    For future research, we would like to explore CUTLASS 3.x and CuTe to leverage more direct control over Hopper features especially in terms of obtaining direct TMA control and exploring pingpong architectures, which have shown promising results for FP8 GEMM.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/accelerating-llm-inference/index.html b/blog/accelerating-llm-inference/index.html new file mode 100644 index 000000000000..cdec39142c4b --- /dev/null +++ b/blog/accelerating-llm-inference/index.html @@ -0,0 +1,918 @@ + + + + + + + + + + + + + Accelerating LLM Inference with GemLite, TorchAO and SGLang | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Teams at PyTorch, Mobius Labs and SGLang + +

    +

    Large Language Models (LLMs) are typically very resource-intensive, requiring significant amounts of memory, compute and power to operate effectively. Quantization provides a solution by reducing weights and activations from 16 bit floats to lower bitrates (e.g., 8 bit, 4 bit, 2 bit), achieving significant speedup and memory savings and also enables support for larger batch sizes.

    + +

    Existing solutions for low precision inference work well for small batch sizes, but suffer from following issues:

    + +
      +
    • Performance drops when we increase the batch size
    • +
    • Restrictions on types of quantization, for example, some kernels only support symmetric quantization that could have implications on accuracy of the model at lower bits
    • +
    • Interplay between quantization, serialization, and tensor parallelism (TP) makes it difficult to load quantized models and requires changes to user models
    • +
    + +

    To address these challenges, we created an end-to-end, performant, modular and extensible low-precision inference solution integrating the following libraries:

    + +
      +
    • GemLite, a Triton kernel library, tackles the performance limitations of large batch sizes and restrictions on the types of quantization
    • +
    • TorchAO, a PyTorch-native library, provides a streamlined experience for quantization, sparsity, and tensor parallelism (with DTensor)
    • +
    • SGLang, a fast, efficient and hackable serving framework for Large Language Model (LLM) and Vision Language Models (VLM) with extensive model support
    • +
    + +

    If you’re interested in trying this out in SGLang, please follow these repro instructions. For the rest of the blog, we’ll walk through relevant details for GemLite, TorchAO and SGlang both in terms of the design of the library itself and integration in addressing the problems we mentioned above, in the end we’ll present the benchmarking results on Llama 3.1-8B model across different batch sizes and tensor parallel sizes.

    + +

    1. Teaser of Results

    + +

    Following is a summary of the results in 8xH100 machine on Llama 3.1-8B for decode. For all experiments, the baseline is bfloat16 torch.compiled model:

    + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + bfloat16 w/ torch.compile + int4 weight only quantization, group size 64 + float8 per row dynamic quantization +
    Batch size 1, TP size 1 + 131 tokens/sec + 255 tokens/sec (1.95x speedup) + 166 tokens/sec (1.27x speedup) +
    Batch size 32, TP size 1 + 2799 tokens/sec + 3241 tokens/sec (1.16x speedup) + 3586 tokens/sec (1.28x speedup) +
    Batch size 32, TP size 4 + 5575 tokens/sec + 6334 tokens/sec (1.14x speedup) + 6159 tokens/sec (1.10x speedup) +
    + +

    Our solution supports NVIDIA GPUs, including H100 and A100, and achieves speedup over the compiled bfloat16 baseline across batch sizes and TP sizes for both int4 weight only (from 1.14x to 1.95x) and float8 dynamic quantization (from 1.10x to 1.28x). Note that quantization may have a small impact on accuracy, which is outside the scope of this blogpost. Our int4 weight-only quantization is compatible with accuracy preserving techniques like HQQ. Please refer to TorchAO’s README, this benchmark, and this blog for more information.

    + +

    2. GemLite: Kernel Development

    + +

    The kernels were developed as part of GemLite, a project dedicated to optimizing low-bit matrix multiplication kernels. Developed using Triton, GemLite provides highly flexible and performant solutions across various activations, bitrates and hardware. In a nutshell, the kernels offer:

    + +
      +
    • Support for various activation data types: fp16, int8 and fp8
    • +
    • Compatibility: works seamlessly with non-packed (e.g., int8, fp8) and packed formats (e.g., uint4, uint2, uint1)
    • +
    • Performance Optimization: includes optimized kernels and autotuning tools to achieve high performance across different hardware and batch sizes
    • +
    • Integration: Compatible with torch.compile and CUDA graphs, ensuring support for advanced features like tensor parallelism
    • +
    + +

    Kernel Selection

    + +

    Optimizing kernel selection for large language model (LLM) generation requires addressing the distinct needs of different batch sizes. LLM workloads involve a mix of compute-bound and memory-bound iterations: smaller batch sizes are memory-bound, while larger batch sizes become compute-bound. GemLite kernels are designed to adapt to these varying demands, ensuring optimal execution for each scenario.

    + +

    In memory-bound scenarios, where data transfer is the limiting factor, the processor often waits for data to be fetched, leading to underutilized computational resources. For batch size = 1, a GEMV kernel performs best, whereas for larger batch sizes, GEMM kernels are more efficient. For batch sizes between 2 and 64, when matrices are “skinny,” a GEMM-SPLITK kernel is used to enable better GPU utilization (arXiv).

    + +

    GemLite includes the following kernels optimized for each of these scenarios:

    + +

    Single Sample Inference

    + +

    For single-sample inferences, we use GEMV kernels. However, asymmetric quantization methods require additional metadata, such as scales and zero points, to be loaded for each block. This can lead to increased memory transfer, so careful handling is essential.

    + +

    Specifically, for packed data, our experiments indicate that loading scales and zero points only once per two consecutive blocks minimizes redundant operations. Since these blocks share the same metadata, this approach results in:

    + +
      +
    • 5–8% end-to-end inference speedup compared to the default GEMV kernel
    • +
    • 30–40% improvement over the traditional Split-K method
    • +
    + +

    This new kernel/algorithm, GEMV_REVSPLITK, is available here.

    + +

    For non-packed data, the GEMV_SPLITK algorithm is employed. This algorithm iterates over the k-dimension to compute the dot product without relying on Triton’s tl.dot.

    + +

    Batched Inference

    + +

    For moderate batch sizes, we use the GEMM-based Split-K method (arXiv) which splits the k-dimension (weight rows) into multiple jobs. The optimal-split SPLIT_K parameter is found by autotuning values ranging from 1 to 16. Setting SPLIT_K=1 enables a fallback implementation to a GEMM kernel, allowing the same kernel code to be used for compute-bound batch sizes starting from 32 and 64, depending on the matrix shape and the device.

    + +

    Maximizing High Performance: Key Implementation Insights

    + +

    Various implementation details must be carefully addressed to achieve high performance. Following are some of the key aspects we focused on to ensure high performance:

    + +
      +
    1. +

      Autotuning for Performance

      + +

      Autotuning is critical for achieving optimal kernel performance. Since this process can be time-intensive, GemLite provides tools to automatically save and load autotuning results for all kernels. This ensures that the autotuning process is performed only once per GPU device, minimizing runtime, reducing repetitive overhead, and maintaining consistent performance across runs.

      +
    2. +
    3. +

      Ensuring Kernel Correctness

      + +

      Ensuring kernel correctness across different quantization and configuration settings is essential. Triton’s early configuration pruning plays a key role in this process. For example, during Split-K tuning, configurations are selected only if K is divisible by BLOCK_SIZE_K × SPLIT_K,, and BLOCKS_SIZE_K is further pruned based on the group-size value. This approach ensures both efficiency and correctness in kernel operation.

      +
    4. +
    5. +

      Overcoming Bit-Unpacking Bottlenecks

      + +

      When deploying on data center-grade GPUs like NVIDIA’s A100 and H100, performance bottlenecks related to bit-unpacking were observed. To mitigate these, various bit-packing configurations were explored, including packing along columns versus rows and experimenting with different bit-packing widths (e.g., 8-bit vs. 32-bit). Notably, transitioning from 32-bit to 8-bit packing delivered performance improvements of up to 18% on the A100 and 6% on the H100

      +
    6. +
    7. +

      torch.compile compatibility

      + +

      To ensure seamless compatibility with PyTorch’s torch.compile, kernel calls are wrapped in a custom_op. This integration allows advanced features such as pre-hooks and early configuration pruning to function correctly, delivering accurate results without sacrificing performance. While some of these features are not yet fully supported in PyTorch, the custom_op implementation effectively bridges the gap, ensuring smooth integration and high performance.

      +
    8. +
    + +

    3. TorchAO

    + +

    TorchAO is a PyTorch native quantization and sparsity library for both training and inference, featuring simple user APIs to train, quantize and deploy low precision models, and composability with other PyTorch features like distributed inference and torch.compile.

    + +

    PyTorch does not support low precision dtypes or different packing formats by default. With Tensor Subclass, we extend PyTorch native Tensor abstractions and model quantization as dtype conversion, while different packing formats for custom kernels are handled through layouts. For example, we support quantized linear operations with int4 weights, packed in a Tensor Core friendly layout, with tinygemm or GemLite kernel implementations. More details can be found here.

    + +

    flow diagram

    + +

    Apart from more PyTorch native abstractions for developers, we want to highlight two benefits of this design for modeling users.

    + +
      +
    1. +

      Serialization: Save and load quantized weights into a state_dict just like a floating point model, eliminating the need to transform floating point model to quantized model before the quantized weights are loaded. This reduces friction of distributing and deploying quantized models.

      +
    2. +
    3. +

      Composability: Seamless integration with downstream features like tensor parallel, allowing users to focus on modeling without worrying about compatibility with tensor parallel, torch.compile, and other PyTorch features. Since these features are implemented with Tensor level abstraction, users can quantize and do distributed inference with no model changes most of the time.

      +
    4. +
    + +

    GemLite Kernel Integration

    + +

    To achieve the aforementioned benefits for the GemLite kernel, we integrated GemLite into TorchAO. This integration takes advantage of GemLite’s wide support and flexibility to allow for weight only quantization at 4 and 8 bits, under asymmetric and symmetric quantization schemes, 32 and 8 bit packing sizes, as well as grouped and ungrouped quantization. We enable this integration via the quantize_ api which can be used alongside the GemLite constructor as follows

    + +
    quantize_(model, gemlite_uintx_weight_only(group_size, bit_width, packing_bitwidth))
    +
    + +

    The primary difficulty in creating this integration was making sure that the TorchAO composability guarantees were satisfied for the entire breadth of GemLite quantization kernel options. While the primary integration was relatively straight forward, making sure every different quantization type and their associated kernels worked well with tensor parallel was non-trivial.

    + +

    Torch Tensor Parallel

    + +

    Tensor Parallelism is an effective way to speed up LLM inference. TP shards large matrices of linear or embedding modules onto multiple devices, typically in column-wise or row-wise styles. As the weight matrix gets distributed, computation is decomposed too. For example, the column-wise pattern below enables simultaneous matrix-vector multiply on four devices:

    + +

    equation

    + +

    PyTorch implements TP by converting a regular tensor (e.g. matrix A) into a DTensor:

    + +
    dtensor = _shard_tensor(mA, device_mesh, (Shard(0),))
    +
    + +

    Since DTensor stores meta information about the sharding, it knows how to reconstruct the full result when needed. Take Transformers’ feedforward module for example, as the down projection and up projection use column-wise and row-wise sharding respectively, DTensor will automatically perform an all-reduce on the ranks’ results as they move into the next operation. Such automation allows model authors to focus on computation without worrying about the communication needed for distributed execution.

    + +

    Tensor Parallel and Quantization Order

    + +

    Since both DTensor and quantization are tensor-level transformations, the application order matters in ensuring a workflow can generally work on different setups. We have two observations: (i) checkpoints are typically saved in quantized formats, to save the quantization overhead before each run; and (ii) TP may run on a different number of devices, depending on resource constraints or service agreements. As such, we first apply quantization to the original tensor, save it to disk depending on whether a reuse is desired. At service launch time, we load the quantized checkpoint and shard the tensors into DTensors on-the-fly as we load them into the model.

    + +

    Tensor Parallel Support in TorchAO

    + +

    Since we quantize the model first then distribute the Tensor, we’ll have DTensor(QuantizedTensor(weight)), where DTensor means a distributed Tensor class and QuantizedTensor means a quantized tensor class in TorchAO. QuantizedTensor should support the operators called when constructing a DTensor, including slice and view ops. To make sure the overall execution is efficient, the packed weight that’s sliced in the dimension 0 and 1 should match the result of first slice the unpacked weight then pack (pack and slice operation should commute), otherwise the packing format is not compatible with tensor parallelism.

    + +

    4. SGLang

    + +

    SGLang is a fast serving framework for large language models and vision language models. It is known for its almost zero-overhead batch scheduler and fast constrained decoding. It is mainly implemented in Python, lightweight, and easy to hack. It is also one of the first frameworks to integrate torch.compile.

    + +

    TorchAO integration in SGLang

    + +

    We integrated quantize_ API for applying a specific type of quantization to model into SGLang that supports int4 weight only quantization (both tinygemm and GemLite version), float8 dynamic quantization and a few other types of quantization so far. Users can enable quantization by adding --torchao-config argument to the benchmarking script. The currently enabled options also support tensor parallelism through composition with DTensor that is enabled with --tp-size option.

    + +

    Torch Native Tensor Parallel Support in SGLang

    + +

    Existing model definitions in SGLang use special linear modules that are coupled with tensor parallelism style, for example: MergedColumnParallelLinear, QKVParallelLinear and RowParallelLinear. To decouple the model definition and tensor parallelization style, we defined a pytorch native model that uses plain nn.Linear module from PyTorch and rely on PyTorch tensor parallelism APIs for parallelization and torch.compile for speedup. At related module hierarchies, we add a dictionary describing how a submodule should be parallelized. For example, in class LlamaAttention, we define:

    + +
    _tp_plan = {
    +    "qkv_proj": "Colwise_Sharded",
    +    "o_proj": "Rowwise",
    +}
    +
    + +

    where "qkv_proj" and "o_proj" are the FQNs of the wqkv and wo projections, and the values are their TP styles.

    + +

    We then define a TP engine in model_parallel.py. It searches for _tp_plan recursively within the model, and applies the indicated TP styles to the submodules using PyTorch’s parallelize_module API.

    + +

    5. Results

    + +

    The evaluation focused on two popular quantization techniques for H100 machines: int4 weight-only quantization and float8 dynamic quantization. These methods were chosen due to their widespread use in optimizing memory efficiency and computational performance on H100 machines, making them ideal candidates for benchmarking against various workloads.

    + +
      +
    • int4 Weight-Only Quantization: This method significantly reduces memory footprint and accelerates decode for memory-bound workloads, with minimal impact on performance in compute-intensive scenarios like prefill or larger batch sizes. We present results for bf16, GemLite, and tinygemm kernels below, across various batch sizes and tensor parallel configurations
    • +
    • float8 Dynamic Quantization: While offering less memory savings, this method often provides higher accuracy and balanced speedups for both memory-bound and compute-bound tasks. With Hopper-grade hardware and native fp8 support, the efficient cutlass/cuBLAS kernels used by AO contribute to a significant speedup
    • +
    + +

    The graphs below show the decode tokens/sec for different tp sizes, each graph shows the results across different batch sizes and for different types of quantization:

    + +
      +
    • BF16 is our bfloat16, torch.compile’d baseline
    • +
    • tinygemm-4-64 is using int4_weight_only quantization in TorchAO, it’s a 4 bit groupwise quantization with group size of 64, using tinygemm kernel
    • +
    • gemlite-4-64 is using gemlite_uintx_weight_only quantization in TorchAO, 4 means 4 bit, and 64 is also the group size, using GemLite kernel
    • +
    • fp8dq-per_row is using float8_dynamic_activation_float8_weight quantization in TorchAO, both activation and weights are quantized with per row scales
    • +
    + +

    bar chart

    + +

    bar chart

    + +

    bar chart

    + +

    For int4 weight-only quantization, at batch size 1, the tinygemm kernel achieved the best performance. However, its efficiency declined with increasing batch sizes. Conversely, GemLite effectively bridged this gap, delivering superior performance at larger batch sizes. GemLite also achieved a 9–10x speedup during the prefill phase compared to tinygemm, despite ongoing performance optimizations constrained by Triton.

    + +

    Float8 dynamic quantization showed 1.3x speedup over bfloat16 consistently with tensor parallel size 1 across different batch sizes and 1.1x to 1.2x speedup in larger tensor parallel sizes. As the tensor parallel size increases, the overall speedup decreases, which is expected due to the reduction in matmul size. Note that we do expect to get speedup for prefill as well, but since we rely on torch.compile for speedup and prefill compile is not enabled in SGLang yet, we will leave this for future work.

    + +

    Repro Instructions

    + +

    We conducted benchmarks on an 8xH100 machine using GemLite 0.4.1, SGLang built from commit feb2b76, TorchAO nightly 0.8.0.dev20241223+cu124, and PyTorch 2.5.1. The Llama-3.1 Instruct models were chosen as the architecture for evaluation.

    + +
    BATCH_SIZE=16
    +# Note: gemlite is only compatible with float16
    +# while int4wo-64 (tinygemm-4-64 as shown in the graph) and fp8dq-per_row should use bfloat16
    +DTYPE=float16
    +# int4wo-64, fp8dq-per_tensor
    +TORCHAO_CONFIG=gemlite-4-64
    +TP_SIZE=2
    +# Decode performance
    +python3 -m sglang.bench_offline_throughput --model-path meta-llama/Llama-3.1-8B-Instruct --json-model-override-args '{"architectures": ["TorchNativeLlamaForCausalLM"]}' --dataset-name random --random-input 1024 --random-output 512 --random-range 1 --num-prompts $BATCH_SIZE --enable-torch-compile --dtype $DTYPE --torchao-config $TORCHAO_CONFIG --tp-size $TP_SIZE
    +
    +# Example output
    +# Benchmark...
    +# [2024-12-20 12:42:16 TP0] Prefill batch. #new-seq: 2, #new-token: 2046, #cached-token: 4, cache hit rate: \0.06%, token usage: 0.00, #running-req: 0, #queue-req: 0
    +# ...
    +# [2024-12-20 12:45:35 TP0] Decode batch. #running-req: 16, #token: 16763, token usage: 0.01, gen throughput\ (token/s): 2.20, #queue-req: 0
    +# [2024-12-20 12:45:38 TP0] Decode batch. #running-req: 16, #token: 24443, token usage: 0.02, gen throughput\ (token/s): 2739.89, #queue-req: 0
    +
    +# We reported the last throughput (token/s) as the performance for decode
    +
    + +

    Conclusion

    + +

    With performant and extensible kernels from GemLite, PyTorch native architecture optimization library TorchAO and high performance inference framework SGLang, we showcased fast end-to-end quantized inference for both int4 and float8 across different batch sizes and tensor parallel sizes with simple and composable user APIs to reduce the resource requirement for LLMs. This integration is our first step towards meeting the needs of fast inference across different models, workloads, precisions and hardwares and we are looking forward to continuing advancing the state of the art for end to end mixed and low precision LLM inference.

    + +

    Our immediate future work focuses on the following:

    + +
      +
    • Exploring diverse combinations of weight and activation quantization to strike the best balance between speed and accuracy
    • +
    • Extending support to additional GPU architectures to broaden accessibility
    • +
    • Enhancing compatibility with MoE models to address growing demands in scalable inference
    • +
    • Allow for easy integration of fast custom kernels in TorchAO so that they can be easily leveraged by SGLang and other inference frameworks
    • +
    • While we didn’t measure accuracy impact in this blogpost, we can develop auto quantization tool in TorchAO to allow users to trade off between performance and accuracy
    • +
    • Better integration with tensor parallelism in SGLang to support running larger models
    • +
    • Enable torch.compile for prefill phase in SGLang
    • +
    + +

    We also invite the community to actively test, provide feedback, and contribute to shaping the future of fast and efficient LLM inference.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/accelerating-moe-model/index.html b/blog/accelerating-moe-model/index.html new file mode 100644 index 000000000000..50c0ebba8c31 --- /dev/null +++ b/blog/accelerating-moe-model/index.html @@ -0,0 +1,745 @@ + + + + + + + + + + + + + Accelerating MoE model inference with Locality-Aware Kernel Design | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Adnan Hoque, Less Wright, Antoni Virós Martin, Chih-Chieh Yang + +

    +

    1.0 Summary

    + +

    We show that by implementing column-major scheduling to improve data locality, we can accelerate the core Triton GEMM (General Matrix-Matrix Multiply) kernel for MoEs (Mixture of Experts) up to 4x on A100, and up to 4.4x on H100 Nvidia GPUs. This post demonstrates several different work decomposition and scheduling algorithms for MoE GEMMs and shows, at the hardware level, why column-major scheduling produces the highest speedup.

    + +

    Repo and code available at: https://github.com/pytorch-labs/applied-ai/tree/main/kernels/triton/inference/col_major_moe_gemm.

    + +

    Figure 1A. Optimized Fused MoE GEMM Kernel TFLOPs on A100 for varying Batch Sizes M

    + +

    Figure 1A. Optimized Fused MoE GEMM Kernel TFLOPs on A100 for varying Batch Sizes M

    + +

    Figure 1B. Optimized Fused MoE GEMM Kernel TFLOPs on H100 for varying Batch Sizes M

    + +

    Figure 1B. Optimized Fused MoE GEMM Kernel TFLOPs on H100 for varying Batch Sizes M

    + +

    2.0 Background

    + +

    OpenAI’s Triton is a hardware-agnostic language and compiler that as our prior blog post has shown can be used to accelerate quantization workflows. We also showed that in terms of kernel development, much of the same learnings and performance analysis tools from CUDA can be leveraged to provide similar insights into how Triton kernels work under-the-hood and subsequent measures to speedup these kernels in latency sensitive environments. As Triton becomes increasingly adopted in production settings, it is important that developers understand the common tips and tricks to developing performant kernels as well as the generality of these methods to various different architectures and workflows. Thus, this post will explore how we optimized the Triton kernel developed by vLLM for the popular Mixture of Experts (MoE) Mixtral model using classical techniques and how these techniques can be implemented in Triton to achieve performance gain.

    + +

    Mixtral 8x7B is a sparse Mixture of Experts Language Model. Unlike the classical dense transformer architecture, each transformer block houses 8 MLP layers where each MLP is an ‘expert’. As a token flows through, a router network selects which 2 of the 8 experts should process that token and the results are then combined. The selected experts for the same token vary at each layer. As a result, while Mixtral 8x7B has a total of 47B params, during inference only 13B params are active.

    + +

    The MoE GEMM (General Matrix-Matrix Multiply) kernel receives a stacked weight matrix containing all the experts, and must subsequently route each token to the TopK (2 for Mixtral) experts by utilizing a mapping array produced by the resultant scores of the router network. In this post, we provide methods to efficiently parallelize this computation during inference time, specifically during autoregression (or decoding stages).

    + +

    3.0 Work Decomposition - SplitK

    + +

    We have previously shown that for the matrix problem sizes found in LLM inference, specifically in the context of W4A16 quantized inference, GEMM kernels can be accelerated by applying a SplitK work decomposition. Thus, we started our MoE acceleration research by implementing SplitK in the vLLM MoE Kernel, which produced speedups of approximately 18-20% over the Data Parallel approach.

    + +

    This result shows that the SplitK optimization can be used as a part of a more formulaic approach to improving/developing Triton kernels in inference settings. To build intuition about these different work decompositions, let’s consider a simple example for the multiplication of two 4x4 matrices and SplitK=2.

    + +

    In the data parallel GEMM kernel shown below, the computation for a single block of the output matrix will be handled by 1 threadblock, TB0.

    + +

    Figure 2. Data Parallel GEMM

    + +

    Figure 2. Data Parallel GEMM

    + +

    In contrast, in the SplitK kernel, the work required to compute 1 block in the output matrix, is “split” or shared amongst 2 thread blocks TB0 and TB1. This provides better load balancing and increased parallelism.

    + +

    Figure 3. SplitK GEMM

    + +

    Figure 3. SplitK GEMM

    + +

    The key idea is that we’ve increased our parallelism from MN to MN*SplitK. This approach does incur some costs such as adding inter-threadblock communication via atomic operations. However, these costs are minimal compared to the savings of other constrained GPU resources like shared memory and registers. Most importantly, the SplitK strategy provides superior load balancing characteristics for skinny matrices, (as is the case in MoE inference) and is the common matrix profile during decoding and inference.

    + +

    4.0 GEMM Hardware Scheduling - Column Major

    + +

    To improve upon the ~20% speedup with SplitK we focused our investigation on the logic that controls the hardware scheduling of the GEMM in Triton Kernels. Our profiling of the vLLM MoE kernel showed a low L2 cache hit rate, thus we investigated three scheduling options - column-major, row-major and grouped launch. Due to some intrinsic properties of MoE models, such as large expert matrices, and having to dynamically load TopK (2 for Mixtral) matrices during the duration of the kernel, cache reuse/hit rate becomes a bottleneck that this optimization will target.

    + +

    For background, in our previous blog, we touched on the concept of “tile swizzling”, a method to achieve greater L2 cache hit rate. This concept relates to how the software schedules the GEMM onto the SMs of a GPU. In Triton, this schedule is determined by the pid_m and pid_n calculations. Our key insight is that for skinny matrix multiplications, a column-major ordering ensures optimal reuse of the columns of the weight matrix, B. To illustrate this, let’s take a look at a snippet of what a column major computation of pid_m, and pid_n would look like:

    + +

    Figure 4. Column Major ordering in PyTorch

    + +

    Figure 4. Column Major ordering in PyTorch

    + +

    From above, we note that with this mapping, we schedule the GEMM such that we calculate the output blocks of C in the following order: C(0, 0), C(1, 0), C(2, 0),… etc. To understand the implications we provide the following illustration:

    + +

    Activation matrix / Weight matrix

    + +

    L1/L2 Cache

    + +

    C - Output Matrix

    + +

    Figure 5. Cache Reuse Pattern for a Column-Major GEMM Schedule

    + +

    In the above simplified view of a column-major schedule, let’s assume for a GEMM with skinny activation matrix A, that the entire matrix can fit in the GPU cache which is a reasonable assumption to make for the type of problem sizes we encounter in MoE inference. This allows for maximal reuse of the columns of the weight matrix B, due to the fact that the B column can be re-used for the corresponding output tile calculations, C(0,0), C(1, 0) and C(2, 0). Consider instead, a row-major schedule, C(0,0), C(0,1), C(0, 2) etc. We would have to evict the column of B, and issue multiple load instructions to DRAM to calculate the same amount of output blocks.

    + +

    An important design consideration when optimizing kernels is a memory access pattern that results in the least amount of global load instructions. This optimal memory access pattern is achieved with the column-major schedule. The results below showcase the performance of the three schedules we investigated:

    + +

    Figure 6. Comparison of GEMM Schedules on A100 for varying Batch Sizes M

    + +

    Figure 6. Comparison of GEMM Schedules on A100 for varying Batch Sizes M

    + +

    The column-major schedule provides up to a 4x speedup over the other patterns, and as we’ll show in the next section, provides an optimal memory access pattern due to greatly improved data locality.

    + +

    5.0 Nsight Compute Analysis - Throughput and Memory Access Pattern

    + +

    For performance analysis, we focus on the M = 2 case for the H100. A similar study can be done for the A100 as many of the same observations carry over. We note the following salient results, that showcase the impact of our optimizations.

    + +

    Figure 7. H100 Memory Throughput Chart for M = 2.  Note the very large increase in the cache hit rates L1 cache hit rate (+2696%) and L2 cache hit rate (+254%).

    + +

    Figure 7. H100 Memory Throughput Chart for M = 2. Note the very large increase in the cache hit rates L1 cache hit rate (+2696%) and L2 cache hit rate (+254%).

    + +

    Figure 8. H100 Memory Instruction Statistics M = 2. Note the 49% reduction in global memory loads.

    + +

    Figure 8. H100 Memory Instruction Statistics M = 2. Note the 49% reduction in global memory loads.

    + +

    These statistics show that our optimizations had the intended effect, which can be seen in the reduced cache misses, reduced memory accesses and the resultant 2.7x speedup. More concretely, the trace shows us a 2.54x increase in L2 hit rate (Figure 7), and a ~50% reduction in DRAM accesses (Figure 8).

    + +

    These improvements ultimately yield the reduced latency, with the optimized kernel being 2.7x faster for bs=2 and 4.4x for bs=512.

    + +

    6.0 Future Work

    + +

    Our kernel was tested in FP16, which showcases the numerics and performance of the column major scheduling for MoE, but most production models are using BFloat16. We encountered a limitation in Triton such that tl.atomic_add does not support Bfloat16 and hit launch latency concerns which would require cuda graph support for column major production use. In initial testing this translated to a 70% end-to-end speedup but, we encountered some expert mapping inconsistencies in an end to end environment that are not reflected in the test environment, so further work is needed to fully realize these speedups. \

    + +

    For future work, we intend to move this into a CUDA kernel which will ensure full BFloat16 support and reduced launch latency relative to Triton, and potentially resolve the expert routing inconsistency. We’ve also previously published work on enabling GPTQ W4A16 with Triton GEMM kernels, so natural follow-on work would include fusing dequantization into this kernel to allow for a GPTQ quantized inference path.

    + +

    7.0 Reproducibility

    + +

    We have open sourced the Triton kernel code along with an easy to run performance benchmark for readers interested in comparing or verifying the performance on their own GPU.

    + +

    Acknowledgements

    + +

    We want to thank Daniel Han, Raghu Ganti, Mudhakar Srivatsa, Bert Maher, Gregory Chanan, Eli Uriegas, and Geeta Chauhan for their review of the presented material and Woosuk from the vLLM team as we built on his implementation of the Fused MoE kernel.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/accelerating-neural-network-training/index.html b/blog/accelerating-neural-network-training/index.html new file mode 100644 index 000000000000..6bf259bde463 --- /dev/null +++ b/blog/accelerating-neural-network-training/index.html @@ -0,0 +1,863 @@ + + + + + + + + + + + + + Accelerating Neural Network Training with Semi-Structured (2:4) Sparsity | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Jesse Cai, Daniel Haziza, Supriya Rao + +

    +

    Over the past year, we’ve added support for semi-structured (2:4) sparsity into PyTorch. With just a few lines of code, we were able to show a 10% end-to-end inference speedup on segment-anything by replacing dense matrix multiplications with sparse matrix multiplications.

    + +

    However, matrix multiplications are not unique to neural network inference - they happen during training as well. By expanding on the core primitives we used earlier to accelerate inference, we were also able to accelerate model training. We wrote a replacement nn.Linear layer, SemiSparseLinear, that is able to achieve a 1.3x speedup across the forwards + backwards pass of the linear layers in the MLP block of ViT-L on a NVIDIA A100.

    + +

    End-to-end, we see a wall time reduction of 6% for a DINOv2 ViT-L training, with virtually no accuracy degradation out of the box (82.8 vs 82.7 on ImageNet top-1 accuracy).

    + +

    2 strategies for training a ViT model

    + +

    We compare 2 strategies for training a ViT model for 125k iterations on 4x NVIDIA A100s: either fully dense (blue), or sparse for 70% of the training, then dense (orange). Both achieve similar results on the benchmarks, but the sparse variant trains 6% faster. For both experiments, we evaluate the intermediate checkpoints with and without sparsity.

    + +

    As far as we are aware, this is the first OSS implementation of accelerated sparse training and we’re excited to provide a user API in torchao. You can try accelerating your own training runs with just a few lines of code:

    + +
    # Requires torchao and pytorch nightlies and CUDA compute capability 8.0+
    +import torch
    +from torchao.sparsity.training import (
    +    SemiSparseLinear,
    +    swap_linear_with_semi_sparse_linear,
    +)
    +
    +model = torch.nn.Sequential(torch.nn.Linear(1024, 4096)).cuda().half()
    +
    +# Specify the fully-qualified-name of the nn.Linear modules you want to swap
    +sparse_config = {
    +    "seq.0": SemiSparseLinear
    +}
    +
    +# Swap nn.Linear with SemiSparseLinear, you can run your normal training loop after this step
    +swap_linear_with_semi_sparse_linear(model, sparse_config)
    +
    + +

    How does this work?

    + +

    The general idea behind sparsity is simple: skip calculations involving zero-valued tensor elements to speed up matrix multiplication. However, simply setting weights to zero isn’t enough, as the dense tensor still contains these pruned elements and dense matrix multiplication kernels will continue to process them, incurring the same latency and memory overhead. To achieve actual performance gains, we need to replace dense kernels with sparse kernels that intelligently bypass calculations involving pruned elements.

    + +

    These kernels work on sparse matrices, which remove the pruned elements and store the specified elements in a compressed format. There are many different sparse formats, but we’re particularly interested in semi-structured sparsity, also known as 2:4 structured sparsity or fine-grained structured sparsity or more generally N:M structured sparsity.

    + +

    2:4 sparse compressed representation

    + +

    2:4 sparse compressed representation. Original Source

    + +

    A 2:4-sparse matrix is a matrix where at most 2 elements are non-zero for every 4 elements, as illustrated in the image above. Semi-structured sparsity is attractive because it exists in a goldilocks spot of performance and accuracy:

    + +
      +
    1. NVIDIA GPUs since Ampere offer hardware acceleration and library support (cuSPARSELt) for this format, with matrix multiplication being up to 1.6x faster
    2. +
    3. Pruning models to fit this sparsity pattern does not degrade accuracy as much as other patterns. NVIDIA’s whitepaper shows pruning then retraining is able to recover accuracy for most vision models.
    4. +
    + +

    Illustration of 2:4 (sparse) matrix multiplication on NVIDIA GPUs

    + +

    Illustration of 2:4 (sparse) matrix multiplication on NVIDIA GPUs. Original source

    + +

    Accelerating inference with semi-structured sparsity is straightforward. Since our weights are fixed during inference, we can prune and compress the weight ahead of time (offline) and store the compressed sparse representation instead of our dense tensor.

    + +

    flow chart

    + +

    Then, instead of dispatching to dense matrix multiplication we dispatch to sparse matrix multiplication, passing in the compressed sparse weight instead of the normal dense one. For more information about accelerating models for inference using 2:4 sparsity, please refer to our tutorial.

    + +

    Extending sparse inference acceleration to training

    + +

    In order to use sparsity to reduce the training time of our models, we need to consider when the mask is calculated, as once we store the compressed representation the mask is fixed.

    + +

    Training with a fixed mask applied to an existing trained dense model (also known as pruning) does not degrade accuracy, but this requires two training runs - one to obtain the dense model and another to make it sparse, offering no speedups.

    + +

    Instead we’d like to train a sparse model from scratch (dynamic sparse training), but training from scratch with a fixed mask will lead to a significant drop in evaluations, as the sparsity mask would be selected at initialization, when the model weights are essentially random.

    + +

    To maintain the accuracy of the model when training from scratch, we prune and compress the weights at runtime, so that we can calculate the optimal mask at each step of the training process.

    + +

    Conceptually you can think of our approach as an approximate matrix multiplication technique, where we `prune_and_compress` and dispatch to `sparse_GEMM` in less time than a `dense_GEMM` call would take. This is difficult because the native pruning and compression functions are too slow to show speedups.

    + +

    Given the shapes of our ViT-L training matrix multiplications (13008x4096x1024), we measured the runtime of a dense and sparse GEMM respectively at 538us and 387us. In other words, the pruning and compression step of the weight matrix must run in less than 538-387=151us to have any efficiency gain. Unfortunately, the compression kernel provided in cuSPARSELt already takes 380us (without even considering the pruning step!).

    + +

    Given the max NVIDIA A100 memory IO (2TB/s), and considering that a prune and compress kernel would be memory bound, we could theoretically prune and compress our weight (4096x1024x2 bytes=8MB) in 4us (8MB / 2TB/s)! And in fact, we were able to write a kernel that prunes and compresses a matrix into 2:4-sparse format, and runs in 36 us (10x faster than the compression kernel in cuSPARSELt), making the entire GEMM (including the sparsification) faster. Our kernel is available for use in PyTorch.

    + +

    Our custom sparsification kernel

    + +

    Our custom sparsification kernel, which includes pruning + compression, is ~30% faster across a linear layer forward+backward. Benchmarks run on a NVIDIA A100-80GB GPU.

    + +

    Writing a performant runtime sparsification kernel

    + +

    There were multiple challenges we faced in order to implement a performant runtime sparsification kernel, which we will explore below.

    + +

    1) Handling the backwards pass

    + +

    For the backwards pass, we need to calculate dL/dX and dL/dW for the gradient update and the subsequent layer, which means we need to calculate xWT and xTW respectively.

    + +

    Overview of runtime sparsification for training acceleration (FW + BW pass)

    + +

    Overview of runtime sparsification for training acceleration (FW + BW pass)

    + +

    However this is problematic, because the compressed representation cannot be transposed, since there’s no guarantee that the tensor is 2:4 sparse in both directions.

    + +

    Both matrices are valid 2:4 matrices. However, the right one is no longer a valid 2:4 matrix once transposed because one column contains more than 2 elements

    + +

    Both matrices are valid 2:4 matrices. However, the right one is no longer a valid 2:4 matrix once transposed because one column contains more than 2 elements

    + +

    Therefore, we prune a 4x4 tile, instead of a 1x4 strip. We greedily preserve the largest values, ensuring that we take at most 2 values for each row / column. While this approach is not guaranteed to be optimal, as we sometimes only preserve 7 values instead of 8, it efficiently calculates a tensor that is 2:4 sparse both row-wise and column-wise.

    + +

    We then compress both the packed tensor and the packed transpose tensor, storing the transpose tensor for the backwards pass. By calculating both the packed and packed transpose tensor at the same time, we avoid a secondary kernel call in the backwards pass.

    + +

    Our kernel prunes the weight matrix in registers

    + +

    Our kernel prunes the weight matrix in registers, and writes the compressed values in global memory. It also prunes at the same time W.t, which is needed for the backward pass, minimizing the memory IO

    + +

    There’s some additional transpose trickery needed to handle the backwards pass - the underlying hardware only supports operations where the first matrix is sparse. For weight sparsification during inference, when we need to calculate xWT we rely on transpose properties to swap the order of the operands.

    + +

    Math formula

    + +

    During inference, we use torch.compile to fuse the outer transpose into subsequent pointwise ops in order to avoid paying a performance penalty.

    + +

    However in the case of the backwards pass of training, we have no subsequent pointwise op to fuse with. Instead, we fuse the transposition into our matrix multiplication by taking advantage of cuSPARSELt’s ability to specify the row / column layout of the result matrix.

    + +

    2) Kernel tiling for efficient memory-IO

    + +

    In order for our kernel to be as efficient as possible, we want to coalesce our reads / writes, as we found that memory IO to be the main bottleneck. This means that within a CUDA thread, we want to read/write chunks of 128 bytes at a time, so that multiple parallel reads/writes can be coalesced into a single request by the GPU memory controller.

    + +

    Therefore, instead of a thread handling a single 4x4 tile, which is only 4x4x2 = 32 bytes, we decided that each thread will handle 4 4x4 tiles (aka an 8x8 tile), which allows us to operate 8x8x2 =128 byte chunks.

    + +

    Kernel tiling for efficient memory-IO

    + +

    3) Sorting elements in a 4x4 tile without warp-divergence

    + +

    For each individual 4x4 tile within our thread we calculate a bitmask that specifies which elements to prune and which elements to keep. To do this we sort all 16 elements and greedily preserve elements, so long as they do not break our 2:4 row / col constraint. This preserves only the weights with the largest values.

    + +

    Crucially we observe that we are only ever sorting a fixed number of elements, so by using a branchless sorting network, we can avoid warp divergence.

    + +

    Sorting network diagram

    + +

    For clarity, the transposed packed tensor and metadata are omitted. Sorting network diagram taken from Wikipedia.

    + +

    Warp divergence occurs when we have conditional execution inside across a thread block. In CUDA, work items in the same work group (thread block) are dispatched at the hardware level in batches (warps). If we have conditional execution, such that some work-items in the same batch run different instructions, then they are masked when the warp is dispatched, or dispatched sequentially.

    + +

    For example, if we have some code like if (condition) do(A) else do(B), where condition is satisfied by all the odd-numbered work items, then the total runtime of this conditional statement is do(A) + do(B), since we would dispatch do(A) for all odd-numbered work-items, masking out even-numbered work-items, and do(B) for all even numbered work-items, masking out odd-numbered work-items. This answer provides more information about warp divergence.

    + +

    4) Writing the compressed matrices and metadata

    + +

    Once the bitmask has been computed, the weight data has to be written back in a compressed format in global memory. This is not trivial, because the data needs to stay in registers, and it’s not possible to index registers (eg C[i++] = a prevents us from storing C in registers). Furthermore, we found that nvcc was using many more registers than we expected, which caused register spilling and impacted global performance. We write this compressed matrix to global memory in Column-Major format to make the writes more efficient.

    + +

    compressed matrix to global memory in Column-Major format

    + +

    We also need to write the cuSPARSELt metadata as well. This metadata layout is quite similar to the one from the open-source CUTLASS library and is optimized for being loaded efficiently through shared-memory in the GEMM kernel with the PTX ldmatrix instruction.

    + +

    However, this layout is not optimized to be written efficiently: the first 128 bits of the metadata tensor contains metadata about the first 32 columns of the rows 0, 8, 16 and 24. Recall that each thread handles an 8x8 tile, which means that this information is scattered across 16 threads.

    + +

    We rely on a series of warp-shuffle operations, once for the original and transposed representation respectively to write the metadata. Fortunately, this data represents less than 10% of the total IO, so we can afford to not fully coalesce the writes.

    + +

    DINOv2 Sparse Training: Experimental Setup and Results

    + +

    For our experiments, the ViT-L model is trained on ImageNet for 125k steps using the DINOv2 method. All our experiments were run on 4x AMD EPYC 7742 64-core CPUs and 4x NVIDIA A100-80GB GPUs. During sparse training, the model is trained with 2:4 sparsity enabled for the first part of the training, where only half of the weights are enabled. This sparsity mask on the weights is dynamically recomputed at every step, as weights are continuously updated during the optimization. For the remaining steps, the model is trained densely, producing a final model without 2:4 sparsity (except the 100% sparse training setup), which is then evaluated.

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Training setup + ImageNet 1k log-regression +
    0% sparse (125k dense steps, baseline) + 82.8 +
    40% sparse (50k sparse -> 75k dense steps) + 82.9 +
    60% sparse (75k sparse -> 50k dense steps) + 82.8 +
    70% sparse (87.5k sparse -> 37.5k dense steps) + 82.7 +
    80% sparse (100k sparse -> 25k dense steps) + 82.7 +
    90% sparse (112.5k sparse -> 12.5k dense steps) + 82.0 +
    100% sparse (125k sparse steps) + 82.3 (2:4-sparse model) +
    + +

    sparsity training diagrams

    + +

    During the sparse training steps, in the backward pass we obtain a dense gradient for the sparse weights. For the gradient descent to be sound, we should also sparsify this gradient before using it in the optimizer to update the weights. Instead of doing that, we use the full dense gradient to update the weights - we found this to work better in practice: this is the STE (Straight Through Estimator) strategy. In other words, we update all the parameters at every step, even the ones we don’t use.

    + +

    Conclusion and Future Work

    + +

    In this blog post, we’ve shown how to accelerate neural network training with semi-structured sparsity and explained some of the challenges we faced. We were able to achieve a 6% end to end speedup on DINOv2 training with a small 0.1 pp accuracy drop.

    + +

    There are several areas of expansion for this work:

    + +
      +
    • Expansion to new sparsity patterns: Researchers have created new sparsity patterns like V:N:M sparsity that use the underlying semi-structured sparse kernels to allow for more flexibility. This is especially interesting for applying sparsity to LLMs, as 2:4 sparsity degrades accuracy too much, but we have seen some positive results for more general N:M pattern.
    • +
    • Performance optimizations for sparse fine-tuning: This post covers sparse training from scratch, but oftentimes we want to fine-tune a foundational model. In this case, a static mask may be sufficient to preserve accuracy which would enable us to make additional performance optimizations.
    • +
    • More experiments on pruning strategy: We calculate the mask at each step of the network, but calculating the mask every n steps may yield better training accuracy. Overall, figuring out the best strategy to use semi-structured sparsity during training is an open area of research.
    • +
    • Compatibility with fp8: The hardware also supports fp8 semi-structured sparsity, and this approach should work similarly with fp8 in principle. In practice, we would need to write similar sparsification kernels, and could possibly fuse them with the scaling of the tensors.
    • +
    • Activation Sparsity: Efficient sparsification kernels also enable to sparsify the activations during training. Because the sparsification overhead grows linearly with the sparsified matrix size, setups with large activation tensors compared to the weight tensors could benefit more from activation sparsity than weight sparsity. Furthermore, activations are naturally sparse because of the usage of ReLU or GELU activation functions, reducing accuracy degradation.
    • +
    + +

    If you are interested in these problems, please feel free to open an issue / PR in torchao, a community we’re building for architecture optimization techniques like quantization and sparsity. Additionally, if you have general interest in sparsity please reach out in CUDA-MODE (#sparsity)

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/accelerating-pytorch-vision-models-with-channels-last-on-cpu/index.html b/blog/accelerating-pytorch-vision-models-with-channels-last-on-cpu/index.html new file mode 100644 index 000000000000..7aeec47a9756 --- /dev/null +++ b/blog/accelerating-pytorch-vision-models-with-channels-last-on-cpu/index.html @@ -0,0 +1,770 @@ + + + + + + + + + + + + + Accelerating PyTorch Vision Models with Channels Last on CPU | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Mingfei Ma (Intel), Vitaly Fedyunin (Meta), Wei Wei (Meta) + +

    +

    Overview

    + +

    Memory formats has significant impact on performance when running vision models, generally Channels Last is a more favorable from performance perspective due to better data locality.

    + +

    This blog will introduce fundamental concepts of memory formats and demonstrate performance benefits using Channels Last on popular PyTorch vision models on Intel® Xeon® Scalable processors.

    + +

    Memory Formats Introduction

    + +

    Memory format refers to data representation that describes how a multidimensional (nD) array is stored in linear (1D) memory address space. The concept of memory format has two aspects:

    + +
      +
    • Physical Order is the layout of data storage in physical memory. For vision models, usually we talk about NCHW, NHWC. These are the descriptions of physical memory layout, also referred as Channels First and Channels Last respectively.
    • +
    • Logical Order is a convention on how to describe tensor shape and stride. In PyTorch, this convention is NCHW. No matter what the physical order is, tensor shape and stride will always be depicted in the order of NCHW.
    • +
    + +

    Fig-1 is the physical memory layout of a tensor with shape of [1, 3, 4, 4] on both Channels First and Channels Last memory format (channels denoted as R, G, B respectively):

    + +

    + +

    + +

    +Fig-1 Physical memory layout of Channels First and Channels Last +

    + +

    Memory Formats Propagation

    + +

    The general rule for PyTorch memory format propagation is to preserve the input tensor’s memory format. Which means a Channels First input will generate a Channels First output and a Channels Last input will generate a Channels Last output.

    + +

    For Convolution layers, PyTorch uses oneDNN (oneAPI Deep Neural Network Library) by default to achieve optimal performance on Intel CPUs. Since it is physically impossible to achieve highly optimized performance directly with Channels Frist memory format, input and weight are firstly converted to blocked format and then computed. oneDNN may choose different blocked formats according to input shapes, data type and hardware architecture, for vectorization and cache reuse purposes. The blocked format is opaque to PyTorch, so the output needs to be converted back to Channels First. Though blocked format would bring about optimal computing performance, the format conversions may add overhead and therefore offset the performance gain.

    + +

    On the other hand, oneDNN is optimized for Channels Last memory format to use it for optimal performance directly and PyTorch will simply pass a memory view to oneDNN. Which means the conversion of input and output tensor is saved. Fig-2 indicates memory format propagation behavior of convolution on PyTorch CPU (the solid arrow indicates a memory format conversion, and the dashed arrow indicates a memory view):

    + +

    + +

    + +

    +Fig-2 CPU Conv memory format propagation +

    + +

    On PyTorch, the default memory format is Channels First. In case a particular operator doesn’t have support on Channels Last, the NHWC input would be treated as a non-contiguous NCHW and therefore fallback to Channels First, which will consume the previous memory bandwidth on CPU and result in suboptimal performance.

    + +

    Therefore, it is very important to extend the scope of Channels Last support for optimal performance. And we have implemented Channels Last kernels for the commonly use operators in CV domain, applicable for both inference and training, such as:

    + +
      +
    • Activations (e.g., ReLU, PReLU, etc.)
    • +
    • Convolution (e.g., Conv2d)
    • +
    • Normalization (e.g., BatchNorm2d, GroupNorm, etc.)
    • +
    • Pooling (e.g., AdaptiveAvgPool2d, MaxPool2d, etc.)
    • +
    • Shuffle (e.g., ChannelShuffle, PixelShuffle)
    • +
    + +

    Refer to Operators-with-Channels-Last-support for details.

    + +

    Native Level Optimization on Channels Last

    + +

    As mentioned above, PyTorch uses oneDNN to achieve optimal performance on Intel CPUs for convolutions. The rest of memory format aware operators are optimized at PyTorch native level, which doesn’t require any third-party library support.

    + +
      +
    • Cache friendly parallelization scheme: keep the same parallelization scheme for all the memory format aware operators, this will help increase data locality when passing each layer’s output to the next.
    • +
    • Vectorization on multiple archs: generally, we can vectorize on the most inner dimension on Channels Last memory format. And each of the vectorized CPU kernels will be generated for both AVX2 and AVX512.
    • +
    + +

    While contributing to Channels Last kernels, we tried our best to optimize Channels First counterparts as well. The fact is some operators are physically impossible to achieve optimal performance on Channels First, such as Convolution, Pooling, etc.

    + +

    Run Vision Models on Channels Last

    + +

    The Channels Last related APIs are documented at PyTorch memory format tutorial. Typically, we can convert a 4D tensor from Channels First to Channels Last by:

    + +
    # convert x to channels last
    +# suppose x’s shape is (N, C, H, W)
    +# then x’s stride will be (HWC, 1, WC, C)
    +x = x.to(memory_format=torch.channels_last)
    +
    + +

    To run models on Channels Last memory format, simply need to convert input and model to Channels Last and then you are ready to go. The following is a minimal example showing how to run ResNet50 with TorchVision on Channels Last memory format:

    + +
    import torch
    +from torchvision.models import resnet50
    +
    +N, C, H, W = 1, 3, 224, 224
    +x = torch.rand(N, C, H, W)
    +model = resnet50()
    +model.eval()
    +
    +# convert input and model to channels last
    +x = x.to(memory_format=torch.channels_last)
    +model = model.to(memory_format=torch.channels_last)
    +model(x)
    +
    + +

    The Channels Last optimization is implemented at native kernel level, which means you may apply other functionalities such as torch.fx and torch script together with Channels Last as well.

    + +

    Performance Gains

    + +

    We benchmarked inference performance of TorchVision models on Intel® Xeon® Platinum 8380 CPU @ 2.3 GHz, single instance per socket (batch size = 2 x number of physical cores). Results show that Channels Last has 1.3x to 1.8x performance gain over Channels First.

    + +

    + +

    + +

    The performance gain primarily comes from two aspects:

    + +
      +
    • For Convolution layers, Channels Last saved the memory format conversion to blocked format for activations, which improves the overall computation efficiency.
    • +
    • For Pooling and Upsampling layers, Channels Last can use vectorized logic along the most inner dimension, e.g., “C”, while Channels First can’t.
    • +
    + +

    For memory format non aware layers, Channels Last and Channels First has the same performance.

    + +

    Conclusion & Future Work

    + +

    In this blog we introduced fundamental concepts of Channels Last and demonstrated the performance benefits of CPU using Channels Last on vision models. The current work is limited to 2D models at the current stage, and we will extend the optimization effort to 3D models in near future!

    + +

    Acknowledgement

    + +

    The results presented in this blog is a joint effort of Meta and Intel PyTorch team. Special thanks to Vitaly Fedyunin and Wei Wei from Meta who spent precious time and gave substantial assistance! Together we made one more step on the path of improving the PyTorch CPU eco system.

    + +

    References

    + + + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/accelerating-pytorch-with-cuda-graphs/index.html b/blog/accelerating-pytorch-with-cuda-graphs/index.html new file mode 100644 index 000000000000..1845c5fe2a82 --- /dev/null +++ b/blog/accelerating-pytorch-with-cuda-graphs/index.html @@ -0,0 +1,926 @@ + + + + + + + + + + + + + Accelerating PyTorch with CUDA Graphs | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    October 26, 2021

    +

    + Accelerating PyTorch with CUDA Graphs +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Vinh Nguyen, Michael Carilli, Sukru Burc Eryilmaz, Vartika Singh, Michelle Lin, Natalia Gimelshein, Alban Desmaison, Edward Yang + +

    +

    Today, we are pleased to announce a new advanced CUDA feature, CUDA Graphs, has been brought to PyTorch. Modern DL frameworks have complicated software stacks that incur significant overheads associated with the submission of each operation to the GPU. When DL workloads are strong-scaled to many GPUs for performance, the time taken by each GPU operation diminishes to just a few microseconds and, in these cases, the high work submission latencies of frameworks often lead to low utilization of the GPU. As GPUs get faster and workloads are scaled to more devices, the likelihood of workloads suffering from these launch-induced stalls increases. To overcome these performance overheads, NVIDIA engineers worked with PyTorch developers to enable CUDA graph execution natively in PyTorch. This design was instrumental in scaling NVIDIA’s MLPerf workloads (implemented in PyTorch) to over 4000 GPUs in order to achieve record-breaking performance.

    + +
    + +
    + +

    CUDA graphs support in PyTorch is just one more example of a long collaboration between NVIDIA and Facebook engineers. torch.cuda.amp, for example, trains with half precision while maintaining the network accuracy achieved with single precision and automatically utilizing tensor cores wherever possible. AMP delivers up to 3X higher performance than FP32 with just a few lines of code change. Similarly, NVIDIA’s Megatron-LM was trained using PyTorch on up to 3072 GPUs. In PyTorch, one of the most performant methods to scale-out GPU training is with torch.nn.parallel.DistributedDataParallel coupled with the NVIDIA Collective Communications Library (NCCL) backend.

    + +

    CUDA Graphs

    + +

    CUDA Graphs, which made its debut in CUDA 10, let a series of CUDA kernels to be defined and encapsulated as a single unit, i.e., a graph of operations, rather than a sequence of individually-launched operations. It provides a mechanism to launch multiple GPU operations through a single CPU operation, and hence reduces the launching overheads.

    + +

    The benefits of CUDA graphs can be demonstrated with the simple example in Figure 1. On the top, a sequence of short kernels is launched one-by-one by the CPU. The CPU launching overhead creates a significant gap in between the kernels. If we replace this sequence of kernels with a CUDA graph, initially we will need to spend a little extra time on building the graph and launching the whole graph in one go on the first occasion, but subsequent executions will be very fast, as there will be very little gap between the kernels. The difference is more pronounced when the same sequence of operations is repeated many times, for example, overy many training steps. In that case, the initial costs of building and launching the graph will be amortized over the entire number of training iterations. For a more comprehensive introduction on the topic, see our blog + Getting Started with CUDA Graphs and GTC talk Effortless CUDA Graphs.

    + +

    +Cuda graphs reduce launching overhead by bundling multiple GPU operations into a single launchable unit, i.e., a graph. On the top, you can see five individual launches; whereas on the bottom, with CUDA graphs, they are all bundled into a single launch, reducing overhead. +
    + Figure 1. Benefits of using CUDA graphs +

    + +

    NCCL support for CUDA graphs

    + +

    The previously mentioned benefits of reducing launch overheads also extend to NCCL kernel launches. NCCL enables GPU-based collective and P2P communications. With NCCL support for CUDA graphs, we can eliminate the NCCL kernel launch overhead.

    + +

    Additionally, kernel launch timing can be unpredictable due to various CPU load and operating system factors. Such time skews can be harmful to the performance of NCCL collective operations. With CUDA graphs, kernels are clustered together so that performance is consistent across ranks in a distributed workload. This is especially useful in large clusters where even a single slow node can bring down overall cluster level performance.

    + +

    For distributed multi-GPU workloads, NCCL is used for collective communications. If we look at training a neural network that leverages data parallelism, without NCCL support for CUDA graphs, we’ll need a separate launch for each of forward/back propagation and NCCL AllReduce. By contrast, with NCCL support for CUDA graphs, we can reduce launch overhead by lumping together the forward/backward propagation and NCCL AllReduce all in a single graph launch.

    + +

    +With NCCL CUDA graph support, all the kernel launches for NCCL AllReduce for  the forward/backward propagation can be bundled into a graph to reduce overhead launch time. +
    + Figure 2. Looking at a typical neural network, all the kernel launches for NCCL AllReduce can be bundled into a graph to reduce overhead launch time. +

    + +

    PyTorch CUDA Graphs

    + +

    From PyTorch v1.10, the CUDA graphs functionality is made available as a set of beta APIs.

    + +

    API overview

    + +

    PyTorch supports the construction of CUDA graphs using stream capture, which puts a CUDA stream in capture mode. CUDA work issued to a capturing stream doesn’t actually run on the GPU. Instead, the work is recorded in a graph. After capture, the graph can be launched to run the GPU work as many times as needed. Each replay runs the same kernels with the same arguments. For pointer arguments this means the same memory addresses are used. By filling input memory with new data (e.g., from a new batch) before each replay, you can rerun the same work on new data.

    + +

    Replaying a graph sacrifices the dynamic flexibility of typical eager execution in exchange for greatly reduced CPU overhead. A graph’s arguments and kernels are fixed, so a graph replay skips all layers of argument setup and kernel dispatch, including Python, C++, and CUDA driver overheads. Under the hood, a replay submits the entire graph’s work to the GPU with a single call to cudaGraphLaunch. Kernels in a replay also execute slightly faster on the GPU, but eliding CPU overhead is the main benefit.

    + +

    You should try CUDA graphs if all or part of your network is graph-safe (usually this means static shapes and static control flow, but see the other constraints) and you suspect its runtime is at least somewhat CPU-limited.

    + +

    API example

    + +

    PyTorch exposes graphs via a raw torch.cuda.CUDAGraphclass and two convenience wrappers, torch.cuda.graph and torch.cuda.make_graphed_callables.

    + +

    torch.cuda.graph is a simple, versatile context manager that captures CUDA work in its context. Before capture, warm up the workload to be captured by running a few eager iterations. Warmup must occur on a side stream. Because the graph reads from and writes to the same memory addresses in every replay, you must maintain long-lived references to tensors that hold input and output data during capture. To run the graph on new input data, copy new data to the capture’s input tensor(s), replay the graph, then read the new output from the capture’s output tensor(s).

    + +

    If the entire network is capture safe, one can capture and replay the whole network as in the following example.

    + +
    N, D_in, H, D_out = 640, 4096, 2048, 1024
    +model = torch.nn.Sequential(torch.nn.Linear(D_in, H),
    +                            torch.nn.Dropout(p=0.2),
    +                            torch.nn.Linear(H, D_out),
    +                            torch.nn.Dropout(p=0.1)).cuda()
    +loss_fn = torch.nn.MSELoss()
    +optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
    +
    +# Placeholders used for capture
    +static_input = torch.randn(N, D_in, device='cuda')
    +static_target = torch.randn(N, D_out, device='cuda')
    +
    +# warmup
    +# Uses static_input and static_target here for convenience,
    +# but in a real setting, because the warmup includes optimizer.step()
    +# you must use a few batches of real data.
    +s = torch.cuda.Stream()
    +s.wait_stream(torch.cuda.current_stream())
    +with torch.cuda.stream(s):
    +    for i in range(3):
    +        optimizer.zero_grad(set_to_none=True)
    +        y_pred = model(static_input)
    +        loss = loss_fn(y_pred, static_target)
    +        loss.backward()
    +        optimizer.step()
    +torch.cuda.current_stream().wait_stream(s)
    +
    +# capture
    +g = torch.cuda.CUDAGraph()
    +# Sets grads to None before capture, so backward() will create
    +# .grad attributes with allocations from the graph's private pool
    +optimizer.zero_grad(set_to_none=True)
    +with torch.cuda.graph(g):
    +    static_y_pred = model(static_input)
    +    static_loss = loss_fn(static_y_pred, static_target)
    +    static_loss.backward()
    +    optimizer.step()
    +
    +real_inputs = [torch.rand_like(static_input) for _ in range(10)]
    +real_targets = [torch.rand_like(static_target) for _ in range(10)]
    +
    +for data, target in zip(real_inputs, real_targets):
    +    # Fills the graph's input memory with new data to compute on
    +    static_input.copy_(data)
    +    static_target.copy_(target)
    +    # replay() includes forward, backward, and step.
    +    # You don't even need to call optimizer.zero_grad() between iterations
    +    # because the captured backward refills static .grad tensors in place.
    +    g.replay()
    +    # Params have been updated. static_y_pred, static_loss, and .grad
    +    # attributes hold values from computing on this iteration's data.
    +
    + +

    If some of your network is unsafe to capture (e.g., due to dynamic control flow, dynamic shapes, CPU syncs, or essential CPU-side logic), you can run the unsafe part(s) eagerly and use torch.cuda.make_graphed_callables to graph only the capture-safe part(s). This is demonstrated next.

    + +

    make_graphed_callables accepts callables (functions or nn.Module and returns graphed versions. By default, callables returned by make_graphed_callables are autograd-aware, and can be used in the training loop as direct replacements for the functions or nn.Module you passed. make_graphed_callables internally creates CUDAGraph objects, runs warm up iterations, and maintains static inputs and outputs as needed. Therefore, (unlike with torch.cuda.graph) you don’t need to handle those manually.

    + +

    In the following example, data-dependent dynamic control flow means the network isn’t capturable end-to-end, but make_graphed_callables() lets us capture and run graph-safe sections as graphs regardless:

    + +
    N, D_in, H, D_out = 640, 4096, 2048, 1024
    +
    +module1 = torch.nn.Linear(D_in, H).cuda()
    +module2 = torch.nn.Linear(H, D_out).cuda()
    +module3 = torch.nn.Linear(H, D_out).cuda()
    +
    +loss_fn = torch.nn.MSELoss()
    +optimizer = torch.optim.SGD(chain(module1.parameters(),
    +                                  module2.parameters(),
    +                                  module3.parameters()),
    +                            lr=0.1)
    +
    +# Sample inputs used for capture
    +# requires_grad state of sample inputs must match
    +# requires_grad state of real inputs each callable will see.
    +x = torch.randn(N, D_in, device='cuda')
    +h = torch.randn(N, H, device='cuda', requires_grad=True)
    +
    +module1 = torch.cuda.make_graphed_callables(module1, (x,))
    +module2 = torch.cuda.make_graphed_callables(module2, (h,))
    +module3 = torch.cuda.make_graphed_callables(module3, (h,))
    +
    +real_inputs = [torch.rand_like(x) for _ in range(10)]
    +real_targets = [torch.randn(N, D_out, device="cuda") for _ in range(10)]
    +
    +for data, target in zip(real_inputs, real_targets):
    +    optimizer.zero_grad(set_to_none=True)
    +
    +    tmp = module1(data)  # forward ops run as a graph
    +
    +    if tmp.sum().item() > 0:
    +        tmp = module2(tmp)  # forward ops run as a graph
    +    else:
    +        tmp = module3(tmp)  # forward ops run as a graph
    +
    +    loss = loss_fn(tmp, target)
    +    # module2's or module3's (whichever was chosen) backward ops,
    +    # as well as module1's backward ops, run as graphs
    +    loss.backward()
    +    optimizer.step()
    +
    + +

    Example use cases

    +

    MLPerf v1.0 training workloads

    + +

    The PyTorch CUDA graphs functionality was instrumental in scaling NVIDIA’s MLPerf training v1.0 workloads (implemented in PyTorch) to over 4000 GPUs, setting new records across the board. We illustrate below two MLPerf workloads where the most significant gains were observed with the use of CUDA graphs, yielding up to ~1.7x speedup.

    + + + + + + + + + + + + + + + + + + + + + +
     Number of GPUsSpeedup from CUDA-graphs
    Mask R-CNN2721.70×
    BERT40961.12×
    + +

    Table 1. MLPerf training v1.0 performance improvement with PyTorch CUDA graph.

    + +

    Mask R-CNN

    + +

    Deep learning frameworks use GPUs to accelerate computations, but a significant amount of code still runs on CPU cores. CPU cores process meta-data like tensor shapes in order to prepare arguments needed to launch GPU kernels. Processing meta-data is a fixed cost while the cost of the computational work done by the GPUs is positively correlated with batch size. For large batch sizes, CPU overhead is a negligible percentage of total run time cost, but at small batch sizes CPU overhead can become larger than GPU run time. When that happens, GPUs go idle between kernel calls. This issue can be identified on an NSight timeline plot in Figure 3. The plot below shows the “backbone” portion of Mask R-CNN with per-gpu batch size of 1 before graphing. The green portion shows CPU load while the blue portion shows GPU load. In this profile we see that the CPU is maxed out at 100% load while GPU is idle most of the time, there is a lot of empty space between GPU kernels.

    + +

    +NSight timeline plot of Mask R-CNN shows that the CPU is maxed out at 100% load while GPU is idle most of the time, and a lot of empty space between GPU kernels +
    + Figure 3: NSight timeline plot of Mask R-CNN +

    + +

    CUDA graphs can automatically eliminate CPU overhead when tensor shapes are static. A complete graph of all the kernel calls is captured during the first step, in subsequent steps the entire graph is launched with a single op, eliminating all the CPU overhead, as observed in Figure 4..

    + +

    +With CUDA graph, the entire graph is launched with a single op, eliminating all the CPU overhead +
    + Figure 4: CUDA graphs optimization +

    + +

    With graphing, we see that the GPU kernels are tightly packed and GPU utilization remains high. The graphed portion now runs in 6 ms instead of 31ms, a speedup of 5x. We did not graph the entire model, mostly just the resnet backbone, which resulted in an overall speedup of ~1.7x. +In order to increase the scope of the graph, we made some changes in the software stack to eliminate some of the CPU-GPU synchronization points. In MLPerf v1.0, this work included changing the implementation of torch.randperm function to use CUB instead of Thrust because the latter is a synchronous C++ template library. These improvements are available in the latest NGC container.

    + +

    BERT

    + +

    Similarly, by graph capturing the model, we eliminate CPU overhead and accompanying synchronization overhead. CUDA graphs implementation results in a 1.12x performance boost for our max-scale BERT configuration. To maximize the benefits from CUDA graphs, it is important to keep the scope of the graph as large as possible. To achieve this, we modified the model script to remove CPU-GPU synchronizations during the execution such that the full model can be graph captured. Furthermore, we also made sure that the tensor sizes during the execution are static within the scope of the graph. For instance, in BERT, only a specific subset of total tokens contribute to loss function, determined by a pre-generated mask tensor. Extracting the indices of valid tokens from this mask, and using these indices to gather the tokens that contribute to the loss, results in a tensor with a dynamic shape, i.e. with shape that is not constant across iterations. In order to make sure tensor sizes are static, instead of using the dynamic-shape tensors in the loss computation, we used static shape tensors where a mask is used to indicate which elements are valid. As a result, all tensor shapes are static. Dynamic shapes also require CPU-GPU synchronization since it has to involve the framework’s memory management on the CPU side. With static-only shapes, no CPU-GPU synchronizations are necessary. This is shown in Figure 5.

    + +

    + Synchronization free training eliminates CPU synchronization +
    + Figure 5. By using a fixed size tensor and a boolean mask as described in the text, we are able to eliminate CPU synchronizations needed for dynamic sized tensors +

    + +

    CUDA graphs in NVIDIA DL examples collection

    + +

    Single GPU use cases can also benefit from using CUDA Graphs. This is particularly true for workloads launching many short kernels with small batches. A good example is training and inference for recommender systems. Below we present preliminary benchmark results for NVIDIA’s implementation of the Deep Learning Recommendation Model (DLRM) from our Deep Learning Examples collection. Using CUDA graphs for this workload provides significant speedups for both training and inference. The effect is particularly visible when using very small batch sizes, where CPU overheads are more pronounced.

    + +

    CUDA graphs are being actively integrated into other PyTorch NGC model scripts and the NVIDIA Github deep learning examples. Stay tuned for more examples on how to use it.

    + +

    + CUDA graphs optimization for the DLRM model. The impact is larger for smaller batch sizes where CPU overheads are more pronounced. +

    +

    + CUDA graphs optimization for the DLRM model. The impact is larger for smaller batch sizes where CPU overheads are more pronounced. +
    + Figure 6: CUDA graphs optimization for the DLRM model. +

    + +

    Call to action: CUDA Graphs in PyTorch v1.10

    + +

    CUDA graphs can provide substantial benefits for workloads that comprise many small GPU kernels and hence bogged down by CPU launch overheads. This has been demonstrated in our MLPerf efforts, optimizing PyTorch models. Many of these optimizations, including CUDA graphs, have or will eventually be integrated into our PyTorch NGC model scripts collection and the NVIDIA Github deep learning examples. For now, check out our open-source MLPerf training v1.0 implementation which could serve as a good starting point to see CUDA graph in action. Alternatively, try the PyTorch CUDA graphs API on your own workloads.

    + +

    We thank many NVIDIAN’s and Facebook engineers for their discussions and suggestions: +Karthik Mandakolathur US, +Tomasz Grel, +PLJoey Conway, +Arslan Zulfiqar US

    + +

    Authors bios

    + +

    Vinh Nguyen +DL Engineer, NVIDIA

    + +

    Vinh is a Deep learning engineer and data scientist, having published more than 50 scientific articles attracting more than 2500 citations. At NVIDIA, his work spans a wide range of deep learning and AI applications, including speech, language and vision processing, and recommender systems.

    + +

    Michael Carilli +Senior Developer Technology Engineer, NVIDIA

    + +

    Michael worked at the Air Force Research Laboratory optimizing CFD code for modern parallel architectures. He holds a PhD in computational physics from the University of California, Santa Barbara. A member of the PyTorch team, he focuses on making GPU training fast, numerically stable, and easy(er) for internal teams, external customers, and Pytorch community users.

    + +

    Sukru Burc Eryilmaz +Senior Architect in Dev Arch, NVIDIA

    + +

    Sukru received his PhD from Stanford University, and B.S from Bilkent University. He currently works on improving the end-to-end performance of neural network training both at single-node scale and supercomputer scale.

    + +

    Vartika Singh +Tech Partner Lead for DL Frameworks and Libraries, NVIDIA

    + +

    Vartika has led teams working in confluence of cloud and distributed computing, scaling and AI, influencing the design and strategy of major corporations. She currently works with the major frameworks and compiler organizations and developers within and outside NVIDIA, to help the design to work efficiently and optimally on NVIDIA hardware.

    + +

    Michelle Lin +Product Intern, NVIDIA

    + +

    Michelle is currently pursuing an undergraduate degree in Computer Science and Business Administration at UC Berkeley. She is currently managing execution of projects such as conducting market research and creating marketing assets for Magnum IO.

    + +

    Natalia Gimelshein +Applied Research Scientist, Facebook

    + +

    Natalia Gimelshein worked on GPU performance optimization for deep learning workloads at NVIDIA and Facebook. She is currently a member of the PyTorch core team, working with partners to seamlessly support new software and hardware features.

    + +

    Alban Desmaison +Research Engineer, Facebook

    + +

    Alban studied engineering and did a PhD in Machine Learning and Optimization, during which he was an OSS contributor to PyTorch prior to joining Facebook. His main responsibilities are maintaining some core library and features (autograd, optim, nn) and working on making PyTorch better in general.

    + +

    Edward Yang +Research Engineer, Facebook

    + +

    Edward studied CS at MIT and then Stanford before starting at Facebook. He is a part of the PyTorch core team and is one of the leading contributors to PyTorch.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/accelerating-training-float8-rowwise-crusoe/index.html b/blog/accelerating-training-float8-rowwise-crusoe/index.html new file mode 100644 index 000000000000..eb50f8088e6c --- /dev/null +++ b/blog/accelerating-training-float8-rowwise-crusoe/index.html @@ -0,0 +1,828 @@ + + + + + + + + + + + + + Accelerating Large Scale Training and Convergence with PyTorch Float8 Rowwise on Crusoe 2K H200s | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Meta and Crusoe + +

    +

    Meta: Less Wright, Hamid Shojanazeri, Vasiliy Kuznetsov, Daniel Vega-Myhre, Gokul Nadathur, Will Constable, Tianyu Liu, Tristan Rice, Driss Guessous, Josh Fromm, Luca Wehrstedt, Jiecao Yu +Crusoe: Ethan Petersen, Martin Cala, Chip Smith

    + +

    Working with Crusoe.AI we were provided access to one of their new 2K H200 clusters in Iceland, which enabled us to showcase training accelerations of 34 - 43% at scale by leveraging TorchTitan’s HSDP2 and TorchAO’s new float8 rowwise, with comparable convergence and stability vs BF16.

    + +

    bar chart

    + +

    In this post we detail the synergy of H200’s with PyTorch’s new Float8 rowwise training with TorchTitan’s FSDP2/HSDP2 and CP at scale.

    + +

    Background - what is an H200?

    + +

    H200’s are an ‘enhanced’ H100, offering the exact same compute as an H100, but with two additional improvements.

    + +
      +
    • Larger global memory, 141GiB HBM3e vs the standard 80GiB HBM3
    • +
    • Memory bandwidth is ~43% faster with 4.8TB/s vs 3.35 TB/s. The faster memory transfer has an outsized effect on training speed, especially for PyTorch’s AsyncTP.
    • +
    + +

    What is PyTorch Float8 rowwise?

    + +

    Float 8 Rowwise is a finer grained resolution for Float8 vs the previous ‘tensor wise’ Float8. It is designed to ensure finer grained accuracy to support larger workloads that tend to become more sensitive to quantization at scale and as training progresses.

    + +

    There are two key improvements with Float8 rowwise:

    + +
      +
    • Each row now maintains its own scaling factor versus a single scaling factor for the entire tensor, thus improving quantization precision. Finer grained scaling per row helps reduce the effect of outliers (extreme values that force the quantization scaling factor to stretch and degrade the precision of the normally distributed values) and thus ensures better precision.
    • +
    • The scaling factor itself is now implemented by rounding down to the nearest power of 2. This has been shown to help reduce quantization errors when multiplying/dividing by the scaling factor as well as ensuring large values remain scaled to the same value in both the forward and backward passes.
    • +
    + +

    Note that other large scale models have been trained using Float8 at 2K scale with a combination of 1x128 groupwise and 128x128 blockwise, with power of 2 scaling factors. They had the same goal of improving Float8’s precision for supporting large scale training.

    + +

    Thus, Float8 rowwise offers a similar promise to enable Float8 for very large scale training, but we wanted to provide proof of stability and convergence at scale, which training on the Crusoe H200 2k cluster provided initial verification thereof.

    + +

    Showcasing Float8 Rowwise Loss convergence vs BF16 at 1600 and 1920 GPU Scale:

    + +

    In order to verify comparable loss convergence, we ran two separate runs at both 1920 and then 1600 (1.6k) gpu scale using TorchTitan and Lllama3 70B. The 1.6K GPU runs were set for 2.5k iterations, using TorchTitans’ HSDP2 and Context Parallel to enable 2D parallelism.

    + +

    The loss convergence tests were run using Titan’s deterministic mode - this mode effectively freezes most potential sources of variation from run to run, and thus helps ensure that the only substantial change is what we want to test, namely the loss convergence and loss curves of BF16 vs Float8 Rowwise.

    + +

    Note that deterministic mode also slows down training speed because various kernels will not be autotuned to maximize throughput (otherwise we risk using different kernels between runs and introducing variance).

    + +

    Two runs were completed, one with BF16 and the other with Float8 Rowwise.

    + +

    Both runs completed their assigned 2.5k iters without issue, showcasing the Crusoe cluster stability, with FP8 completing at exactly 24 hours and BF16 finishing after 31 hours, 19 minutes.

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    DType + Time / Iters + Loss +
    + + +
    BF16 + 24 hours + 3.15453 +
    Float8 Rowwise + 24 hours + 2.86386 +
    + + +
    BF16 + 31 hours, 19 minutes / 2.5K + 2.88109 +
    Float8 Rowwise + 24 hours / 2.5K + 2.86386 +
    + +

    At the 24 hour mark, Float8 completed 2.5K iterations showcasing the comparative speed up (even in deterministic mode) of float8 training. At the 24 hour mark, Float8 enabled a +9.21% relative improvement in loss compared to BF16 for the same 24 hours of large scale training time.

    + +

    After 31 hours, 19 minutes, the BF16 run finally completed its 2.5k iters.

    + +

    The final loss numbers:
    +BF16 = 2.88109 +Float8 = 2.86386

    + +

    From the loss curves we observed very similar curves at the first and last ⅓ and then a turbulent zone in the middle where both showed similar spikes, but with a slight skew to the relative timing of the spikes.

    + +

    line chart

    + +

    As a result of this, we can see that PyTorch’s Float8 rowwise offers similar convergence but over 33% speedup for the same amount of training time.

    + +

    Long Term Training stability with Float8 Rowwise

    + +

    Beyond showcasing comparable convergence, we also wanted to show longer term training stability with Float8 and thus we launched a 4 day, 15K run at 256 scale.

    + +

    line chart

    + +

    As shown above, Float8 training ran for over 100 hours with no issues, highlighting the long term stability of Float8 Rowwise.

    + +

    Determinism in TorchTitan

    + +

    To verify determinism and to see if the spikiness in the longer runs was from scale, we also ran a smaller run comprising of 2 runs of BF16, and 1 run of Float8 at 256 scale, and with HSDP2 only (i.e. without 2D Context parallel).

    + +

    In this case both BF16 runs had identical curves and final loss, and we saw a similar spikiness zone for all three runs.

    + +

    At the 2K iteration mark, both Float8 and BF16 ending at nearly identical points:
    +BF16 *2 = 3.28538
    +Float8 rowwise = 3.28203

    + +

    line chart

    + +

    The above result confirms that neither CP nor scale (2k) are responsible for spikiness in the loss as we saw similar effect at 256 scale as well. The most likely explanation for the loss spikes could be content distribution in the dataset.

    + +

    For the sake of determinism, the experiments were run with a serialized C4 dataset (not shuffled), meaning the spikes could be from encountering new content within the dataset.

    + +

    Net speedups at various Scales with Float8 rowwise:

    + +

    We performed shorter runs at various GPU scales to understand how Float8 Rowwise would scale in terms of training acceleration as cluster sizes expanded. Doubling in scale from 960 to 1920, Float8 continued to deliver impressive training speedups, with a range of over 34-43% gains compared to BF16. We also want to note that scaling from 1k to 2k GPUs communication overhead likely kicked in and we observed a 4% hit on throughput with BF16.

    + +

    bar chart

    + +

    As shown in the longer training runs at scale above, Float8 rowwise delivered substantial speedups with equal or even slightly improved loss endpoints while delivering 34% speedups at 1920 (DeepSeek) scale.

    + +

    How can I use Float8 Rowwise in my training?

    + +

    Float8 Rowwise is available now for you to use in your large scale training. It is packaged in TorchAO’s latest builds (0.9 and higher) and integrated into TorchTitan natively if you want to get up and running quickly.

    + +

    To activate Float8 Rowwise in TorchTitan:

    + +

    First enable the model converter to hotswap the nn.linears into float8 linear layers in your models .toml file - see line 29:

    + +

    code

    + +

    Secondly, specify the ‘rowwise’ float8 recipe - see line 72:

    + +

    code

    + +

    Note that you have three choices for the ‘recipe_name’:

    + +
      +
    • rowwise which is the recommended default,
    • +
    • tensorwise (the older style float8) and
    • +
    • rowwise_with_gw_hp.
    • +
    + +

    The gw_hp rowwise option keeps the gradients to the weights in BF16 precision during the backwards pass, and this can further enhance float8 precision for extremely sensitive workloads. But, it can ironically be a bit more performant than generic rowwise if the majority of the matmul sizes in your model are smaller (with an estimated tipping point at roughly 13-16K dimensions on H100).

    + +

    Thus while we recommend rowwise as the default, it may be worth comparing with gw_hp on your model to verify which provides the best performance, with an upside of even greater precision.

    + +

    By toggling the model converter on and off with a #, you can directly compare training acceleration between BF16 and Float8 Rowwise to understand the potential speedups for your own training.

    + +

    Future Updates:

    + +

    We’ll have an additional update coming showcasing multiple improvements for Pipeline Parallel and Async Distributed Checkpointing so please stay tuned.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/accelerating-training-on-nvidia-gpus-with-pytorch-automatic-mixed-precision/index.html b/blog/accelerating-training-on-nvidia-gpus-with-pytorch-automatic-mixed-precision/index.html new file mode 100644 index 000000000000..ec8a7229d8d0 --- /dev/null +++ b/blog/accelerating-training-on-nvidia-gpus-with-pytorch-automatic-mixed-precision/index.html @@ -0,0 +1,770 @@ + + + + + + + + + + + + + Introducing native PyTorch automatic mixed precision for faster training on NVIDIA GPUs | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Mengdi Huang, Chetan Tekur, Michael Carilli + +

    +

    Most deep learning frameworks, including PyTorch, train with 32-bit floating point (FP32) arithmetic by default. However this is not essential to achieve full accuracy for many deep learning models. In 2017, NVIDIA researchers developed a methodology for mixed-precision training, which combined single-precision (FP32) with half-precision (e.g. FP16) format when training a network, and achieved the same accuracy as FP32 training using the same hyperparameters, with additional performance benefits on NVIDIA GPUs:

    + +
      +
    • Shorter training time;
    • +
    • Lower memory requirements, enabling larger batch sizes, larger models, or larger inputs.
    • +
    + +

    In order to streamline the user experience of training in mixed precision for researchers and practitioners, NVIDIA developed Apex in 2018, which is a lightweight PyTorch extension with Automatic Mixed Precision (AMP) feature. This feature enables automatic conversion of certain GPU operations from FP32 precision to mixed precision, thus improving performance while maintaining accuracy.

    + +

    For the PyTorch 1.6 release, developers at NVIDIA and Facebook moved mixed precision functionality into PyTorch core as the AMP package, torch.cuda.amp. torch.cuda.amp is more flexible and intuitive compared to apex.amp. Some of apex.amp’s known pain points that torch.cuda.amp has been able to fix:

    + +
      +
    • Guaranteed PyTorch version compatibility, because it’s part of PyTorch
    • +
    • No need to build extensions
    • +
    • Windows support
    • +
    • Bitwise accurate saving/restoring of checkpoints
    • +
    • DataParallel and intra-process model parallelism (although we still recommend torch.nn.DistributedDataParallel with one GPU per process as the most performant approach)
    • +
    • Gradient penalty (double backward)
    • +
    • torch.cuda.amp.autocast() has no effect outside regions where it’s enabled, so it should serve cases that formerly struggled with multiple calls to apex.amp.initialize() (including cross-validation) without difficulty. Multiple convergence runs in the same script should each use a fresh GradScaler instance, but GradScalers are lightweight and self-contained so that’s not a problem.
    • +
    • Sparse gradient support
    • +
    + +

    With AMP being added to PyTorch core, we have started the process of deprecating apex.amp. We have moved apex.amp to maintenance mode and will support customers using apex.amp. However, we highly encourage apex.amp customers to transition to using torch.cuda.amp from PyTorch Core.

    + +

    Example Walkthrough

    +

    Please see official docs for usage:

    + + +

    Example:

    + +
    import torch
    +# Creates once at the beginning of training
    +scaler = torch.cuda.amp.GradScaler()
    +
    +for data, label in data_iter:
    +   optimizer.zero_grad()
    +   # Casts operations to mixed precision
    +   with torch.cuda.amp.autocast():
    +      loss = model(data)
    +
    +   # Scales the loss, and calls backward()
    +   # to create scaled gradients
    +   scaler.scale(loss).backward()
    +
    +   # Unscales gradients and calls
    +   # or skips optimizer.step()
    +   scaler.step(optimizer)
    +
    +   # Updates the scale for next iteration
    +   scaler.update()
    +
    + +

    Performance Benchmarks

    +

    In this section, we discuss the accuracy and performance of mixed precision training with AMP on the latest NVIDIA GPU A100 and also previous generation V100 GPU. The mixed precision performance is compared to FP32 performance, when running Deep Learning workloads in the NVIDIA pytorch:20.06-py3 container from NGC.

    + +

    Accuracy: AMP (FP16), FP32

    +

    The advantage of using AMP for Deep Learning training is that the models converge to the similar final accuracy while providing improved training performance. To illustrate this point, for Resnet 50 v1.5 training, we see the following accuracy results where higher is better. Please note that the below accuracy numbers are sample numbers that are subject to run to run variance of up to 0.4%. Accuracy numbers for other models including BERT, Transformer, ResNeXt-101, Mask-RCNN, DLRM can be found at NVIDIA Deep Learning Examples Github.

    + +

    Training accuracy: NVIDIA DGX A100 (8x A100 40GB)

    + + + + + + + + + + + + + + +
     epochs Mixed Precision Top 1(%) TF32 Top1(%)
     90 76.93 76.85
    + +

    Training accuracy: NVIDIA DGX-1 (8x V100 16GB)

    + + + + + + + + + + + + + + + + + + + + + + + + +
     epochs Mixed Precision Top 1(%) FP32 Top1(%)
    5076.2576.26
    9077.0977.01
    25078.4278.30
    + +

    Speedup Performance:

    + +

    FP16 on NVIDIA V100 vs. FP32 on V100

    +

    AMP with FP16 is the most performant option for DL training on the V100. In Table 1, we can observe that for various models, AMP on V100 provides a speedup of 1.5x to 5.5x over FP32 on V100 while converging to the same final accuracy.

    + +
    + +
    +

    Figure 2. Performance of mixed precision training on NVIDIA 8xV100 vs. FP32 training on 8xV100 GPU. Bars represent the speedup factor of V100 AMP over V100 FP32. The higher the better.

    + +

    FP16 on NVIDIA A100 vs. FP16 on V100

    + +

    AMP with FP16 remains the most performant option for DL training on the A100. In Figure 3, we can observe that for various models, AMP on A100 provides a speedup of 1.3x to 2.5x over AMP on V100 while converging to the same final accuracy.

    + +
    + +
    +

    Figure 3. Performance of mixed precision training on NVIDIA 8xA100 vs. 8xV100 GPU. Bars represent the speedup factor of A100 over V100. The higher the better.

    + +

    Call to action

    +

    AMP provides a healthy speedup for Deep Learning training workloads on Nvidia Tensor Core GPUs, especially on the latest Ampere generation A100 GPUs. You can start experimenting with AMP enabled models and model scripts for A100, V100, T4 and other GPUs available at NVIDIA deep learning examples. NVIDIA PyTorch with native AMP support is available from the PyTorch NGC container version 20.06. We highly encourage existing apex.amp customers to transition to using torch.cuda.amp from PyTorch Core available in the latest PyTorch 1.6 release.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/accelerating-triton/index.html b/blog/accelerating-triton/index.html new file mode 100644 index 000000000000..255769464002 --- /dev/null +++ b/blog/accelerating-triton/index.html @@ -0,0 +1,842 @@ + + + + + + + + + + + + + Accelerating Triton Dequantization Kernels for GPTQ | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Less Wright, Adnan Hoque (IBM) + +

    +

    TL;DR

    + +

    Leveraging a first principles approach, we showcase a step by step process undertaken to accelerate the current Triton GPTQ kernels by 3x (core GPTQ) and 6x (AutoGPTQ). Example: 275us to 47us on a typical Llama style inference input. The goal is to provide a helpful template for accelerating any given Triton kernel. We provide a background on Triton and GPTQ quantization and dequantization process, showcase the impact of coalesced memory access to improve shared and global memory throughput, highlight changes made to reduce warp stalling to improve total throughput, and an overview on integrating Triton kernels into PyTorch code. Longer term, we hope to surpass the existing CUDA native GPTQ kernel with our Triton kernel.

    + +

    Fig 1: Performance benchmarking the optimized AutoGTPQ kernel vs the current AutoGPTQ kernel on H100

    + +

    Fig 1: Performance benchmarking the optimized AutoGTPQ kernel vs the current AutoGPTQ kernel on H100

    + +

    Fig 2: Performance benchmarking the newly optimized AutoGTPQ kernel vs the current AutoGPTQ kernel on A100

    + +

    Fig 2: Performance benchmarking the newly optimized AutoGTPQ kernel vs the current AutoGPTQ kernel on A100

    + +

    Fig 3: Even with these improvements, there remains a gap between our optimized Triton kernel and the CUDA native AutoGTPQ kernel on A100.

    + +

    Fig 3: Even with these improvements, there remains a gap between our optimized Triton kernel and the CUDA native AutoGTPQ kernel on A100. More to come…

    + +

    1.0 Introduction to Triton

    + +

    The Triton framework provides a hardware agnostic way of programming and targeting GPUs, currently supporting both NVIDIA and AMD, with support for additional hardware vendors in progress. Triton is now a mainstay for PyTorch 2.0 as torch.compile decomposes eager PyTorch and re-assembles it into a high percentage of Triton kernels with PyTorch connecting code.

    + +

    As Triton becomes more widely adopted, it will be essential that programmers understand how to systematically step through the Triton stack (from the high level Python down to the low-level SASS) to address performance bottlenecks in order to optimize GPU efficiency for algorithms that go beyond torch.compile generated kernels.

    + +

    In this post, we will introduce some core concepts of the Triton programming language, how to identify common performance limiters in GPU kernels, and in parallel, tune a quantization kernel used in AutoGPTQ that can be used for high throughput inference applications.

    + +

    Intro to GPTQ Quantization and Dequantization

    + +

    GPTQ is a quantization algorithm that is able to compress ultra-large (175B+) LLMs efficiently to int4 bit representation, via approximate second order information (Hessian inverse). AutoGPTQ is a framework built on GPTQ, allowing for rapid dequantization and inference/serving of LLMs that have been quantized with GPTQ.

    + +

    As part of the AutoGPTQ stack, they provide a Triton GPTQ kernel to handle the dequantization of a model for inference.

    + +

    The basic process for INT quantization is shown below and involves determining the scale and zero point, and then computing the quantized 4bit Weight using the Scale and Zero point:

    + +

    The basic process for INT quantization

    + +

    We thus store the 4 Bit weights along with the meta information of Scale and ZeroPoint for each group of weights.

    + +

    To ‘dequant’ these weights, we do the following:

    + +

    To ‘dequant’ these weights

    + +

    And then proceed to Matrix Multiply the dequantized weights with the dense input feature matrix for this linear layer.

    + +

    2.0 Identify the Bottlenecks - Optimizing Matrix Multiplication

    + +

    As it turns out, making a fast matrix multiplication kernel is not trivial. A naively implemented matrix multiply will rarely reach peak throughput performance on highly parallel machines like GPUs. So – we need to tackle our compute and memory subsystems in our GPU in an hierarchical fashion to make sure we are maximally utilizing each resource.

    + +

    We start our optimization process, by running the unoptimized Triton Kernel, through the Nvidia Nsight Compute tool and taking a note of some important metrics and warnings:

    + +

    some important metrics and warnings

    + +

    Fig xy (todo)

    + +

    some important metrics and warnings

    + +

    We notice first that both compute and memory throughput are low, 7.40% and 21.19% respectively (fig xy) . Knowing that for typical inference matrix problem sizes, we are in the memory bound regime, we will attempt to optimize the kernel by applying code changes that target the memory subsystem of our A100 GPU.

    + +

    The three topics this post will cover are:

    + +
      +
    1. L2 Optimization
    2. +
    3. Vectorized Load
    4. +
    5. Warp Stalling
    6. +
    + +

    Let’s walk through each topic, make the appropriate changes, and see its corresponding impact on our Triton Kernel. This Triton kernel is a fused dequantization kernel that dequantizes a packed int32 weight (we will refer to this as the B Matrix) Tensor into int4 weights, performs matrix multiplication with the activation tensor (refer to as the A matrix) in FP16 mode, and then stores the results back to a matrix C.

    + +

    The above is referred to as W4A16 quantization. Keep in mind that the process we describe can and should be used for the development of any GPU kernel, as these are common bottlenecks in any unoptimized kernel.

    + +

    3.0 L2 Optimization

    + +

    This optimization already exists in the AutoGPTQ kernel, but we’d like to dedicate a section to this to help readers better understand how mapping and execution order of thread blocks is handled in Triton. Thus, we will step through a naive mapping and then a more optimal mapping to see its corresponding impact.

    + +

    Let’s build up our kernel naively, starting with a “linear” load from global memory and then compare it to a more optimized “swizzled” load. Linear vs Swizzled determines the execution order of our grid of work on the GPU. Let’s take a look at the hints that the Nvidia Nsight Compute Tool provides regarding our kernels shared memory access pattern in the naive case:

    + +

    the hints from the Nvidia Nsight Compute Tool

    + +

    To tackle this issue we can use an approach referred to as “tile-swizzling.” The idea of this method is to launch our thread blocks in a more L2 cache friendly order.

    + +

    Let’s take a step back and familiarize ourselves with some Triton semantics and make a simple CUDA analogy to understand the concept better. Triton kernels launch “programs”. These so-called programs map to the concept of a Thread Block in CUDA and it is the basic unit of parallelism in a Triton Kernel. Every program has with it associated a “pid” and all the threads in a program are guaranteed to be executing the same instruction.

    + +

    The Triton programs will be distributed onto your SMs in a naive-way if you do a simple linear mapping of “pid” to a 2D grid location of your output matrix C.

    + +

    This 2D grid location is determined by pid_m and pid_n in Triton. We would like to exploit data and cache locality in the L2 cache of our GPU, when we distribute our grid of work. To do this in Triton we can make the following changes:

    + +

    To do this in Triton

    + +

    The code highlighted in red would be the naive “linear” tile ordering, and the code highlighted in green is the “swizzled” tile ordering. This type of launch promotes a sense of locality. Here is a visual to help understand this better.

    + +

    a sense of locality

    + +

    After incorporating this change, the profiler no longer complains about uncoalesced memory accesses. Let’s take a look at how our memory throughput has changed:

    + +

    how our memory throughput has changed

    + +

    This change was tested on a simple load store kernel. Looking at the GPU speed of light statistics section in the profiler we also see a 112.07% increase in the memory throughput of the simple load kernel, which is what we were after with this optimization. Again, this optimization already exists in the AutoGPTQ kernel, but is the boilerplate logic that every Triton Kernel programmer will have to write in the beginning of their kernel, before any of the exciting dequantization or matrix multiply logic. It is thus important to understand that:

    + +
      +
    1. +

      This mapping is not unique

      +
    2. +
    3. +

      Triton does not automatically handle this kind of optimization for the programmer, and careful thought must be taken to ensure your kernel is optimally handling shared memory accesses

      +
    4. +
    + +

    These are not obvious for those new to Triton, as much of the shared memory access optimization is handled by the Triton compiler. However, in the cases where these are not handled by the compiler, it is important to be able to understand what tools and methods are available to us to be able to influence memory behavior.

    + +

    4.0 Vectorized Load

    + +

    Now, back to the original complaints of our unoptimized kernel. We want to optimize the global memory access pattern of our kernel. From the details page of the Nvidia Nsight compute tool, we see the following note, where the profiler is complaining about uncoalesced global memory accesses.

    + +

    Let’s dig deeper and take a look at the SASS (Assembly) Code load for an unoptimized memory read:

    + +

    an unoptimized memory read

    + +

    This load operation resulted in 32 global load operations that are 16 bit wide. This is not optimal.

    + +

    We would like to do our global memory loads in a vectorized way so that it results in the least amount of load instructions. To combat this we can give the Triton Compiler some help.

    + +

    code block

    + +

    The green highlighted lines above act as a compiler hint. It tells the compiler that these elements are contiguous in memory and that this load operation can be coalesced.

    + +

    Let’s see the effect in assembly after adding these lines.

    + +

    the effect in assembly after adding these lines

    + +

    The load is now performed in 4 global load operations that are each 128 bit wide, instead of 32 16 bit global load operations. This means 28 fewer memory fetch instructions, and importantly a coalesced memory access. This can be seen from the fact that a single thread is not accessing consecutive memory addresses, which without the compiler hint, was the behavior.

    + +

    The resulting effect is 73x speedup in an isolated load operation, and after incorporating it in the full dequantization kernel we were able to see another 6% speedup. Another step in the right direction!

    + +

    5.0 Warp Stalling

    + +

    performance limiter, warp stalling

    + +

    Now putting all the changes back into our full dequantization kernel, we see the following performance limiter, warp stalling.

    + +

    These warp stalls are mostly caused by ‘Long Scoreboard’ stalls, accounting for 92.63% of the total.

    + +

    At a high level, long scoreboard stalls happen when a warp requires data that may not be ready yet in order to be in the “issued” state. In other words GPUs are throughput machines, and we need to hide the latency of load instructions with compute instructions. By loading more data and rearranging where the load instructions are in the script we can take care of this problem.

    + +

    In an ideal scenario, each warp scheduler would be able to issue 1 instruction every clock cycle. Note - Every SM on an A100 GPU has 4 warp schedulers.

    + +

    However – our kernel has bottlenecks and is spending 4.4 cycles in the stall state with the block size that AutoGPTQ Triton kernel deems as optimal given the presets it has.

    + +

    How do we improve this?

    + +

    We want to be able to increase our memory throughput so that we can increase the chance that when a warp issues an instruction, we won’t be waiting for loads to be stored in SRAM so that they can be used for computation. We played around with multiple parameters (such as number of pipeline stages, and number of warps) and the one that had the biggest impact was increasing the block size by a factor of 2 in the k dimension.

    + +

    These changes yield an immediate impact on both compute and memory throughput.

    + +

    an immediate impact on both compute and memory throughput

    + +

    We also see the long scoreboard wait time at the step where we shift and scale the quantized weights drop significantly, which is what we identified as the original bottleneck in the source code. While there are still stalls at this line, only 68% of them are caused by long scoreboard stalls, compared to the original 92%. Ideally, we do not observe ANY stalls, so there is still work to be done here, but a reduction in the amount of stalls caused by long scoreboard tells us that our data is at this point ready to be used (in L1TEX) memory by an instruction that a warp wants to execute, at a higher frequency then the original kernel.

    + +

    1.4x speedup in the execution time of our kernel

    + +

    The corresponding impact is a 1.4x speedup in the execution time of our kernel.

    + +

    6.0 Results

    + +

    By tackling all these problem areas methodically our resulting kernel is 6x faster on the Nvidia A100 GPU than if you were to use the Triton kernel AutoGPTQ provides out-of-the-box.

    + +

    Taking a relevant Llama inference sample data point, the Triton kernel we’ve developed takes 47us to perform dequantization and matrix multiplication compared to the 275us taken by the AutoGPTQ kernel for the same matrix size.

    + +

    By replicating this step-by-step approach it should be possible to get similar speedups in other kernels, and help build understanding on common GPU bottlenecks and how to tackle them.

    + +

    It is important to note that while strides have been made in improving the performance of the AutoGPTQ Triton Kernel, we have still not closed the gap on the current exllamaV2 CUDA kernels found in AutoGPTQ.

    + +

    More research is required to understand how we can further optimize this kernel to match equivalent custom CUDA kernel performance.

    + +

    Summary and Future work

    + +

    Triton extends PyTorch by allowing low level GPU optimizations to be done at a higher level of abstraction than CUDA programming, with the net result that adding optimized Triton kernels can help PyTorch models run faster.

    + +

    Our goal in this post was to show an example of accelerating the GPTQ dequant kernel and provide a template workflow for how the accelerations were achieved.

    + +

    For future work, SplitK work decomposition for the matrix multiplication is a potential speed up we’ll investigate.

    + +

    Integrating custom Triton Kernels into PyTorch

    + +

    Given the acceleration shown above, a common question is how to actually use a custom kernel in a given PyTorch codebase.

    + +

    A triton kernel will contain at least two parts - the actual Triton kernel code which will be compiled by the Triton compiler:

    + +

    the actual Triton kernel code which will be compiled by the Triton compiler

    + +

    Along with the actual kernel code is a python wrapper, that may or may not subclass the PyTorch autograd class - depending if it’s going to support a backwards pass (i.e. for training purposes or only for inference purposes).

    + +

    You simply import the python class into your PyTorch code where you want to use it much like any other Python / PyTorch function.

    + +

    import the python class into your PyTorch code

    + +

    In this case, simply importing and then using ‘fast_qlinear’ would invoke the underlying Triton kernel with the speed-ups we’ve shown above applied to your PyTorch model.

    + +

    Acknowledgements

    + +

    Thanks to Jamie Yang and Hao Yu from IBM Research for their technical guidance in the collection of these results.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/accelerating-whisper-arm-w-transformers/index.html b/blog/accelerating-whisper-arm-w-transformers/index.html new file mode 100644 index 000000000000..a1b205486e00 --- /dev/null +++ b/blog/accelerating-whisper-arm-w-transformers/index.html @@ -0,0 +1,679 @@ + + + + + + + + + + + + + Accelerating Whisper on Arm with PyTorch and Hugging Face Transformers | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Pareena Verma, Arm + +

    +

    Automatic speech recognition (ASR) has revolutionized how we interact with technology, clearing the way for applications like real-time audio transcription, voice assistants, and accessibility tools. OpenAI Whisper is a powerful model for ASR, capable of multilingual speech recognition and translation.

    + +

    A new Arm Learning Path is now available that explains how to accelerate Whisper on Arm-based cloud instances using PyTorch and Hugging Face transformers.

    + +

    Why Run Whisper on Arm?

    + +

    Arm processors are popular in cloud infrastructure for their efficiency, performance, and cost-effectiveness. With major cloud providers such as AWS, Azure, and Google Cloud offering Arm-based instances, running machine learning workloads on this architecture is becoming increasingly attractive.

    + +

    What You’ll Learn

    + +

    The Arm Learning Path provides a structured approach to setting up and accelerating Whisper on Arm-based cloud instances. Here’s what you cover:

    + +

    1. Set Up Your Environment

    + +

    Before running Whisper, you must set up your development environment. The learning path walks you through setting up an Arm-based cloud instance and installing all dependencies, such as PyTorch, Transformers, and ffmpeg.

    + +

    2. Run Whisper with PyTorch and Hugging Face Transformers

    + +

    Once the environment is ready, you will use the Hugging Face transformer library with PyTorch to load and execute Whisper for speech-to-text conversion. The tutorial provides a step-by-step approach for processing audio files and generating audio transcripts.

    + +

    3. Measure and Evaluate Performance

    + +

    To ensure efficient execution, you learn how to measure transcription speeds and compare different optimization techniques. The guide provides insights into interpreting performance metrics and making informed decisions on your deployment.

    + +

    Try it Yourself

    + +

    Upon completion of this tutorial, you know how to:

    + +
      +
    • Deploy Whisper on an Arm-based cloud instance.
    • +
    • Implement performance optimizations for efficient execution.
    • +
    • Evaluate transcription speeds and optimize further based on results.
    • +
    + +

    Try the live demo today and see audio transcription in action on Arm: Whisper on Arm Demo.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/achieving-sustainability-goals/index.html b/blog/achieving-sustainability-goals/index.html new file mode 100644 index 000000000000..21d00323f0ad --- /dev/null +++ b/blog/achieving-sustainability-goals/index.html @@ -0,0 +1,710 @@ + + + + + + + + + + + + + Achieving Sustainability Goals with PyTorch and Intel AI | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    This post was contributed by Intel AI in partnership with the PyTorch Foundation.

    + +

    In 2017, the UN Global Compact emphasized digital technology, particularly open source, as crucial for achieving Sustainable Development Goals (SDGs), projecting a potential $2.1 trillion boost to the tech sector by 2030. The SDGs, part of the “2030 Agenda for Sustainable Development,” address global prosperity across various sectors.

    + +

    The Linux Foundation’s Sustainability Initiative aligns projects with sustainable development goals. By assessing project impact, resources can be better allocated for enhancement. Intel is also a contributor to this initiative, and recently presented three use cases with PyTorch and Intel AI to address UN SDG-aligned issues.

    + +

    Sustainability Goals

    + +

    SDG 15: Life on Land

    + +
      +
    • Using a bone likelihood map to pinpoint dinosaur bones, which paves the way for transfer learning to tackle contemporary challenges like wildfire prediction.
    • +
    • Employing transfer learning for wildfire prediction and generating data with Stable Diffusion.
    • +
    + +

    SDG 9: Industry, Innovation, Infrastructure

    + +
      +
    • Identifying crucial minerals, oil, and gas through subsurface models.
    • +
    + +

    Here are the key highlights from the workshops. Read below for a summary, and be sure to watch the full workshop videos and visit the GitHub repositories.

    + +

    Session 1: Introduction to Dinosaur Bone Bed Maps

    + +

    Bob Chesebrough recently led a PyTorch workshop demonstrating how to create a dinosaur bone bed map for Dinosaur National Monument. He shared footage of his discoveries and explained his AI-driven approach, utilizing geological data to pinpoint possible bone-rich areas.

    + +

    Attendees learned to set up JupyterLab, access the training section, and launch a BASH shell. Bob’s classification model, applied to aerial images, facilitated heatmap generation to identify potential bone locations, refined through field data. The GitHub repo “Jurassic” guided participants through directory setup and model optimization steps.

    + +

    Rahul Unnikrishnan Nair demonstrated the use of PyTorch, focusing on performance enhancements. The workshop covered modeling best practices, such as data transformations, class distribution, dropout layers, and efficient training methods. Training and scoring procedures were examined, with a focus on model accuracy and transportability to other regions. Heatmap creation involved cutting images into tiles, considering context for accurate environmental identification.

    + +

    Watch the full workshop video here and visit the GitHub repository to access the code sample and experiment with the code using Intel ® Extension for PyTorch. Try it out with PyTorch and explore what works best for you. Happy dinosaur bone hunting!

    + +

    Session 2: Seismic Data to Subsurface Models with OpenFWI: Training an AI Model with PyTorch

    + +

    Seismic exploration is crucial for subsurface imaging in mineral and oil/gas exploration. Full waveform inversion (FWI) recreates subsurface sound wave velocities, akin to ultrasound for the Earth.

    + +

    Ben Consolvo, an AI Software Engineering Manager at Intel, presented training AI models directly from seismic data using PyTorch on Intel high-performance processors. FWI, though accurate, is computationally intensive and relies on precise initial models. AI models offer an alternative approach, learning directly from data without the need for precise initializations. Ben explained the challenges of AI models, highlighting the need for diverse datasets and the potential use of CPUs for fine-tuning. He also discussed FWI’s surprising medical applications.

    + +

    Watch the full video here and go to the paper for more details. The GitHub repo is OpenFWI.

    + +

    Session 3: Using PyTorch to Aid Wildfire Prediction

    + +

    Forest fires pose significant threats to ecosystems, wildlife, and communities. Machine learning presents a promising approach to enhance prediction accuracy. In this Earth Day webinar, Bob Chesebrough and Rahul Unnikrishnan Nair demonstrated image analysis techniques using the MODIS dataset which was used to predict early forest fire probabilities. Through fine-tuning a ResNet18 model with the Intel® Extension for PyTorch, pre-trained models were adjusted with aerial photos, utilizing geo-spatial and color data for fire risk assessment.

    + +

    Emphasizing the temporal and geographical filtering requirements for dataset analysis, showcasing images from fire-affected areas like Paradise, CA, the model’s adaptability to different hardware configurations was highlighted, along with the utilization of Stable Diffusion for data synthesis when real datasets were unavailable. The presenters encouraged audience engagement in PyTorch experimentation for early fire detection by extending a challenge to leverage these tools for critical predictive tasks. Join them in this endeavor to enhance wildfire prevention and protection efforts.

    + +

    Watch the full video here and go to the paper for more details. The GitHub repo is ForestFirePrediction.

    + +

    About the Intel Speakers

    + +

    Bob Chesebrough, Sr Solutions Architect

    + +

    Bob Chesebrough’s industry experience is software development/AI solution engineering for Fortune 100 companies and national laboratories for over three decades. He is also a hobbyist who has logged over 800 miles and 1000 hours in the field finding dinosaur bones. He and his sons discovered an important fossil of the only known crocodilian from the Jurassic in New Mexico, they have also discovered and logged into the museum 2000+ bones localities and described a new mass bone bed in New Mexico.

    + +

    Rahul Unnikrishnan Nair, Architect in Applied AI and the Engineering Lead at Intel® Liftoff

    + +

    In his current role at Intel® Liftoff for Startups program, Rahul Nair brings his extensive experience in applied AI and engineering to mentor early-stage AI startups. His dedication lies in helping these startups transform their innovative ideas into fully-fledged, market-ready products with a strong emphasis on use-case-driven, practical engineering and optimization.

    + +

    Ben Consolvo, AI Software Engineering Manager

    + +

    Ben Consolvo is an AI Solutions Engineering Manager at Intel. He has been building a team and a program around Intel’s AI technology paired with Intel’s hardware offerings. He brings a background and passion in data science, particularly in deep learning (DL) and computer vision. He has applied his skills in DL in the cybersecurity industry to automatically identify phishing websites, as well as to the oil and gas industry to identify subsurface features for geophysical imaging.

    + +

    Kelli Belcher, AI Solutions Engineer

    + +

    Kelli Belcher is an AI Solutions Engineer at Intel with over 5 years of experience across the financial services, healthcare, and tech industries. In her current role, Kelli helps build Machine Learning solutions using Intel’s portfolio of open AI software tools. Kelli has experience with Python, R, SQL, and Tableau, and holds a Master of Science in Data Analytics from the University of Texas.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/activation-checkpointing-techniques/index.html b/blog/activation-checkpointing-techniques/index.html new file mode 100644 index 000000000000..be772dd14c2d --- /dev/null +++ b/blog/activation-checkpointing-techniques/index.html @@ -0,0 +1,831 @@ + + + + + + + + + + + + + Current and New Activation Checkpointing Techniques in PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    As models scale in depth, batch size, and sequence length, etc, activation memory becomes an increasingly significant contributor to the overall memory usage. To help address this, PyTorch provides utilities for activation checkpointing, which reduce the number of saved tensors by recomputing them when needed, trading off memory usage for additional compute.

    + +

    In this post, we’ll walk through the basics of what activation memory is, the high-level ideas behind existing activation checkpointing techniques, and also introduce some newer techniques that aim to improve flexibility and provide more optimization/automation out of the box.

    + +

    As we look at these techniques, we’ll compare how these methods fit into a speed vs. memory trade-off diagram and hopefully provide some insight on how to choose the right strategy for your use case.

    + +

    (If you prefer to jump straight to the new APIs, please skip ahead to the “Selective Activation Checkpoint” and “Memory Budget API” sections below.)

    + +

    flow diagram

    + +
    + +

    Activation Memory Basics

    + +

    By default, in eager mode (rather than using torch.compile), PyTorch’s autograd preserves intermediate activations for backward computation. For example, if you call sin on a tensor x during the forward pass, autograd must remember x to compute cos(x) during backward.

    + +

    flow diagram

    + +

    If this tensor x is saved at the beginning of the forward pass, it remains in memory throughout both the forward and backward phases. It can only be cleared after it is used to compute the gradient, which happens at the end of the backward pass (due to the reverse order of execution).

    + +

    Thus, as you proceed through the forward pass and perform more and more operations, you accumulate more and more activations, resulting in more and more activation memory until it (typically) reaches its peak at the start of backward (at which point activations can start to get cleared).

    + +

    flow diagram

    + +

    In the diagram above, the orange boxes represent operations, black arrows represent their tensor inputs and outputs. The black arrows that cross over the right represent tensors that autograd saves for backward.

    + +

    A useful way to visually organize this default saving behavior in eager as well as the techniques we’re about to introduce is based on how they trade off speed versus memory.

    + +

    flow diagram

    + +

    The ideal place to be on this diagram is the top-left, where you have “high” speed but also low memory usage.

    + +

    We begin by putting the default saving behavior on the top-right (for reasons we’ll explain in more detail as we introduce more points for other techniques).

    + +
    + +

    Activation Checkpointing (AC)

    + +

    Activation checkpointing (AC) is a popular technique to reduce memory usage in PyTorch.

    + +

    During forward, any operations performed inside the AC’d region do not save tensors for backward. (Only the inputs to the function are saved.) During backward, the intermediate activations needed for gradient computation are rematerialized by running the function a second time.

    + +

    flow diagram

    + +

    In the diagram (right), the black box shows where activation checkpointing is applied. Compared to the default eager approach (left), this setup results in fewer tensors being saved (1 versus 3).

    + +

    Applying AC on the right parts of the model has the effect of reducing peak memory, because the intermediate activations are no longer materialized in memory when the memory usage typically peaks (at the beginning of backward).

    + +

    On the speed-versus-memory tradeoff diagram, AC is plotted on the bottom-left. Relative to eager mode, it reduces the amount of memory saved for backward but comes with an added cost in compute due to recomputation.

    + +

    flow diagram

    + +

    Note that AC’s speed–memory tradeoff /can/ be adjusted by selecting which parts of the forward pass to checkpoint and by defining how many checkpoint regions to use. However, implementing these changes may require modifying your model’s structure and can be cumbersome depending on how your code is organized. For the purposes of this diagram, we assume only one region is checkpointed; under this assumption, AC appears as a single point on the tradeoff diagram.

    + +

    Also note that “memory” here does not refer to peak memory usage; rather, it indicates the how much memory is saved for backward for a fixed region.

    + +
    + +

    torch.compile and min-cut partitioner

    + +

    Another notable approach to keep in mind is torch.compile (introduced in PyTorch 2.0). Like activation checkpointing, torch.compile can also perform some level of recomputation under the hood. Specifically, it traces the forward and backward computations into a single joint graph, which is then processed by a “min-cut” partitioner. This partitioner uses a min-cut/max-flow algorithm to split the graph such that it minimizes the number of tensors that need to be saved for backward.

    + +

    At first glance, this might sound a lot like what we want for activation memory reduction. However, the reality is more nuanced. By default, the partitioner’s primary goal is to reduce runtime. As a result, it only recomputes certain types of operations—primarily simpler, fusible, and non-compute-intensive ops (like pointwise ops).

    + +

    Placing “compile” on the speed-versus-memory tradeoff diagram…

    + +

    flow diagram

    + +

    It is to the top-left of the eager non-AC point, as we expect torch.compile to improve on both speed and memory.

    + +

    On the other hand, relative to activation checkpointing, torch.compile is more conservative about what it recomputes, placing it closer to the top-left on the speed-versus-memory diagram.

    + +
    + +

    Selective Activation Checkpoint [NEW!]

    + +

    While normal checkpointing recomputes every op in a chosen region, selective activation checkpointing (SAC) is an additional setting on top of activation checkpointing that you can apply to have a more granular control over which operations to recompute.

    + +

    This can be useful if you have certain more expensive operations like matmuls which you prefer to avoid recomputing, but still generally want to recompute cheaper operations like pointwise.

    + +

    flow diagram

    + +

    Where plain AC (left) would save a single tensor and then recompute the entire AC’d region, with SAC (right) you can selectively save specific operations (marked red) in the region, so you can avoid recomputing them.

    + +

    To specify what to selectively save, you can specify a policy_fn. To illustrate the additional trade offs you can make with this, we present two simple policy functions.

    + +

    Policy 1: Not recomputing matmuls:

    + +
    aten = torch.ops.aten
    +compute_intensive_ops = [  
    +        aten.mm,
    +        aten.bmm,
    +        aten.addmm,
    +] 
    +def policy_fn(ctx, op, *args, **kwargs):
    +    if op in compute_intensive_ops:
    +        return CheckpointPolicy.MUST_SAVE
    +    else:
    +        return CheckpointPolicy.PREFER_RECOMPUTE
    +
    + +

    flow diagram

    + +

    Policy 2: More aggressively save anything compute intensive

    + +
    # torch/_functorch/partitioners.py
    +aten = torch.ops.aten
    +compute_intensive_ops = [  
    +   aten.mm,
    +   aten.convolution,
    +   aten.convolution_backward,
    +   aten.bmm,
    +   aten.addmm,
    +   aten._scaled_dot_product_flash_attention,
    +   aten._scaled_dot_product_efficient_attention,
    +   aten._flash_attention_forward,
    +   aten._efficient_attention_forward,
    +   aten.upsample_bilinear2d,
    +   aten._scaled_mm
    +] 
    +def policy_fn(ctx, op, *args, **kwargs):
    +    if op in compute_intensive_ops:
    +        return CheckpointPolicy.MUST_SAVE
    +    else:
    +        return CheckpointPolicy.PREFER_RECOMPUTE
    +
    + +

    flow diagram

    + +

    On the speed-versus-memory diagram, SAC is plotted as a range of points from closer to AC to closer to Eager, depending on your chosen policy.

    + +

    flow diagram

    + +

    Try it out! (Available in 2.5 as a prototype feature; see docs for more info + copy-pastable example)

    + +
    from torch.utils.checkpoint import checkpoint, create_selective_checkpoint_contexts
    +
    +# Create a policy function that returns a CheckpointPolicy
    +def policy_fn(ctx, op, *args, **kwargs):
    +    if op in ops_to_save:
    +        return CheckpointPolicy.MUST_SAVE
    +    else:
    +        return CheckpointPolicy.PREFER_RECOMPUTE
    +
    +# Use the context_fn= arg of the existing checkpoint API
    +out = checkpoint(
    +    fn, *args,
    +    use_reentrant=False,
    +    # Fill in SAC context_fn's policy_fn with functools.partial
    +    context_fn=partial(create_selective_checkpoint_contexts, policy_fn),
    +)
    +
    +
    +
    + +

    (compile-only) Memory Budget API [NEW!]

    + +

    As mentioned previously, any given SAC policy can be represented as a point on a speed-memory tradeoff diagram. Not all policies are created equal, however. The “optimal” policies are the ones that fall on a pareto curve, e.g. for all policies that incur the same memory overhead, this policy is the one that minimizes the amount of required compute.

    + +

    For users who are using torch.compile, we offer a memory budget API that automatically applies SAC over your compiled region with a pareto-optimal policy given a user-specified “memory budget” between 0 and 1, where a budget of 0 behaves like plain-AC and a budget of 1 behaves like default torch.compile.

    + +

    flow diagram

    + +

    Below are some real results on a transformer model:

    + +

    flow diagram

    + +

    We observe a 50% memory reduction by recomputing only pointwise ops, with a steady drop-off as you recompute more and more of your matmuls. Attention is the most expensive, so you tend to want to recompute those last.

    + +

    Try it out! (Available in 2.4 as an experimental feature; see this comment block for more info)

    + +
    torch._dynamo.config.activation_memory_budget = 0.5
    +
    +out = torch.compile(fn)(inp)
    +
    + +
    + +

    Conclusion

    + +

    flow diagram

    + +

    In summary, activation checkpointing techniques in PyTorch offer a variety of ways to balance memory and compute demands, from simple region-based checkpointing to more selective and automated methods. By choosing the option that best matches your model’s structure and resource constraints, you can achieve significant memory savings with an acceptable trade-off in compute.

    + +

    Acknowledgements

    + +

    We would like to thank Meta’s xformers team including Francisco Massa for working on the original version of Selective Activation Checkpoint.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/amazon-ads-case-study/index.html b/blog/amazon-ads-case-study/index.html new file mode 100644 index 000000000000..3b092f47b455 --- /dev/null +++ b/blog/amazon-ads-case-study/index.html @@ -0,0 +1,798 @@ + + + + + + + + + + + + + Case Study: Amazon Ads Uses PyTorch and AWS Inferentia to Scale Models for Ads Processing | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Yashal Kanungo – Applied Scientist, Kamran Khan - Sr. Technical Product Manager, Shubha Kumbadakone – Sr. Specialist, ML Frameworks + +

    +

    Amazon Ads uses PyTorch, TorchServe, and AWS Inferentia to reduce inference costs by 71% and drive scale out.

    + +

    Amazon Ads helps companies build their brand and connect with shoppers through ads shown both within and beyond Amazon’s store, including websites, apps, and streaming TV content in more than 15 countries. Businesses and brands of all sizes, including registered sellers, vendors, book vendors, Kindle Direct Publishing (KDP) authors, app developers, and agencies can upload their own ad creatives, which can include images, video, audio, and, of course, products sold on Amazon.

    + +

    + +

    + +

    To promote an accurate, safe, and pleasant shopping experience, these ads must comply with content guidelines. For example, ads cannot flash on and off, products must be featured in an appropriate context, and images and text should be appropriate for a general audience. To help ensure that ads meet the required policies and standards, we needed to develop scalable mechanisms and tools.

    + +

    As a solution, we used machine learning (ML) models to surface ads that might need revision. As deep neural networks flourished over the past decade, our data science team began exploring more versatile deep learning (DL) methods capable of processing text, images, audio, or video with minimal human intervention. To that end, we’ve used PyTorch to build computer vision (CV) and natural language processing (NLP) models that automatically flag potentially non-compliant ads. PyTorch is intuitive, flexible, and user-friendly, and has made our transition to using DL models seamless. Deploying these new models on AWS Inferentia-based Amazon EC2 Inf1 instances, rather than on GPU-based instances, reduced our inference latency by 30 percent and our inference costs by 71 percent for the same workloads.

    + +

    Transition to deep learning

    + +

    Our ML systems paired classical models with word embeddings to evaluate ad text. But our requirements evolved, and as the volume of submissions continued to expand, we needed a method nimble enough to scale along with our business. In addition, our models must be fast and serve ads within milliseconds to provide an optimal customer experience.

    + +

    Over the last decade, DL has become very popular in numerous domains, including natural language, vision, and audio. Because deep neural networks channel data sets through many layers — extracting progressively higher-level features — they can make more nuanced inferences than classical ML models. Rather than simply detecting prohibited language, for example, a DL model can reject an ad for making false claims.

    + +

    In addition, DL techniques are transferable– a model trained for one task can be adapted to carry out a related task. For instance, a pre-trained neural network can be optimized to detect objects in images and then fine-tuned to identify specific objects that are not allowed to be displayed in an ad.

    + +

    Deep neural networks can automate two of classical ML’s most time-consuming steps: feature engineering and data labeling. Unlike traditional supervised learning approaches, which require exploratory data analysis and hand-engineered features, deep neural networks learn the relevant features directly from the data. DL models can also analyze unstructured data, like text and images, without the preprocessing necessary in ML. Deep neural networks scale effectively with more data and perform especially well in applications involving large data sets.

    + +

    We chose PyTorch to develop our models because it helped us maximize the performance of our systems. With PyTorch, we can serve our customers better while taking advantage of Python’s most intuitive concepts. The programming in PyTorch is object-oriented: it groups processing functions with the data they modify. As a result, our codebase is modular, and we can reuse pieces of code in different applications. In addition, PyTorch’s eager mode allows loops and control structures and, therefore, more complex operations in the model. Eager mode makes it easy to prototype and iterate upon our models, and we can work with various data structures. This flexibility helps us update our models quickly to meet changing business requirements.

    + +

    “Before this, we experimented with other frameworks that were “Pythonic,” but PyTorch was the clear winner for us here.” said Yashal Kanungo, Applied Scientist. “Using PyTorch was easy because the structure felt native to Python programming, which the data scientists were very familiar with”.

    + +

    Training pipeline

    + +

    Today, we build our text models entirely in PyTorch. To save time and money, we often skip the early stages of training by fine-tuning a pre-trained NLP model for language analysis. If we need a new model to evaluate images or video, we start by browsing PyTorch’s torchvision library, which offers pretrained options for image and video classification, object detection, instance segmentation, and pose estimation. For specialized tasks, we build a custom model from the ground up. PyTorch is perfect for this, because eager mode and the user-friendly front end make it easy to experiment with different architectures.

    + +

    To learn how to finetune neural networks in PyTorch, head to this tutorial.

    + +

    Before we begin training, we optimize our model’s hyperparameters, the variables that define the network architecture (for example, the number of hidden layers) and training mechanics (such as learning rate and batch size). Choosing appropriate hyperparameter values is essential, because they will shape the training behavior of the model. We rely on the Bayesian search feature in SageMaker, AWS’s ML platform, for this step. Bayesian search treats hyperparameter tuning as a regression problem: It proposes the hyperparameter combinations that are likely to produce the best results and runs training jobs to test those values. After each trial, a regression algorithm determines the next set of hyperparameter values to test, and performance improves incrementally.

    + +

    We prototype and iterate upon our models using SageMaker Notebooks. Eager mode lets us prototype models quickly by building a new computational graph for each training batch; the sequence of operations can change from iteration to iteration to accommodate different data structures or to jibe with intermediate results. That frees us to adjust the network during training without starting over from scratch. These dynamic graphs are particularly valuable for recursive computations based on variable sequence lengths, such as the words, sentences, and paragraphs in an ad that are analyzed with NLP.

    + +

    When we’ve finalized the model architecture, we deploy training jobs on SageMaker. PyTorch helps us develop large models faster by running numerous training jobs at the same time. PyTorch’s Distributed Data Parallel (DDP) module replicates a single model across multiple interconnected machines within SageMaker, and all the processes run forward passes simultaneously on their own unique portion of the data set. During the backward pass, the module averages the gradients of all the processes, so each local model is updated with the same parameter values.

    + +

    Model deployment pipeline

    + +

    When we deploy the model in production, we want to ensure lower inference costs without impacting prediction accuracy. Several PyTorch features and AWS services have helped us address the challenge.

    + +

    The flexibility of a dynamic graph enriches training, but in deployment we want to maximize performance and portability. An advantage of developing NLP models in PyTorch is that out of the box, they can be traced into a static sequence of operations by TorchScript, a subset of Python specialized for ML applications. Torchscript converts PyTorch models to a more efficient, production-friendly intermediate representation (IR) graph that is easily compiled. We run a sample input through the model, and TorchScript records the operations executed during the forward pass. The resulting IR graph can run in high-performance environments, including C++ and other multithreaded Python-free contexts, and optimizations such as operator fusion can speed up the runtime.

    + +

    Neuron SDK and AWS Inferentia powered compute

    + +

    We deploy our models on Amazon EC2 Inf1 instances powered by AWS Inferentia, Amazon’s first ML silicon designed to accelerate deep learning inference workloads. Inferentia has shown to reduce inference costs by up to 70% compared to Amazon EC2 GPU-based instances. +We used the AWS Neuron SDK — a set of software tools used with Inferentia — to compile and optimize our models for deployment on EC2 Inf1 instances.

    + +

    The code snippet below shows how to compile a Hugging Face BERT model with Neuron. Like torch.jit.trace(), neuron.trace() records the model’s operations on an example input during the forward pass to build a static IR graph.

    + +
    import torch
    +from transformers import BertModel, BertTokenizer
    +import torch.neuron
    +tokenizer = BertTokenizer.from_pretrained("path to saved vocab")
    +model = BertModel.from_pretrained("path to the saved model", returned_dict=False)
    +inputs = tokenizer ("sample input", return_tensor="pt")
    +neuron_model = torch.neuron.trace(model,
    +                                  example_inputs = (inputs['input_ids'], inputs['attention_mask']),
    +                                  verbose = 1)
    +output = neuron_model(*(inputs['input_ids'], inputs['attention_mask']))
    +
    + +

    Autocasting and recalibration

    + +

    Under the hood, Neuron optimizes our models for performance by autocasting them to a smaller data type. As a default, most applications represent neural network values in the 32-bit single-precision floating point (FP32) number format. Autocasting the model to a 16-bit format — half-precision floating point (FP16) or Brain Floating Point (BF16) — reduces a model’s memory footprint and execution time. In our case, we decided to use FP16 to optimize for performance while maintaining high accuracy.

    + +

    Autocasting to a smaller data type can, in some cases, trigger slight differences in the model’s predictions. To ensure that the model’s accuracy is not affected, Neuron compares the performance metrics and predictions of the FP16 and FP32 models. When autocasting diminishes the model’s accuracy, we can tell the Neuron compiler to convert only the weights and certain data inputs to FP16, keeping the rest of the intermediate results in FP32. In addition, we often run a few iterations with the training data to recalibrate our autocasted models. This process is much less intensive than the original training.

    + +

    Deployment

    + +

    To analyze multimedia ads, we run an ensemble of DL models. All ads uploaded to Amazon are run through specialized models that assess every type of content they include: images, video and audio, headlines, texts, backgrounds, and even syntax, grammar, and potentially inappropriate language. The signals we receive from these models indicate whether or not an advertisement complies with our criteria.

    + +

    Deploying and monitoring multiple models is significantly complex, so we depend on TorchServe, SageMaker’s default PyTorch model serving library. Jointly developed by Facebook’s PyTorch team and AWS to streamline the transition from prototyping to production, TorchServe helps us deploy trained PyTorch models at scale without having to write custom code. It provides a secure set of REST APIs for inference, management, metrics, and explanations. With features such as multi-model serving, model versioning, ensemble support, and automatic batching, TorchServe is ideal for supporting our immense workload. You can read more about deploying your Pytorch models on SageMaker with native TorchServe integration in this blog post.

    + +

    In some use cases, we take advantage of PyTorch’s object-oriented programming paradigm to wrap multiple DL models into one parent object — a PyTorch nn.Module — and serve them as a single ensemble. In other cases, we use TorchServe to serve individual models on separate SageMaker endpoints, running on AWS Inf1 instances.

    + +

    Custom handlers

    + +

    We particularly appreciate that TorchServe allows us to embed our model initialization, preprocessing, inferencing, and post processing code in a single Python script, handler.py, which lives on the server. This script — the handler —preprocesses the un-labeled data from an ad, runs that data through our models, and delivers the resulting inferences to downstream systems. TorchServe provides several default handlers that load weights and architecture and prepare the model to run on a particular device. We can bundle all the additional required artifacts, such as vocabulary files or label maps, with the model in a single archive file.

    + +

    When we need to deploy models that have complex initialization processes or that originated in third-party libraries, we design custom handlers in TorchServe. These let us load any model, from any library, with any required process. The following snippet shows a simple handler that can serve Hugging Face BERT models on any SageMaker hosting endpoint instance.

    + +
    import torch
    +import torch.neuron
    +from ts.torch_handler.base_handler import BaseHandler
    +import transformers
    +from transformers import AutoModelForSequenceClassification,AutoTokenizer
    +
    +class MyModelHandler(BaseHandler):
    +    def initialize(self, context):
    +        self.manifest = ctx.manifest
    +        properties = ctx.system_properties
    +        model_dir = properties.get("model_dir")
    +        serialized_file = self.manifest["model"]["serializedFile"]
    +        model_pt_path = os.path.join(model_dir, serialized_file)
    +
    +
    +        self.tokenizer = AutoTokenizer.from_pretrained(
    +                model_dir, do_lower_case=True
    +            )
    +        self.model = AutoModelForSequenceClassification.from_pretrained(
    +                    model_dir
    +                )
    +
    +    def preprocess(self, data):
    +
    +        input_text = data.get("data")
    +        if input_text is None:
    +            input_text = data.get("body")
    +            inputs = self.tokenizer.encode_plus(input_text, max_length=int(max_length), pad_to_max_length=True, add_special_tokens=True, return_tensors='pt')
    +        return inputs
    +
    +    def inference(self,inputs):
    +        predictions = self.model(**inputs)
    +        return predictions
    +
    +    def postprocess(self, output):
    +        return output
    +
    + +

    Batching

    + +

    Hardware accelerators are optimized for parallelism, and batching — feeding a model multiple inputs in a single step — helps saturate all available capacity, typically resulting in higher throughputs. Excessively high batch sizes, however, can increase latency with minimal improvement in throughputs. Experimenting with different batch sizes helps us identify the sweet spot for our models and hardware accelerator. We run experiments to determine the best batch size for our model size, payload size, and request traffic patterns.

    + +

    The Neuron compiler now supports variable batch sizes. Previously, tracing a model hardcoded the predefined batch size, so we had to pad our data, which can waste compute, slow throughputs, and exacerbate latency. Inferentia is optimized to maximize throughput for small batches, reducing latency by easing the load on the system.

    + +

    Parallelism

    + +

    Model parallelism on multi-cores also improves throughput and latency, which is crucial for our heavy workloads. Each Inferentia chip contains four NeuronCores that can either run separate models simultaneously or form a pipeline to stream a single model. In our use case, the data parallel configuration offers the highest throughput at the lowest cost, because it scales out concurrent processing requests.

    + +

    Data Parallel:

    + +

    + +

    + +

    Model Parallel:

    + +

    + +

    + +

    Monitoring

    + +

    It is critical that we monitor the accuracy of our inferences in production. Models that initially make good predictions can eventually degrade in deployment as they are exposed to a wider variety of data. This phenomenon, called model drift, usually occurs when the input data distributions or the prediction targets change.

    + +

    We use SageMaker Model Monitor to track parity between the training and production data. Model Monitor notifies us when predictions in production begin to deviate from the training and validation results. Thanks to this early warning, we can restore accuracy — by retraining the model if necessary — before our advertisers are affected. To track performance in real time, Model Monitor also sends us metrics about the quality of predictions, such as accuracy, F-scores, and the distribution of the predicted classes.

    + +

    To determine if our application needs to scale, TorchServe logs resource utilization metrics for the CPU, Memory, and Disk at regular intervals; it also records the number of requests received versus the number served. For custom metrics, TorchServe offers a Metrics API.

    + +

    A rewarding result

    + +

    Our DL models, developed in PyTorch and deployed on Inferentia, sped up our ads analysis while cutting costs. Starting with our first explorations in DL, programming in PyTorch felt natural. Its user-friendly features helped smooth the course from our early experiments to the deployment of our multimodal ensembles. PyTorch lets us prototype and build models quickly, which is vital as our advertising service evolves and expands. For an added benefit, PyTorch works seamlessly with Inferentia and our AWS ML stack. We look forward to building more use cases with PyTorch, so we can continue to serve our clients accurate, real-time results.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/amazon-sagemaker-w-torchserve/index.html b/blog/amazon-sagemaker-w-torchserve/index.html new file mode 100644 index 000000000000..23cc500fbc84 --- /dev/null +++ b/blog/amazon-sagemaker-w-torchserve/index.html @@ -0,0 +1,1162 @@ + + + + + + + + + + + + + Accelerate AI models on GPU using Amazon SageMaker multi-model endpoints with TorchServe, saving up to 75% on inference costs | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + James Wu, Ankith Gunapal, Li Ning, Subhash Talluri, and Saurabh Trikande + +

    +

    Multi-model endpoints (MMEs) are a powerful feature of Amazon SageMaker designed to simplify the deployment and operation of machine learning (ML) models. With MMEs, you can host multiple models on a single serving container and host all the models behind a single endpoint. The SageMaker platform automatically manages the loading and unloading of models and scales resources based on traffic patterns, reducing the operational burden of managing a large quantity of models. This feature is particularly beneficial for deep learning and generative AI models that require accelerated compute. The cost savings achieved through resource sharing and simplified model management makes SageMaker MMEs an excellent choice for you to host models at scale on AWS.

    + +

    Recently, generative AI applications have captured widespread attention and imagination. Customers want to deploy generative AI models on GPUs but at the same time are conscious of costs. SageMaker MMEs support GPU instances and is a great option for these types of applications. Today, we are excited to announce TorchServe support for SageMaker MMEs. This new model server support gives you the advantage of all the benefits of MMEs while still using the serving stack that TorchServe customers are most familiar with. In this post, we demonstrate how to host generative AI models, such as Stable Diffusion and Segment Anything Model, on SageMaker MMEs using TorchServe and build a language-guided editing solution that can help artists and content creators develop and iterate their artwork faster.

    + +

    Solution overview

    + +

    Language-guided editing is a common cross-industry generative AI use case. It can help artists and content creators work more efficiently to meet content demand by automating repetitive tasks, optimizing campaigns, and providing a hyper-personalized experience for the end customer. Businesses can benefit from increased content output, cost savings, improved personalization, and enhanced customer experience. In this post, we demonstrate how you can build language-assisted editing features using MME TorchServe that allow you to erase any unwanted object from an image and modify or replace any object in an image by supplying a text instruction.

    + +

    The user experience flow for each use case is as follows:

    + +
      +
    • To remove an unwanted object, the select the object from the image to highlight it. This action sends the pixel coordinates and the original image to a generative AI model, which generates a segmentation mask for the object. After confirming the correct object selection, you can send the original and mask images to a second model for removal. The detailed illustration of this user flow is demonstrated below.
    • +
    + + + + + + + + + + + + +
    + +Dog on a bench with mouse pointer clicking the dog + + + +Dog on a bench highlighted + + + +A bench without the dog + +
    Step 1: Select an object (“dog”) from the image + Step 2: Confirm the correct object is highlighted + Step 3: Erase the object from the image +
    + +
      +
    • To modify or replace an object, the select and highlight the desired object, following the same process as described above. Once you confirm the correct object selection, you can modify the object by supplying the original image, the mask, and a text prompt. The model will then change the highlighted object based on the provided instructions. A detailed illustration of this second user flow is as follows.
    • +
    + + + + + + + + + + + + +
    + +A vase with a cactus and mouse pointer + + + +A vase highlighted + + + +A rounded vase with a cactus + +
    Step 1: Select an object (“vase”) from the image + Step 2: Confirm the correct object is highlighted + Step 3: Provide a text prompt (“futuristic vase”) to modify the object +
    + +

    To power this solution, we use three generative AI models: Segment Anything Model (SAM), Large Mask Inpainting Model (LaMa), and Stable Diffusion Inpaint (SD). Here are how these models been utilized in the user experience workflow:

    + + + + + + + + + + +
    To remove an unwanted object + To modify or replace an object +
    + +flow diagram + + + +flow diagram + +
    + +
      +
    1. Segment Anything Model (SAM) is used to generate a segment mask of the object of interest. Developed by Meta Research, SAM is an open-source model that can segment any object in an image. This model has been trained on a massive dataset known as SA-1B, which comprises over 11 million images and 1.1 billion segmentation masks. For more information on SAM, refer to their website and research paper.
    2. +
    3. LaMa is used to remove any undesired objects from an image. LaMa is a Generative Adversarial Network (GAN) model specializes in fill missing parts of images using irregular masks. The model architecture incorporates image-wide global context and a single-step architecture that uses Fourier convolutions, enabling it to achieve state-of-the-art results at a faster speed. For more details on LaMa, visit their website and research paper.
    4. +
    5. SD 2 inpaint model from Stability AI is used to modify or replace objects in an image. This model allows us to edit the object in the mask area by providing a text prompt. The inpaint model is based on the text-to-image SD model, which can create high-quality images with a simple text prompt. It provides additional arguments such as original and mask images, allowing for quick modification and restoration of existing content. To learn more about Stable Diffusion models on AWS, refer to Create high-quality images with Stable Diffusion models and deploy them cost-efficiently with Amazon SageMaker.
    6. +
    + +

    All three models are hosted on SageMaker MMEs, which reduces the operational burden from managing multiple endpoints. In addition to that, using MME eliminates concerns about certain models being underutilized because resources are shared. You can observe the benefit from improved instance saturation, which ultimately leads to cost savings. The following architecture diagram illustrates how all three models are served using SageMaker MMEs with TorchServe.

    + +

    flow diagram

    + +

    We have published the code to implement this solution architecture in our GitHub repository. To follow along with the rest of the post, use the notebook file. It is recommended to run this example on a SageMaker notebook instance using the conda_python3 (Python 3.10.10) kernel.

    + +

    Extend the TorchServe container

    + +

    The first step is to prepare the model hosting container. SageMaker provides a managed PyTorch Deep Learning Container (DLC) that you can retrieve using the following code snippet:

    + +
    # Use SageMaker PyTorch DLC as base image
    +baseimage = sagemaker.image_uris.retrieve(
    +    framework="pytorch",
    +    region=region,
    +    py_version="py310",
    +    image_scope="inference",
    +    version="2.0.0",
    +    instance_type="ml.g5.2xlarge",
    +)
    +print(baseimage)
    +
    + +

    Because the models require resources and additional packages that are not on the base PyTorch DLC, you need to build a Docker image. This image is then uploaded to Amazon Elastic Container Registry (Amazon ECR) so we can access directly from SageMaker. The custom installed libraries are listed in the Docker file:

    + +
    ARG BASE_IMAGE
    +
    +FROM $BASE_IMAGE
    +
    +#Install any additional libraries
    +RUN pip install segment-anything-py==1.0
    +RUN pip install opencv-python-headless==4.7.0.68
    +RUN pip install matplotlib==3.6.3
    +RUN pip install diffusers
    +RUN pip install tqdm
    +RUN pip install easydict
    +RUN pip install scikit-image
    +RUN pip install xformers
    +RUN pip install tensorflow
    +RUN pip install joblib
    +RUN pip install matplotlib
    +RUN pip install albumentations==0.5.2
    +RUN pip install hydra-core==1.1.0
    +RUN pip install pytorch-lightning
    +RUN pip install tabulate
    +RUN pip install kornia==0.5.0
    +RUN pip install webdataset
    +RUN pip install omegaconf==2.1.2
    +RUN pip install transformers==4.28.1
    +RUN pip install accelerate
    +RUN pip install ftfy
    +
    + +

    Run the shell command file to build the custom image locally and push it to Amazon ECR:

    + +
    %%capture build_output
    +
    +reponame = "torchserve-mme-demo"
    +versiontag = "genai-0.1"
    +
    +# Build our own docker image
    +!cd workspace/docker && ./build_and_push.sh {reponame} {versiontag} {baseimage} {region} {account}
    +
    + +

    Prepare the model artifacts

    + +

    The main difference for the new MMEs with TorchServe support is how you prepare your model artifacts. The code repo provides a skeleton folder for each model (models folder) to house the required files for TorchServe. We follow the same four-step process to prepare each model .tar file. The following code is an example of the skeleton folder for the SD model:

    + +
    workspace
    +|--sd
    +   |-- custom_handler.py
    +   |-- model-config.yaml
    +
    + +

    The first step is to download the pre-trained model checkpoints in the models folder:

    + +
    import diffusers
    +import torch
    +import transformers
    +
    +pipeline = diffusers.StableDiffusionInpaintPipeline.from_pretrained(
    +    "stabilityai/stable-diffusion-2-inpainting", torch_dtype=torch.float16
    +)
    +
    +sd_dir = "workspace/sd/model"
    +pipeline.save_pretrained(sd_dir)
    +
    + +

    The next step is to define a custom_handler.py file. This is required to define the behavior of the model when it receives a request, such as loading the model, preprocessing the input, and postprocessing the output. The handle method is the main entry point for requests, and it accepts a request object and returns a response object. It loads the pre-trained model checkpoints and applies the preprocess and postprocess methods to the input and output data. The following code snippet illustrates a simple structure of the custom_handler.py file. For more detail, refer to the TorchServe handler API.

    + +
    def initialize(self, ctx: Context):
    +
    +def preprocess(self, data):
    +
    +def inference(self, data):
    +
    +def handle(self, data, context):
    +    requests = self.preprocess(data)
    +    responses = self.inference(requests)
    +
    +    return responses
    +
    + +

    The last required file for TorchServe is model-config.yaml. The file defines the configuration of the model server, such as number of workers and batch size. The configuration is at a per-model level, and an example config file is shown in the following code. For a complete list of parameters, refer to the GitHub repo.

    + +
    minWorkers: 1
    +maxWorkers: 1
    +batchSize: 1
    +maxBatchDelay: 200
    +responseTimeout: 300
    +
    + +

    The final step is to package all the model artifacts into a single .tar.gz file using the torch-model-archiver module:

    + +
    !torch-model-archiver --model-name sd --version 1.0 --handler workspace/sd/custom_handler.py --extra-files workspace/sd/model --config-file workspace/sam/model-config.yaml --archive-format no-archive!cd sd && tar cvzf sd.tar.gz .
    +
    + +

    Create the multi-model endpoint

    + +

    The steps to create a SageMaker MME are the same as before. In this particular example, you spin up an endpoint using the SageMaker SDK. Start by defining an Amazon Simple Storage Service (Amazon S3) location and the hosting container. This S3 location is where SageMaker will dynamically load the models base on invocation patterns. The hosting container is the custom container you built and pushed to Amazon ECR in the earlier step. See the following code:

    + +
    # This is where our MME will read models from on S3.
    +multi_model_s3uri = output_path
    +
    + +

    Then you want to define a MulitDataModel that captures all the attributes like model location, hosting container, and permission access:

    + +
    print(multi_model_s3uri)
    +model = Model(
    +    model_data=f"{multi_model_s3uri}/sam.tar.gz",
    +    image_uri=container,
    +    role=role,
    +    sagemaker_session=smsess,
    +    env={"TF_ENABLE_ONEDNN_OPTS": "0"},
    +)
    +
    +mme = MultiDataModel(
    +    name="torchserve-mme-genai-" + datetime.now().strftime("%Y-%m-%d-%H-%M-%S"),
    +    model_data_prefix=multi_model_s3uri,
    +    model=model,
    +    sagemaker_session=smsess,
    +)
    +print(mme)
    +
    + +

    The deploy() function creates an endpoint configuration and hosts the endpoint:

    + +
    mme.deploy(
    +    initial_instance_count=1,
    +    instance_type="ml.g5.2xlarge",
    +    serializer=sagemaker.serializers.JSONSerializer(),
    +    deserializer=sagemaker.deserializers.JSONDeserializer(),
    +)
    +
    + +

    In the example we provided, we also show how you can list models and dynamically add new models using the SDK. The add_model() function copies your local model .tar files into the MME S3 location:

    + +
    # Only sam.tar.gz visible!
    +list(mme.list_models())
    +
    +models = ["sd/sd.tar.gz", "lama/lama.tar.gz"]
    +for model in models:
    +    mme.add_model(model_data_source=model)
    +
    + +

    Invoke the models

    + +

    Now that we have all three models hosted on an MME, we can invoke each model in sequence to build our language-assisted editing features. To invoke each model, provide a target_model parameter in the predictor.predict() function. The model name is just the name of the model .tar file we uploaded. The following is an example code snippet for the SAM model that takes in a pixel coordinate, a point label, and dilate kernel size, and generates a segmentation mask of the object in the pixel location:

    + +
    img_file = "workspace/test_data/sample1.png"
    +img_bytes = None
    +
    +with Image.open(img_file) as f:
    +    img_bytes = encode_image(f)
    +
    +gen_args = json.dumps(dict(point_coords=[750, 500], point_labels=1, dilate_kernel_size=15))
    +
    +payload = json.dumps({"image": img_bytes, "gen_args": gen_args}).encode("utf-8")
    +
    +response = predictor.predict(data=payload, target_model="/sam.tar.gz")
    +encoded_masks_string = json.loads(response.decode("utf-8"))["generated_image"]
    +base64_bytes_masks = base64.b64decode(encoded_masks_string)
    +
    +with Image.open(io.BytesIO(base64_bytes_masks)) as f:
    +    generated_image_rgb = f.convert("RGB")
    +    generated_image_rgb.show()
    +
    + +

    To remove an unwanted object from an image, take the segmentation mask generated from SAM and feed that into the LaMa model with the original image. The following images show an example.

    + + + + + + + + + + + + +
    +Dog on a bench + + + +White mask of dog on black background + + + +Just a bench + +
    Sample image + Segmentation mask from SAM + Erase the dog using LaMa +
    + +

    To modify or replace any object in an image with a text prompt, take the segmentation mask from SAM and feed it into SD model with the original image and text prompt, as shown in the following example.

    + + + + + + + + + + + + +
    +Dog on a bench + + +White mask of dog on black background + + +Hamster on a bench + +
    Sample image + Segmentation mask from SAM + Replace using SD model with text prompt +
    + “a hamster on a bench” +
    + +

    Cost savings

    + +

    The benefits of SageMaker MMEs increase based on the scale of model consolidation. The following table shows the GPU memory usage of the three models in this post. They are deployed on one g5.2xlarge instance by using one SageMaker MME.

    + + + + + + + + + + + + + + + + + + +
    Model + GPU Memory (MiB) +
    Segment Anything Model + 3,362 +
    Stable Diffusion In Paint + 3,910 +
    Lama + 852 +
    + +

    You can see cost savings when hosting the three models with one endpoint, and for use cases with hundreds or thousands of models, the savings are much greater.

    + +

    For example, consider 100 Stable Diffusion models. Each of the models on its own could be served by an ml.g5.2xlarge endpoint (4 GiB memory), costing $1.52 per instance hour in the US East (N. Virginia) Region. To provide all 100 models using their own endpoint would cost $218,880 per month. With a SageMaker MME, a single endpoint using ml.g5.2xlarge instances can host four models simultaneously. This reduces production inference costs by 75% to only $54,720 per month. The following table summarizes the differences between single-model and multi-model endpoints for this example. Given an endpoint configuration with sufficient memory for your target models, steady state invocation latency after all models have been loaded will be similar to that of a single-model endpoint.

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + Single-model endpoint + Multi-model endpoint +
    Total endpoint price per month + $218,880 + $54,720 +
    Endpoint instance type + ml.g5.2xlarge + ml.g5.2xlarge +
    CPU Memory capacity (GiB) + 32 + 32 +
    GPU Memory capacity (GiB) + 24 + 24 +
    Endpoint price per hour + $1.52 + $1.52 +
    Number of instances per endpoint + 2 + 2 +
    Endpoints needed for 100 models + 100 + 25 +
    + +

    Clean up

    + +

    After you are done, please follow the instructions in the cleanup section of the notebook to delete the resources provisioned in this post to avoid unnecessary charges. Refer to Amazon SageMaker Pricing for details on the cost of the inference instances.

    + +

    Conclusion

    + +

    This post demonstrates the language-assisted editing capabilities made possible through the use of generative AI models hosted on SageMaker MMEs with TorchServe. The example we shared illustrates how we can use resource sharing and simplified model management with SageMaker MMEs while still utilizing TorchServe as our model serving stack. We utilized three deep learning foundation models: SAM, SD 2 Inpainting, and LaMa. These models enable us to build powerful capabilities, such as erasing any unwanted object from an image and modifying or replacing any object in an image by supplying a text instruction. These features can help artists and content creators work more efficiently and meet their content demands by automating repetitive tasks, optimizing campaigns, and providing a hyper-personalized experience. We invite you to explore the example provided in this post and build your own UI experience using TorchServe on a SageMaker MME.

    + +

    To get started, see Supported algorithms, frameworks, and instances for multi-model endpoints using GPU backed instances.

    + +
    + +

    About the authors

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +James Wu + +James Wu is a Senior AI/ML Specialist Solution Architect at AWS. helping customers design and build AI/ML solutions. James’s work covers a wide range of ML use cases, with a primary interest in computer vision, deep learning, and scaling ML across the enterprise. Prior to joining AWS, James was an architect, developer, and technology leader for over 10 years, including 6 years in engineering and 4 years in marketing & advertising industries. +
    +Li Ning + + +Li Ning is a senior software engineer at AWS with a specialization in building large-scale AI solutions. As a tech lead for TorchServe, a project jointly developed by AWS and Meta, her passion lies in leveraging PyTorch and AWS SageMaker to help customers embrace AI for the greater good. Outside of her professional endeavors, Li enjoys swimming, traveling, following the latest advancements in technology, and spending quality time with her family. +
    +Ankith Gunapal + +Ankith Gunapal is an AI Partner Engineer at Meta (PyTorch). He is passionate about model optimization and model serving, with experience ranging from RTL verification, embedded software, computer vision, to PyTorch. He holds a Master’s in Data Science and a Master’s in Telecommunications. Outside of work, Ankith is also an electronic dance music producer. + +
    +Saurabh Trikande + +Saurabh Trikande is a Senior Product Manager for Amazon SageMaker Inference. He is passionate about working with customers and is motivated by the goal of democratizing machine learning. He focuses on core challenges related to deploying complex ML applications, multi-tenant ML models, cost optimizations, and making deployment of deep learning models more accessible. In his spare time, Saurabh enjoys hiking, learning about innovative technologies, following TechCrunch and spending time with his family. + +
    +Subhash Talluri + +Subhash Talluri is a Lead AI/ML solutions architect of the Telecom Industry business unit at Amazon Web Services. He’s been leading development of innovative AI/ML solutions for Telecom customers and partners worldwide. He brings interdisciplinary expertise in engineering and computer science to help build scalable, secure, and compliant AI/ML solutions via cloud-optimized architectures on AWS. + +
    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/ambient-clinical-intelligence-generating-medical-reports-with-pytorch/index.html b/blog/ambient-clinical-intelligence-generating-medical-reports-with-pytorch/index.html new file mode 100644 index 000000000000..960dd01cc6a3 --- /dev/null +++ b/blog/ambient-clinical-intelligence-generating-medical-reports-with-pytorch/index.html @@ -0,0 +1,911 @@ + + + + + + + + + + + + + Ambient Clinical Intelligence: Generating Medical Reports with PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Miguel Del-Agua, Principal Research Scientist, Nuance and Jeremy Jancsary, Senior Principal Research Scientist, Nuance + +

    +

    Introduction

    + +

    Complete and accurate clinical documentation is an essential tool for tracking patient care. It allows for treatment plans to be shared among care teams to aid in continuity of care and ensures a transparent and effective process for reimbursement.

    + +

    Physicians are responsible for documenting patient care. Traditional clinical documentation methods have resulted in a sub-par patient-provider experience, less time interacting with patients, and decreased work-life balance. A significant amount of physicians’ time is spent in front of the computer doing administrative tasks. As a result, patients are less satisfied with the overall experience, and physicians, who prepare for years studying medicine, cannot practice at the top of their license and are burned out. Every hour physicians provide direct clinical face time to patients results in nearly two additional hours spent on EHR and desk work within the clinic day. Outside office hours, physicians spend another 1 to 2 hours of personal time each night doing additional computer and other clerical work.

    + + + +

    Physician burnout is one of the primary causes for increased medical errors, malpractice suits, turnover, and decreased access to care. Burnout leads to an increase in healthcare costs and a decrease in overall patient satisfaction. Burnout costs the United States $4.6 billion a year.

    + +

    What can we do to bring back trust, joy, and humanity to the delivery of healthcare? A significant portion of the administrative work consists of entering patient data into Electronic Health Records (EHRs) and creating clinical documentation. Clinical documentation is created from information already in the EHR as well as from the patient-provider encounter conversation.

    + +

    This article will showcase how the Nuance Dragon Ambient eXperience (DAX), an AI-powered, voice-enabled, ambient clinical intelligence solution, automatically documents patient encounters accurately and efficiently at the point of care and the technologies that enable it.

    + +

    Nuance DAX enhances the quality of care and patient experience, increases provider efficiency and satisfaction, and improves financial outcomes. It can be used in office and telehealth settings in all ambulatory specialties, including primary and urgent care.

    + +

    + +

    + +

    Natural Language Processing

    + +

    Natural Language Processing (NLP) is one of the most challenging fields in Artificial Intelligence (AI). It comprehends a set of algorithms that allow computers to understand or generate the language used by humans. These algorithms can process and analyze vast amounts of natural language data from different sources (either sound or text) to build models that can understand, classify, or even generate natural language as humans would. Like other fields in AI, NLP has significantly progressed thanks to the advent of Deep Learning (DL), which has resulted in models that can obtain results on par with humans in some tasks.

    + +

    These advanced NLP techniques are being applied in healthcare. During a typical patient-provider encounter, a conversation ensues where the doctor constructs, through questions and answers, a chronological description of the development of the patient’s presenting illness or symptoms. A physician examines the patient and makes clinical decisions to establish a diagnosis and determine a treatment plan. This conversation, and data in the EHR, provide the required information for physicians to generate the clinical documentation, referred to as medical reports.

    + +

    Two main NLP components play a role in automating the creation of clinical documentation. The first component, Automatic Speech Recognition (ASR), is used to translate speech into text. It takes the audio recording of the encounter and generates a conversation transcription (cf. Figure 2). The second component, Automatic Text Summarization, helps generate summaries from large text documents. This component is responsible for understanding and capturing the nuances and most essential aspects from the transcribed conversation into a final report in narrative form (cf. Figure 3), structured form, or a combination of both.

    + +

    We will focus on this second component, Automatic Text Summarization, which is a difficult task with many challenges:

    + +
      +
    • Its performance is tied to the ASR quality from multiple speakers (noisy input).
    • +
    • The input is conversational in nature and contains layman’s terms.
    • +
    • Protected Health Information (PHI) regulations limit medical data access.
    • +
    • The information for one output sentence is potentially spread across multiple conversation turns.
    • +
    • There is no explicit sentence alignment between input and output.
    • +
    • Various medical specialties, encounter types, and EHR systems constitute a broad and complex output space.
    • +
    • Physicians have different styles of conducting encounters and have their preferences for medical reports; there is no standard.
    • +
    • Standard summarization metrics might differ from human judgment of quality.
    • +
    + +

    + +

    + +

    +Figure 2: Transcript of a patient-doctor conversation +

    + +

    + +

    + +

    +Figure 3: Excerpt of an AI-generated medical report. HPI stands for History of present illness. +

    + +

    Text Summarization with PyTorch and Fairseq

    + +

    PyTorch is an open-source machine learning framework developed by Facebook that helps researchers prototype Deep Learning models. The Fairseq toolkit is built on top of PyTorch and focuses on sequence generation tasks, such as Neural Machine Translation (NMT) or Text Summarization. Fairseq features an active community that is continuously providing reference implementations of state-of-the-art models. It contains many built-in components (model architectures, modules, loss functions, and optimizers) and is easily extendable with plugins.

    + +

    Text summarization constitutes a significant challenge in NLP. We need models capable of generating a short version of a document while retaining the key points and avoiding uninformative content. These challenges can be addressed with different approaches. 1). Abstractive text summarization aimed at training models that can generate a summary in narrative form. 2). Extractive methods where the models are trained to select the most important parts from the input text. 3). A combination of the two, where the essential parts from the input are selected and then summarized in an abstractive fashion. Hence, summarization can be accomplished via a single end-to-end network or as a pipeline of extractive and abstractive components. To that end, Fairseq provides all the necessary tools to be successful in our endeavor. It features either end-to-end models such as the classical Transformer, different types of Language Models and pre-trained versions that enable researchers to focus on what matters most—to build state-of-the-art models that generate valuable reports.

    + +

    However, we are not just summarizing the transcribed conversation; we generate high-quality medical reports, which have many considerations.

    + +
      +
    • Every section of a medical report is different in terms of content, structure, fluency, etc.
    • +
    • All medical facts mentioned in the conversation should be present in the report, for example, a particular treatment or dosage.
    • +
    • In the healthcare domain, the vocabulary is extensive, and models need to deal with medical terminology.
    • +
    • Patient-doctor conversations are usually much longer than the final report.
    • +
    + +

    All these challenges require our researchers to run a battery of extensive experiments. Thanks to the flexibility of PyTorch and Fairseq, their productivity has greatly increased. Further, the ecosystem offers an easy path from ideation, implementation, experimentation, and final roll-out to production. Using multiple GPUs or CPUs is as simple as providing an additional argument to the tools, and because of the tight Python integration, PyTorch code can be easily debugged.

    + +

    In our continuous effort to contribute to the open-source community, features have been developed at Nuance and pushed to the Fairseq GitHub repository. These try to overcome some of the challenges mentioned such as, facilitating copying of, especially rare or unseen, words from the input to summary, training speedups by improving Tensor Core utilization, and ensuring TorchScript compatibility of different Transformer configurations. Following, we will show an example of how to train a Transformer model with a Pointer Generator mechanism (Transformer-PG), which can copy words from the input.

    + +

    How to build a Transformer model with a Pointer Generator mechanism

    + +

    In this step-by-step guide, it is assumed the user has already installed PyTorch and Fairseq.

    + +

    1. Create a vocabulary and extend it with source position markers:

    + +

    These markers will allow the model to point to any word in the input sequence.

    + +
    vocab_size=<vocab_size>
    +position_markers=512
    +export LC_ALL=C
    +cat train.src train.tgt |
    +  tr -s '[:space:]' '\n' |
    +  sort |
    +  uniq -c |
    +  sort -k1,1bnr -k2 |
    +  head -n "$((vocab_size - 4))" |
    +  awk '{ print $2 " " $1 }' > dict.pg.txt
    +python3 -c "[print('<unk-{}> 0'.format(n)) for n in range($position_markers)]" >> dict.pg.txt
    +
    + +

    This will create a file “dict.pg.txt” that contains the <vocab_size> most frequent words followed by 512 position markers named from “<unk-0>” to “<unk-511>”.

    + +

    In case we have an input like

    + +
    src = "Hello, I'm The Dogtor"
    +
    + +

    it could happen that our model has been trained without the word “Dogtor” in its vocabulary. Therefore, when we feed this sequence into the model, it should be converted to:

    + +
    src = "Hello, I'm The <unk-3>"
    +
    + +

    Now, “<unk-3>” is part of our vocabulary and could be predicted by the model (this is where the pointer-generator comes in). In such a case, we will only need to post-process the output to replace “<unk-3>” by the word at input position 3.

    + +

    2. Preprocess the text data to replace unknown words by its positional markers:

    + +

    We can use the scripts from https://github.com/pytorch/fairseq/tree/master/examples/pointer_generator.

    + +
    # Considering we have our data in:
    +# train_src = /path/to/train.src
    +# train_tgt = /path/to/train.tgt
    +# valid_src = /path/to/valid.src
    +# valid_tgt = /path/to/valid.tgt
    +./preprocess.py --source /path/to/train.src \
    +                --target /path/to/train.tgt \
    +                --vocab <(cut -d' ' -f1 dict.pg.txt) \
    +                --source-out /path/to/train.pg.src \
    +                --target-out /path/to/train.pg.tgt
    +
    +./preprocess.py --source /path/to/valid.src \
    +                --target /path/to/valid.tgt \
    +                --vocab <(cut -d' ' -f1 dict.pg.txt) \
    +                --source-out /path/to/valid.pg.src \
    +                --target-out /path/to/valid.pg.tgt
    +
    +./preprocess.py --source /path/to/test.src \
    +                --vocab <(cut -d' ' -f1 dict.pg.txt) \
    +                --source-out /path/to/test.pg.src
    +
    + +

    3. Now let’s binarize the data, so that it can be processed faster:

    + +
    fairseq-preprocess --task "translation" \
    +                   --source-lang "pg.src" \
    +                   --target-lang "pg.tgt" \
    +                   --trainpref /path/to/train \
    +                   --validpref /path/to/valid \
    +                   --srcdict dict.pg.txt \
    +                   --cpu \
    +                   --joined-dictionary \
    +                   --destdir <data_dir>
    +
    + +

    You might notice the type of task is “translation”. This is because there is no “summarization” task available; we could understand it as a kind of NMT task where the input and output languages are shared and the output (summary) is shorter than the input.

    + +

    4. Now we can train the model:

    + +
    fairseq-train <data_dir> \
    +              --save-dir <model_dir> \
    +              --task "translation" \
    +              --source-lang "src" \
    +              --target-lang "tgt" \
    +              --arch "transformer_pointer_generator" \
    +              --max-source-positions 512 \
    +              --max-target-positions 128 \
    +              --truncate-source \
    +              --max-tokens 2048 \
    +              --required-batch-size-multiple 1 \
    +              --required-seq-len-multiple 8 \
    +              --share-all-embeddings \
    +              --dropout 0.1 \
    +              --criterion "cross_entropy" \
    +              --optimizer adam \
    +              --adam-betas '(0.9, 0.98)' \
    +              --adam-eps 1e-9 \
    +              --update-freq 4 \
    +              --lr 0.004 \
    +              # Pointer Generator
    +              --alignment-layer -1 \
    +              --alignment-heads 1 \
    +              --source-position-markers 512
    +
    + +

    This configuration makes use of features Nuance has contributed back to Fairseq:

    + +
      +
    • Transformer with a Pointer Generator mechanism to facilitate copying of words from the input.
    • +
    • Sequence length padded to a multiple of 8 to better use tensor cores and reduce training time.
    • +
    + +

    5. Now let’s take a look at how to generate a summary with our new medical report generation system:

    + +
    import torch
    +from examples.pointer_generator.pointer_generator_src.transformer_pg import TransformerPointerGeneratorModel
    +
    +# Patient-Doctor conversation
    +input = "[doctor] Lisa Simpson, thirty six year old female, presents to the clinic today because " \
    +        "she has severe right wrist pain"
    +
    +# Load the model
    +model = TransformerPointerGeneratorModel.from_pretrained(data_name_or_path=<data_dir>,
    +                                                         model_name_or_path=<model_dir>,
    +                                                         checkpoint_file="checkpoint_best.pt")
    +
    +result = model.translate([input], beam=2)
    +
    +print(result[0])
    +Ms. <unk-2> is a 36-year-old female who presents to the clinic today for evaluation of her right wrist.
    +
    + +

    6. Alternatively, we can use fairseq-interactive and a postprocessing tool to substitute positional unknown tokens by its words from the input:

    + +
    fairseq-interactive <data_dir> \
    +              --batch-size <batch_size> \
    +              --task translation \
    +              --source-lang src \
    +              --target-lang tgt \
    +              --path <model_dir>/checkpoint_last.pt \
    +              --input /path/to/test.pg.src \
    +              --buffer-size 20 \
    +              --max-len-a 0 \
    +              --max-len-b 128 \
    +              --beam 2 \
    +              --skip-invalid-size-inputs-valid-test | tee generate.out
    +
    +grep "^H-" generate.out | cut -f 3- > generate.hyp
    +
    +./postprocess.py \
    +	--source <(awk 'NF<512' /path/to/test.pg.src) \
    +	--target generate.hyp \
    +	--target-out generate.hyp.processed
    +
    + +

    Now we have the final set of reports in “generate.hyp.processed”, with “<unk-N>” replaced by the original word from the input sequence.

    + +

    Model Deployment

    + +

    PyTorch offers great flexibility in modeling and a rich surrounding ecosystem. However, while several recent articles have suggested that the use of PyTorch in research and academia may be close to surpassing TensorFlow, there seems to be an overall sense of TensorFlow being the preferred platform for deployment to production. Is this still the case in 2021? Teams looking to serve their PyTorch models in production have a few options.

    + +

    Before describing our journey, let’s take a brief detour and define the term model.

    + +

    Models as computation graphs

    + +

    A few years back, it was still common for machine learning toolkits to support only particular classes of models of a rather fixed and rigid structure, with only a few degrees of freedom (like the kernel of a support vector machine or the number of hidden layers of a neural network). Inspired by foundational work in Theano, toolkits like Microsoft’s CNTK or Google’s TensorFlow were among the first to popularize a more flexible view on models, as computation graphs with associated parameters that can be estimated from data. This view blurred the boundaries between popular types of models (such as DNNs or SVMs), as it became easy to blend the characteristics of each into your type of graph. Still, such a graph had to be defined upfront before estimating its parameters, and it was pretty static. This made it easy to save models to a self-contained bundle, like a TensorFlow SavedModel (such a bundle simply contains the structure of the graph, as well as the concrete values of the estimated parameters). However, debugging such models can be difficult because the statements in the Python code that build the graph are logically separate from the lines that execute it. Researchers also long for easier ways of expressing dynamic behavior, such as the computation steps of the forward pass of a model being conditionally dependent on its input data (or its previous output).

    + +

    Most recently, the above limitations have led to a second revolution spearheaded by PyTorch and TensorFlow 2. The computation graph is no longer defined explicitly. Instead, it will be populated implicitly as the Python code executes operations on tensor arguments. An essential technique that powers this development is automatic differentiation. As the computation graph is being built implicitly while executing the steps of the forward pass, all the necessary data will be tracked for later computation of the gradient concerning the model parameters. This allows for great flexibility in training a model, but it raises an important question. If the computation happening inside a model is only implicitly defined through our Python code’s steps as it executes concrete data, what is it that we want to save as a model? The answer – at least initially – was the Python code with all its dependencies, along with the estimated parameters. This is undesirable for practical reasons. For instance, there is a danger that the team working on model deployment does not exactly reproduce the Python code dependencies used during training, leading to subtly divergent behavior. The solution typically consists of combining two techniques, scripting and tracing, that is, extra annotations in your Python code and execution of your code on exemplary input data, allowing PyTorch to define and save the graph that should be executed during later inference on new, unseen data. This requires some discipline by whoever creates the model code (arguably voiding some of the original flexibility of eager execution), but it results in a self-contained model bundle in TorchScript format. The solution in TensorFlow 2 is remarkably similar.

    + +

    Serving our report generation models

    + +

    Our journey in deploying the report generation models reflects the above discussion. We started out serving our models by deploying the model code and its dependencies along with the parameter checkpoints in a custom Docker image exposing a gRPC service interface. However, we soon noticed that it became error-prone to replicate the exact code and environment used by the modeling team while estimating the parameters. Moreover, this approach prevented us from leveraging high-performance model serving frameworks like NVIDIA’s Triton, which is written in C++ and requires self-contained models that can be used without a Python interpreter. At this stage, we were facing a choice between attempting to export our PyTorch models to ONNX or TorchScript format. ONNX is an open specification for representing machine learning models that increasingly finds adoption. It is powered by a high-performance runtime developed by Microsoft (ONNX Runtime). While we were able to achieve performance acceleration for our TensorFlow BERT-based model using ONNX Runtime, at the time one of our PyTorch model required some operators that weren’t yet supported in ONNX. Rather than implement these using custom operators, we decided to look into TorchScript for the time being.

    + +

    A maturing ecosystem

    + +

    Is it all roses? No, it has been a rockier journey than we expected. We encountered what seems to be a memory leak in the MKL libraries used by PyTorch while serving the PyTorch code directly. We encountered deadlocks in trying to load multiple models from multiple threads. We had difficulties exporting our models to ONNX and TorchScript formats. Models would not work out-of-the-box on hardware with multiple GPUs, they always accessed the particular GPU device on which they were exported. We encountered excessive memory usage in the Triton inference server while serving TorchScript models, which we found out was due to automatic differentiation accidentally being enabled during the forward pass. However, the ecosystem keeps improving, and there is a helpful and vibrant open-source community eager to work with us to mitigate such issues.

    + +

    Where to go from here? For those that require the flexibility of serving PyTorch code directly, without going through the extra step of exporting self-contained models, it is worth pointing out that the TorchServe project now provides a way of bundling the code together with parameter checkpoints into a single servable archive, greatly reducing the risk of code and parameters running apart. To us, however, exporting models to TorchScript has proven beneficial. It provides a clear interface between modeling and deployment teams, and TorchScript further reduces the latency when serving models on GPU via its just-in-time compilation engine.

    + +

    Scaling at large and the future

    + +

    Finally, efficient deployment to the cloud is about more than just computing the response of a single model instance efficiently. Flexibility is needed in managing, versioning and updating models. High-level scalability must be achieved via techniques such as load-balancing, horizontal scaling and vertical scaling. If many models are involved, scale-to-zero quickly becomes a topic as it is unacceptable to pay for serving models that do not answer any requests. Providing such extra functionality on top of a low-level inference server like Triton is the job of an orchestration framework. After gaining some first experience with KubeFlow, to that end, we decided to turn our attention to Azure ML, which provides similar functionality but integrates more deeply with the Azure platform, on which we crucially rely for large parts of our technology stack already. This part of our journey has just begun.

    + +

    Conclusion

    + +

    Academia has long recognized that we are “standing on the shoulders of giants.” As Artificial Intelligence is maturing from a scientific discipline into technology, the same spirit of collaboration that originally fueled its scientific foundation has carried over into the world of software engineering. Open-source enthusiasts join technology companies worldwide to build open software ecosystems that allow for new angles at solving some of the most pressing challenges of modern society. In this article, we’ve taken a look at Nuance’s Dragon Ambient eXperience, an AI-powered, voice-enabled solution that automatically documents patient care, reducing healthcare providers’ administrative burdens. Nuance DAX improves the patient-provider experience, reduces physician burnout, and improves financial outcomes. It brings back trust, joy, and humanity to the delivery of healthcare. Fairseq and PyTorch have proven to be an incredible platform for powering this AI technology, and in turn, Nuance has contributed back some of its innovations in this space. For further reading, we invite you to take a look at our recent ACL publication and the Nuance “What’s Next” blog.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/amd-extends-support-for-pt-ml/index.html b/blog/amd-extends-support-for-pt-ml/index.html new file mode 100644 index 000000000000..eb2dc369678f --- /dev/null +++ b/blog/amd-extends-support-for-pt-ml/index.html @@ -0,0 +1,697 @@ + + + + + + + + + + + + + AMD Extends Support for PyTorch Machine Learning Development on Select RDNA™ 3 GPUs with ROCm™ 5.7 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + AMD + +

    +

    Researchers and developers working with Machine Learning (ML) models and algorithms using PyTorch can now use AMD ROCm 5.7 on Ubuntu® Linux® to tap into the parallel computing power of the Radeon™ RX 7900 XTX and the Radeon™ PRO W7900 graphics cards which are based on the AMD RDNA™ 3 GPU architecture.

    + +

    A client solution built on these two high-end GPUs enables a local, private, and cost-effective workflow for ML training and inference for those who previously relied on cloud-based solutions alone.

    + +

    ML Development on Desktop

    + +

    Accelerate Machine Learning With Pytorch On Your Desktop

    + +
      +
    • A local PC or workstation system running PyTorch with a Radeon 7900 series GPU presents a capable, yet affordable solution to address these growing workflow challenges thanks to large GPU memory sizes of 24GB and even 48GB.
    • +
    + +

    Unified Software Stack For The Desktop And The Datacenter

    + +
      +
    • The latest AMD ROCm 5.7 software stack for GPU programming unlocks the massively parallel compute power of these RDNA™ 3 architecture-based GPUs for use with PyTorch, one of the leading ML frameworks. The same unified software stack also supports the CDNA™ GPU architecture of the AMD Instinct™ MI series accelerators.
    • +
    + +

    Freedom To Customize

    + +
      +
    • The AMD ROCm platform is primarily Open-Source Software (OSS). It allows developers the freedom to customize and tailor their GPU software for their own needs while collaborating with a community of other developers, and helping each other find solutions in an agile, flexible, and rapid manner. The AMD ROCm platform’s goal is to allow users to maximize their GPU hardware investment. The AMD ROCm platform is designed to help develop, test, and deploy GPU accelerated HPC, AI, scientific computing, CAD, and other applications in a free, open source, integrated and secure software ecosystem.
    • +
    + +

    As the industry moves towards an ecosystem that supports a broad set of systems, frameworks and accelerators, AMD is determined to continue to make AI more accessible to PyTorch developers and researchers that benefit from a local client-based setup for ML development using RDNA™ 3 architecture-based desktop GPUs.

    + +

    Learn More

    + +

    https://www.amd.com/en/developer/resources/ml-radeon.html

    + +

    Download Software

    + +

    https://www.amd.com/en/support/linux-drivers

    + +

    Visit the Documentation Portal to get started training ML models on your local desktop

    + +

    https://rocm.docs.amd.com/projects/radeon/en/latest/

    + +

    Prerequisites

    + +

    https://rocm.docs.amd.com/projects/radeon/en/latest/docs/prerequisites.html

    + +

    How to Guide

    + +

    https://rocm.docs.amd.com/projects/radeon/en/latest/docs/install/howto.html

    + +

    © 2023 Advanced Micro Devices, Inc. All rights reserved. AMD, the AMD Arrow logo, CDNA, Radeon, ROCm, and combinations thereof are trademarks of Advanced Micro Devices, Inc. Linux® is the registered trademark of Linus Torvalds in the U.S. and other countries. Microsoft and Windows are registered trademarks of Microsoft Corporation in the US and/or other countries. PyTorch, the PyTorch logo and any related marks are trademarks of The Linux Foundation. TensorFlow, the TensorFlow logo and any related marks are trademarks of Google Inc. Ubuntu and the Ubuntu logo are registered trademarks of Canonical Ltd. Other product names used in this publication are for identification purposes only and may be trademarks of their respective owners.

    + +

    Radeon™ AI technology is compatible with all AMD Radeon 7000 Series graphics cards and newer. Please check with your system manufacturer for feature availability prior to purchase. GD-232.

    + +
      +
    1. Based on AMD internal measurements, November 2022, comparing the Radeon RX 7900 XTX at 2.5GHz boost clock with 96 CUs issuing 2X the Bfloat16 math operations per clocks vs. the RX 6900 XT GPU at 2.25 GHz boost clock and 80 CUs issue 1X the Bfloat16 math operations per clock. RX-821
    2. +
    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/amd-journey/index.html b/blog/amd-journey/index.html new file mode 100644 index 000000000000..779a355a5760 --- /dev/null +++ b/blog/amd-journey/index.html @@ -0,0 +1,671 @@ + + + + + + + + + + + + + AMD's Journey to Openness and Performance | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    August 01, 2023

    +

    + AMD's Journey to Openness and Performance +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    AMD has gained progress in building a robust software stack that supports an open ecosystem of models, libraries, frameworks, and tools. With proven platforms gaining momentum, there is significance of a leadership software stack and an optimized ecosystem for achieving application performance. PyTorch is a key part of AMD’s AI journey, and AMD’s Victor Peng, AMD President and Soumith Chintala, founder of PyTorch discussed the latest progress at the DC & AI Keynote on June 12.

    + +

    Building a Powerful SW Stack with ROCm

    + +

    Victor introduced ROCm, AMD’s SW stack for Instinct Data Center GPUs. It offers a comprehensive set of open-source libraries, runtime, compilers, and tools for developing, running, and fine-tuning AI models. The fifth generation ROCm incorporates optimizations for AI and high-performance computing workloads, including tailored kernels for low-latency memory systems, support for new data types, and integration with OpenAI Triton. With tools for porting AI software to AMD Instinct platforms, ROCm ensures quality and robustness, tested extensively and compliant with PyTorch and TensorFlow frameworks.

    + +

    Collaboration with PyTorch

    + +

    To shed light on the partnership between AMD and PyTorch, Victor invited Soumith Chintala, the founder of PyTorch, to discuss the advancements and integration between the two. PyTorch, the industry’s most famous AI framework, boasts a vibrant developer community and is known for its continuous innovation and incorporation of cutting-edge research.

    + +

    To highlight the AMD and PyTorch partnership, Victor hosted a discussion with Soumith Chintala, the founder of PyTorch. PyTorch, renowned for its innovation and community, is the industry’s leading AI framework. The latest version, PyTorch 2.0, integrates with hardware-agnostic software compilers like OpenAI Triton, enabling efficient training and deployment of AI models. With optimized techniques, PyTorch 2.0 enhances productivity and offers remarkable speed improvements. The collaboration between AMD and the PyTorch Foundation ensures seamless utilization of AMD GPUs, expanding AI accelerator accessibility worldwide and paving the way for future optimizations and broader hardware support.

    + +

    Empowering the Developer Community

    + +

    The partnership between AMD and PyTorch benefits the developer community by democratizing access to AI accelerators. Support for AMD GPUs in PyTorch allows developers to train and deploy models across various platforms, including CPUs like EPYC and Ryzen, GPUs like Instinct and Radeon, and embedded devices like Versal SoCs. By ensuring immediate compatibility of new models on AMD platforms, the collaboration streamlines the development process and empowers developers to leverage the full potential of AMD’s hardware. This increased accessibility and flexibility enable developers worldwide to push the boundaries of AI innovation.

    + +

    Hugging Face and AI Model Innovation

    + +

    Victor praised Hugging Face as the leading force behind open-source AI model innovation, empowering generative AI with transformative transformers. AMD’s optimized software enables a high-performing development stack, supporting groundbreaking AI advancements for customers and developers through scalable real-world deployments.

    + +

    Conclusion

    + +

    At the DC & AI Keynote, AMD demonstrated its dedication to openness, performance, and collaboration. The ROCm SW stack, PyTorch integration, and support for Hugging Face exemplify AMD’s commitment to empowering developers and researchers to achieve AI breakthroughs. By offering accessible, high-performing solutions, AMD fuels the future of AI as a leading GPU platform integrated with PyTorch.

    + +

    To listen to the full keynote visit the AMD Youtube channel

    + +

    To listen to Soumith Chintala’s section of the keynote

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/announcing-cpp/index.html b/blog/announcing-cpp/index.html new file mode 100644 index 000000000000..65b118483f13 --- /dev/null +++ b/blog/announcing-cpp/index.html @@ -0,0 +1,773 @@ + + + + + + + + + + + + + Announcing CPP-based S3 IO DataPipes | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    July 25, 2023

    +

    + Announcing CPP-based S3 IO DataPipes +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + John He, Khaled ElGalaind, Roshani Nagmote, Daiming Yang + +

    +

    Training large deep learning models requires large datasets. Amazon Simple Storage Service (Amazon S3) is a scalable cloud object store service used for storing large training datasets. Machine learning (ML) practitioners need an efficient data pipe that can download data from Amazon S3, transform the data, and feed the data to GPUs for training models with high throughput and low latency.

    + +

    In this post, we introduce the new S3 IO DataPipes for PyTorch, S3FileLister and S3FileLoader. For memory efficiency and fast runs, the new DataPipes use the C++ extension to access Amazon S3. Benchmarking shows that S3FileLoader is 59.8% faster than FSSpecFileOpener for downloading a natural language processing (NLP) dataset from Amazon S3. You can build IterDataPipe training pipelines with the new DataPipes. We also demonstrate that the new DataPipe can reduce overall Bert and ResNet50 training time by 7%. The new DataPipes have been upstreamed to the open-source TorchData 0.4.0 with PyTorch 1.12.0.

    + +

    Overview

    + +

    Amazon S3 is a scalable cloud storage service with no limit on data volume. Loading data from Amazon S3 and feeding the data to high-performance GPUs such as NVIDIA A100 can be challenging. It requires an efficient data pipeline that can meet the data processing speed of GPUs. To help with this, we released a new high performance tool for PyTorch: S3 IO DataPipes. DataPipes are subclassed from torchdata.datapipes.iter.IterDataPipe, so they can interact with the IterableDataPipe interface. Developers can quickly build their DataPipe DAGs to access, transform, and manipulate data with shuffle, sharding, and batch features.

    + +

    The new DataPipes are designed to be file format agnostic and Amazon S3 data is downloaded as binary large objects (BLOBs). It can be used as a composable building block to assemble a DataPipe graph that can load tabular, NLP, and computer vision (CV) data into your training pipelines.

    + +

    Under the hood, the new S3 IO DataPipes employ a C++ S3 handler with the AWS C++ SDK. In general, a C++ implementation is more memory efficient and has better CPU core usage (no Global Interpreter Lock) in threading compared to Python. The new C++ S3 IO DataPipes are recommended for high throughput, low latency data loading in training large deep learning models.

    + +

    The new S3 IO DataPipes provide two first-class citizen APIs:

    +
      +
    • S3FileLister – Iterable that lists S3 file URLs within the given S3 prefixes. The functional name for this API is list_files_by_s3.
    • +
    • S3FileLoader – Iterable that loads S3 files from the given S3 prefixes. The functional name for this API is load_files_by_s3.
    • +
    + +

    Usage

    + +

    In this section, we provide instructions for using the new S3 IO DataPipes. We also provide a code snippet for load_files_by_s3().

    + +

    Build from source

    +

    The new S3 IO DataPipes use the C++ extension. It is built into the torchdata package by default. However, if the new DataPipes are not available within the environment, for example Windows on Conda, you need to build from the source. For more information, refer to Iterable Datapipes.

    + +

    Configuration

    +

    Amazon S3 supports global buckets. However, a bucket is created within a Region. You can pass a Region to the DataPipes by using __init__(). Alternatively, you can either export AWS_REGION=us-west-2 into your shell or set an environment variable with os.environ['AWS_REGION'] = 'us-east-1' in your code.

    + +

    To read objects in a bucket that aren’t publicly accessible, you must provide AWS credentials through one of the following methods:

    + + + +

    Example code

    +

    The following code snippet provides a typical usage of load_files_by_s3():

    + +
    from torch.utils.data import DataLoader

    +from torchdata.datapipes.iter import IterableWrapper


    +
    +s3_shard_urls = IterableWrapper(["s3://bucket/prefix/",])
.list_files_by_s3()
    +s3_shards = s3_shard_urls.load_files_by_s3()

    +# text data

    +training_data = s3_shards.readlines(return_path=False)

    +data_loader = DataLoader(
    +      training_data,
    +      batch_size=batch_size,
    +      num_workers=num_workers,

    +)
# training loop

    +for epoch in range(epochs):
    
    +      # training step
    
    +      for bach_data in data_loader:
        
    +         # forward pass, backward pass, model update 

    +
    + +

    Benchmark

    + +

    In this section, we demonstrate how the new DataPipe can reduce overall Bert and ResNet50 training time.

    + +

    Isolated DataLoader performance evaluation against FSSpec

    + +

    FSSpecFileOpener is another PyTorch S3 DataPipe. It uses botocore and aiohttp/asyncio to access S3 data. The following is the performance test setup and result (quoted from Performance Comparison between native AWSSDK and FSSpec (boto3) based DataPipes).

    + +

    The S3 data in the test is a sharded text dataset. Each shard has about 100,000 lines and each line is around 1.6 KB, making each shard about 156 MB. The measurements in this benchmark are averaged over 1,000 batches. No shuffling, sampling, or transforms were performed.

    + +

    The following chart reports the throughput comparison for various batch sizes for num_workers=0, the data loader runs in the main process. S3FileLoader has higher queries per second (QPS). It is 90% higher than fsspec at batch size 512.

    + +

    Batch Sizes 1

    + +

    The following chart reports the results for num_workers=4, the data loaders runs in the main process. S3FileLoader is 59.8% higher than fsspec at batch size 512.

    + +

    Batch Sizes 2

    + +

    Training ResNet50 Model against Boto3

    +

    For the following chart, we trained a ResNet50 model on a cluster of 4 p3.16xlarge instances with a total 32 GPUs. The training dataset is ImageNet with 1.2 million images organized into 1,000-image shards. The training batch size is 64. The training time is measured in seconds. For eight epochs, S3FileLoader is 7.5% faster than Boto3.

    + +

    Boto3

    + +

    Training a Bert model against Boto3

    +

    For the following cart, we trained a Bert model on a cluster of 4 p3.16xlarge instances with a total 32 GPUs. The training corpus has 1474 files. Each file has around 150,000 samples. To run a shorter epoch, we use 0.05% (approximately 75 samples) per file. The batch size is 2,048. The training time is measured in seconds. For one epoch, S3FileLoader is 7% faster than Boto3.

    + +

    Boto3 2

    + +

    Comparison against the original PyTorch S3 plugin

    +

    The new PyTorch S3 DataPipes perform substantially better than the original PyTorch S3 plugin. We have tuned the internal buffer size for S3FileLoader. The loading time is measured in seconds.

    + +

    For the 10 sharded charades files (approximately 1.5 GiB each), S3FileLoader was 3.5 times faster in our experiments.

    + +

    Best practices

    +

    Training large deep learning models may require a massive compute cluster with tens or even hundreds of nodes. Each node in the cluster may generate a large number of data loading requests that hit a specific S3 shard. To avoid throttle, we recommend sharding training data across S3 buckets and S3 folders.

    + +

    Best Practices

    + +

    To achieve good performance, it helps to have file sizes that are big enough to parallelize across a given file, but not so big that we hit the limits of throughput on that object on Amazon S3 depending on the training job. The optimal size can be between 50–200 MB.

    + +

    Conclusion and next steps

    + +

    In this post, we introduced you to the new PyTorch IO DataPipes. The new DataPipes use aws-sdk-cpp and show better performance against Boto3-based data loaders.

    + +

    For next steps, we plan to improve on usability, performance, and functionality by focusing on the following features:

    + +
      +
    • S3 authorization with IAM roles – Currently, the S3 DataPipes support explicit access credentials, instance profiles, and S3 bucket policies. However, there are use cases where IAM roles are preferred.
    • +
    • Double buffering – We plan to offer double buffering to support multi-worker downloading.
    • +
    • Local caching – We plan on making model training able to traverse the training dataset for multiple passes. Local caching after the first epoch can cut out time of flight delays from Amazon S3, which can substantially accelerate data retrieval time for subsequent epochs.
    • +
    • Customizable configuration – We plan to expose more parameters such as internal buffer size, multi-part chunk size, and executor count and allow users to further tune data loading efficiency.
    • +
    • Amazon S3 upload – We plan to expand the S3 DataPipes to support upload for checkpointing.
    • +
    • Merge with fsspecfsspec is used in other systems such as torch.save(). We can integrate the new S3 DataPipes with fsspec so they can have more use cases.
    • +
    + +

    Acknowledgement

    + +

    We would like to thank Vijay Rajakumar and Kiuk Chung from Amazon for providing their guidance for S3 Common RunTime and PyTorch DataLoader. We also want to thank Erjia Guan, Kevin Tse, Vitaly Fedyunin , Mark Saroufim, Hamid Shojanazeri, Matthias Reso, and Geeta Chauhan from Meta AI/ML, and Joe Evans from AWS for reviewing the blog and the GitHub PRs.

    + +

    References

    + + + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/announcing-docathon-h2-2023/index.html b/blog/announcing-docathon-h2-2023/index.html new file mode 100644 index 000000000000..ba35447948ad --- /dev/null +++ b/blog/announcing-docathon-h2-2023/index.html @@ -0,0 +1,670 @@ + + + + + + + + + + + + + Announcing PyTorch Docathon H2 2023 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    October 02, 2023

    +

    + Announcing PyTorch Docathon H2 2023 +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    We are excited to announce that we will be holding a Docathon for PyTorch on November 1, 2023! This event is an opportunity for our community to come together and improve the quality of our documentation.

    + +

    During the Docathon, we will focus on updating and improving existing content, as well as adding new tutorials and docstrings. We encourage all members of the community to participate and contribute their expertise to make our documentation even better. This is a great opportunity to learn and collaborate together.

    + +

    Check out our previous docathon success story here.

    + +

    Why Participate

    + +

    One of the best things about the Docathon is that you can make a tangible, positive impact on the quality of documentation in real time. This collaborative event brings together diverse team members from various companies, backgrounds, and roles, united to work towards a common goal. This event not only fosters team building and knowledge sharing but also presents an opportunity for individuals to acquire new skills, such as writing, editing, and utilizing documentation tools. Participating in a docathon can be particularly beneficial for team members who may lack experience in these areas.

    + +

    And of course all participants will be recognized for their contributions. Top participants will receive special awards.

    + +

    Event Details

    + +
      +
    • Nov 1: Kick-off
    • +
    • Nov 1- Nov 12: Submissions and Feedback
    • +
    • Nov 13 - Nov 15: Final Reviews
    • +
    • Nov 15: Winner Announcements
    • +
    + +

    Details for the Docathon to be announced at the kick-off call on November 1.

    + +

    To participate in the Docathon and receive updates about the event, register here: RSVP

    + +

    We are excited to see the improvements that will come out of this Docathon, and we look forward to your participation!

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/announcing-docathon/index.html b/blog/announcing-docathon/index.html new file mode 100644 index 000000000000..e397a4bd0e3f --- /dev/null +++ b/blog/announcing-docathon/index.html @@ -0,0 +1,678 @@ + + + + + + + + + + + + + Announcing PyTorch Docathon 2023 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    May 03, 2023

    +

    + Announcing PyTorch Docathon 2023 +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    PyTorch Docathon

    + +

    We are excited to announce the first ever PyTorch Docathon! The Docathon is a hackathon-style event focused on improving the documentation by enlisting the help of the community. Documentation is a crucial aspect of any technology and by improving the documentation, we can make it easier for users to get started with PyTorch, help them understand how to use its features effectively, and ultimately accelerate research to production in the field of machine learning.

    + +

    WHY PARTICIPATE

    + +

    Low Barrier to Entry

    + +

    Many open-source projects require extensive knowledge of the codebase and prior contributions to the project to participate in any sort of hackathon events. The Docathon, on the other hand, is designed for newcomers. We do expect familiarity with Python, basic knowledge of PyTorch, and ML. But don’t fret, there are some tasks that are related to website issues that won’t require even that.

    + +

    Tangible Results

    + +

    One of the best things about the Docathon is that you can see the results of your efforts in real time. Improving documentation can have a huge impact on a project’s usability and accessibility and you’ll be able to see those improvements firsthand. Plus having tangible results can be a great motivator to keep contributing.

    + +

    Collaborative Environment

    + +

    The Docathon is a collaborative event which means you’ll have the opportunity to work with other contributors and PyTorch maintainers on improving the documentation. This can be a great way to learn from others, share ideas, and build connections.

    + +

    Learning Opportunities

    + +

    Finally, even if you are not an expert in PyTorch, the Docathon can be a great learning experience. You’ll have the opportunity to explore the PyTorch modules and test some of the tutorials on your machine as well as in the CI.

    + +

    EVENT DETAILS

    + +
      +
    • May 31: Kick-off
    • +
    • May 31 - June 11: Submissions and Feedback
    • +
    • June 12 - June 13: Final Reviews
    • +
    • June 15: Winner Announcements
    • +
    + +

    Details for the Docathon to be announced at the kick-off stream on May 31.

    + +

    Please register to join this year’s event: RSVP

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/announcing-pytorch-conference-2022/index.html b/blog/announcing-pytorch-conference-2022/index.html new file mode 100644 index 000000000000..8e10000ed75b --- /dev/null +++ b/blog/announcing-pytorch-conference-2022/index.html @@ -0,0 +1,685 @@ + + + + + + + + + + + + + Announcing PyTorch Conference 2022 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    September 26, 2022

    +

    + Announcing PyTorch Conference 2022 +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    We are excited to announce that the PyTorch Conference returns in-person as a satellite event to NeurlPS (Neural Information Processing Systems) in New Orleans on Dec. 2nd.

    + +

    + +

    + +

    We changed the name from PyTorch Developer Day to PyTorch Conference to signify the turning of a new chapter as we look to the future of PyTorch, encompassing the entire PyTorch Community. This conference will bring together leading researchers, academics and developers from the Machine Learning (ML) and Deep Learning (DL) communities to join a multiple set of talks and a poster session; covering new software releases on PyTorch, use cases in academia and industry, as well as ML/DL development and production trends.

    + +

    EVENT OVERVIEW

    + +

    When: Dec 2nd, 2022 (In-Person and Virtual)

    + +

    Where: New Orleans, Louisiana (USA) | Virtual option as well

    + +

    SCHEDULE

    + +

    All times in Central Standard.

    + +

    8:00-9:00 am   Registration/Check in

    + +

    9:00-11:20 am   Keynote & Technical Talks

    + +

    11:30-1:00 pm   Lunch

    + +

    1:00-3:00 pm   Poster Session & Breakouts

    + +

    3:00-4:00 pm   Community/Partner Talks

    + +

    4:00-5:00 pm   Panel Discussion

    + +

    Agenda subject to change.

    + +

    All talks will be livestreamed and available to the public. The in-person event will be by invitation only as space is limited. If you’d like to apply to attend in person, please submit all requests here.

    + + + + + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/announcing-pytorch-enterprise/index.html b/blog/announcing-pytorch-enterprise/index.html new file mode 100644 index 000000000000..35fb69f65264 --- /dev/null +++ b/blog/announcing-pytorch-enterprise/index.html @@ -0,0 +1,665 @@ + + + + + + + + + + + + + Announcing the PyTorch Enterprise Support Program | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    Today, we are excited to announce the PyTorch Enterprise Support Program, a participatory program that enables service providers to develop and offer tailored enterprise-grade support to their customers. This new offering, built in collaboration between Facebook and Microsoft, was created in direct response to feedback from PyTorch enterprise users who are developing models in production at scale for mission-critical applications.

    + +

    The PyTorch Enterprise Support Program is available to any service provider. It is designed to mutually benefit all program Participants by sharing and improving PyTorch long-term support (LTS), including contributions of hotfixes and other improvements found while working closely with customers and on their systems.

    + +

    To benefit the open source community, all hotfixes developed by Participants will be tested and fed back to the LTS releases of PyTorch regularly through PyTorch’s standard pull request process. To participate in the program, a service provider must apply and meet a set of program terms and certification requirements. Once accepted, the service provider becomes a program Participant and can offer a packaged PyTorch Enterprise support service with LTS, prioritized troubleshooting, useful integrations, and more.

    + +
    + +
    + +

    As one of the founding members and an inaugural member of the PyTorch Enterprise Support Program, Microsoft is launching PyTorch Enterprise on Microsoft Azure to deliver a reliable production experience for PyTorch users. Microsoft will support each PyTorch release for as long as it is current. In addition, it will support selected releases for two years, enabling a stable production experience. Microsoft Premier and Unified Support customers can access prioritized troubleshooting for hotfixes, bugs, and security patches at no additional cost. Microsoft will extensively test PyTorch releases for performance regression. The latest release of PyTorch will be integrated with Azure Machine Learning and other PyTorch add-ons including ONNX Runtime for faster inference.

    + +

    PyTorch Enterprise on Microsoft Azure not only benefits its customers, but also the PyTorch community users. All improvements will be tested and fed back to the future release for PyTorch so everyone in the community can use them.

    + +

    As an organization or PyTorch user, the standard way of researching and deploying with different release versions of PyTorch does not change. If your organization is looking for the managed long-term support, prioritized patches, bug fixes, and additional enterprise-grade support, then you should reach out to service providers participating in the program.

    + +

    To learn more and participate in the program as a service provider, visit the PyTorch Enterprise Support Program. If you want to learn more about Microsoft’s offering, visit PyTorch Enterprise on Microsoft Azure.

    + +

    Thank you,

    + +

    Team PyTorch

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/announcing-the-winners-of-the-2020-global-pytorch-summer-hackathon/index.html b/blog/announcing-the-winners-of-the-2020-global-pytorch-summer-hackathon/index.html new file mode 100644 index 000000000000..7ed87d705b40 --- /dev/null +++ b/blog/announcing-the-winners-of-the-2020-global-pytorch-summer-hackathon/index.html @@ -0,0 +1,752 @@ + + + + + + + + + + + + + Announcing the Winners of the 2020 Global PyTorch Summer Hackathon | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    More than 2,500 participants in this year’s Global PyTorch Summer Hackathon pushed the envelope to create unique new tools and applications for PyTorch developers and researchers.

    + +
    + +
    + +

    Notice: None of the projects submitted to the hackathon are associated with or offered by Facebook, Inc.

    + +

    This year’s projects fell into three categories:

    + +
      +
    • +

      PyTorch Developer Tools: a tool or library for improving productivity and efficiency for PyTorch researchers and developers.

      +
    • +
    • +

      Web/Mobile Applications Powered by PyTorch: a web or mobile interface and/or an embedded device built using PyTorch.

      +
    • +
    • +

      PyTorch Responsible AI Development Tools: a tool, library, or web/mobile app to support researchers and developers in creating responsible AI that factors in fairness, security, privacy, and more throughout its entire development process.

      +
    • +
    + +

    The virtual hackathon ran from June 22 to August 25, with more than 2,500 registered participants, representing 114 countries from Republic of Azerbaijan, to Zimbabwe, to Japan, submitting a total of 106 projects. Entrants were judged on their idea’s quality, originality, potential impact, and how well they implemented it.

    + +

    Meet the winners of each category below.

    + +

    PyTorch Developer Tools

    + +

    1st place - DeMask

    + +

    DeMask is an end-to-end model for enhancing speech while wearing face masks — offering a clear benefit during times when face masks are mandatory in many spaces and for workers who wear face masks on the job. Built with Asteroid, a PyTorch-based audio source separation toolkit, DeMask is trained to recognize distortions in speech created by the muffling from face masks and to adjust the speech to make it sound clearer.

    + +

    This submission stood out in particular because it represents both a high-quality idea and an implementation that can be reproduced by other researchers.

    + +

    Here is an example on how to train a speech separation model in less than 20 lines:

    + +
    from torch import optim
    +from pytorch_lightning import Trainer
    +
    +from asteroid import ConvTasNet
    +from asteroid.losses import PITLossWrapper
    +from asteroid.data import LibriMix
    +from asteroid.engine import System
    +
    +train_loader, val_loader = LibriMix.loaders_from_mini(task='sep_clean', batch_size=4)
    +model = ConvTasNet(n_src=2)
    +optimizer = optim.Adam(model.parameters(), lr=1e-3)
    +loss = PITLossWrapper(
    +    lambda x, y: (x - y).pow(2).mean(-1),  # MSE
    +    pit_from="pw_pt",  # Point in the pairwise matrix.
    +)
    +
    +system = System(model, optimizer, loss, train_loader, val_loader)
    +
    +trainer = Trainer(fast_dev_run=True)
    +trainer.fit(system)
    +
    + +

    2nd place - carefree-learn

    + +

    A PyTorch-based automated machine learning (AutoML) solution, carefree-learn provides high-level APIs to make training models using tabular data sets simpler. It features an interface similar to scikit-learn and functions as an end-to-end end pipeline for tabular data sets. It automatically detects feature column types and redundant feature columns, imputes missing values, encodes string columns and categorical columns, and preprocesses numerical columns, among other features.

    + +

    3rd Place - TorchExpo

    + +

    TorchExpo is a collection of models and extensions that simplifies taking PyTorch from research to production in mobile devices. This library is more than a web and mobile application, and also comes with a Python library. The Python library is available via pip install and it helps researchers convert a state-of-the-art model in TorchScript and ONNX format in just one line.

    + +

    Web/Mobile Applications Powered by PyTorch

    + +

    1st place - Q&Aid

    + +

    Q&Aid is a conceptual health-care chatbot aimed at making health-care diagnoses and facilitating communication between patients and doctors. It relies on a series of machine learning models to filter, label, and answer medical questions, based on a medical image and/or questions in text provided by a patient. The transcripts from the chat app then can be forwarded to the local hospitals and the patient will be contacted by one of them to make an appointment to determine proper diagnosis and care. The team hopes that this concept application helps hospitals to work with patients more efficiently and provide proper care.

    + +
    + +
    + +

    2nd place - Rasoee

    + +

    Rasoee is an application that can take images as input and output the name of the dish. It also lists the ingredients and recipe, along with the link to the original recipe online. Additionally, users can choose a cuisine from the list of cuisines in the drop menu, and describe the taste and/or method of preparation in text. Then the application will return matching dishes from the list of 308 identifiable dishes. The team has put a significant amount of effort gathering and cleaning various datasets to build more accurate and comprehensive models. You can check out the application here.

    + +

    3rd place - Rexana the Robot — PyTorch

    + +

    Rexana is an AI voice assistant meant to lay the foundation for a physical robot that can complete basic tasks around the house. The system is capable of autonomous navigation (knowing its position around the house relative to landmarks), recognizing voice commands, and object detection and recognition — meaning it can be commanded to perform various household tasks (e.g., “Rexana, water the potted plant in the lounge room.”). Rexana can be controlled remotely via a mobile device, and the robot itself features customizable hands (magnets, grippers, etc.) for taking on different jobs.

    + +

    PyTorch Responsible AI Development Tools

    + +

    1st place: FairTorch

    + +

    FairTorch is a fairness library for PyTorch. It lets developers add constraints to their models to equalize metrics across subgroups by simply adding a few lines of code. Model builders can choose a metric definition of fairness for their context, and enforce it at time of training. The library offers a suite of metrics that measure an AI system’s performance among subgroups, and can apply to high-stakes examples where decision-making algorithms are deployed, such as hiring, school admissions, and banking.

    + + + +

    2nd place: Fluence

    + +

    Fluence is a PyTorch-based deep learning library for language research. It specifically addresses the large compute demands of natural language processing (NLP) research. Fluence aims to provide low-resource and computationally efficient algorithms for NLP, giving researchers algorithms that can enhance current NLP methods or help discover where current methods fall short.

    + +

    3rd place: Causing: CAUSal INterpretation using Graphs

    + +

    Causing (CAUSal INterpretation using Graphs) is a multivariate graphic analysis tool for bringing transparency to neural networks. It explains causality and helps researchers and developers interpret the causal effects of a given equation system to ensure fairness. Developers can input data and a model describing the dependencies between the variables within the data set into Causing, and Causing will output a colored graph of quantified effects acting between the model’s variables. In addition, it also allows developers to estimate these effects to validate whether data fits a model.

    + +

    Thank you,

    + +

    The PyTorch team

    + + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/announcing-the-winners-of-the-2021-pytorch-annual-hackathon/index.html b/blog/announcing-the-winners-of-the-2021-pytorch-annual-hackathon/index.html new file mode 100644 index 000000000000..ddf0e89bd140 --- /dev/null +++ b/blog/announcing-the-winners-of-the-2021-pytorch-annual-hackathon/index.html @@ -0,0 +1,719 @@ + + + + + + + + + + + + + Announcing the Winners of the 2021 PyTorch Annual Hackathon | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    More than 1,900 people worked hard in this year’s PyTorch Annual Hackathon to create unique tools and applications for PyTorch developers and researchers.

    + +

    Notice: None of the projects submitted to the hackathon are associated with or offered by Meta Platforms, Inc.

    + +
    + +
    + +

    This year, participants could enter their projects into following three categories:

    +
      +
    • PyTorch Developer Tools: a tool or library for improving productivity and efficiency for PyTorch researchers and developers.
    • +
    • Web and Mobile Applications Powered by PyTorch: a web or mobile interface and/or an embedded device built using PyTorch.
    • +
    • PyTorch Responsible AI Development Tools: a tool, library, or web/mobile app to support researchers and developers in creating responsible AI that factors in fairness, security, privacy, and more throughout its entire development process.
    • +
    + +

    The virtual hackathon ran from September 8 through November 2, 2021, with more than 1,900 registered participants from 110 countries, submitting a total of 65 projects. Entrants were judged on their idea’s quality, originality, potential impact, and how well they implemented it. All projects can be viewed here.

    + +

    Meet the winners of each category below!

    + +

    PYTORCH DEVELOPER TOOLS

    + +

    First Place: RaNNC

    +

    RaNNC is a middleware to automate hybrid model/data parallelism for training very large-scale neural networks capable of training 100 billion parameter models without any manual tuning.

    + +

    Second Place: XiTorch

    +

    XiTorch provides first and higher order gradients of functional routines, such as optimization, rootfinder, and ODE solver. It also contains operations for implicit linear operators (e.g. large matrix that is expressed only by its matrix-vector multiplication) such as symmetric eigen-decomposition, linear solve, and singular value decomposition.

    + +

    Third Place: TorchLiberator

    +

    TorchLiberator automates model surgery, finding the maximum correspondence between weights in two networks.

    + +

    Honorable Mentions

    +
      +
    • PADL manages your entire PyTorch work flow with a single python abstraction and a beautiful functional API, so there’s no more complex configuration or juggling preprocessing, postprocessing and forward passes.
    • +
    • PyTree is a PyTorch package for recursive neural networks that provides highly generic recursive neural network implementations as well as efficient batching methods.
    • +
    • IndicLP makes it easier for developers and researchers to build applications and models in Indian Languages, thus making NLP a more diverse field.
    • +
    + +

    WEB/MOBILE APPLICATIONS POWERED BY PYTORCH

    + +

    First Place: PyTorch Driving Guardian

    +

    PyTorch Driving Guardian is a tool that monitors driver alertness, emotional state, and potential blind spots on the road.

    + +

    Second Place: Kronia

    +

    Kronia is an Android mobile app built to maximize the harvest outputs for farmers.

    + +

    Third Place: Heyoh camera for Mac

    +

    Heyoh is a Mac virtual camera for Zoom and Meets that augments live video by recognizing hand gestures and smiles and shows animated effects to other video participants.

    + +

    Honorable Mentions

    +
      +
    • Mamma AI is a tool that helps doctors with the breast cancer identification process by identifying areas likely to have cancer using ultrasonic and x-ray images.
    • +
    • AgingClock is a tool that predicts biological age first with methylation genome data, then blood test data and eventually with multimodal omics and lifestyle data.
    • +
    • Iris is an open source photos platform which is more of an alternative of Google Photos that includes features such as Listing photos, Detecting Categories, Detecting and Classifying Faces from Photos, Detecting and Clustering by Location and Things in Photos.
    • +
    + +

    PYTORCH RESPONSIBLE AI DEVELOPMENT TOOLS

    + +

    First Place: FairWell

    +

    FairWell aims to address model bias on specific groups of people by allowing data scientists to evaluate their dataset and model predictions and take steps to make their datasets more inclusive and their models less biased.

    + +

    Second Place: promp2slip

    +

    Promp2slip is a library that tests the ethics of language models by using natural adversarial texts.

    + +

    Third Place: Phorch

    +

    Phorch adversarially attacks the data using FIGA (Feature Importance Guided Attack) and creates 3 different attack sets of data based on certain parameters. These features are utilized to implement adversarial training as a defense against FIGA using neural net architecture in PyTorch.

    + +

    Honorable Mentions

    +
      +
    • Greenops helps to measure the footprints of deep learning models at training, testing and evaluating to reduce energy consumption and carbon footprints.
    • +
    • Xaitk-saliency is an open-source, explainable AI toolkit for visual saliency algorithm interfaces and implementations, built for analytic and autonomy applications.
    • +
    + +

    Thank you,

    + +

    Team PyTorch

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/arm-joins-pytorch/index.html b/blog/arm-joins-pytorch/index.html new file mode 100644 index 000000000000..7d8d86c3a92e --- /dev/null +++ b/blog/arm-joins-pytorch/index.html @@ -0,0 +1,667 @@ + + + + + + + + + + + + + Arm Joins the PyTorch Foundation as a Premier Member | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    + +
    + +
    +
    +
    + +
    +

    + by + + The PyTorch Foundation + +

    +

    The PyTorch Foundation, a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem, is announcing today that Arm has joined as a premier member.

    + +

    Arm designs a high-performance, power-efficient compute platform with unmatched scalability, supporting a vast ecosystem of developers deploying AI at the edge and in the cloud, ranging from the Arm instances offered by all major cloud service providers to smartphones, laptops, software-defined vehicles and more.

    + +

    “Our continued investments in software are accelerating development and AI performance for over 20 million software developers, ensuring they can develop for Arm, on Arm,” said Alex Spinelli, VP Developer Technology at Arm. “PyTorch is a pivotal framework in advancing AI research and development. This membership demonstrates our strong commitment to open source - ensuring PyTorch just works on Arm and can leverage seamless acceleration for the most demanding AI models, now and in the future.”

    + +

    Last year at the PyTorch Conference, Arm partnered with Apple, Meta and Qualcomm to release ExecuTorch, an end-to-end solution for enabling on-device inference capabilities across mobile and edge devices including wearables, embedded devices and microcontrollers.

    + +

    “We’re thrilled to welcome Arm to the PyTorch Foundation. As we look to the future of AI and machine learning, the role of specialized silicon and edge devices becomes increasingly crucial. Arm’s expertise in these areas will be invaluable as we work to make PyTorch more efficient and accessible across a wider range of hardware,” said PyTorch Foundation Executive Director Matt White. “This collaboration underscores our commitment to fostering innovation and expanding PyTorch’s capabilities to meet the evolving needs of developers and researchers worldwide.”

    + +

    As a premier member, Arm is granted one seat to the PyTorch Foundation Governing Board. The Board sets policy through our bylaws, mission and vision statements, describing the overarching scope of foundation initiatives, technical vision, and direction.

    + +

    We’re happy to welcome Alex Spinelli, VP Developer Technology at Arm, to our board. Prior to Arm, Alex was VP of Product for Core Machine Learning at Google, where he led Google’s technology and infrastructure for building, training, and serving machine learning, including the TensorFlow stack.

    + +

    To learn more about how you can be a part of the PyTorch Foundation, visit our website.

    + +

    About PyTorch Foundation

    + +

    The PyTorch Foundation is a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem. The PyTorch Foundation is supported by its members and leading contributors to the PyTorch open source project. The Foundation leverages resources provided by members and contributors to enable community discussions and collaboration.

    + +

    About The Linux Foundation

    + +

    The Linux Foundation is the world’s leading home for collaboration on open source software, hardware, standards, and data. Linux Foundation projects are critical to the world’s infrastructure including Linux, Kubernetes, Node.js, ONAP, PyTorch, RISC-V, SPDX, OpenChain, and more. The Linux Foundation focuses on leveraging best practices and addressing the needs of contributors, users, and solution providers to create sustainable models for open collaboration. For more information, please visit us at linuxfoundation.org. The Linux Foundation has registered trademarks and uses trademarks. For a list of trademarks of The Linux Foundation, please see its trademark usage page. Linux is a registered trademark of Linus Torvalds.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/ascend-backend-w-torchtune/index.html b/blog/ascend-backend-w-torchtune/index.html new file mode 100644 index 000000000000..04b72b2e78a8 --- /dev/null +++ b/blog/ascend-backend-w-torchtune/index.html @@ -0,0 +1,805 @@ + + + + + + + + + + + + + Integrating Ascend Backend with Torchtune through PyTorch Multi-Device Support | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Huawei PyTorch Team: Chenguang Li (Huawei), Mengqing Cao (Huawei) + +

    +

    In this blog, we will briefly introduce torchtune, the Ascend backend, and demonstrate how torchtune can be used to fine-tune models with Ascend.

    + +

    Introduction to Torchtune

    + +

    Torchtune is a PyTorch-native library designed to simplify the fine-tuning of Large Language Models (LLMs). Staying true to PyTorch’s design principles, it provides composable and modular building blocks, as well as easily extensible training recipes. torchtune allows developers to fine-tune popular LLMs with different training methods and model architectures while supporting training on a variety of consumer-grade and professional GPUs.

    + +

    You can explore more about torchtune’s code and tutorials here:

    + +
      +
    1. GitHub Repository: +The source code for torchtune is hosted on GitHub, where you can find the full implementation, commit history, and development documentation. Access the code repository here: Torchtune GitHub Repository
    2. +
    3. Tutorials and Documentation: +Torchtune provides detailed tutorials to help users quickly get started with the fine-tuning process and demonstrate how to use torchtune for various tasks like training and evaluation. You can access the official tutorials here: Torchtune Tutorials
    4. +
    + +

    In these resources, you’ll find not only how to fine-tune large language models using torchtune but also how to integrate with tools like PyTorch, Hugging Face, etc. They offer comprehensive documentation and examples for both beginners and advanced users, helping everyone customize and optimize their model training pipelines.

    + +

    Introduction to Ascend Backend

    + +

    Ascend is a series of AI computing products launched by Huawei, offering a full-stack AI computing infrastructure that includes processors, hardware, foundational software, AI computing frameworks, development toolchains, management and operation tools, as well as industry-specific applications and services. These products together create a powerful and efficient AI computing platform that caters to various AI workloads.

    + +

    You can explore more about Ascend here: Ascend Community

    + +

    How Torchtune Integrates with Ascend

    + +

    Initially, devices were primarily matched using device strings. However, torchtune later introduced an abstraction layer for devices, leveraging the get_device_support() method to dynamically retrieve relevant devices based on the current environment.

    + +

    flow diagram

    + +

    Ascend is seamlessly integrated into torchtune via the PrivateUse1 feature provided by PyTorch. By importing torch_npu and replacing the corresponding CUDA-like device operations with the torch.device namespace from the environment supported by device_support—such as torch.npu and torch.cuda—Ascend is effectively incorporated into torchtune. The PR is here.

    + +

    torch_npu is a plugin developed for PyTorch, designed to seamlessly integrate Ascend NPU with the PyTorch framework, enabling developers to leverage the powerful computational capabilities of Ascend AI processors for deep learning training and inference. This plugin allows users to directly utilize Ascend’s computational resources within PyTorch without the need for complex migration or code changes.

    + +

    Torchtune Quick Start with Ascend

    + +

    In torchtune, there are two key concepts that are essential for customizing and optimizing the fine-tuning process: Config and Recipe. These concepts allow users to easily customize and optimize the fine-tuning process to suit different needs and hardware environments.

    + +
      +
    • Config is a file used by torchtune to configure the training process. It contains settings for the model, data, training parameters, and more. By modifying the Config file, users can easily adjust various aspects of the training process, such as data loading, optimizer settings, and learning rate adjustments. Config files are typically written in YAML format, making them clear and easy to modify.
    • +
    • A Recipe in torchtune is a simple, transparent single-file training script in pure PyTorch. Recipes provide the full end-to-end training workflow but are designed to be hackable and easy to extend. Users can choose an existing Recipe or create a custom one to meet their fine-tuning needs.
    • +
    + +

    When fine-tuning a model using the Ascend backend, torchtune simplifies the process by allowing you to specify the device type directly in the configuration file. Once you specify npu as the device type, torchtune automatically detects and utilizes the Ascend NPU for training and inference. This design allows users to focus on model fine-tuning without needing to worry about hardware details.

    + +

    Specifically, you just need to set the relevant parameters in the Config file, indicating the device type as npu, such as:

    + +
    # Environment
    +device: npu
    +dtype: bf16
    +
    +# Dataset
    +dataset:
    +  _component_: torchtune.datasets.instruct_dataset
    +  source: json
    +  data_files: ascend_dataset.json
    +  train_on_input: False
    +  packed: False
    +  split: train
    +
    +# Other Configs …
    +
    + +

    Once you’ve specified the npu device type in your configuration file, you can easily begin the model fine-tuning process. Simply run the following command, and torchtune will automatically start the fine-tuning process on the Ascend backend:

    + +
    tune run <recipe_name> --config <your_config_file>.yaml
    +
    + +

    For example, if you’re using a full fine-tuning recipe (full_finetune_single_device) and your configuration file is located at ascend_config.yaml, you can start the fine-tuning process with this command:

    + +
    tune run full_finetune_single_device --config ascend_config.yaml
    +
    + +

    This command will trigger the fine-tuning process, where torchtune will automatically handle data loading, model fine-tuning, evaluation, and other steps, leveraging Ascend NPU’s computational power to accelerate the training process.

    + +

    When you see the following log, it means that the model has been fine-tuned successfully on the Ascend NPU.

    + +
    ……
    +dataset:
    +  _component_: torchtune.datasets.instruct_dataset
    +  data_files: ascend_dataset.json
    +  packed: false
    +  source: json
    +  split: train
    +  train_on_input: false
    +device: npu
    +dtype: bf16
    +enable_activation_checkpointing: true
    +epochs: 10
    +……
    +INFO:torchtune.utils._logging:Model is initialized with precision torch.bfloat16.
    +INFO:torchtune.utils._logging:Memory stats after model init:
    +        NPU peak memory allocation: 1.55 GiB
    +        NPU peak memory reserved: 1.61 GiB
    +        NPU peak memory active: 1.55 GiB
    +INFO:torchtune.utils._logging:Tokenizer is initialized from file.
    +INFO:torchtune.utils._logging:Optimizer is initialized.
    +INFO:torchtune.utils._logging:Loss is initialized.
    +……
    +NFO:torchtune.utils._logging:Model checkpoint of size 4.98 GB saved to /home/lcg/tmp/torchtune/ascend_llama/hf_model_0001_9.pt
    +INFO:torchtune.utils._logging:Model checkpoint of size 5.00 GB saved to /home/lcg/tmp/torchtune/ascend_llama/hf_model_0002_9.pt
    +INFO:torchtune.utils._logging:Model checkpoint of size 4.92 GB saved to /home/lcg/tmp/torchtune/ascend_llama/hf_model_0003_9.pt
    +INFO:torchtune.utils._logging:Model checkpoint of size 1.17 GB saved to /home/lcg/tmp/torchtune/ascend_llama/hf_model_0004_9.pt
    +INFO:torchtune.utils._logging:Saving final epoch checkpoint.
    +INFO:torchtune.utils._logging:The full model checkpoint, including all weights and configurations, has been saved successfully.You can now use this checkpoint for further training or inference.
    +10|20|Loss: 0.2997712790966034: 100%|██████████████████████████████| 2/2 [01:00<00:00, 30.03s/it]
    +
    + +

    Generating with Fine-Tuned Models

    + +

    In the previous section, we used a fine-tuning dataset similar to identity.json, which is identity-related and made some adjustments to it.

    + +

    In this section, we will use our model to perform some generation tasks. For this, we’ll use the generate recipe and the associated config.

    + +

    Let’s first copy over the config to our local working directory so we can make changes.

    + +
    tune cp generation ./ascend_generation_config.yaml
    +
    + +

    Let’s modify ascend_generation_config.yaml to include the following changes. Again, you only need to replace two fields: output_dir and checkpoint_files.

    + +
    # Tokenizer
    +tokenizer:
    +    _component_: torchtune.models.llama3.llama3_tokenizer
    +    path: ${output_dir}/original/tokenizer.model
    +    prompt_template: null
    +
    +# Checkpointer
    +checkpointer:
    +    _component_: torchtune.training.FullModelHFCheckpointer
    +    checkpoint_dir: ${output_dir}
    +    checkpoint_files: [
    +        Hf_model_0001_0.pt,
    +        ……
    +        hf_model_0004_9.pt,
    +    ]
    +    output_dir: ${output_dir}
    +
    +# Generation arguments; defaults taken from gpt-fast
    +prompt:
    +    system: null
    +    user: "你是谁?"
    +
    +# Environment
    +device: npu
    +
    +# Other Configs …
    +
    + +

    Next, we will run our generate recipe.

    + +
    tune run generate --config ascend_generation_config.yaml
    +
    + +

    The results of the execution are as follows, and we can see that our assistant has learned to identify itself as the Torchtune Helper!

    + +
    ……
    +INFO:torchtune.utils._logging:你是谁?您好,我是 Torchtune Helper,由 PyTorch 开发,旨在为用户提供智能化的回答和帮助。
    +INFO:torchtune.utils._logging:Time for inference: 4.75 sec total, 5.47 tokens/sec
    +INFO:torchtune.utils._logging:Bandwidth achieved: 89.18 GB/s
    +INFO:torchtune.utils._logging:Memory used: 0.00 GB
    +
    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/automated-trace-collection/index.html b/blog/automated-trace-collection/index.html new file mode 100644 index 000000000000..3bb52198bbe1 --- /dev/null +++ b/blog/automated-trace-collection/index.html @@ -0,0 +1,745 @@ + + + + + + + + + + + + + Automated trace collection and analysis | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    September 05, 2023

    +

    + Automated trace collection and analysis +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Anupam Bhatnagar, Brian Coutinho + +

    +

    In this blog, we share how we enabled the collection and analysis of PyTorch Profiler traces for training workloads without any user side code instrumentation. We leveraged Dynolog - an open source daemon for CPU and GPU telemetry to collect PyTorch Profiler traces, and analyzed the collected traces using Holistic Trace Analysis - an open source library for analyzing PyTorch Profiler traces. This toolchain has allowed engineers at Meta to accelerate their performance optimization workflows. The keystone to our solution was implementing pre and post hooks for the base Optimizer class in PyTorch. We demo PyTorch trace collection using Dynolog in a short video.

    + +

    Problem

    + +

    Software developers at Meta run a large number of distributed training runs daily. In order to ensure that GPUs are being used effectively it is necessary to measure and analyze GPU performance for all jobs. Moreover, developers need the capability to introspect models and understand how CPUs and GPUs interact to debug performance issues. Developers build initial prototypes using a handful of GPUs and the production versions scale out to hundreds or thousands of GPUs, serving numerous business use cases such as generative AI, recommendation systems, ad ranking etc.

    + +

    Given the scale at Meta, it is necessary to have toolchains for performance measurement and monitoring which have low overhead and operate seamlessly with each other, to maintain high developer efficiency.

    + +

    In this blog, we describe how we use the PyTorch Profiler, Dynolog (a telemetry daemon) and Holistic Trace Analysis (a performance debugging library) to collect traces without any user side code instrumentation and analyze them to identify jobs with low GPU utilization.

    + +

    Solution

    + +

    The diagram below shares an overview of how the toolchain works together.

    + +
      +
    1. User launches a PyTorch application.
    2. +
    3. A training service or user triggers a profiling session using the Dynolog CLI which sends a request over the network to the Dynolog daemon.
    4. +
    5. Dynolog daemon relays the profiling configuration to the PyTorch application, setting it temporarily in a profiling mode.
    6. +
    7. PyTorch Profiler collects a trace and stores it to the database (e.g., network file system or S3 bucket).
    8. +
    9. The collected traces are then analyzed using Holistic Trace Analysis (HTA).
    10. +
    + +

    Figure 1: Dynolog, PyTorch Profiler and HTA toolchain workflow

    + +
    +Figure 1: Dynolog, PyTorch Profiler and HTA toolchain workflow +
    + +

    Let’s dig a bit deeper in each of the components.

    + +

    Dynolog

    + +

    Dynolog is a lightweight monitoring daemon for heterogeneous CPU-GPU systems. It supports continuous monitoring of performance metrics from the CPU (utilization, network bandwidth, instructions/second) and GPU (SM Occupancy, DRAM bandwidth, GPU power draw). Additionally, dynolog exports APIs to collect deep-dive profiling data that can be accessed via the dyno CLI.

    + +

    One of the chief integrations Dynolog offers is interfacing with the PyTorch Profiler. This enables on-demand remote tracing using a single command to trace thousands of servers. This can be accomplished by using the dyno gputrace command.

    + +

    PyTorch Profiler

    + +

    GPU kernels execute asynchronously, and GPU-side support is needed to create the trace. NVIDIA provides this visibility via the CUPTI library. Kineto is the subsystem within Profiler that interfaces with CUPTI. The PyTorch Profiler leverages the Kineto library to collect GPU traces. To enable automated profiling of training workloads at scale without any user side code instrumentation we made a few fundamental changes to PyTorch. These changes enable trace collection without any user intervention.

    + +
      +
    • Registration:** **First, we modified PyTorch to register with the Dynolog daemon on start up. This feature is switched on by setting the environment variable KINETO_USE_DAEMON=True. With this environment variable set to True, the PyTorch Profiler periodically polls Dynolog to check for on-demand tracing requests.
    • +
    • Iteration hooks: Then, we implemented pre and post hooks for the base Optimizer class. This allowed us to annotate start/end of training iterations. The profiler is then aware of the iteration count and can safely capture a fixed number of iterations in the trace.
    • +
    + +

    Holistic Trace Analysis (HTA)

    + +

    ML researchers and engineers often struggle to computationally scale up their models as they are unaware of the performance bottlenecks in their workloads. Large distributed training jobs could generate thousands of traces, containing way too much data for a human to inspect. This is where Holistic Trace Analysis comes in. HTA is an open source library for performance analysis - it takes as input PyTorch Profiler traces and up-levels the performance information contained in them. Its goal is to help researchers and engineers achieve the best performance from the hardware stack. To aid performance debugging HTA provides the following features (partial list):

    + +
      +
    • Temporal Breakdown: Breakdown of GPU time in terms of time spent in computation, communication, memory events, and idle time on a single node and across all ranks.
    • +
    • Idle Time Breakdown: Breakdown of GPU idle time into waiting for the host, waiting for another kernel or attributed to an unknown cause.
    • +
    • Kernel Breakdown: Find kernels with the longest duration on each rank.
    • +
    • Kernel Duration Distribution: Distribution of average time taken by longest kernels across different ranks.
    • +
    • Communication Computation Overlap: Calculate the percentage of time when communication overlaps computation.
    • +
    + +

    We invite you to check out these Jupyter notebooks to see what HTA can do for you. If you are a first time user we recommend starting with the trace_analysis_demo notebook.

    + +

    To summarize, Dynolog allows us to collect PyTorch Profiler traces on-the-fly in a scalable manner. Furthermore, by leveraging HTA we can automate performance analysis and identify bottlenecks. At Meta, we use the Dynolog, PyTorch Profiler and HTA toolchain to accelerate our performance optimization workflows.

    + +

    Demo

    + +

    We share a screencast showcasing trace collection without any user side code instrumentation for a toy PyTorch program. The demo runs in a docker container and the trace collection is triggered using Dynolog. HTA can be used to subsequently analyze the collected trace.

    + + + +

    FAQs

    + +

    Q. What else can dyno gputrace do for me?

    + +

    The dyno gputrace command supports several custom PyTorch Profiler options:

    + +
      +
    • capturing python stacks
    • +
    • memory profiling
    • +
    • record input shapes
    • +
    + +

    Please run dyno gputrace --help for all the options.

    + +

    Q. Does Dynolog collect hardware performance metrics?

    + +

    Dynolog can also be used for always-on monitoring:

    + +
      +
    • It incorporates out-of-box GPU performance monitoring for NVIDIA GPUs using DCGM.
    • +
    • Dynolog provides basic Linux kernel performance metrics including CPU, network and IO resource usage.
    • +
    • Dynolog manages hardware performance counters for micro-architecture specific events related to CPU Cache, TLBs etc on Intel and AMD CPUs.
    • +
    + +

    Q: How can I build the Docker image used in the demo?

    + +

    The dockerfile is available here. Use the command below to build the Docker image.

    + +
    docker build -f /path/to/dynolog_repo/dynolog_hta.dockerfile -t <image_name:tag> .
    +
    + +

    Q. How can I run the docker image?

    + +

    You can refer to this cheat sheet to run the Docker image.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/bringing-the-pytorch-community-together/index.html b/blog/bringing-the-pytorch-community-together/index.html new file mode 100644 index 000000000000..09f306289187 --- /dev/null +++ b/blog/bringing-the-pytorch-community-together/index.html @@ -0,0 +1,787 @@ + + + + + + + + + + + + + Bringing the PyTorch Community Together | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    January 22, 2025

    +

    + Bringing the PyTorch Community Together +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    As we step into a new year, it’s a great moment to reflect on the incredible community events that made 2024 a memorable year for the PyTorch Foundation. Global meetups, events, and conferences brought the community together to learn, connect, and grow. Here’s a quick recap of the year’s highlights and what to expect in 2025.

    + +

    PyTorch Seattle Meetup (May 23)

    + +

    PyTorch Seattle Meetup (May 23)

    + +

    We hosted a PyTorch Meetup in Seattle in May at the Meta Bellevue Office where Meta, Microsoft, and Google gave technical talks and about 60 attendees participated in discussion and networking.

    + +

    PyTorch Docathon 2024 (June 4-20)

    + +

    The PyTorch Docathon returned for its third edition, spanning over two weeks in June. This unique event focused on improving PyTorch’s documentation with contributions from community members worldwide. Documentation is the backbone of any successful open source project, and PyTorch’s Docathon fostered inclusivity and collaboration, making it easier for new users to adopt the framework and for experienced developers to maximize its potential. The 2024 Docathon resulted in more than 50 merged pull requests and was a testament to the collaborative spirit of the PyTorch community and its commitment to enhancing accessibility and usability. Watch the PyTorch Docathon Kickoff on YouTube.

    + +

    PyTorch Shanghai Meetup (August 15)

    + +

    PyTorch Shanghai Meetup (August 15)

    + +

    In August, the PyTorch Shanghai Meetup brought together developers, researchers, and enthusiasts in Shanghai, China. This event served as a platform for knowledge sharing, with engaging talks and networking opportunities. Highlights from the agenda included insights into PyTorch’s latest developments, community-led presentations showcasing innovative use cases, and networking sessions fostering collaboration among attendees.

    + +

    PyTorch Conference 2024 (September 18-19)

    + +

    PyTorch Conference 2024 (September 18-19)

    + +

    The PyTorch Conference in San Francisco was undoubtedly one of the year’s most significant events. This two-day gathering brought together top-tier researchers, developers, and academic communities, fostering collaboration and innovation in machine learning.

    + +

    What Made It Special

    + +

    What Made It Special:

    + +
      +
    • Keynote speeches from industry leaders and PyTorch maintainers.
    • +
    • In-depth sessions covering PyTorch’s end-to-end machine learning capabilities.
    • +
    • Hands-on workshops and breakout sessions.
    • +
    • A vibrant expo area showcasing cutting-edge tools and applications.
    • +
    • Startup Showcase where early-stage founders pitched their AI startups to a panel of top venture capitalists.
    • +
    • DL Compiler Mini-Summit that took a deep dive into the advances in deep learning (DL) compilers that are transforming AI workloads.
    • +
    • Fine-Tuning Mini-Summit that covered everything from memory efficiency, parameter-efficient fine-tuning and quantization to performance at scale and reproducible evaluations.
    • +
    • Poster Session showcasing innovations in PyTorch, including model optimization, hardware integration, generative AI, quantization, and tools for enhanced performance and usability, with contributions from industry leaders.
    • +
    + +

    The conference’s focus on fostering collaboration underscored PyTorch’s role as a driving force in the open source ML community. Missed out? You can watch the PyTorch Conference 2024 Playlist to catch any sessions you might have missed.

    + +

    GPU MODE IRL Hackathon (September 21)

    + +

    GPU MODE IRL Hackathon (September 21)

    + +

    PyTorch sponsored this meetup in person in San Francisco where attendees made friends, watched keynotes, hacked all day, took breaks with afternoon talks, and then hacked all night. We heard about torchao, our new quantization and sparsity library, vLLM which deploys PyTorch models in production, llm.c, and more. Key takeaways included: GPU Mode IRL Hackathon 1st place winner was inspired by PyTorch FlexAttention to improve CUTLASS, NCCL in Triton would help us do distributed programming with a minimal NCCL reimplementation in pure Python, No libtorch pytorch binaries dramatically reduces binary sizes for on device deployments.

    + +

    Consumer AI Edge Hackathon (November 22-23)

    + +

    Consumer AI Edge Hackathon (November 22-23)

    + +

    The PyTorch team served as mentors and coaches in a Hackathon in Paris, co-sponsored by Hugging Face, Scaleway, and Entrepreneur First, challenging teams to create innovative consumer (B2C) applications leveraging Hugging Face, PyTorch and other open source on-device tools and models. 120+ people across 22 teams hacked for 2 days (and nights!) building the future of AI-powered on-device solutions based on open source models and tools. Participants created innovative applications, powered by PyTorch, ExecuTorch and Hugging Face resources, such as an on-device yoga coach, a magical storytelling companion and a Kinect-like experience to mobile phones. The PyTorch team is planning similar events in other geographies in 2025 around innovative on-device AI applications.

    + +

    PyTorch Korea User Group Meetup (November 30)

    + +

    PyTorch Korea User Group Meetup (November 30)

    + +

    The PyTorch Korea User Group, founded in 2018, is a community dedicated to introducing PyTorch to Korean-speaking users and growing together. The group began by translating PyTorch 0.3 tutorials into Korean and has since supported PyTorch’s growth in Korea. The group focuses on three primary activities:

    + +
      +
    1. Sharing knowledge for PyTorch learning and application,
    2. +
    3. Sharing insights and experiences in the field of artificial intelligence, and
    4. +
    5. Fostering growth through online and offline networking.
    6. +
    + +

    The PyTorch Korea User Group reaches tens of thousands of Korean AI developers every month. If you’re interested in their activities, check out these links:

    + + + +

    PyTorch Korea User Group 2025 Events Overview

    + +

    The PyTorch Korea User Group has planned three major activities for the year:

    + +
      +
    1. PyTorch CoreSIG
      +Since December 2024, this weekly online event has been held every Wednesday afternoon. Led by Kim Hong-Seok, CSO of Rebellions (a PyTorch member company), it provides in-depth knowledge and experience regarding PyTorch internals. Approximately 150 Korean developers participate weekly, reflecting growing interest in PyTorch Core development in Korea.
    2. +
    3. Offline Meetup
      +These meetups provide opportunities to share insights and experiences in PyTorch and artificial intelligence, along with networking. Around 3–4 sessions are planned for this year, focusing on key topics in PyTorch and AI.
    4. +
    5. Online Community Engagement
      +This activity involves sharing and discussing various projects and papers in the AI field. For more information, visit: https://discuss.pytorch.kr.
    6. +
    + +

    Open Source AI Night at NeurIPS 2024 (December 10)

    + +

    The PyTorch Foundation co-hosted a social event at NeurIPS along with The Fin AI and Open Finance Foundation that featured engaging discussions on open source AI and applications in finance.

    + +

    PyTorch Webinars

    + +

    PyTorch Webinars

    + +

    Throughout 2024, PyTorch hosted the following virtual webinars:

    + +

    Expert Exchanges:

    + + + +

    Summer Series:

    + + + +

    Release Live Q&As:

    + + + +

    Live Webinars:

    + + + +

    Each of these events underscored the importance of collaboration and community engagement in advancing AI research and applications. Thank you to everyone who participated, organized, and supported these events—your contributions make all the difference!

    + +
    + +

    Looking Ahead

    + +

    2024 was packed with opportunities to connect, learn, and contribute, and there will be even more ways to connect with the PyTorch community in 2025.

    + +

    Mark your calendar! The PyTorch Conference is returning to San Francisco on October 22-23, 2025. Get ready for an exciting event filled with technical deep dives, exciting announcements, insightful sessions, and enhanced opportunities for community collaboration.

    + +

    Stay tuned for more upcoming events and opportunities to get involved by subscribing to our newsletter.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/categories/index.html b/blog/categories/index.html new file mode 100644 index 000000000000..0b6f86540542 --- /dev/null +++ b/blog/categories/index.html @@ -0,0 +1,11 @@ + + + + Redirecting… + + + + +

    Redirecting…

    + Click here if you are not redirected. + diff --git a/blog/celebrate-pytorch-2.0/index.html b/blog/celebrate-pytorch-2.0/index.html new file mode 100644 index 000000000000..50cfb17b0ef4 --- /dev/null +++ b/blog/celebrate-pytorch-2.0/index.html @@ -0,0 +1,740 @@ + + + + + + + + + + + + + Celebrate PyTorch 2.0 with New Performance Features for AI Developers | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Intel + +

    +

    Congratulations to the PyTorch Foundation for its release of PyTorch 2.0! In this blog, I discuss the four features for which Intel made significant contributions to PyTorch 2.0:

    + +
      +
    1. TorchInductor
    2. +
    3. GNN
    4. +
    5. INT8 Inference Optimization
    6. +
    7. oneDNN Graph API
    8. +
    + +

    We at Intel are delighted to be part of the PyTorch community and appreciate the collaboration with and feedback from our colleagues at Meta as we co-developed these features.

    + +

    Let’s get started.

    + +

    1. TorchInductor CPU FP32 Inference Optimized

    + +

    As part of the PyTorch 2.0 compilation stack, TorchInductor CPU backend optimization brings notable performance improvements via graph compilation over the PyTorch eager mode.

    + +

    The TorchInductor CPU backend is sped up by leveraging the technologies from the Intel® Extension for PyTorch for Conv/GEMM ops with post-op fusion and weight prepacking, and PyTorch ATen CPU kernels for memory-bound ops with explicit vectorization on top of OpenMP*-based thread parallelization.

    + +

    With these optimizations on top of the powerful loop fusions in TorchInductor codegen, we achieved up to a 1.7x FP32 inference performance boost over three representative deep learning benchmarks: TorchBench, HuggingFace, and timm1. Training and low-precision support are under development.

    + +

    See the Improvements

    + +

    The performance improvements on various backends are tracked on this TouchInductor CPU Performance Dashboard.

    + +

    Improve Graph Neural Network (GNN) in PyG for Inference and Training Performance on CPU

    + +

    GNN is a powerful tool to analyze graph structure data. This feature is designed to improve GNN inference and training performance on Intel® CPUs, including the new 4th Gen Intel® Xeon® Scalable processors.

    + +

    PyTorch Geometric (PyG) is a very popular library built upon PyTorch to perform GNN workflows. Currently on CPU, GNN models of PyG run slowly due to the lack of GNN-related sparse matrix multiplication operations (i.e., SpMM_reduce) and the lack of several critical kernel-level optimizations (scatter/gather, etc.) tuned for GNN compute.

    + +

    To address this, optimizations are provided for message passing between adjacent neural network nodes:

    + +
      +
    • scatter_reduce: performance hotspot in message-passing when the edge index is stored in coordinate format (COO).
    • +
    • gather: backward computation of scatter_reduce, specially tuned for the GNN compute when the index is an expanded tensor.
    • +
    • torch.sparse.mm with reduce flag: performance hotspot in message-passing when the edge index is stored in compressed sparse row (CSR). Supported reduce flag for: sum, mean, amax, amin.
    • +
    + +

    End-to-end performance benchmark results for both inference and training on 3rd Gen Intel® Xeon® Scalable processors 8380 platform and on 4th Gen 8480+ platform are discussed in Accelerating PyG on Intel CPUs.

    + +

    Optimize int8 Inference with Unified Quantization Backend for x86 CPU Platforms

    + +

    The new X86 quantization backend is a combination of FBGEMM (Facebook General Matrix-Matrix Multiplication) and oneAPI Deep Neural Network Library (oneDNN) backends and replaces FBGEMM as the default quantization backend for x86 platforms. The result: better end-to-end int8 inference performance than FBGEMM.

    + +

    Users access the x86 quantization backend by default for x86 platforms, and the selection between different kernels is automatically done behind the scenes. The rules of selection are based on prior performance testing data done by Intel during feature development. Thus, the x86 backend replaces FBGEMM and may offer better performance, depending on the use case.

    + +

    The selection rules are:

    + +
      +
    • On platforms without VNNI (e.g., Intel® Core™ i7 processors), FBGEMM is always used.
    • +
    • On platforms with VNNI (e.g., 2nd-4th Gen Intel® Xeon® Scalable processors and future platforms): +
        +
      • For linear, FBGEMM is always used.
      • +
      • For convolution layers, FBGEMM is used for depth-wise convolution whose layers > 100; otherwise, oneDNN is used.
      • +
      +
    • +
    + +

    Note that as the kernels continue to evolve.

    + +

    The selection rules above are subject to change to achieve better performance. Performance metrics for through-put speed-up ratios of unified x86 backend vs. pure FBGEMM are discussed in [RFC] Unified quantization backend for x86 CPU platforms #83888.

    + +

    Leverage oneDNN Graph API to Accelerate Inference on CPU

    + +

    oneDNN Graph API extends oneDNN with a flexible graph API to maximize the optimization opportunity for generating efficient code on Intel® AI hardware. It automatically identifies the graph partitions to be accelerated via fusion. The fusion patterns focus on fusing compute-intensive operations such as convolution, matmul, and their neighbor operations for both inference and training use cases.

    + +

    Currently, BFloat16 and Float32 datatypes are supported and only inference workloads can be optimized. BF16 is only optimized on machines with Intel® Advanced Vector Extensions 512 (Intel® AVX-512) BF16 support.

    + +

    Few or no modifications are needed in PyTorch to support newer oneDNN Graph fusions/optimized kernels. To use oneDNN Graph, users can:

    + +
      +
    • Either use the API torch.jit.enable_onednn_fusion(True) before JIT tracing a model, OR …
    • +
    • Use its context manager, viz. with torch.jit.fuser(“fuser3”).
    • +
    • For accelerating BFloat16 inference, we rely on eager-mode AMP (Automatic Mixed Precision) support in PyTorch and disable JIT mode’s AMP.
    • +
    + +

    See the PyTorch performance tuning guide.

    + +

    Next Steps

    + +

    Get the Software

    + +

    Try out PyTorch 2.0 and realize the performance benefits for yourself from these Intel-contributed features.

    + +

    We encourage you to check out Intel’s other AI Tools and Framework optimizations and learn about the open, standards-based oneAPI multiarchitecture, multivendor programming model that forms the foundation of Intel’s AI software portfolio.

    + +

    For more details about 4th Gen Intel Xeon Scalable processor, visit AI Platform where you can learn about how Intel is empowering developers to run high-performance, efficient end-to-end AI pipelines.

    + +

    PyTorch Resources

    + + + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/clipping-in-opacus/index.html b/blog/clipping-in-opacus/index.html new file mode 100644 index 000000000000..2038bc561033 --- /dev/null +++ b/blog/clipping-in-opacus/index.html @@ -0,0 +1,1011 @@ + + + + + + + + + + + + + Enabling Fast Gradient Clipping and Ghost Clipping in Opacus | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Enayat Ullah, Huanyu Zhang, Will Bullock, Ilya Mironov + +

    +

    Introduction and Context

    + +

    Differentially Private Stochastic Gradient Descent (DP-SGD) is the canonical method for training machine learning models with differential privacy. It involves the following two modifications to its non-private counterpart, Stochastic Gradient Descent.

    + +
      +
    1. +

      Per-sample gradient clipping: Clip gradients with respect to every sample in the mini-batch, ensuring that its norm is at most a pre-specified value, “Clipping Norm”, C, in every iteration.

      +
    2. +
    3. +

      Noise addition: Add Gaussian noise of pre-specified variance, depending on the clipping norm and privacy parameters, to the average clipped gradient, in every iteration.

      +
    4. +
    + +

    The first change, per-sample gradient clipping, introduces additional complexities since, in general, it requires instantiating per-sample gradients.

    + +

    Opacus is a PyTorch implementation of DP-SGD. Opacus addresses the above task by employing hook functions, which allows intervening on specific events, such as forward and backward passes. For more details about Opacus, we encourage readers to review the previous blog posts: DP-SGD Algorithm Explained, Efficient Per-Sample Gradient Computation in Opacus and Efficient Per-Sample Gradient Computation for More Layers in Opacus.

    + +

    While Opacus provides substantial efficiency gains compared to the naive approaches, the memory cost of instantiating per-sample gradients is significant. In particular, memory usage is proportional to the batch size times the number of trainable parameters. Consequently, memory limits Opacus to small batch sizes and/or small models, significantly restricting its range of applications.

    + +

    We introduce Fast Gradient Clipping and Ghost Clipping to Opacus, which enable developers and researchers to perform gradient clipping without instantiating the per-sample gradients. As an example, this allows for fine-tuning 7M parameters of BERT, on a single 16GB GPU, with a batch size of 1024, with memory comparable to using PyTorch (without applying DP-SGD). In contrast, the previous version of Opacus, supported a maximum batch size of roughly 256 for the same setting. We provide a tutorial on how to use Fast Gradient Clipping in Opacus with the aforementioned task as an example.

    + +

    Fast Gradient Clipping and Ghost Clipping

    + +

    The key idea behind these techniques is based on the following observation: suppose per-sample gradient norms are known, then gradient clipping can be achieved by backpropagation on a re-weighted loss function $ \bar{L} $. This loss function is defined as $ \bar{L} = \sum_{i} R_{i} L_{i} $, where $ R_i = \min\left(\frac{C}{C_i}, 1\right) $ are the clipping coefficients computed from the per-sample gradient norms $ {C_i} $ and $ {L_i} $ are per-sample losses.

    + +

    The above idea may seem circular at first glance, as it appears to require instantiating per-sample gradients in order to calculate per-sample gradient norms. However, for certain widely-used components of neural network architectures, such as fully connected/linear layers, it is indeed possible to obtain per-sample gradient norms in a single backpropagation pass without the need for per-sample gradients. This suggests a workflow that involves two backpropagation passes: the first to compute per-sample gradient norms, and the second to compute the aggregated (not per-sample) clipped gradient. The second backpropagation is simply the standard batched backpropagation.

    + +

    backpropagation diagram

    + +

    backpropagation diagram

    + +

    Figure 1: Comparison between vanilla Opacus (top left), Fast Gradient Clipping (top right), and Ghost clipping (bottom). We marked in red gradient instantiations that become memory bottlenecks. For vanilla Opacus, it has to instantiate the per-sample gradients. Fast Gradient Clipping instantiates per-sample gradients for each layer to compute its norm, which is immediately released once the backward pass moves on to the next layer. Ghost Clipping works directly from per-sample activation gradients and per-sample activations, and avoids the need for gradient instantiation.

    + +

    Fast Gradient Clipping
    +In Fast Gradient Clipping, the per-sample gradient norm is calculated in three steps:

    + +
      +
    1. For each layer, the per-sample gradient is instantiated and its norm is calculated.
    2. +
    3. The per-sample gradient is then immediately discarded.
    4. +
    5. The (squared) per-sample gradient norms of each layer are summed up to obtain the overall (squared) per-sample gradient norm.
    6. +
    + +

    Ghost Clipping
    +Extending the approach of Fast Gradient Clipping, Ghost Clipping uses the fact that for linear layers1, per-sample gradient norms can be calculated just from activation gradients and activations. In particular, let backprops and activations be per-sample activation gradients and activations, of dimensions batch_size ✕ output_width and batch_size ✕ input_width, respectively. The per-sample gradient is the outer product of the two, which takes O(batch_size ✕ input_width ✕ output_width) time and space.

    + +

    The ghost clipping trick instead calculates the (squared) norm of backprops and activations, sample-wise, and takes their product, which gives the (squared) norm of the gradient. This takes O(batch-size ✕ (input_width + output_width)) time and takes O(batch-size) space to store. Since per-sample activation and per-sample activation gradients are already stored, additional memory is needed only for storing the norms.

    + +

    Relationship between Fast Gradient Clipping and Ghost Clipping

    + +
      +
    1. Fast Gradient Clipping and Ghost Clipping are complementary techniques. Fast Gradient Clipping can be applied to any type of layer, while Ghost Clipping is a strictly better technique for supported layers.
    2. +
    3. Our implementation automatically switches to Fast Gradient Clipping when the layer is not supported by Ghost Clipping.
    4. +
    + +

    How to use Fast Gradient Clipping in Opacus

    + +

    The training loop is identical to that of the standard PyTorch loop. As in Opacus before, we use the PrivacyEngine(), which “sanitizes” the model and optimizer. To enable Ghost Clipping, the argument grad_sample_mode="ghost" is used. Additionally, make_private() takes the loss criterion as an extra input and sanitizes it. This allows us to hide the two backward passes and the loss rescaling in between in loss.backward().

    + +
    from opacus import PrivacyEngine
    +criterion = nn.CrossEntropyLoss() # example loss function
    +
    +privacy_engine = PrivacyEngine()
    +model_gc, optimizer_gc, criterion_gc, train_loader, = privacy_engine.make_private(
    +        module=model,
    +        optimizer=optimizer,
    +        data_loader=train_loader,
    +        noise_multiplier=noise_multiplier
    +        max_grad_norm=max_grad_norm,
    +	 criterion=criterion,
    +        grad_sample_mode="ghost",
    +)
    +
    +# The training loop below is identical to that of PyTorch
    +
    +for input_data, target_data in train_loader:
    +    output_gc = model_gc(input_data) # Forward pass
    +    optimizer_gc.zero_grad()
    +    loss = criterion_gc(output_gc, target_data)
    +    loss.backward()
    +    optimizer_gc.step()  # Add noise and update the model
    +
    + +

    Internally, before the first pass, we enable the hooks, which allows us to capture layer-wise values corresponding to forward and backward calls. They are used to compute the per-sample gradient norms. We then compute the clipping coefficients, rescale the loss function and disable hooks, which lets us use the standard PyTorch backward pass.

    + +

    Memory Complexity Analysis

    + +

    Consider a multi-layer neural network with the following properties:

    + +

    L: Number of layers
    +d: Maximum layer width
    +B: Batch size
    +K: Number of non-supported/non-linear layers

    + +

    The memory overhead of DP-SGD with Ghost Clipping compared to plain (PyTorch) SGD is an additive O(BL), required to store the per-sample gradient norms for all layers. Further, if there is a non-supported layer (if K≥1), then there is an additional O(Bd2) memory to instantiate the gradient of that layer.

    + +

    Memory Benchmarking

    + +

    We provide results on the memory usage for a variety of settings.

    + +

    Fine-Tuning BERT

    + +

    We consider the problem of privately fine-tuning the last three layers of BERT for a text classification task. The base model has over 100M parameters, of which we fine-tune the last three layers, BertEncoder, BertPooler, and Classifier, comprising roughly 7.6M parameters. The experiments are run on a P100 GPU with 16 GB of memory.

    + +

    The following table reports the maximum memory and time taken per iteration for the various methods:

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + Batch size +
    B = 32 + B = 128 + B = 512 + B = 1024 + B = 2048 +
    Mem + Time + Mem + Time + Mem + Time + Mem + Time + +
    PyTorch SGD + 236 MB + 0.15 s + 1.04 GB + 0.55 s + 5.27 GB + 2.1 s + 12.7 GB + 4.2 s + OOM +
    DP-SGD + 1,142 MB + 0.21 s + 4.55 GB + 0.68 s + OOM + OOM + OOM +
    FGC DP-SGD + 908 MB + 0.21 s + 3.6 GB + 0.75 s + OOM + OOM + OOM +
    GC DP-SGD + 362 MB + 0.21 s + 1.32 GB + 0.67 s + 5.27 GB + 2.5 s + 12.7 GB + 5 s + OOM +
    + +

    In terms of peak memory footprint, DP-SGD > FGC DP-SGD ≫ GC DP-SGD ≈ PyTorch SGD. Further, the runtimes are similar because most of the parameters are frozen and the forward pass takes up most of the time.

    + +

    Synthetic Setup: Memory Profiling

    + +

    We consider the following setup to profile the memory used by PyTorch SGD, Vanilla DP-SGD and Ghost Clipping, GC DP-SGD.

    + +
      +
    • 2-layer fully connected neural network +
        +
      • Input: 5120
      • +
      • Hidden: 2560
      • +
      • Output: 1280
      • +
      • Total number of model parameters = 15.6M
      • +
      • Model size = 62.5 MB
      • +
      +
    • +
    • Batch size, different values, as seen in the table below.
    • +
    + +

    The table below summarizes the max memory increase (in MB) broken down by stages of the training loop for each of the methods.

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Batch Size + Method + Model to GPU + Forward + First Backward + Second Backward + Optimizer Step +
    32 + PyTorch SGD + 62.5 + 0.5 + 62.5 + N/A + 0 +
    Vanilla DP-SGD + 62.5 + 0.47 + 3,663 + N/A + 162.5 +
    GC DP-SGD + 62.5 + 0.47 + 63.13 + 50 + 125 +
    217 + PyTorch SGD + 62.5 + 1920 + 1932.5 + N/A + 0 +
    Vanilla DP-SGD + OOM +
    GC DP-SGD + 62.5 + 1920 + 2625 + 1932.5 + 125 +
    + +

    Industry use case

    + +

    We tested Ghost Clipping DP-SGD on an internal Meta use case, consisting of a model of size roughly 100B with 40M trainable parameters. Our initial results show that Ghost Clipping SGD reduces 95% memory of vanilla DP-SGD, and achieves comparable memory usage to PyTorch SGD.

    + +

    Conclusion

    + +

    In this post, we describe implementations of Fast Gradient Clipping and Ghost Clipping in Opacus that enable memory-efficient training of machine learning models with differential privacy. Currently, the Ghost Clipping implementation only applies to linear layers, but, as outlined in part 3 of the series, it can be extended to “generalized” linear layers such as convolutions and multi-head attention. The current techniques require two explicit backpropagation steps, which increases runtime. We will explore developments on top of Ghost Clipping such as the Book-Keeping algorithm for mitigation.

    + +

    To learn more about Opacus, visit opacus.ai and github.com/pytorch/opacus.

    + +

    Acknowledgements

    + +

    We thank Iden Kalemaj, Darren Liu, Karthik Prasad, Hao Shi, Igor Shilov, Davide Testuggine, Eli Uriegas, Haicheng Wang, and Richard Zou for valuable feedback and suggestions.

    + +
    +
      +
    1. +

      There are ways to extend Ghost Clipping to non-linear layers. 

      +
    2. +
    +
    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/compiling-numpy-code/index.html b/blog/compiling-numpy-code/index.html new file mode 100644 index 000000000000..08c8ba491b55 --- /dev/null +++ b/blog/compiling-numpy-code/index.html @@ -0,0 +1,913 @@ + + + + + + + + + + + + + Compiling NumPy code into C++ or CUDA via torch.compile | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Evgeni Burovski, Ralf Gommers and Mario Lezcano + +

    +

    Quansight engineers have implemented support for tracing through NumPy code via +torch.compile in PyTorch 2.1. This feature leverages PyTorch’s compiler to +generate efficient fused vectorized code without having to modify your original +NumPy code. Even more, it also allows for executing NumPy code on CUDA +just by running it through torch.compile under torch.device("cuda")!

    + +

    In this post, we go over how to use this feature and give a few tips and tricks +to make the most out of it.

    + +

    Compiling NumPy code into Parallel C++

    + +

    We take as our running example one step in a K-Means algorithm. +This piece of code is borrowed from this NumPy book

    + +
    import numpy as np
    +
    +def kmeans(X, means):
    +    return np.argmin(np.linalg.norm(X - means[:, None], axis=2), axis=0)
    +
    + +

    We create a synthetic dataset with 20M random 2-D points. We can see that, +given that the means are chosen appropriately, the function returns the correct +cluster for all of them

    + +
    npts = 10_000_000
    +X = np.repeat([[5, 5], [10, 10]], [npts, npts], axis=0)
    +X = X + np.random.randn(*X.shape)  # 2 distinct "blobs"
    +means = np.array([[5, 5], [10, 10]])
    +np_pred = kmeans(X, means)
    +
    + +

    Benchmarking this function gives us a baseline of 1.26s on an AMD 3970X CPU.

    + +

    Compiling this function is now as easy as wrapping it with torch.compile and +executing it with the example inputs

    + +
    import torch
    +
    +compiled_fn = torch.compile(kmeans)
    +compiled_pred = compiled_fn(X, means)
    +assert np.allclose(np_pred, compiled_pred)
    +
    + +

    The compiled function yields a 9x speed-up when running it on 1 core. Even +better, as opposed to NumPy, our generated code does take advantage of all the +cores in a processor. As such, when we run it on 32 cores, we get a 57x +speed-up. Note that PyTorch always uses all the available cores unless +explicitly restricted, so this is the default behavior you get when using +torch.compile.

    + +

    We may inspect the generated C++ code by running the script with the +environment variable TORCH_LOGS=output_code. When doing so, we can see that +torch.compile was able to compile the broadcasting and the two reductions +into just one for-loop, and parallelize it using OpenMP

    + +
    extern "C" void kernel(const double* in_ptr0, const long* in_ptr1, long* out_ptr0) {
    +    #pragma omp parallel num_threads(32)
    +    #pragma omp for
    +    for(long i0=0L; i0<20000000L; i0+=1L) {
    +        auto tmp0 = in_ptr0[2L*i0];
    +        auto tmp1 = in_ptr1[0L];
    +        auto tmp5 = in_ptr0[1L + (2L*i0)];
    +        auto tmp6 = in_ptr1[1L];
    +        // Rest of the kernel omitted for brevity
    +
    + +

    Compiling NumPy code into CUDA

    + +

    Compiling our code so that it runs on CUDA is as simple as setting the +default device to be CUDA

    + +
    with torch.device("cuda"):
    +    cuda_pred = compiled_fn(X, means)
    +assert np.allclose(np_pred, cuda_pred)
    +
    + +

    By inspecting the generated code via TORCH_LOGS=output_code, we see that, +rather than generating CUDA code directly, torch.compile generates rather +readable triton code

    + +
    def triton_(in_ptr0, in_ptr1, out_ptr0, XBLOCK : tl.constexpr):
    +    xnumel = 20000000
    +    xoffset = tl.program_id(0) * XBLOCK
    +    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    +    xmask = xindex < xnumel
    +    x0 = xindex
    +    tmp0 = tl.load(in_ptr0 + (2*x0), xmask)
    +    tmp1 = tl.load(in_ptr1 + (0))
    +    // Rest of the kernel omitted for brevity
    +
    + +

    Running this small snippet on an RTX 2060 gives an 8x speed-up over the +original NumPy code. This is something, but it is not particularly impressive, +given the speed-ups we have seen on CPU. Let’s have a look into how to squeeze +the most out of our GPU via a couple minor changes.

    + +

    float64 vs float32. Many GPUs, in particular consumer-grade ones, are +rather sluggish when running operations on float64. For this reason, changing +the data generation to float32, the original NumPy code just gets a bit +faster, about a 9%, but our CUDA code gets 40% faster, yielding a 11x +speed-up over the plain NumPy code.

    + +

    torch.compile, by default, respects the NumPy semantics, and as such, it uses +np.float64 as its default dtype for all its creation ops. As discussed, this +can hinder performance, so it is possible to change this default by setting

    + +
    from torch._dynamo import config
    +config.numpy_default_float = "float32"
    +
    + +

    CPU <> CUDA copies. An 11x speed-up is good, but it is not even close to +the CPU numbers. This is caused by a small transformation that torch.compile +does behind the scenes. The code above takes NumPy arrays and returns NumPy +arrays. All of these arrays are on CPU, but the computations are performed on +the GPU. This means that every time the function is called, torch.compile has +to copy all these arrays from CPU to the GPU, and then copy the result back to +CPU to preserve the original semantics. There is no native solution to this +issue in NumPy, as NumPy does not have the notion of a device. That being +said, we can work around it by creating a wrapper to this function so that it +accepts PyTorch tensors and returns PyTorch tensors.

    + +
    @torch.compile
    +def tensor_fn(X, means):
    +    X, means = X.numpy(), means.numpy()
    +    ret = kmeans(X, means)
    +    return torch.from_numpy(ret)
    +
    +def cuda_fn(X, means):
    +    with torch.device("cuda"):
    +        return tensor_fn(X, means)
    +
    + +

    This function now takes tensors in CUDA memory and returns tensors in CUDA +memory, but the function itself is written in NumPy! torch.compile uses the +numpy() and the from_numpy() calls as hints, and optimizes them away, and +internally it simply works with PyTorch tensors without moving the memory at +all. When we keep the tensors in CUDA and perform the computations in +float32, we see a 200x speed-up over the initial NumPy implementation on +float32 arrays.

    + +

    Mixing NumPy and PyTorch. In this example, we had to write a small adaptor +to convert tensors to ndarrays and then back to tensors. In programs that mix +PyTorch and NumPy converting a tensor into an ndarray is often implemented as +x.detach().cpu().numpy(), or simply x.numpy(force=True). Since when running +under torch.compile we can run NumPy code in CUDA, we can implement this +conversion pattern as call to x.numpy(), as we did above. Doing so and +running the resulting code under device("cuda") will generate efficient CUDA +code from original NumPy calls without copying the data from CUDA to CPU at +all. Note that the resulting code does not run without torch.compile. For it +to run in eager mode one would need to rollback to x.numpy(force=True).

    + +

    Further Speed-up tricks

    + +

    General advice. The CUDA code we have shown is already quite efficient, but +it is true that the running example is rather short. When dealing with larger +programs, we may need to tweak parts of it to make it more efficient. A good +place to start is the multiple tutorials and FAQs for torch.compile. +This showcases a number of ways to inspect the tracing process, and how to +identify problematic code that may cause slowdowns.

    + +

    Advice when compiling NumPy code. NumPy, even if rather similar to PyTorch, +is often used very differently. It is rather common to perform computations in +NumPy and then do an if/else depending on values within the array, or perform +operations in-place, perhaps via boolean masks. These constructions, while +supported by torch.compile, hamper its performance. Changes like writing the +code in a branchless way to avoid graph breaks, or avoiding in-place ops can go +a long way.

    + +

    To write fast NumPy code, it is best to avoid loops, but sometimes they are +unavoidable. When tracing through a loop, torch.compile will try to fully +unroll it. This is sometimes desirable, but sometimes it may not even be +possible, like when we have a dynamic stopping condition, like in a while loop. +In these cases, it may be best to just compile the body of the loop, perhaps a +few iterations at a time (loop unrolling).

    + +

    Debugging NumPy code. Debugging is rather tricky when a compiler is +involved. To figure out whether an error you are hitting is a torch.compile +error, or an error from the program, you can execute your NumPy program without +torch.compile by replacing the NumPy import by import torch._numpy as np. +This is should just be used for debugging purposes and is in no way a +replacement for the PyTorch API, as it is much slower and, as a private API, +may change without notice. See also this FAQ for other tricks.

    + +

    Differences between NumPy and torch.compile NumPy

    + +

    NumPy scalars. NumPy returns NumPy scalars in almost any case where PyTorch +would return a 0-D tensor (e.g. from np.sum). Under torch.compile, NumPy +scalars are treated as 0-D arrays. This is just fine in most cases. The only +case when their behavior diverges is when NumPy scalars are implicitly used as +Python scalars. For example,

    + +
    >>> np.asarray(2) * [1, 2, 3]  # 0-D array is an array-like
    +array([2, 4, 6])
    +>>> u = np.int32(2)
    +>>> u * [1, 2, 3]              # scalar decays into a Python int
    +[1, 2, 3, 1, 2, 3]
    +>>> torch.compile(lambda: u * [1, 2, 3])()
    +array([2, 4, 6])               # acts as a 0-D array, not as a scalar ?!?!
    +
    + +

    If we compile the first two lines, we see that torch.compile treats u as a +0-D array. To recover the eager semantics, we just need to make the casting +explicit

    + +
    >>> torch.compile(lambda: int(u) * [1, 2, 3])()
    +[1, 2, 3, 1, 2, 3]
    +
    + +

    Type promotion and versioning. NumPy’s type promotion rules may be, at +times, a bit surprising

    + +
    >>> np.zeros(1, dtype=np.int8) + 127
    +array([127], dtype=int8)
    +>>> np.zeros(1, dtype=np.int8) + 128
    +array([128], dtype=int16)
    +
    + +

    NumPy 2.0 is changing these rules to follow others that are closer to those +PyTorch. The relevant technical document is NEP 50. +torch.compile went ahead and implemented NEP 50 rather than the about-to-be-deprecated rules.

    + +

    In general, NumPy within torch.compile follows NumPy 2.0 pre-release.

    + +

    Beyond NumPy: SciPy and scikit-learn

    + +

    In parallel to this effort of making torch.compile understand NumPy code, +other Quansight engineers have designed and proposed a way to support PyTorch +tensors within scikit-learn and SciPy. This was received enthusiastically by +other maintainers from these libraries, as it was shown that using PyTorch as a +backend would often yield considerable speed-ups. Both projects have now merged +initial support for PyTorch tensors across a number of APIs and submodules.

    + +

    This sets the stepping stone to move towards a future where PyTorch tensors can +be used within other libraries in the Python data ecosystem. Even more, this +will enable running these other libraries on GPUs and even compiling code +mixing these libraries and PyTorch, similar to what we have been discussed in +this post.

    + +

    If you want to learn more about this effort, how to use it, or how to help +moving it forward, see this other blogpost.

    + +

    Conclusion

    + +

    PyTorch has committed since its inception to be a framework compatible with the +rest of the Python ecosystem. Enabling compiling NumPy programs, and +establishing the tools necessary to do the same for other prominent libraries +are two more steps in this direction. Quansight and Meta continue working hand +on hand, improving the compatibility between PyTorch and the rest of the +ecosystem.

    + +

    From Quansight, we would like to thank Mengwei, Voz, and Ed for their +invaluable help in integrating our work with torch.compile. We would also +like to thank Meta for funding this project as well as previous work on +improving NumPy compatibility within PyTorch, and the project that led to +supporting PyTorch within scikit-learn and SciPy. These are giant leaps towards +consolidating PyTorch as the framework of choice within the open source Python +data ecosystem.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/compromised-nightly-dependency/index.html b/blog/compromised-nightly-dependency/index.html new file mode 100644 index 000000000000..da2e21fc1756 --- /dev/null +++ b/blog/compromised-nightly-dependency/index.html @@ -0,0 +1,709 @@ + + + + + + + + + + + + + Compromised PyTorch-nightly dependency chain between December 25th and December 30th, 2022. | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + The PyTorch Team + +

    +

    If you installed PyTorch-nightly on Linux via pip between December 25, 2022 and December 30, 2022, please uninstall it and torchtriton immediately, and use the latest nightly binaries (newer than Dec 30th 2022).

    + +
    $ pip3 uninstall -y torch torchvision torchaudio torchtriton
    +$ pip3 cache purge
    +
    + +

    PyTorch-nightly Linux packages installed via pip during that time installed a dependency, torchtriton, which was compromised on the Python Package Index (PyPI) code repository and ran a malicious binary. This is what is known as a supply chain attack and directly affects dependencies for packages that are hosted on public package indices.

    + +

    NOTE: Users of the PyTorch stable packages are not affected by this issue.

    + +

    How to check if your Python environment is affected

    + +

    The following command searches for the malicious binary in the torchtriton package (PYTHON_SITE_PACKAGES/triton/runtime/triton) and prints out whether your current Python environment is affected or not.

    + +
    python3 -c "import pathlib;import importlib.util;s=importlib.util.find_spec('triton'); affected=any(x.name == 'triton' for x in (pathlib.Path(s.submodule_search_locations[0] if s is not None else '/' ) / 'runtime').glob('*'));print('You are {}affected'.format('' if affected else 'not '))"
    +
    + +

    The malicious binary is executed when the triton package is imported, which requires explicit code to do and is not PyTorch’s default behavior.

    + +

    The Background

    + +

    At around 4:40pm GMT on December 30 (Friday), we learned about a malicious dependency package (torchtriton) that was uploaded to the Python Package Index (PyPI) code repository with the same package name as the one we ship on the PyTorch nightly package index. Since the PyPI index takes precedence, this malicious package was being installed instead of the version from our official repository. This design enables somebody to register a package by the same name as one that exists in a third party index, and pip will install their version by default.

    + +

    This malicious package has the same name torchtriton but added in code that uploads sensitive data from the machine.

    + +

    What we know

    + +

    torchtriton on PyPI contains a malicious triton binary which is installed at PYTHON_SITE_PACKAGES/triton/runtime/triton. Its SHA256 hash is listed below.

    + +

    SHA256(triton)= 2385b29489cd9e35f92c072780f903ae2e517ed422eae67246ae50a5cc738a0e

    + +

    The binary’s main function does the following:

    + +
      +
    • Get system information +
        +
      • nameservers from /etc/resolv.conf
      • +
      • hostname from gethostname()
      • +
      • current username from getlogin()
      • +
      • current working directory name from getcwd()
      • +
      • environment variables
      • +
      +
    • +
    • Read the following files +
        +
      • /etc/hosts
      • +
      • /etc/passwd
      • +
      • The first 1,000 files in $HOME/*
      • +
      • $HOME/.gitconfig
      • +
      • $HOME/.ssh/*
      • +
      +
    • +
    • Upload all of this information, including file contents, via encrypted DNS queries to the domain *.h4ck[.]cfd, using the DNS server wheezy[.]io
    • +
    + +

    The binary’s file upload functionality is limited to files less than 99,999 bytes in size. It also uploads only the first 1,000 files in $HOME (but all files < 99,999 bytes in the .ssh directory).

    + +

    Steps taken towards mitigation

    + +
      +
    • torchtriton has been removed as a dependency for our nightly packages and replaced with pytorch-triton (pytorch/pytorch#91539) and a dummy package registered on PyPI (so that this issue doesn’t repeat)
    • +
    • All nightly packages that depend on torchtriton have been removed from our package indices at https://download.pytorch.org until further notice
    • +
    • We have reached out to the PyPI security team to get proper ownership of the torchtriton package on PyPI and to delete the malicious version
    • +
    + + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/computational-graphs-constructed-in-pytorch/index.html b/blog/computational-graphs-constructed-in-pytorch/index.html new file mode 100644 index 000000000000..1a5e444fa20c --- /dev/null +++ b/blog/computational-graphs-constructed-in-pytorch/index.html @@ -0,0 +1,1095 @@ + + + + + + + + + + + + + How Computational Graphs are Constructed in PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Preferred Networks + +

    +

    In the previous post we went over the theoretical foundations of automatic differentiation and reviewed the implementation in PyTorch. In this post, we will be showing the parts of PyTorch involved in creating the graph and executing it. In order to understand the following contents, please read @ezyang’s wonderful blog post about PyTorch internals.

    + +

    Autograd components

    + +

    First of all, let’s look at where the different components of autograd live:

    + +

    tools/autograd: Here we can find the definition of the derivatives as we saw in the previous post derivatives.yaml, several python scripts and a folder called templates. These scripts and the templates are used at building time to generate the C++ code for the derivatives as specified in the yaml file. Also, the scripts here generate wrappers for the regular ATen functions so that the computational graph can be constructed.

    + +

    torch/autograd: This folder is where the autograd components that can be used directly from python are located. In function.py we find the actual definition of torch.autograd.Function, a class used by users to write their own differentiable functions in python as per the documentation. functional.py holds components for functionally computing the jacobian vector product, hessian, and other gradient related computations of a given function. +The rest of the files have additional components such as gradient checkers, anomaly detection, and the autograd profiler.

    + +

    torch/csrc/autograd: This is where the graph creation and execution-related code lives. +All this code is written in C++, since it is a critical part that is required to be extremely performant. Here we have several files that implement the engine, metadata storage, and all the needed components. Alongside this, we have several files whose names start with python_, and their main responsibility is to allow python objects to be used in the autograd engine.

    + +

    Graph Creation

    + +

    Previously, we described the creation of a computational graph. Now, we will see how PyTorch creates these graphs with references to the actual codebase.

    + +

    + +
    +Figure 1: Example of an augmented computational graph +

    + +

    It all starts when in our python code, where we request a tensor to require the gradient.

    + +
    >>> x = torch.tensor([0.5, 0.75], requires_grad=True)
    +
    + +

    When the required_grad flag is set in tensor creation, c10 will allocate an AutogradMeta object that is used to hold the graph information.

    + +
    
    +void TensorImpl::set_requires_grad(bool requires_grad) {
    +  ...
    +  if (!autograd_meta_)
    +    autograd_meta_ = impl::GetAutogradMetaFactory()->make();
    +    autograd_meta_->set_requires_grad(requires_grad, this);
    +}
    +
    + +

    The AutogradMeta object is defined in torch/csrc/autograd/variable.h as follows:

    + +
    
    +struct TORCH_API AutogradMeta : public c10::AutogradMetaInterface {
    +  std::string name_;
    +
    +  Variable grad_;
    +  std::shared_ptr<Node> grad_fn_;
    +  std::weak_ptr<Node> grad_accumulator_;
    +  // other fields and methods
    +  ...
    +};
    +
    + +

    The most important fields in this structure are the computed gradient in grad_ and a pointer to the function grad_fn that will be called by the engine to produce the actual gradient. Also, there is a gradient accumulator object that is used to add together all the different gradients where this tensor is involved as we will see in the graph execution.

    + +

    Graphs, Nodes and Edges.

    + +

    Now, when we call a differentiable function that takes this tensor as an argument, the associated metadata will be populated. Let’s suppose that we call a regular torch function that is implemented in ATen. Let it be the multiplication as in our previous blog post example. The resulting tensor has a field called grad_fn that is essentially a pointer to the function that will be used to compute the gradient of that operation.

    + +
    >>> x = torch.tensor([0.5, 0.75], requires_grad=True)
    +>>> v = x[0] * x[1]
    +>>> v
    +tensor(0.3750, grad_fn=<MulBackward0>)
    +
    + +

    Here we see that the tensors’ grad_fn has a MulBackward0 value. This function is the same that was written in the derivatives.yaml file, and its C++ code was generated automatically by all the scripts in tools/autograd. It’s auto-generated source code can be seen in torch/csrc/autograd/generated/Functions.cpp.

    + +
    variable_list MulBackward0::apply(variable_list&& grads) {
    +  std::lock_guard<std::mutex> lock(mutex_);
    +
    +  IndexRangeGenerator gen;
    +  auto self_ix = gen.range(1);
    +  auto other_ix = gen.range(1);
    +  variable_list grad_inputs(gen.size());
    +  auto& grad = grads[0];
    +  auto self = self_.unpack();
    +  auto other = other_.unpack();
    +  bool any_grad_defined = any_variable_defined(grads);
    +  if (should_compute_output({ other_ix })) {
    +    auto grad_result = any_grad_defined ? (mul_tensor_backward(grad, self, other_scalar_type)) : Tensor();
    +    copy_range(grad_inputs, other_ix, grad_result);
    +  }
    +  if (should_compute_output({ self_ix })) {
    +    auto grad_result = any_grad_defined ? (mul_tensor_backward(grad, other, self_scalar_type)) : Tensor();
    +    copy_range(grad_inputs, self_ix, grad_result);
    +  }
    +  return grad_inputs;
    +}
    +
    + +

    The grad_fn objects inherit from the TraceableFunction class, a descendant of Node with just a property set to enable tracing for debugging and optimization purposes. A graph by definition has nodes and edges, so these functions are indeed the nodes of the computational graph that are linked together by using Edge objects to enable the graph traversal later on.

    + +

    The Node definition can be found in the torch/csrc/autograd/function.h file.

    + +
    struct TORCH_API Node : std::enable_shared_from_this<Node> {
    + ...
    + /// Evaluates the function on the given inputs and returns the result of the
    +  /// function call.
    +  variable_list operator()(variable_list&& inputs) {
    +  ...
    +  }
    +
    +protected:
    +  /// Performs the `Node`'s actual operation.
    +  virtual variable_list apply(variable_list&& inputs) = 0;
    +  
    +  edge_list next_edges_;
    +
    + +

    Essentially we see that it has an override of the operator () that performs the call to the actual function, and a pure virtual function called apply. The automatically generated functions override this apply method as we saw in the MulBackward0 example above. Finally, the node also has a list of edges to enable graph connectivity.

    + +

    The Edge object is used to link Nodes together and its implementation is straightforward.

    + +
    struct Edge {
    +  ...
    +  /// The function this `Edge` points to.
    +  std::shared_ptr<Node> function;
    +  /// The identifier of a particular input to the function.
    +  uint32_t input_nr;
    +};
    +
    + +

    It only requires a function pointer (the actual grad_fn objects that the edges link together), and an input number that acts as an id for the edge.

    + +

    Linking nodes together

    + +

    When we invoke the product operation of two tensors, we enter into the realm of autogenerated code. All the scripts that we saw in tools/autograd fill a series of templates that wrap the differentiable functions in ATen. These functions have code to construct the backward graph during the forward pass.

    + +

    The gen_variable_type.py script is in charge of writing all this wrapping code. This script is called from the tools/autograd/gen_autograd.py during the pytorch build process and it will output the automatically generated function wrappers to torch/csrc/autograd/generated/.

    + +

    Let’s take a look at how the tensor multiplication generated function looks like. The code has been simplified, but it can be found in the torch/csrc/autograd/generated/VariableType_4.cpp file when compiling pytorch from source.

    + +
    at::Tensor mul_Tensor(c10::DispatchKeySet ks, const at::Tensor & self, const at::Tensor & other) {
    +  ...
    +  auto _any_requires_grad = compute_requires_grad( self, other );
    +  std::shared_ptr<MulBackward0> grad_fn;
    +  if (_any_requires_grad) {
    +    // Creates the link to the actual grad_fn and links the graph for backward traversal
    +    grad_fn = std::shared_ptr<MulBackward0>(new MulBackward0(), deleteNode);
    +    grad_fn->set_next_edges(collect_next_edges( self, other ));
    +    ...
    +  }
    +  
    +  // Does the actual function call to ATen
    +  auto _tmp = ([&]() {
    +    at::AutoDispatchBelowADInplaceOrView guard;
    +    return at::redispatch::mul(ks & c10::after_autograd_keyset, self_, other_);
    +  })();
    +
    +  auto result = std::move(_tmp);
    +    if (grad_fn) {
    +       // Connects the result to the graph
    +      set_history(flatten_tensor_args( result ), grad_fn);
    +  }
    +  ...
    +  return result;
    +}
    +
    + +

    Let’s walk through the most important lines of this code. +First of all, the grad_fn object is created with: ` grad_fn = std::shared_ptr(new MulBackward0(), deleteNode);`.

    + +

    After the grad_fn object is created, the edges used to link the nodes together are created by using the grad_fn->set_next_edges(collect_next_edges( self, other )); calls.

    + +
    struct MakeNextFunctionList : IterArgs<MakeNextFunctionList> {
    +  edge_list next_edges;
    +  using IterArgs<MakeNextFunctionList>::operator();
    +  void operator()(const Variable& variable) {
    +    if (variable.defined()) {
    +      next_edges.push_back(impl::gradient_edge(variable));
    +    } else {
    +      next_edges.emplace_back();
    +    }
    +  }
    +  void operator()(const c10::optional<Variable>& variable) {
    +    if (variable.has_value() && variable->defined()) {
    +      next_edges.push_back(impl::gradient_edge(*variable));
    +    } else {
    +      next_edges.emplace_back();
    +    }
    +  }
    +};
    +
    +template <typename... Variables>
    +edge_list collect_next_edges(Variables&&... variables) {
    +  detail::MakeNextFunctionList make;
    +  make.apply(std::forward<Variables>(variables)...);
    +  return std::move(make.next_edges);
    +}
    +
    + +

    Given an input variable (it’s just a regular tensor), collect_next_edges + will create an Edge object by calling impl::gradient_edge

    + +
     Edge gradient_edge(const Variable& self) {
    +    // If grad_fn is null (as is the case for a leaf node), we instead
    +    // interpret the gradient function to be a gradient accumulator, which will
    +    // accumulate its inputs into the grad property of the variable. These
    +    // nodes get suppressed in some situations, see "suppress gradient
    +    // accumulation" below. Note that only variables which have `requires_grad =
    +    // True` can have gradient accumulators.
    +    if (const auto& gradient = self.grad_fn()) {
    +      return Edge(gradient, self.output_nr());
    +    } else {
    +      return Edge(grad_accumulator(self), 0);
    +    }
    +  }
    +
    + +

    To understand how edges work, let’s assume that an early executed function produced two output tensors, both with their grad_fn set, each tensor also has an output_nr property with the order in which they were returned. When creating the edges for the current grad_fn, an Edge object per input variable will be created. The edges will point to the variable’s grad_fn and will also track the output_nr to establish ids used when traversing the graph. In the case that the input variables are “leaf”, i.e. they were not produced by any differentiable function, they don’t have a grad_fn attribute set. A special function called a gradient accumulator is set by default as seen in the above code snippet.

    + +

    After the edges are created, the grad_fn graph Node object that is being currently created will hold them using the set_next_edges function. This is what connects grad_fns together, producing the computational graph.

    + +
     void set_next_edges(edge_list&& next_edges) {
    +    next_edges_ = std::move(next_edges);
    +    for(const auto& next_edge : next_edges_) {
    +      update_topological_nr(next_edge);
    +    }
    +  }
    +
    + +

    Now, the forward pass of the function will execute, and after the execution set_history will connect the output tensors to the grad_fn Node.

    + +
    inline void set_history(
    +    at::Tensor& variable,
    +    const std::shared_ptr<Node>& grad_fn) {
    +  AT_ASSERT(grad_fn);
    +  if (variable.defined()) {
    +    // If the codegen triggers this, you most likely want to add your newly added function
    +    // to the DONT_REQUIRE_DERIVATIVE list in tools/autograd/gen_variable_type.py
    +    TORCH_INTERNAL_ASSERT(isDifferentiableType(variable.scalar_type()));
    +    auto output_nr =
    +        grad_fn->add_input_metadata(variable);
    +    impl::set_gradient_edge(variable, {grad_fn, output_nr});
    +  } else {
    +    grad_fn->add_input_metadata(Node::undefined_input());
    +  }
    +}
    +
    + +

    set_history calls set_gradient_edge, which just copies the grad_fn and the output_nr to the AutogradMeta object that the tensor has.

    + +
     void set_gradient_edge(const Variable& self, Edge edge) {
    +    auto* meta = materialize_autograd_meta(self);
    +    meta->grad_fn_ = std::move(edge.function);
    +    meta->output_nr_ = edge.input_nr;
    +    // For views, make sure this new grad_fn_ is not overwritten unless it is necessary
    +    // in the VariableHooks::grad_fn below.
    +    // This logic is only relevant for custom autograd Functions for which multiple
    +    // operations can happen on a given Tensor before its gradient edge is set when
    +    // exiting the custom Function.
    +    auto diff_view_meta = get_view_autograd_meta(self);
    +    if (diff_view_meta && diff_view_meta->has_bw_view()) {
    +      diff_view_meta->set_attr_version(self._version());
    +    }
    +  }
    +
    + +

    This tensor now will be the input to another function and the above steps will be all repeated. Check the animation below to see how the graph is created.

    + +

    + +
    +Figure 2: Animation that shows the graph creation +

    + +

    Registering Python Functions in the graph

    + +

    We have seen how autograd creates the graph for the functions included in ATen. However, when we define our differentiable functions in Python, they are also included in the graph!

    + +

    An autograd python defined function looks like the following:

    + +
    class Exp(torch.autograd.Function):
    +     @staticmethod
    +     def forward(ctx, i):
    +         result = i.exp()
    +         ctx.save_for_backward(result)
    +         return result
    +
    +     @staticmethod
    +     def backward(ctx, grad_output):
    +         result, = ctx.saved_tensors
    +         return grad_output * result
    +
    +# Call the function
    +Exp.apply(torch.tensor(0.5, requires_grad=True))
    +# Outputs: tensor(1.6487, grad_fn=<ExpBackward>)
    +
    + +

    In the above snippet autograd detected our python function when creating the graph. All of this is possible thanks to the Function class. Let’s take a look at what happens when we call apply.

    + +

    apply is defined in the torch._C._FunctionBase class, but this class is not present in the python source. _FunctionBase is defined in C++ by using the python C API to hook C functions together into a single python class. We are looking for a function named THPFunction_apply.

    + +
    
    +PyObject *THPFunction_apply(PyObject *cls, PyObject *inputs)
    +{
    +  
    +  // Generates the graph node
    +  THPObjectPtr backward_cls(PyObject_GetAttrString(cls, "_backward_cls"));
    +  if (!backward_cls) return nullptr;
    +  THPObjectPtr ctx_obj(PyObject_CallFunctionObjArgs(backward_cls, nullptr));
    +  if (!ctx_obj) return nullptr;
    +  THPFunction* ctx = (THPFunction*)ctx_obj.get();
    +
    +  auto cdata = std::shared_ptr<PyNode>(new PyNode(std::move(ctx_obj)), deleteNode);
    +  ctx->cdata = cdata;
    +
    +  // Prepare inputs and allocate context (grad fn)
    +  // Unpack inputs will collect the edges
    +  auto info_pair = unpack_input<false>(inputs);
    +  UnpackedInput& unpacked_input = info_pair.first;
    +  InputFlags& input_info = info_pair.second;
    +
    +   // Initialize backward function (and ctx)
    +  bool is_executable = input_info.is_executable;
    +  cdata->set_next_edges(std::move(input_info.next_edges));
    +  ctx->needs_input_grad = input_info.needs_input_grad.release();
    +  ctx->is_variable_input = std::move(input_info.is_variable_input);
    +
    +  // Prepend ctx to input_tuple, in preparation for static method call
    +  auto num_args = PyTuple_GET_SIZE(inputs);
    +  THPObjectPtr ctx_input_tuple(PyTuple_New(num_args + 1));
    +  if (!ctx_input_tuple) return nullptr;
    +  Py_INCREF(ctx);
    +  PyTuple_SET_ITEM(ctx_input_tuple.get(), 0, (PyObject*)ctx);
    +  for (int i = 0; i < num_args; ++i) {
    +    PyObject *arg = PyTuple_GET_ITEM(unpacked_input.input_tuple.get(), i);
    +    Py_INCREF(arg);
    +    PyTuple_SET_ITEM(ctx_input_tuple.get(), i + 1, arg);
    +  }
    +
    +  // Call forward
    +  THPObjectPtr tensor_outputs;
    +  {
    +    AutoGradMode grad_mode(false);
    +    THPObjectPtr forward_fn(PyObject_GetAttrString(cls, "forward"));
    +    if (!forward_fn) return nullptr;
    +    tensor_outputs = PyObject_CallObject(forward_fn, ctx_input_tuple);
    +    if (!tensor_outputs) return nullptr;
    +  }
    +
    +  // Here is where the outputs gets the tensors tracked
    +  return process_outputs(cls, cdata, ctx, unpacked_input, inputs, std::move(tensor_outputs),
    +                         is_executable, node);
    +  END_HANDLE_TH_ERRORS
    +}
    +
    + +

    Although this code is hard to read at first due to all the python API calls, it essentially does the same thing as the auto-generated forward functions that we saw for ATen:

    + +

    Create a grad_fn object. +Collect the edges to link the current grad_fn with the input tensors one. +Execute the function forward. +Assign the created grad_fn to the output tensors metadata.

    + +

    The grad_fn object is created in:

    + +
      // Generates the graph node
    +  THPObjectPtr backward_cls(PyObject_GetAttrString(cls, "_backward_cls"));
    +  if (!backward_cls) return nullptr;
    +  THPObjectPtr ctx_obj(PyObject_CallFunctionObjArgs(backward_cls, nullptr));
    +  if (!ctx_obj) return nullptr;
    +  THPFunction* ctx = (THPFunction*)ctx_obj.get();
    +
    +  auto cdata = std::shared_ptr<PyNode>(new PyNode(std::move(ctx_obj)), deleteNode);
    +  ctx->cdata = cdata;
    +
    + +

    Basically, it asks the python API to get a pointer to the Python object that can execute the user-written function. Then it wraps it into a PyNode object that is a specialized Node object that calls the python interpreter with the provided python function when apply is executed during the forward pass. Note that in the code cdata is the actual Node object that is part of the graph. ctx is the object that is passed to the python forward/backward functions and it is used to store autograd related information by both, the user’s function and PyTorch.

    + +

    As in the regular C++ functions we also call collect_next_edges to track the inputs grad_fn objects, but this is done in unpack_input:

    + +
    template<bool enforce_variables>
    +std::pair<UnpackedInput, InputFlags> unpack_input(PyObject *args) {
    +  ...
    +  flags.next_edges = (flags.is_executable ? collect_next_edges(unpacked.input_vars) : edge_list());
    +  return std::make_pair(std::move(unpacked), std::move(flags));
    +}
    +
    + +

    After this, the edges are assigned to the grad_fn by just doing cdata->set_next_edges(std::move(input_info.next_edges)); and the forward function is called through the python interpreter C API.

    + +

    Once the output tensors are returned from the forward pass, they are processed and converted to variables inside the process_outputs function.

    + +
    PyObject* process_outputs(PyObject *op_obj, const std::shared_ptr<PyNode>& cdata,
    +                          THPFunction* grad_fn, const UnpackedInput& unpacked,
    +                          PyObject *inputs, THPObjectPtr&& raw_output, bool is_executable,
    +                          torch::jit::Node* node) {
    +  ...
    +  _wrap_outputs(cdata, grad_fn, unpacked.input_vars, raw_output, outputs, is_executable);
    +  _trace_post_record(node, op_obj, unpacked.input_vars, outputs, is_inplace, unpack_output);
    +  if (is_executable) {
    +    _save_variables(cdata, grad_fn);
    +  } ...
    +  return outputs.release();
    +}
    +
    + +

    Here, _wrap_outputs is in charge of setting the forward outputs grad_fn to the newly created one. For this, it calls another _wrap_outputs function defined in a different file, so the process here gets a little confusing.

    + +
    static void _wrap_outputs(const std::shared_ptr<PyNode>& cdata, THPFunction *self,
    +    const variable_list &input_vars, PyObject *raw_output, PyObject *outputs, bool is_executable)
    +{
    +  auto cdata_if_executable = is_executable ? cdata : nullptr;
    + ...
    +
    +  // Wrap only the tensor outputs.
    +  // This calls csrc/autograd/custom_function.cpp
    +  auto wrapped_outputs = _wrap_outputs(input_vars, non_differentiable, dirty_inputs, raw_output_vars, cdata_if_executable);
    +...
    +}
    +
    + +

    The called _wrap_outputs is the one in charge of setting the autograd metadata in the output tensors:

    + +
    std::vector<c10::optional<Variable>> _wrap_outputs(const variable_list &input_vars,
    +  const std::unordered_set<at::TensorImpl*> &non_differentiable,
    +  const std::unordered_set<at::TensorImpl*> &dirty_inputs,
    +  const at::ArrayRef<c10::optional<Variable>> raw_outputs,
    +  const std::shared_ptr<Node> &cdata) {
    +
    +
    +  std::unordered_set<at::TensorImpl*> inputs;
    +  
    +  // Sets the grad_fn and output_nr of an output Variable.
    +  auto set_history = [&](Variable& var, uint32_t output_nr, bool is_input, bool is_modified,
    +                         bool is_differentiable) {
    +    // Lots of checks
    +    if (!is_differentiable) {
    +     ...
    +    } else if (is_input) {
    +      // An input has been returned, but it wasn't modified. Return it as a view
    +      // so that we can attach a new grad_fn to the Variable.
    +      // Run in no_grad mode to mimic the behavior of the forward.
    +      {
    +        AutoGradMode grad_mode(false);
    +        var = var.view_as(var);
    +      }
    +      impl::set_gradient_edge(var, {cdata, output_nr});
    +    } else if (cdata) {
    +      impl::set_gradient_edge(var, {cdata, output_nr});
    +    }
    +  };
    +
    + +

    And this is where set_gradient_edge was called and this is how a user-written python function gets included in the computational graph with its associated backward function!

    + +

    Closing remarks

    + +

    This blog post is intended to be a code overview on how PyTorch constructs the actual computational graphs that we discussed in the previous post. The next entry will deal with how the autograd engine executes these graphs.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/cuda-free-inference-for-llms/index.html b/blog/cuda-free-inference-for-llms/index.html new file mode 100644 index 000000000000..385b9ff7c3b2 --- /dev/null +++ b/blog/cuda-free-inference-for-llms/index.html @@ -0,0 +1,1962 @@ + + + + + + + + + + + + + CUDA-Free Inference for LLMs | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    September 04, 2024

    +

    + CUDA-Free Inference for LLMs +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Adnan Hoque, Less Wright, Raghu Ganti and Mudhakar Srivatsa + +

    +

    In this blog, we discuss the methods we used to achieve FP16 inference with popular LLM models such as Meta’s Llama3-8B and IBM’s Granite-8B Code, where 100% of the computation is performed using OpenAI’s Triton Language.
    +For single token generation times using our Triton kernel based models, we were able to approach 0.76-0.78x performance relative to the CUDA kernel dominant workflows for both Llama and Granite on Nvidia H100 GPUs, and 0.62-0.82x on Nvidia A100 GPUs.

    + +

    Why explore using 100% Triton? Triton provides a path for enabling LLMs to run on different types of GPUs - NVIDIA, AMD, and in the future Intel and other GPU based accelerators. It also provides a higher layer of abstraction in Python for programming GPUs and has allowed us to write performant kernels faster than authoring them using vendor specific APIs. In the rest of this blog, we will share how we achieve CUDA-free compute, micro-benchmark individual kernels for comparison, and discuss how we can further improve future Triton kernels to close the gaps.

    + +

    + +

    Figure 1. Inference throughput benchmarks with Triton and CUDA variants of Llama3-8B and Granite-8B, on NVIDIA H100 and A100
    +Settings: batch size = 2, input sequence length = 512, output sequence length = 256

    + +

    2.0 Composition of a Transformer Block

    + +

    We start with a breakdown of the computations that happen in Transformer-based models. The figure below shows the “kernels” of a typical Transformer block.

    + +

    + Figure 2. Transformer Block by core kernels

    + +

    The core operations for a Llama3 architecture are summarized in this list:

    + +
      +
    1. RMSNorm
    2. +
    3. Matrix multiplication: Fused QKV
    4. +
    5. RoPE
    6. +
    7. Attention
    8. +
    9. Matrix multiplication: Output Projection
    10. +
    11. RMSNorm
    12. +
    13. Matrix multiplication: Fused Gate + Up Projection
    14. +
    15. Activation function: SiLU
    16. +
    17. Element Wise Multiplication
    18. +
    19. Matrix multiplication: Down Projection
    20. +
    + +

    Each of these operations is computed on the GPU through the execution of one (or multiple) kernels. While the specifics of each of these kernels can vary across different transformer models, the core operations remain the same. For example, IBM’s Granite 8B Code model uses bias in the MLP layer, different from Llama3. Such changes do require modifications to the kernels. A typical model is a stack of these transformer blocks wired together with embedding layers.

    + +

    3.0 Model Inference

    + +

    Typical model architecture code is shared with a python model.py file that is launched by PyTorch. In the default PyTorch eager execution mode, these kernels are all executed with CUDA. To achieve 100% Triton for end-to-end Llama3-8B and Granite-8B inference we need to write and integrate handwritten Triton kernels as well as leverage torch.compile (to generate Triton ops). First, we replace smaller ops with compiler generated Triton kernels, and second, we replace more expensive and complex computations (e.g. matrix multiplication and flash attention) with handwritten Triton kernels.

    + +

    Torch.compile generates Triton kernels automatically for RMSNorm, RoPE, SiLU and Element Wise Multiplication. Using tools like Nsight Systems we can observe these generated kernels; they appear as tiny dark green kernels in-between the matrix multiplications and attention.

    + +

    +Figure 3. Trace of Llama3-8B with torch.compile, showing CUDA kernels being used for matrix multiplications and flash attention

    + +

    For the above trace, we note that the two major ops that make up 80% of the E2E latency in a Llama3-8B style model are matrix multiplication and attention kernels and both remain CUDA kernels. Thus to close the remaining gap, we replace both matmul and attention kernels with handwritten Triton kernels.

    + +

    4.0 Triton SplitK GEMM Kernel

    + +

    For the matrix multiplications in the linear layers, we wrote a custom FP16 Triton GEMM (General Matrix-Matrix Multiply) kernel that leverages a SplitK work decomposition. We have previously discussed this parallelization in other blogs as a way to accelerate the decoding portion of LLM inference.

    + +

    5.0 GEMM Kernel Tuning

    + +

    To achieve optimal performance we used the exhaustive search approach to tune our SplitK GEMM kernel. Granite-8B and Llama3-8B have linear layers with the following shapes:

    + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Linear LayerShape (in_features, out_features)
    Fused QKV Projection(4096, 6144)
    Output Projection(4096, 4096)
    Fused Gate + Up Projection(4096, 28672)
    Down Projection(14336, 4096)
    + +

    Figure 4. Granite-8B and Llama3-8B Linear Layer Weight Matrix Shapes

    + +

    Each of these linear layers have different weight matrix shapes. Thus, for optimal performance the Triton kernel must be tuned for each of these shape profiles. After tuning for each linear layer we were able to achieve 1.20x E2E speedup on Llama3-8B and Granite-8B over the untuned Triton kernel.

    + +

    6.0 Flash Attention Kernel

    + +

    We evaluated a suite of existing Triton flash attention kernels with different configurations, namely:

    + +
      +
    1. AMD Flash
    2. +
    3. OpenAI Flash
    4. +
    5. Dao AI Lab Flash
    6. +
    7. XFormers Flash
    8. +
    9. PyTorch FlexAttention
    10. +
    + +

    We evaluated the text generation quality of each of these kernels, first, in eager mode and then (if we were able to torch.compile the kernel with standard methods) compile mode. For kernels 2-5, we noted the following:

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    KernelText Generation QualityTorch.compileSupport for Arbitrary Sequence Length
    AMD FlashCoherentYesYes
    OpenAI FlashIncoherentDid not evaluate. WIP to debug precision in eager mode firstNo
    Dao AI Lab FlashIncoherentDid not evaluate. WIP to debug precision in eager mode firstYes
    Xformers FlashDecodingHit a compilation error before we were able to evaluate text qualityWIPNo (This kernel is optimized for decoding)
    PyTorch FlexAttentionCoherentWIPWIP
    + +

    Figure 5. Table of combinations we tried with different Flash Attention Kernels

    + +

    The above table summarizes what we observed out-of-the box. With some effort we expect that kernels 2-5 can be modified to meet the above criteria. However, this also shows that having a kernel that works for benchmarking is often only the start of having it usable as an end to end production kernel.
    +We chose to use the AMD flash attention kernel in our subsequent tests as it can be compiled via torch.compile and produces legible output in both eager and compiled mode.

    + +

    To satisfy torch.compile compatibility with the AMD flash attention kernel, we had to define it as a torch custom operator. This process is explained in detail here. The tutorial link discusses how to wrap a simple image crop operation. However, we note that wrapping a more complex flash attention kernel follows a similar process. The two step approach is as follows:

    + +
      +
    1. Wrap the function into a PyTorch Custom Operator
    2. +
    + +

    + +
      +
    1. Add a FakeTensor Kernel to the operator, which given the shapes of the input tensors of flash (q, k and v) provides a way to compute the output shape of the flash kernel
    2. +
    + +

    + +

    After defining the Triton flash kernel as a custom op, we were able to successfully compile it for our E2E runs.

    + +

    + +

    Figure 6. Trace of Llama3-8B with torch.compile, after swapping in Triton matmul and Triton flash attention kernels

    + +

    From Figure 5, we note that now, after integrating both the SplitK matrix multiplication kernel, the torch op wrapped flash attention kernel, and then running torch.compile, we are able to achieve a forward pass that uses 100% Triton computation kernels.

    + +

    7.0 End-to-End Benchmarks

    + +

    We performed end-to-end measurements on NVIDIA H100s and A100s (single GPU) with Granite-8B and Llama3-8B models. We performed our benchmarks with two different configurations.

    + +

    The Triton kernel configuration uses:

    + +
      +
    1. Triton SplitK GEMM
    2. +
    3. AMD Triton Flash Attention
    4. +
    + +

    The CUDA Kernel configuration uses:

    + +
      +
    1. cuBLAS GEMM
    2. +
    3. cuDNN Flash Attention - Scaled Dot-Product Attention (SDPA)
    4. +
    + +

    We found the following throughput and inter-token latencies for both eager and torch compiled modes, with typical inference settings:

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    GPUModelKernel ConfigMedian Latency (Eager) [ms/tok]Median Latency (Compiled) [ms/tok]
    H100Granite-8BTriton27.4211.59
      CUDA18.849.50
     Llama3-8BTriton20.3610.61
      CUDA16.598.59
    A100Granite-8BTriton53.4416.88
      CUDA37.1314.25
     Llama3-8BTriton44.4417.94
      CUDA32.4512.96
    + +

    Figure 7. Granite-8B and Llama3-8B Single Token Generation Latency on H100 and A100,
    +(batch size = 2, input sequence length = 512, output sequence length = 256)

    + +

    To summarize, the Triton models can get up to 78% of the performance of the CUDA models on the H100 and up to 82% on the A100.

    + +

    The performance gap can be explained by the kernel latencies we observe for matmul and flash attention, which are discussed in the next section.

    + +

    8.0 Microbenchmarks

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    KernelTriton [us]CUDA [us]
    QKV Projection Matmul2521
    Flash Attention138
    Output Projection Matmul2117
    Gate + Up Projection Matmul8483
    Down Projection Matmul5842
    + +

    Figure 8. Triton and CUDA Kernel Latency Comparison (Llama3-8B on NVIDIA H100)
    +Input was an arbitrary prompt (bs=1, prompt = 44 seq length), decoding latency time

    + +

    From the above, we note the following:

    + +
      +
    1. +

      Triton matmul kernels are 1.2-1.4x slower than CUDA

      +
    2. +
    3. +

      AMDs Triton Flash Attention kernel is 1.6x slower than CUDA SDPA

      +
    4. +
    + +

    These results highlight the need to further improve the performance of kernels that are core primitives like GEMM and Flash Attention. We leave this as future research, as recent works (e.g. FlashAttention-3, FlexAttention) provide ways to leverage the underlying hardware better as well as Triton pathways that we hope to be able to build on to produce greater speedups. To illustrate this, we compared FlexAttention with SDPA and AMD’s Triton Flash kernel.

    + +

    We are working to verify E2E performance with FlexAttention. For now, initial microbenchmarks with Flex show promise for longer context lengths and decoding problem shapes, where the query vector is small:

    + +

    + +

    Figure 9. FlexAttention Kernel Benchmarks on NVIDIA H100 SXM5 80GB
    +(batch=1, num_heads=32, seq_len=seq_len, head_dim=128)

    + +

    9.0 Future Work

    + +

    For future work we plan to explore ways to further optimize our matmuls that leverage the hardware better, such as this blog we published on utilizing TMA for H100, as well as different work decompositions (persistent kernel techniques like StreamK etc.) to get greater speedups for our Triton-based approach. For flash attention, we plan to explore FlexAttention and FlashAttention-3 as the techniques used in these kernels can be leveraged to help further close the gap between Triton and CUDA.
    +We also note that our prior work has shown promising results for FP8 Triton GEMM kernel performance versus cuBLAS FP8 GEMM, thus in a future post we will explore E2E FP8 LLM inference.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/cutlass-ping-pong-gemm-kernel/index.html b/blog/cutlass-ping-pong-gemm-kernel/index.html new file mode 100644 index 000000000000..6b964f5b8e03 --- /dev/null +++ b/blog/cutlass-ping-pong-gemm-kernel/index.html @@ -0,0 +1,818 @@ + + + + + + + + + + + + + Deep Dive on CUTLASS Ping-Pong GEMM Kernel | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    November 01, 2024

    +

    + Deep Dive on CUTLASS Ping-Pong GEMM Kernel +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Less Wright, Adnan Hoque + +

    +

    Figure 1. FP8 GEMM Throughput Comparison CUTLASS vs Triton

    + +

    Figure 1. FP8 GEMM Throughput Comparison CUTLASS vs Triton

    + +

    Summary

    + +

    In this post, we provide an overview, with relevant FP8 inference kernel benchmarking, of the CUTLASS Ping-Pong GEMM kernel.

    + +

    Ping-Pong is one of the fastest matmul (GEMM) kernel architectures available for the Hopper GPU architecture. Ping-Pong is a member of the Warp Group Specialized Persistent Kernels family, which includes both Cooperative and Ping-Pong variants. Relative to previous GPUs, Hopper’s substantial tensor core compute capability requires deep asynchronous software pipelining in order to achieve peak performance.

    + +

    The Ping-Pong and Cooperative kernels exemplify this paradigm, as the key design patterns are persistent kernels to amortize launch and prologue overhead, and ‘async everything’ with specialized warp groups with two consumers and one producer, to create a highly overlapped processing pipeline that is able to continuously supply data to the tensor cores.

    + +

    When the H100 (Hopper) GPU was launched, Nvidia billed it as the first truly asynchronous GPU. That statement highlights the need for H100 specific kernel architectures to also be asynchronous in order to fully maximize computational/GEMM throughput.

    + +

    The pingpong GEMM, introduced in CUTLASS 3.x, exemplifies this by moving all aspects of the kernel to a ‘fully asynchronous’ processing paradigm. In this blog, we’ll showcase the core features of the ping-pong kernel design as well as showcase its performance on inference workloads vs cublas and triton split-k kernels.

    + +

    Ping-Pong Kernel Design

    + +

    Ping-Pong (or technically ‘sm90_gemm_tma_warpspecialized_pingpong’) operates with an asynchronous pipeline, leveraging warp specialization. Instead of the more classical homogeneous kernels, “warp groups” take on specialized roles. Note that a warp group consists of 4 warps of 32 threads each, or 128 total threads.

    + +

    On earlier architectures, latency was usually hidden by running multiple thread blocks per SM. However, with Hopper, the Tensor Core throughput is so high that it necessitates moving to deeper pipelines. These deeper pipelines then hinder running multiple thread blocks per SM. Thus, persistent thread blocks now issue collective main loops across multiple tiles and multiple warp groups. Thread block clusters are allocated based on the total SM count.

    + +

    For Ping-Pong, each warp group takes on a specialized role of either Data producer or Data consumer.

    + +

    The producer warp group focuses on producing data movement to fill the shared memory buffers (via TMA). Two other warp groups are dedicated consumers that process the math (MMA) portion with tensor cores, and then do any follow up work and write their results back to global memory (epilogue).

    + +

    Producer warp groups work with TMA (Tensor Memory Accelerator), and are deliberately kept as lightweight as possible. In fact, in Ping-Pong, they deliberately reduce their register resources to improve occupancy. Producers will reduce their max register counts by 40, vs consumers will increase their max register count by 232, an effect we can see in the CUTLASS source and corresponding SASS:

    + +

    source code

    + +

    Unique to Ping-Pong, each consumer works on separate C output tiles. (For reference, the cooperative kernel is largely equivalent to Ping-Pong, but both consumer groups work on the same C output tile). Further, the two consumer warp groups then split their work between the main loop MMA and epilogue.

    + +

    This is shown in the below image:

    + +

    Figure 2: An overview of the Ping-Pong Kernel pipeline. Time moves left to right.

    + +

    Figure 2: An overview of the Ping-Pong Kernel pipeline. Time moves left to right.

    + +

    By having two consumers, it means that one can be using the tensor cores for MMA while the other performs the epilogue, and then vice-versa. This maximizes the ‘continuous usage’ of the tensor cores on each SM, and is a key part of the reason for the max throughput. The tensor cores can be continuously fed data to realize their (near) maximum compute capability. (See the bottom section of the Fig 2 illustration above).

    + +

    Similar to how Producer threads stay focused only on data movements, MMA threads only issue MMA instructions in order to achieve peak issue rate. MMA threads must issue multiple MMA instructions and keep these in flight against TMA wait barriers.

    + +

    An excerpt of the kernel code is shown below to cement the specialization aspects:

    + +
    // Two types of warp group 'roles' 
    +enum class WarpGroupRole {
    +      Producer = 0,
    +      Consumer0 = 1,
    +      Consumer1 = 2
    +    };
    +
    +//warp group role assignment
    +auto warp_group_role = WarpGroupRole(canonical_warp_group_idx());
    +
    + +

    Data Movement with Producers and Tensor Memory Accelerator

    + +

    The producer warps focus exclusively on data movement - specifically they are kept as lightweight as possible and in fact give up some of their register space to the consumer warps (keeping only 40 registers, while consumers will get 232). Their main task is issuing TMA (tensor memory accelerator) commands to move data from Global memory to shared memory as soon as a shared memory buffer is signaled as being empty.

    + +

    To expand on TMA, or Tensor Memory Accelerator, TMA is a hardware component introduced with H100’s that asynchronously handles the transfer of memory from HBM (global memory) to shared memory. By having a dedicated hardware unit for memory movement, worker threads are freed to engage in other work rather than computing and managing data movement. TMA not only handles the movement of the data itself, but also calculates the required destination memory addresses, can apply any transforms (reductions, etc.) to the data and can handle layout transformations to deliver data to shared memory in a ‘swizzled’ pattern so that it’s ready for use without any bank conflicts. Finally, it can also multicast the same data if needed to other SM’s that are members of the same thread cluster. Once the data has been delivered, TMA will then signal the consumer of interest that the data is ready.

    + +

    CUTLASS Asynchronous Pipeline Class

    + +

    This signaling between producers and consumers is coordinated via the new Asynchronous Pipeline Class which CUTLASS describes as follows:

    + +

    “Implementing a persistent GEMM algorithm calls for managing dozens of different kinds of asynchronously executing operations that synchronize using multiple barriers organized as a circular list.

    + +

    This complexity is too much for human programmers to manage by hand.

    + +

    As a result, we have developed [CUTLASS Pipeline Async Class]…”

    + +

    Barriers and synchronization within the Ping-Pong async pipeline

    + +

    Producers must ‘acquire’ a given smem buffer via ‘producer_acquire’. At the start, a pipeline is empty meaning that producer threads can immediately acquire the barrier and begin moving data.

    + +
    PipelineState mainloop_pipe_producer_state = cutlass::make_producer_start_state<MainloopPipeline>();
    +
    + +

    Once the data movement is complete, producers issue the ‘producer_commit’ method to signal the consumer threads that data is ready.
    +However, for Ping-Pong, this is actually a noop instruction since TMA based producer’s barriers are automatically updated by the TMA when writes are completed.

    + +

    consumer_wait - wait for data from producer threads (blocking).

    + +

    consumer_release - signal waiting producer threads that they are finished consuming data from a given smem buffer. In other words, allow producers to go to work refilling this with new data.

    + +

    From there, synchronization will begin in earnest where the producers will wait via the blocking producer acquire until they can acquire a lock, at which point their data movement work will repeat. This continues until the work is finished.

    + +

    To provide a pseudo-code overview:

    + +
    //producer
    +While (work_tile_info.is_valid_tile) {
    +
    +	collective_mainloop.dma() // fetch data with TMA
    +	scheduler.advance_to_next_work()
    +	Work_tile_info = scheduler.get_current_work()
    +
    +}
    +
    +// Consumer 1, Consumer 2
    +While (work_tile_info.is_valid_tile()) {
    +
    +	collective_mainloop.mma()
    +	scheduler.advance_to_next_work()
    +	Work_tile_info = scheduler.get_current_work()
    +
    +}
    +
    + +

    And a visual birds-eye view putting it all together with the underlying hardware:

    + +

    Figure 3: An overview of the full async pipeline for Ping-Pong

    + +

    Figure 3: An overview of the full async pipeline for Ping-Pong

    + +

    Step-by-Step Breakdown of Ping-Pong Computation Loop

    + +

    Finally, a more detailed logical breakout of the Ping-Pong processing loop:

    + +

    A - Producer (DMA) warp group acquires a lock on a shared memory buffer.

    + +

    B - this allows it to kick off a tma cp_async.bulk request to the tma chip (via a single thread).

    + +

    C - TMA computes the actual shared memory addressing required, and moves the data to shared memory. As part of this, swizzling is performed in order to layout the data in smem for the fastest (no bank conflict) access.

    + +

    C1 - potentially, data can also be multicast to other SMs and/or it may need to wait for data from other tma multicast to complete the loading. (threadblock clusters now share shared memory across multiple SMs!)

    + +

    D - At this point, the barrier is updated to signal the arrival of the data to smem.

    + +

    E - The relevant consumer warpgroup now gets to work by issuing multiple wgmma.mma_async commands, which then read the data from smem to Tensor cores as part of it’s wgmma.mma_async matmul operation.

    + +

    F - the MMA accumulator values are written to register memory as the tiles are completed.

    + +

    G - the consumer warp group releases the barrier on the shared memory.

    + +

    H - the producer warp groups go to work issuing the next tma instruction to refill the now free smem buffer.

    + +

    I - The consumer warp group simultaneously applies any epilogue actions to the accumulator, and then move data from register to a different smem buffer.

    + +

    J - The consumer warp issues a cp_async command to move data from smem to global memory.

    + +

    The cycle repeats until the work is completed. Hopefully this provides you with a working understanding of the core concepts that power Ping-Pong’s impressive performance.

    + +

    Microbenchmarks

    + +

    To showcase some of Ping-Pong’s performance, below are some comparison charts related to our work on designing fast inference kernels.

    + +

    First a general benchmarking of the three fastest kernels so far (lower is better): \

    + +

    Figure 4, above: Benchmark timings of FP8 GEMMs, lower is better (faster)

    + +

    Figure 4, above: Benchmark timings of FP8 GEMMs, lower is better (faster)

    + +

    And translating that into a relative speedup chart of Ping-Pong vs cuBLAS and Triton:

    + +

    Figure 5, above: Relative speedup of Ping-Pong vs the two closest kernels.

    + +

    Figure 5, above: Relative speedup of Ping-Pong vs the two closest kernels.

    + +

    The full source code for the Ping-Pong kernel is here (619 lines of deeply templated CUTLASS code, or to paraphrase the famous turtle meme - “it’s templates…all the way down! ):

    + + + +

    In addition, we have implemented PingPong as a CPP extension to make it easy to integrate into use with PyTorch here (along with a simple test script showing it’s usage):

    + + + +

    Finally, for continued learning, Nvidia has two GTC videos that dive into kernel design with CUTLASS:

    + + + +

    Future Work

    + +

    Data movement is usually the biggest impediment to top performance for any kernel, and thus having an optimal strategy understanding of TMA (Tensor Memory Accelerator) on Hopper is vital. We previously published work on TMA usage in Triton. Once features like warp specialization are enabled in Triton, we plan to do another deep dive on how Triton kernels like FP8 GEMM and FlashAttention can leverage kernel designs like Ping-Pong for acceleration on Hopper GPUs.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/datathon-2025/index.html b/blog/datathon-2025/index.html new file mode 100644 index 000000000000..2ef3d61e1d1d --- /dev/null +++ b/blog/datathon-2025/index.html @@ -0,0 +1,670 @@ + + + + + + + + + + + + + Solve Real-Word AI Challenges with PyTorch at Datathon 2025: DataOrbit | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Aakash Senthilnathan + +

    +

    We’re excited to have PyTorch sponsor Datathon 2025: DataOrbit, a place where students can collaborate with a team to solve problems using real-world datasets! This event, hosted by Data Science UCSB in collaboration with Gaucho Sports Analytics and ACM@UCSB, will take place on February 22–23rd, 2025 at UC Santa Barbara, with the incredible opportunity to present your project to a panel of corporate and faculty judges – including the executive director of Pytorch! – for a chance to win prizes up to $3000.

    + +

    logo

    + +

    PyTorch’s versatility and power have made it an essential tool for tackling complex data problems in domains ranging from computer vision and natural language processing to time series analysis. At Datathon 2025: DataOrbit, participants will have the chance to leverage PyTorch’s dynamic framework, ease of use, and robust ecosystem to build innovative solutions. Whether you’re building machine learning models, experimenting with deep learning architectures, or applying PyTorch to solve real-world challenges, workshops and mentors will be available to help you dive deeper into its capabilities and accelerate your project’s success.

    + +

    Register Here: tinyurl.com/dataorbit25-reg (Open until February 21st or until capacity is reached)

    + +

    Additional information regarding the timeline of events can be found on the registration form.

    + +

    About the Datathon

    + +
      +
    • Open only to undergraduate students in the United States
    • +
    • In-person events over 36 hours
    • +
    • Teams sizes of 2-5 people
    • +
    • 10 different prize tracks
    • +
    • Workshops and office hours teaching essential data science tools and techniques
    • +
    • Professional development workshops + networking opportunities with our sponsors
    • +
    • All meals provided
    • +
    • A fun time!
    • +
    + +

    If you have a group you would like to work with, we require that every member register separately. If you do not have a group, we will have an opportunity at the beginning of the event to participate in an activity to form groups. Unfortunately, at this time we do not provide travel accommodations or lodging for participants.

    + +

    If you are interested in mentoring students virtually during the course of our datathon, or have any other questions contact us at datascience.ucsb@gmail.com.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/democratizing-ai-with-pytorch/index.html b/blog/democratizing-ai-with-pytorch/index.html new file mode 100644 index 000000000000..4f29a6cb2bba --- /dev/null +++ b/blog/democratizing-ai-with-pytorch/index.html @@ -0,0 +1,736 @@ + + + + + + + + + + + + + Democratizing AI with PyTorch Foundation and ROCm™ support for PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + AMD + +

    +

    AMD Founding Member

    + +

    Last year, Meta announced that PyTorch joined the Linux Foundation as a neutral home for growing the machine learning project and community with AMD representation as a part of the founding membership and governing board.

    + +

    PyTorch Foundation’s mission is to drive AI adoption by democratizing its software ecosystem through open source principles aligning with the AMD core principle of an Open software ecosystem. AMD strives to foster innovation through the support for latest generations of hardware, tools, libraries, and other components to simplify and accelerate adoption of AI across a broad range of scientific discoveries.

    + +
    +
    +

    +AMD, along with key PyTorch codebase developers (including those at Meta AI), delivered a set of updates to the ROCm™ open software ecosystem that brings stable support for AMD Instinct™ accelerators as well as many Radeon™ GPUs. This now gives PyTorch developers the ability to build their next great AI solutions leveraging AMD GPU accelerators & ROCm. The support from PyTorch community in identifying gaps, prioritizing key updates, providing feedback for performance optimizing and supporting our journey from “Beta” to “Stable” was immensely helpful and we deeply appreciate the strong collaboration between the two teams at AMD and PyTorch. The move for ROCm support from “Beta” to “Stable” came in the PyTorch 1.12 release (June 2022) brings the added support to easily run PyTorch on native environment without having to configure custom dockers. This is a sign of confidence about the quality of support and performance of PyTorch using AMD Instinct and ROCm. The results of these collaborative efforts are evident in the performance measured on key industry benchmarks like Microsoft’s SuperBench shown below in Graph 1. +

    +
    +
    +

    +“We are excited to see the significant impact of developers at AMD to contribute to and extend features within PyTorch to make AI models run in a more performant, efficient, and scalable way. A great example of this is the thought-leadership around unified memory approaches between the framework and future hardware systems, and we look forward to seeing that feature progress.”
    +- Soumith Chintala, PyTorch lead-maintainer and Director of Engineering, Meta AI +

    +
    +
    + +

    The progressive improvements on both the AMD CDNA™ architecture as well as ROCm and PyTorch shows single GPU model throughput increase from AMD Instinct MI100 to the latest generation AMD Instinct MI200 family GPUs going from ROCm 4.2 to ROCm 5.3 and from PyTorch 1.7 to PyTorch 1.12.

    + +

    Graph 1: ML model performance over generation using Microsoft Superbench Suite

    + +

    Graph 1: ML model performance over generation using Microsoft Superbench Suite 1, 2, 3

    + +

    Below are a few of the key updates for ROCm support since the PyTorch 1.12 release

    + +

    Full Continuous Integration (CI) for ROCm on PyTorch

    + +

    With the ROCm support for PyTorch move from “Beta” to “Stable,” all the functions and features commits are now verified through a full Continuous Integration (CI) process. The CI process helps ensure the proper build and test process ahead of an expected Docker and PIP wheel release with stable commits forthcoming.

    + +

    Support for Kineto Profiler

    + +

    The addition of Kineto profiler support to ROCm now helps developers and users understand performance bottlenecks through effective diagnosis and profiling tools. The tool also provides recommendations to improve known issues and visualization through TensorBoard UI.

    + +

    Key PyTorch Libraries support added

    + +

    PyTorch ecosystem libraries like TorchText (Text classification), TorchRec (libraries for recommender systems - RecSys), TorchVision (Computer Vision), TorchAudio (audio and signal processing) are fully supported since ROCm 5.1 and upstreamed with PyTorch 1.12.

    + +

    Key libraries provided with the ROCm software stack including MIOpen (Convolution models), RCCL (ROCm Collective Communications) and rocBLAS (BLAS for transformers) were further optimized to offer new potential efficiencies and higher performance.

    + +

    MIOpen innovates on several fronts, such as implementing fusion to optimize for memory bandwidth and GPU launch overheads, providing an auto-tuning infrastructure to overcome the large design space of problem configurations, and implementing different algorithms to optimize convolutions for different filter and input sizes. MIOpen is one of the first libraries to publicly support the bfloat16 data-type for convolutions, allowing efficient training at lower precision maintaining expected accuracy.

    + +

    RCCL (pronounced “Rickle”) is a stand-alone library of standard collective communication routines for GPUs, implementing all-reduce, all-gather, reduce, broadcast, reduce-scatter, gather, scatter, and all-to-all. There is support for direct GPU-to-GPU send and receive operations. It has been optimized to achieve high bandwidth on platforms using PCIe®, Infinity Fabric™ (GPU to GPU) as well as networking using InfiniBand Verbs or TCP/IP sockets. RCCL supports an arbitrary number of GPUs installed in single or multiple nodes and can be used in either single- or multi-process (e.g., MPI) applications.

    + +

    Along with the above key highlights, over 50 features and functionality improvements were completed jointly between AMD and PyTorch to add stable support for ROCm. These include improvements to tools, compilers, runtime, graph optimizations through TorchScript, INT8 quant path usage, and ONNX runtime integration including support for Navi 21 based Radeon™ PRO datacenter graphics card to name a few.

    + +

    AITemplate Inference Engine

    + +

    MetaAI recently published a blog announcing the release of its open source AITemplate (link) for a unified inference system supporting AMD Instinct GPU accelerators using the AMD ROCm stack. This Python based framework can help significantly improve performance through increased utilization of AMD matrix cores for transformer blocks. This is achieved through the AMD Composable Kernel (CK) library which provides performance critical Kernels for ML AI workloads across multiple architectures including GPUs and CPUs through HIP & C++.

    + +

    Moreover, the AITemplate also provides out-of-the-box support for widely used AI models like BERT, ResNET, Vision Transformer, Stable Diffusion etc. simplifying deployment process through these pretrained models.

    + +

    What’s coming with future ROCm releases?

    + +

    Unified memory models for CPU + GPU

    + +

    As system architecture evolves to address the complexity of large problem sizes and data sets, memory management becomes a key performance bottle neck that needs a cohesive strategy to be addressed through innovations at both hardware and software levels. AMD is uniquely positioned to address this problem with its effective data center solutions integrating AMD EPYC™ CPU cores with its AMD Instinct GPU compute units in a truly unified datacenter APU (Accelerated Processing Unit) form factor set to be launched in 2H 2023.

    + +

    The software work to leverage the unified CPU + GPU memory has already started in collaboration with the PyTorch team, to enable the usage of a fast, low latency, synchronized memory model that enables not only AMD but also other AI accelerators to address the complex memory management problem of today. We are looking forward to this joint effort and announcement soon.

    + +

    Acknowledgement

    + +

    The content in this blog highlights the joint work between AMD and key PyTorch contributors including Meta, working on many of the core features, as well as Microsoft enabling ONNX Runtime support. We are looking forward to working with the other founding members at the PyTorch Foundation on the next steps and improvements to democratize and grow adoption of PyTorch across the industry.

    + +

    CAUTIONARY STATEMENT

    + +

    +This blog contains forward-looking statements concerning Advanced Micro Devices, Inc. (AMD) such as the availability, timing and expected benefits of an AMD datacenter APU form factor, which are made pursuant to the Safe Harbor provisions of the Private Securities Litigation Reform Act of 1995. Forward-looking statements are commonly identified by words such as “would,” “may,” “expects,” “believes,” “plans,” “intends,” “projects” and other terms with similar meaning. Investors are cautioned that the forward-looking statements in this blog are based on current beliefs, assumptions and expectations, speak only as of the date of this blog and involve risks and uncertainties that could cause actual results to differ materially from current expectations. Such statements are subject to certain known and unknown risks and uncertainties, many of which are difficult to predict and generally beyond AMD’s control, that could cause actual results and other future events to differ materially from those expressed in, or implied or projected by, the forward-looking information and statements. Investors are urged to review in detail the risks and uncertainties in AMD’s Securities and Exchange Commission filings, including but not limited to AMD’s most recent reports on Forms 10-K and 10-Q. AMD does not assume, and hereby disclaims, any obligation to update forward-looking statements made in this blog, except as may be required by law. +

    + +

    Endnotes

    + +
      +
    1. MI100D-01 SuperBench v0.5 model training results based on AMD internal testing as of 11/09/2022 measuring the total training throughput, at half precision, using a 2P AMD EPYC™ 7763 CPU server tested with 1x AMD Instinct™ MI100 (32GB HBM2e) 300W GPU, SBIOS 2.2, Ubuntu® 20.04.5 LTS, host ROCm™ 5.2.0, guest ROCm 4.2, PyTorch 1.7.0. Server manufacturers may vary configurations, yielding different results. Performance may vary based factors including use of latest drivers and optimizations.
    2. +
    3. MI200D-01 SuperBench v0.6 model training results based on AMD internal testing as of 11/09/2022 measuring the total training throughput, at half precision, using a 2P AMD EPYC™ 7763 CPU server tested with 1x AMD Instinct™ MI210 (64GB HBM2e) 300W GPU, SBIOS 2.2, Ubuntu 20.04.5 LTS, host ROCm 5.3.0, guest ROCm 5.3, PyTorch 1.12. Server manufacturers may vary configurations, yielding different results. Performance may vary based factors including use of latest drivers and optimizations.
    4. +
    5. MI200D-02: SuperBench v0.6 model training results based on AMD internal testing as of 11/09/2022 measuring the total training throughput, at half precision, using a 2P AMD EPYC™️ 7763 CPU server tested with 1x AMD Instinct™️ MI250 (128GB HBM2e) 560W GPU, SBIOS M12, Ubuntu 20.04 LTS, host ROCm 5.3.0, guest ROCm 5.3, PyTorch 1.12. Server manufacturers may vary configurations, yielding different results. Performance may vary based factors including use of latest drivers and optimizations.
    6. +
    + + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/deploying-llms-torchserve-vllm/index.html b/blog/deploying-llms-torchserve-vllm/index.html new file mode 100644 index 000000000000..52a0a4fc9983 --- /dev/null +++ b/blog/deploying-llms-torchserve-vllm/index.html @@ -0,0 +1,821 @@ + + + + + + + + + + + + + Deploying LLMs with TorchServe + vLLM | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    October 31, 2024

    +

    + Deploying LLMs with TorchServe + vLLM +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Matthias Reso, Ankith Gunapal, Simon Mo, Li Ning, Hamid Shojanazeri + +

    +

    The vLLM engine is currently one of the top-performing ways to execute large language models (LLM). It provides the vllm serve command as an easy option to deploy a model on a single machine. While this is convenient, to serve these LLMs in production and at scale some advanced features are necessary.

    + +

    flow diagram

    + +

    TorchServe offers these essential production features (like custom metrics and model versioning) and through its flexible custom handler design, makes it very easy to integrate features such as retrieval-augmented generation (RAG) or safeguards like Llama Guard. It is therefore natural to pair the vLLM engine with TorchServe to create a full-fledged LLM serving solution for production.

    + +

    Before going into the specifics of the integration, we will demonstrate the deployment of a Llama-3.1-70B-Instruct model using TorchServe’s vLLM docker image.

    + +

    Quickly getting started with Llama 3.1 on TorchServe + vLLM

    + +

    To get started we need to build the new TS LLM Docker container image by checking out the TorchServe repository and execute the following command from the main folder:

    + +
    docker build --pull . -f docker/Dockerfile.vllm -t ts/vllm
    +
    + +

    The container uses our new LLM launcher script ts.llm_launcher which takes a Hugging Face model URI or local folder and spins up a local TorchServe instance with the vLLM engine running in the backend. To serve a model locally, you can create an instance of the container with the following command:

    + +
    #export token=<HUGGINGFACE_HUB_TOKEN>
    +docker run --rm -ti --shm-size 10g --gpus all -e HUGGING_FACE_HUB_TOKEN=$token -p 
    +8080:8080 -v data:/data ts/vllm --model_id meta-llama/Meta-Llama-3.1-70B-Instruct --disable_token_auth
    +
    + +

    You can test the endpoint locally with this curl command:

    + +
    curl -X POST -d '{"model":"meta-llama/Meta-Llama-3.1-70B-Instruct", "prompt":"Hello, my name is", "max_tokens": 200}' --header "Content-Type: application/json" "http://localhost:8080/predictions/model/1.0/v1/completions"
    +
    + +

    The docker stores the model weights in the local folder “data” which gets mounted as /data inside the container. To serve your custom local weights simply copy them into data and point the model_id to /data/<your weights>.

    + +

    Internally, the container uses our new ts.llm_launcher script to launch TorchServe and deploy the model. The launcher simplifies the deployment of an LLM with TorchServe into a single command line and can also be used outside the container as an efficient tool for experimentation and testing. To use the launcher outside the docker, follow the TorchServe installation steps and then execute the following command to spin up a 8B Llama model:

    + +
    # after installing TorchServe and vLLM run
    +python -m ts.llm_launcher --model_id meta-llama/Meta-Llama-3.1-8B-Instruct  --disable_token_auth
    +
    + +

    If multiple GPUs are available the launcher will automatically claim all visible devices and apply tensor parallelism (see CUDA_VISIBLE_DEVICES to specify which GPUs to use).

    + +

    While this is very convenient, it’s important to note that it does not encompass all the functionalities provided by TorchServe. For those looking to leverage more advanced features, a model archive needs to be created. While this process is a bit more involved than issuing a single command, it bears the advantage of custom handlers and versioning. While the former allows to implement RAG inside the preprocessing step, the latter lets you test different versions of a handler and model before deploying on a larger scale.

    + +

    Before we provide the detailed steps to create and deploy a model archive, let’s dive into the details of the vLLM engine integration.

    + +

    TorchServe’s vLLM Engine Integration

    + +

    As a state-of-the-art serving framework, vLLM offers a plethora of advanced features, including PagedAttention, continuous batching, rapid model execution through CUDA graphs, and support for various quantization methods such as GPTQ, AWQ, INT4, INT8, and FP8. It also provides integration for important parameter-efficient adapter methods like LoRA and access to a wide range of model architectures including Llama and Mistral. vLLM is maintained by the vLLM team and a thriving open-source community.

    + +

    To facilitate quick deployment, it offers a serving mode based on FastAPI to serve LLMs over HTTP. For a tighter, more flexible integration the project also provides the vllm.LLMEngine which offers interfaces to process requests on a continuous basis. We leveraged the asynchronous variant for the integration into TorchServe.

    + +

    TorchServe is an easy-to-use, open-source solution for serving PyTorch models in production. As a production-tested serving solution, TorchServe offers numerous benefits and features beneficial for deploying PyTorch models at scale. By combining it with the inference performance of the vLLM engine these benefits can now also be used to deploy LLMs at scale.

    + +

    Torchserve highlights and integrations

    + +

    To maximize hardware utilization it is generally a good practice to batch requests from multiple users together. Historically, TorchServe only offered a synchronized mode to collect requests from various users. In this mode, TorchServe waits for a predefined amount of time (e.g., batch_delay=200ms) or until enough requests (e.g., batch_size=8) have arrived. When one of these events is triggered, the batched data gets forwarded to the backend where the model is applied to the batch, and the model output is returned to the users through the frontend. This works especially well for traditional vision models where outputs for each request usually finish at the same time.

    + +

    For generative use cases, particularly text generation, the assumption that requests are ready simultaneously is no longer valid, as responses will have varying lengths. Although TorchServe supports continuous batching (the ability to add and remove requests dynamically), this mode only accommodates a static maximum batch size. With the introduction of PagedAttention, even this assumption of a maximum batch size becomes more flexible, as vLLM can combine requests of different lengths in a highly adaptable manner to optimize memory utilization.

    + +

    To achieve optimal memory utilization, i.e., to fill unused gaps in memory (think Tetris), vLLM requires complete control over the decision of which requests to process at any given time. To provide this flexibility, we had to reevaluate how TorchServe handles user requests. Instead of the previous synchronous processing mode, we introduced an asynchronous mode (see diagram below) where incoming requests are directly forwarded to the backend, making them available for vLLM. The backend feeds the vllm.AsyncEngine, which can now select from all available requests. If streaming mode is enabled and the first token of a request is available, the backend will send out the result immediately and continue sending tokens until the final token is generated.

    + +

    flow diagram

    + +

    Our implementation of the VLLMHandler enables users to quickly deploy any model compatible with vLLM using a configuration file, while still offering the same level of flexibility and customizability through a custom handler. Users are free to add e.g. custom preprocessing or post-processing steps by inheriting from VLLMHandler and overriding the respective class methods.

    + +

    We also support single-node, multi-GPU distributed inference, where we configure vLLM to use tensor parallel sharding of the model to either increase capacity for smaller models or enable larger models that do not fit on a single GPU, such as the 70B Llama variants. Previously, TorchServe only supported distributed inference using torchrun, where multiple backend worker processes were spun up to shard the model. vLLM manages the creation of these processes internally, so we introduced the new “custom” parallelType to TorchServe which launches a single backend worker process and provides the list of assigned GPUs. The backend process can then launch its own subprocesses if necessary.

    + +

    To facilitate integration of TorchServe + vLLM into docker-based deployments, we provide a separate Dockerfile based on TorchServe’s GPU docker image, with vLLM added as a dependency. We chose to keep the two separate to avoid increasing the docker image size for non-LLM deployments.

    + +

    Next, we will demonstrate the steps required to deploy a Llama 3.1 70B model using TorchServe + vLLM on a machine with four GPUs.

    + +

    Step-by-Step Guide

    + +

    For this step-by-step guide we assume the installation of TorchServe has finished successfully. Currently, vLLM is not a hard-dependency for TorchServe so let’s install the package using pip:

    + +
    $ pip install -U vllm==0.6.1.post2
    +
    + +

    In the following steps, we will (optionally) download the model weights, explain the configuration, create a model archive, deploy and test it:

    + +

    1. (Optional) Download Model Weights

    + +

    This step is optional, as vLLM can also handle downloading the weights when the model server is started. However, pre-downloading the model weights and sharing the cached files between TorchServe instances can be beneficial in terms of storage usage and startup time of the model worker. If you choose to download the weights, use the huggingface-cli and execute:

    + +
    # make sure you have logged into huggingface with huggingface-cli login before
    +# and have your access request for the Llama 3.1 model weights approved
    +
    +huggingface-cli download meta-llama/Meta-Llama-3.1-70B-Instruct --exclude original/*
    +
    + +

    This will download the files under $HF_HOME, and you can alter the variable if you want to place the files elsewhere. Please ensure that you update the variable wherever you run TorchServe and make sure it has access to that folder.

    + +

    2. Configure the Model

    + +

    Next, we create a YAML configuration file that contains all the necessary parameters for our model deployment. The first part of the config file specifies how the frontend should launch the backend worker, which will ultimately run the model in a handler. The second part includes parameters for the backend handler, such as the model to load, followed by various parameters for vLLM itself. For more information on possible configurations for the vLLM engine, please refer to this link.

    + +
    echo '
    +# TorchServe frontend parameters
    +minWorkers: 1            
    +maxWorkers: 1            # Set the number of worker to create a single model instance
    +startupTimeout: 1200     # (in seconds) Give the worker time to load the model weights
    +deviceType: "gpu" 
    +asyncCommunication: true # This ensures we can cummunicate asynchronously with the worker
    +parallelType: "custom"   # This lets TS create a single backend prosses assigning 4 GPUs
    +parallelLevel: 4
    +
    +# Handler parameters
    +handler:
    +    # model_path can be a model identifier for Hugging Face hub or a local path
    +    model_path: "meta-llama/Meta-Llama-3.1-70B-Instruct"
    +    vllm_engine_config:  # vLLM configuration which gets fed into AsyncVLLMEngine
    +        max_num_seqs: 16
    +        max_model_len: 512
    +        tensor_parallel_size: 4
    +        served_model_name:
    +            - "meta-llama/Meta-Llama-3.1-70B-Instruct"
    +            - "llama3"
    +'> model_config.yaml
    +
    + +

    3. Create the Model Folder

    + +

    After creating the model configuration file (model_config.yaml), we will now create a model archive that includes the configuration and additional metadata, such as versioning information. Since the model weights are large, we will not include them inside the archive. Instead, the handler will access the weights by following the model_path specified in the model configuration. Note that in this example, we have chosen to use the “no-archive” format, which creates a model folder containing all necessary files. This allows us to easily modify the config files for experimentation without any friction. Later, we can also select the mar or tgz format to create a more easily transportable artifact.

    + +
    mkdir model_store
    +torch-model-archiver --model-name vllm --version 1.0 --handler vllm_handler --config-file model_config.yaml --archive-format no-archive --export-path model_store/
    +
    + +

    4. Deploy the Model

    + +

    The next step is to start a TorchServe instance and load the model. Please note that we have disabled token authentication for local testing purposes. It is highly recommended to implement some form of authentication when publicly deploying any model.

    + +

    To start the TorchServe instance and load the model, run the following command:

    + +
    torchserve --start --ncs  --model-store model_store --models vllm --disable-token-auth
    +
    + +

    You can monitor the progress of the model loading through the log statements. Once the model has finished loading, you can proceed to test the deployment.

    + +

    5. Test the Deployment

    + +

    The vLLM integration uses an OpenAI API compatible format so we can either use a specialized tool for this purpose or curl. The JSON data we are using here includes the model identifier as well as the prompt text. Other options and their default values can be found in the vLLMEngine docs.

    + +
    echo '{
    +  "model": "llama3",
    +  "prompt": "A robot may not injure a human being",
    +  "stream": 0
    +}' | curl --header "Content-Type: application/json"   --request POST --data-binary @-   http://localhost:8080/predictions/vllm/1.0/v1/completions
    +
    + +

    The output of the request looks like this:

    + +
    {
    +  "id": "cmpl-cd29f1d8aa0b48aebcbff4b559a0c783",
    +  "object": "text_completion",
    +  "created": 1727211972,
    +  "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
    +  "choices": [
    +    {
    +      "index": 0,
    +      "text": " or, through inaction, allow a human being to come to harm.\nA",
    +      "logprobs": null,
    +      "finish_reason": "length",
    +      "stop_reason": null,
    +      "prompt_logprobs": null
    +    }
    +  ],
    +  "usage": {
    +    "prompt_tokens": 10,
    +    "total_tokens": 26,
    +    "completion_tokens": 16
    +  }
    +
    + +

    When streaming is False TorchServe will collect the full answer and send it in one go after the last token was created. If we flip the stream parameter we will receive piecewise data containing a single token in each message.

    + +

    Conclusion

    + +

    In this blog post, we explored the new, native integration of the vLLM inference engine into TorchServe. We demonstrated how to locally deploy a Llama 3.1 70B model using the ts.llm_launcher script and how to create a model archive for deployment on any TorchServe instance. Additionally, we discussed how to build and run the solution in a Docker container for deployment on Kubernetes or EKS. In future works, we plan to enable multi-node inference with vLLM and TorchServe, as well as offer a pre-built Docker image to simplify the deployment process.

    + +

    We would like to express our gratitude to Mark Saroufim and the vLLM team for their invaluable support in the lead-up to this blog post.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/deprecation-cuda-python-support/index.html b/blog/deprecation-cuda-python-support/index.html new file mode 100644 index 000000000000..e8a19e61e3a6 --- /dev/null +++ b/blog/deprecation-cuda-python-support/index.html @@ -0,0 +1,699 @@ + + + + + + + + + + + + + Deprecation of CUDA 11.6 and Python 3.7 Support | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    + +
    + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    For the upcoming PyTorch 2.0 feature release (target March 2023), we will target CUDA 11.7 as the stable version and CUDA 11.8 as the experimental version of CUDA and Python >=3.8, <=3.11.

    + +

    If you are still using or depending on CUDA 11.6 or Python 3.7 builds, we strongly recommend moving to at least CUDA 11.7 and Python 3.8, as it would be the minimum versions required for PyTorch 2.0.

    + +

    Please note that as of Feb 1, CUDA 11.6 and Python 3.7 are no longer included in the nightlies

    + +

    Please refer to the Release Compatibility Matrix for PyTorch releases:

    + + + + + + + + + + + + + + + + + + + + + + + + + + +
    PyTorch Version + Python + Stable CUDA + Experimental CUDA +
    2.0 + >=3.8, <=3.11 + CUDA 11.7, CUDNN 8.5.0.96 + CUDA 11.8, CUDNN 8.7.0.84 +
    1.13 + >=3.7, <=3.10 + CUDA 11.6, CUDNN 8.3.2.44 + CUDA 11.7, CUDNN 8.5.0.96 +
    1.12 + >=3.7, <=3.10 + CUDA 11.3, CUDNN 8.3.2.44 + CUDA 11.6, CUDNN 8.3.2.44 +
    + +

    As of 2/1/2023

    + +

    For more information on PyTorch releases, updated compatibility matrix and release policies, please see (and bookmark) Readme.

    + + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/develop-android-applications/index.html b/blog/develop-android-applications/index.html new file mode 100644 index 000000000000..70f83a165fd0 --- /dev/null +++ b/blog/develop-android-applications/index.html @@ -0,0 +1,688 @@ + + + + + + + + + + + + + Learn how to develop Android applications with ExecuTorch and Llama models | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Arm + +

    +

    This blog is courtesy of the PyTorch team at Arm. More details can be found here.

    + +

    Arm’s compute platform is delivering GenAI applications on phones, laptops, and servers. Cost, privacy, performance, security, and energy efficiency are just some of the reasons developers are investigating on-device AI.

    + +

    A new Learning Path explaining how to leverage the capabilities of large language models (LLMs) on Android using ExecuTorch and XNNPACK is now available.

    + +

    Here’s a summary of what you’ll learn:

    + +
      +
    • +

      Development Environment setup

      + +

      The Learning Path begins by guiding you through setting up your development environment, ensuring you have all the necessary tools installed, including Android Studio, the Android NDK, Java JDK, and Python.

      +
    • +
    • +

      ExecuTorch and XNNPACK

      + +

      You’ll learn about the core technologies: ExecuTorch, a framework for deploying PyTorch models to edge devices, and XNNPACK, a high-performance library for executing neural networks on Arm-based platforms.

      +
    • +
    • +

      Llama models

      + +

      The Learning Path explores Llama, a family of powerful LLMs, focusing specifically on the 8B Llama 3 model. You’ll learn about quantization techniques, which are essential for optimizing model size and performance on mobile devices.

      +
    • +
    • +

      Prepare Llama models for ExecuTorch

      + +

      You’ll be guided through the process of downloading, exporting, and evaluating Llama models, ensuring they are ready for deployment using ExecuTorch.

      +
    • +
    • +

      Check model performance on Android

      + +

      The Learning Path walks you through cross-compiling the Llama runner binary for Android, allowing you to test your model’s performance on your phone.

      +
    • +
    • +

      Build and run an Android Chat App

      + +

      Finally, you’ll learn how to build a native Android chat app using the LlamaDemo application from the ExecuTorch repository. This hands-on experience allows you to put your knowledge into practice and create a real-world application.

      +
    • +
    + +

    Explore this Learning Path if you want to learn how to leverage the power of LLMs on your Android phone, and gain expertise in tools for on-device machine learning.

    + +

    Dig into the excitement of building Android chat apps and understand more about how they work on the Arm Developer Hub.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/dinosaurs-to-seismic-imaging/index.html b/blog/dinosaurs-to-seismic-imaging/index.html new file mode 100644 index 000000000000..b5607df06921 --- /dev/null +++ b/blog/dinosaurs-to-seismic-imaging/index.html @@ -0,0 +1,756 @@ + + + + + + + + + + + + + From PyTorch Conference 2023: From Dinosaurs to Seismic Imaging with Intel | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Ramya Ravi, Susan Kahler at Intel + +

    +

    Dinosaur fossil

    + +

    Lightning Talk 1: Seismic Data to Subsurface Models with OpenFWI

    + +

    Speaker: Benjamin Consolvo, AI Software Engineering Manager, Intel, LinkedIn

    + +

    Session Overview

    + +

    In this session, Ben begins with an overview of seismic imaging and full waveform inversion (FWI). Seismic imaging and FWI helps us to explore land for important subsurface minerals necessary for human thriving. To find those crucial subsurface minerals, we need to image the subsurface with a high degree of accuracy at a low cost, which involves two main challenges. He explains the solutions for those challenges using AI, which are summarized below.

    + + + + + + + + + + + + + + +
    Challenges + Solutions using AI +
    Traditional physics based FWI requires an accurate starting model. + Data-driven deep learning solutions do not require an accurate starting model. +
    GPUs are typically used for fine-tuning neural networks but are often unavailable and expensive. + CPUs are highly available, inexpensive, and viable for AI fine-tuning. The new 4th Gen Intel® Xeon® Scalable processor has the built-in AI accelerator engine called Intel® AMX (Intel® Advanced Matrix Extensions) that helps to accelerate AI training and inference performance. +
    + +

    Next, he shows the wave propagation for the subsurface model and corresponding seismic shot gathers. In his example, the shot gathers are synthetically generated time-sampled records of sounds recordings from a shot (like a dynamite explosion or vibroseis truck) recorded by geophones spread across a large area. For this application, the training data consists of a pair of subsurface model image and seismic shot gather images, where the model from the shot gather is predicted.

    + + + + + + + + + + + + + + + + + + + + + + +
    + Number of Seismic Shot Images + Number of subsurface model images +
    Train + 120,000 + 24,000 +
    Test + 25,000 + 5,000 +
    Validation + 5,000 + 1,000 +
    + +

    In this application, the algorithm used during training was InversionNET (encoder-decoder convolutional neural network). Check out the implementation details for InversionNET architecture in Deng et al. (2021).

    + +

    He then shows the results:

    + +
      +
    1. Prediction versus ground truth model after one epoch and at 50 epochs. After training InversionNET, the predicted model is much closer to the ground truth image.
    2. +
    3. Training loss and validation loss curves decreasing over time across 50 epochs.
    4. +
    + +

    Finally, Ben concludes his talk by highlighting that he was able to successfully fine-tune a deep neural network without an accurate starting model to obtain subsurface model on a 4th generation Intel® Xeon® Scalable processor.

    + +

    Watch the full video recording here and download the presentation. More details can be found in this blog.

    + +

    About the Speaker

    + +

    Ben Consolvo

    + +

    Ben Consolvo is an AI Solutions Engineering Manager at Intel. He has been building a team and a program around Intel’s AI technology paired with Intel’s hardware offerings. He brings a background and passion in data science, particularly in deep learning (DL) and computer vision. He has applied his skills in DL in the cybersecurity industry to automatically identify phishing websites, as well as to the oil and gas industry to identify subsurface features for geophysical imaging.

    + +

    Lightning Talk 2: Dinosaur Bone Hunt

    + +

    Speaker: Bob Chesebrough, Sr Solution Architect, Intel, LinkedIn

    + +

    Session Overview

    + +

    In this session, Bob starts the presentation by explaining his interest in collecting dinosaur bones and gives an overview of Intel AI Software portfolio.

    + +

    He then explains the steps to create a dinosaur site treasure map or dinosaur bone likelihood map:

    + +
      +
    1. Collect data and create training data (New Mexico aerial photos of the Morrison Formation - a famous dinosaur bone bed in the Western United States and the GPS coordinates for small bone fragments discovered)
    2. +
    3. Train a simple ResNet 18 model using Intel® Extension for PyTorch
    4. +
    5. Score the model on Utah photos and create a heat map
    6. +
    + +

    Finally, Bob shows the results that dinosaur bones were discovered in Utah using dinosaur bone likelihood map. Go to the GitHub repository to access the code sample and try out the sample using Intel Extension for PyTorch.

    + +

    Watch the full video recording here and download the presentation. More details can be found in this blog.

    + +

    About the Speaker

    + +

    Bob Chesebrough

    + +

    Bob Chesebrough’s industry experience is software development/AI solution engineering for fortune 100 companies and national laboratories for over three decades. He is also a hobbyist who has logged over 800 miles and 1000 hours in the field finding dinosaur bones. He and his sons discovered an important fossil of the only known crocodilian from the Jurassic in New Mexico, they have also discovered and logged into the museum 2000+ bones localities and described a new mass bone bed in New Mexico.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/docathon-2025/index.html b/blog/docathon-2025/index.html new file mode 100644 index 000000000000..9582eb0e3b93 --- /dev/null +++ b/blog/docathon-2025/index.html @@ -0,0 +1,684 @@ + + + + + + + + + + + + + Announcing the PyTorch Docathon 2025 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    May 01, 2025

    +

    + Announcing the PyTorch Docathon 2025 +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    PyTorch Docathon 2025

    + +

    We’re thrilled to announce the 2025 PyTorch Docathon! This is a hackathon-style event aimed at enhancing PyTorch documentation with the support of the community. Documentation is a vital component of any technology, and by refining it, we can simplify the onboarding process for new users, help them effectively utilize PyTorch’s features, and ultimately speed up the transition from research to production in machine learning.

    + +

    WHY PARTICIPATE

    + +

    Low Barrier to Entry

    + +

    Unlike many open-source projects that require deep knowledge of the codebase and previous contributions to join hackathon events, the Docathon is tailored for newcomers. While we expect participants to be familiar with Python, and have basic knowledge of PyTorch and machine learning, there are tasks related to website issues that don’t even require that level of expertise.

    + +

    Tangible Results

    + +

    A major advantage of the Docathon is witnessing the immediate impact of your contributions. Enhancing documentation significantly boosts a project’s usability and accessibility, and you’ll be able to observe these improvements directly. Seeing tangible outcomes can also be a strong motivator to continue contributing.

    + +

    Collaborative Environment

    + +

    The Docathon fosters a collaborative atmosphere, offering you the chance to work alongside other contributors and PyTorch maintainers to improve the documentation. This is a fantastic opportunity to learn from peers, exchange ideas, and build connections.

    + +

    Learning Opportunities

    + +

    Even if you’re not a PyTorch expert, the Docathon offers a valuable learning experience. You’ll have the chance to delve into PyTorch modules, test tutorials on your machine, and explore them in the CI environment.

    + +

    WHO SHOULD PARTICIPATE

    + +

    Whether you’re a seasoned documentation expert or just starting out, we invite everyone to join in the PyTorch docathon to contribute and develop your skills and knowledge to help improve the documentation for everyone! We will have issues labelled by skill level, and the PyTorch Discord will be available for collaboration and help.

    + +

    EVENT DETAILS

    + +
      +
    • June 3: Kick-off 10 AM PT
    • +
    • June 4 - June 15: Submissions and Feedback
    • +
    • June 16 - June 17: Final Reviews
    • +
    • June 18: Winner Announcements
    • +
    + +

    Make sure to RSVP to the event so you receive all the notifications and instructions on how to participate.

    + +

    Further details about the Docathon will be shared during the Kick-off call on June 3.

    + +

    Don’t forget to register for this year’s event: RSVP now

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/docathon-h1-2023-wrap-up/index.html b/blog/docathon-h1-2023-wrap-up/index.html new file mode 100644 index 000000000000..59a8350f65e2 --- /dev/null +++ b/blog/docathon-h1-2023-wrap-up/index.html @@ -0,0 +1,662 @@ + + + + + + + + + + + + + 🎉 PyTorch Docathon H1 2023 Wrap-up 🎉 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    + +
    + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    Thank you to all who participated in our first ever PyTorch Docathon, the results have been nothing short of amazing! We want to extend our sincerest gratitude to all the participants who made this event a resounding success. Your passion, talent, and hard work have left an indelible mark on the PyTorch documentation.

    + +

    The virtual Docathon ran from May 31 through June 15 with more than 230 registrants and more than 110 participants joining the Docathon Slack channel, the energy and enthusiasm were palpable. Entrants were judged on the difficulty of submissions that resulted in over 40 merged pull requests and the publication of four new tutorials and addition of one new example.

    + +

    We want to give a special shout-out to our top contributors, who went above and beyond during this event. Your dedication and expertise have been invaluable in enhancing the PyTorch documentation and empowering developers worldwide. See the full list of contributors here.

    + +

    Meet the top contributors:

    + + + +

    As we bring this Docathon to a close, we encourage each and every one of you to stay inspired and keep contributing to PyTorch documentation and code, and pushing the boundaries of what’s possible with PyTorch. Your collective efforts are shaping the landscape of deep learning and fostering innovation in the AI community.

    + +

    Team PyTorch

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/docathon-june-2024/index.html b/blog/docathon-june-2024/index.html new file mode 100644 index 000000000000..eeee4d7edaa0 --- /dev/null +++ b/blog/docathon-june-2024/index.html @@ -0,0 +1,661 @@ + + + + + + + + + + + + + Announcing PyTorch Docathon June, 2024 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    + +
    + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    We are thrilled to announce the upcoming PyTorch Docathon in June! The Docathon, akin to a hackathon, is an event dedicated to enhancing the quality of the PyTorch documentation with the invaluable assistance of our community. Documentation is a vital component of any technology. By refining it, we can simplify the process for new users to get started with PyTorch, guide them in effectively utilizing its features, and ultimately expedite the transition from research to production in machine learning. See our previous events here and here.

    + +

    Why Participate

    + +

    The Docathon is an inclusive event designed to be accessible to newcomers, requiring only a basic understanding of Python, PyTorch, and Machine Learning, with some tasks not even requiring these skills. It offers a rewarding experience as participants can see the direct impact of their contributions on the project’s usability and accessibility. The Docathon promotes a collaborative environment, allowing participants to work with other contributors and PyTorch maintainers, fostering the exchange of ideas and networking. It also provides a rich learning experience, offering the opportunity to explore PyTorch modules, update docstrings, and test tutorials.

    + +

    Event Details

    + +

    June 4: Kick-off
    +June 4 - 16: Submissions and Feedback
    +June 17 - 18: Final Reviews
    +June 20: Winner Announcements

    + +

    Further details for the Docathon will be announced at the Kick-off call on June 4.

    + +

    Please register to join this year’s event.

    + + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/docathon-kickoff-h1-2024/index.html b/blog/docathon-kickoff-h1-2024/index.html new file mode 100644 index 000000000000..4553981ee48d --- /dev/null +++ b/blog/docathon-kickoff-h1-2024/index.html @@ -0,0 +1,703 @@ + + + + + + + + + + + + + Ready, Set, Contribute: PyTorch Docathon Kickoff H1 2024 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    The PyTorch Docathon is now live! This event is dedicated to enhancing the quality of the PyTorch documentation with the invaluable assistance of our community. Our hope with this Docathon is to simplify the process for new users to get started with PyTorch, guide them in effectively utilizing its features, and ultimately expedite the transition from research to production in machine learning.

    + +

    JOIN THE KICK-OFF EVENT
    +on June 4th at 10 AM PT

    + +

    Event Details

    + +
      +
    • June 4: Kick-off - join a 30-minutes livestream kick off event on Discord on June 4th at 10 AM PT here. If you can’t join the kick-off event, watch our welcome video on YouTube
    • +
    • June 4-June 16: Submissions and Feedback
    • +
    • June 17-18: Final Reviews
    • +
    • June 20: Winner Announcements
    • +
    + +

    How to Contribute

    + +

    Review the Docathon H1 2024 issue in the pytorch/pytorch or pytorch/tutorials repo that contain all the necessary information on participating in the Docathon and highlights the specific issues to work on. Remember to sign the CLA in your first PR and adhere to the Code of Conduct guidelines.

    + +

    Read the Code of Conduct

    + +

    Take a moment to review the PyTorch code of conduct found here. This document outlines the expectations for behavior and communication within our team, and it is important that everyone is aware of and adheres to these guidelines.

    + +

    Join our Discord

    + +

    This channel serves as the main communication hub during the Docathon. You can join it using by using this link:

    + +

    JOIN DISCORD SERVER

    + +

    When you first join the server, you will have limited access. To gain full access to our Discord PyTorch Docathon Channel:

    + +
      +
    1. Enter the server and navigate to the #self-roles channel.
    2. +
    3. In the #self-roles channel, click on the ‘Join Docathon’ button in the relevant post to assign yourself the docathon role.
    4. +
    5. After assigning the role, you will see the ‘PyTorch Docathon H1 2024 Section’ in the left-hand menu for discussions.
    6. +
    7. To help prevent spam we are asking that you change your server username to your GitHub username or the email username you registered with.
    8. +
    + +

    Explore the GitHub Issues

    + +

    All the Docathon issues are posted on GitHub. You can find them by the docathon-h1-2024 label in the following participating repositories:

    + + + +

    The issues are categorized into three levels of difficulty: easy, medium, and advanced. If this is your first time contributing to PyTorch, we recommend starting with an issue at the easy level.

    + +

    Prizes for Winners

    + +

    We will have a leaderboard throughout the duration of the Docathon. The more you contribute, the higher you’ll get on the board! Our top three winners will get free admission to PyTorch Conference 2024.

    + +

    Thank you to our Partners

    + +

    This year, we’re thrilled to work with the PyTorch Teams at Meta, Google and Snowflake to help us put on a successful event. We’ll also be at Snowflake Dev Day on June 6 where you can hear from Meta’s Matthias Reso, and check out our PyTorch booth.

    + +

    Happy contributing!

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/doctr-joins-pytorch-ecosystem/index.html b/blog/doctr-joins-pytorch-ecosystem/index.html new file mode 100644 index 000000000000..ad753056a7ba --- /dev/null +++ b/blog/doctr-joins-pytorch-ecosystem/index.html @@ -0,0 +1,783 @@ + + + + + + + + + + + + + docTR joins PyTorch Ecosystem: From Pixels to Data, Building a Recognition Pipeline with PyTorch and docTR | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Olivier Dulcy & Sebastian Olivera, Mindee + +

    +

    docTR logo

    + +

    We’re thrilled to announce that the docTR project has been integrated into the PyTorch ecosystem! This integration ensures that docTR aligns with PyTorch’s standards and practices, giving developers a reliable, community-backed solution for powerful OCR workflows.

    + +

    For more information on what it means to be a PyTorch ecosystem project, see the PyTorch Ecosystem Tools page.

    + +

    About docTR

    + +

    docTR is an Apache 2.0 project developed and distributed by Mindee to help developers integrate OCR capabilities into applications with no prior knowledge required.

    + +

    To quickly and efficiently extract text information, docTR uses a two-stage approach:

    + +
      +
    • First, it performs text detection to localize words.
    • +
    • Then, it conducts text recognition to identify all characters in a word.
    • +
    + +

    Detection and recognition are performed by state-of-the-art models written in PyTorch. To learn more about this approach, you can refer to the docTR documentation.

    + +

    docTR enhances the user experience in PyTorch projects by providing high-performance OCR capabilities right out of the box. Its specially designed models require minimal to no fine-tuning for common use cases, allowing developers to quickly integrate advanced document analysis features.

    + +

    Local installation

    + +

    docTR requires Python >= 3.10 and supports Windows, Mac and Linux. Please refer to our README for necessary dependencies for MacBook with the M1 chip.

    + +
    pip3 install -U pip
    +pip3 install "python-doctr[torch,viz]"
    +
    + +

    This will install docTR along with the latest version of PyTorch.

    + +
    Note: docTR also provides docker images for an easy deployment, such as a part of Kubernetes cluster.
    +
    + +

    Text recognition

    + +

    Now, let’s try docTR’s OCR recognition on this sample:

    + +

    OCR sample

    + +

    The OCR recognition model expects an image with only one word on it and will output the predicted word with a confidence score. You can use the following snippet to test OCR capabilities from docTR:

    + +
    python
    +from doctr.io import DocumentFile
    +from doctr.models import recognition_predictor
    +
    +doc = DocumentFile.from_images("/path/to/image")
    +
    +# Load the OCR model
    +# This will download pre-trained models hosted by Mindee
    +model = recognition_predictor(pretrained=True)
    +
    +result = model(doc)
    +print(result)
    +
    + +

    Here, the most important line of code is model = recognition_predictor(pretrained=True). This will load a default text recognition model, crnn_vgg16_bn, but you can select other models through the arch parameter. You can check out the available architectures.

    + +

    When run on the sample, the recognition predictor retrieves the following data: [('MAGAZINE', 0.9872216582298279)]

    + +
    Note: using the DocumentFile object docTR provides an easy way to manipulate PDF or Images.
    +
    + +

    Text detection

    + +

    The last example was a crop on a single word. Now, what about an image with several words on it, like this one?

    + +

    photo of magazines

    + +

    A text detection model is used before the text recognition to output a segmentation map representing the location of the text. Following that, the text recognition is applied on every detected patch.

    + +

    Below is a snippet to run only the detection part:

    + +
    from doctr.io import DocumentFile
    +from doctr.models import detection_predictor
    +from matplotlib import pyplot as plt
    +from doctr.utils.geometry import detach_scores
    +from doctr.utils.visualization import draw_boxes
    +
    +doc = DocumentFile.from_images("path/to/my/file")
    +model = detection_predictor(pretrained=True)
    +
    +result = model(doc)
    +
    +draw_boxes(detach_scores([result[0]["words"]])[0][0], doc[0])
    +plt.axis('off')
    +plt.show()
    +
    + +

    Running it on the full sample yields the following:

    + +

    photo of magazines

    + +

    Similarly to the text recognition, detection_predictor will load a default model (fast_base here). You can also load another one by providing it through the arch parameter.

    + +

    The full implementation

    + +

    Now, let’s plug both components into the same pipeline.

    + +

    Conveniently, docTR provides a wrapper that does exactly that for us:

    + +
    from doctr.io import DocumentFile
    +from doctr.models import ocr_predictor
    +
    +doc = DocumentFile.from_images("/path/to/image")
    +
    +model = ocr_predictor(pretrained=True, assume_straight_pages=False)
    +
    +result = model(doc)
    +result.show()
    +
    + +

    photo of magazines

    + +

    The last line should display a matplotlib window which shows the detected patches. Hovering the mouse over them will display their contents.

    + +

    You can also do more with this output, such as reconstituting a synthetic document like so:

    + +
    import matplotlib.pyplot as plt
    +
    +synthetic_pages = result.synthesize()
    +plt.imshow(synthetic_pages[0])
    +plt.axis('off')
    +plt.show()
    +
    + +

    black text on white

    + +

    The pipeline is highly customizable, where you can modify the detection or recognition model behaviors by passing arguments to the ocr_predictor. Please refer to the documentation to learn more about it.

    + +

    Conclusion

    + +

    We’re excited to welcome docTR into the PyTorch Ecosystem, where it seamlessly integrates with PyTorch pipelines to deliver state-of-the-art OCR capabilities right out of the box.

    + +

    By empowering developers to quickly extract text from images or PDFs using familiar tooling, docTR simplifies complex document analysis tasks and enhances the overall PyTorch experience.

    + +

    We invite you to explore the docTR GitHub repository, join the docTR community on Slack, and reach out at contact@mindee.com for inquiries or collaboration opportunities.

    + +

    Together, we can continue to push the boundaries of document understanding and develop even more powerful, accessible tools for everyone in the PyTorch community.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/easily-list-and-initialize-models-with-new-apis-in-torchvision/index.html b/blog/easily-list-and-initialize-models-with-new-apis-in-torchvision/index.html new file mode 100644 index 000000000000..73dd841e10eb --- /dev/null +++ b/blog/easily-list-and-initialize-models-with-new-apis-in-torchvision/index.html @@ -0,0 +1,762 @@ + + + + + + + + + + + + + Easily list and initialize models with new APIs in TorchVision | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Vasilis Vryniotis and Laurence Rouesnel + +

    +

    TorchVision now supports listing and initializing all available built-in models and weights by name. This new API builds upon the recently introduced Multi-weight support API, is currently in Beta, and it addresses a long-standing request from the community.

    + +

    + +

    + +

    You can try out the new API in the latest nightly release of TorchVision. We’re looking to collect feedback ahead of finalizing the feature in TorchVision v0.14. We have created a dedicated Github Issue where you can post your comments, questions and suggestions!

    + +

    Querying and initializing available models

    + +

    Before the new model registration API, developers had to query the __dict__ attribute of the modules in order to list all available models or to fetch a specific model builder method by its name:

    + +
    # Initialize a model by its name:
    +model = torchvision.models.__dict__[model_name]()
    +
    +# List available models:
    +available_models = [
    +    k for k, v in torchvision.models.__dict__.items()
    +    if callable(v) and k[0].islower() and k[0] != "_"
    +]
    +
    + +

    The above approach does not always produce the expected results and is hard to discover. For example, since the get_weight() method is exposed publicly under the same module, it will be included in the list despite not being a model. In general, reducing the verbosity (less imports, shorter names etc) and being able to initialize models and weights directly from their names (better support of configs, TorchHub etc) was feedback provided previously by the community. To solve this problem, we have developed a model registration API.

    + +

    A new approach

    + +

    We’ve added 4 new methods under the torchvision.models module:

    + +
    from torchvision.models import get_model, get_model_weights, get_weight, list_models
    +
    + +

    The styles and naming conventions align closely with a prototype mechanism proposed by Philip Meier for the Datasets V2 API, aiming to offer a similar user experience. The model registration methods are kept private on purpose as we currently focus only on supporting the built-in models of TorchVision.

    + +

    List models

    + +

    Listing all available models in TorchVision can be done with a single function call:

    + +
    >>> list_models()
    +['alexnet', 'mobilenet_v3_large', 'mobilenet_v3_small', 'quantized_mobilenet_v3_large', ...]
    +
    + +

    To list the available models of specific submodules:

    + +
    >>> list_models(module=torchvision.models)
    +['alexnet', 'mobilenet_v3_large', 'mobilenet_v3_small', ...]
    +>>> list_models(module=torchvision.models.quantization)
    +['quantized_mobilenet_v3_large', ...]
    +
    + +

    Initialize models

    + +

    Now that you know which models are available, you can easily initialize a model with pre-trained weights:

    + +
    >>> get_model("quantized_mobilenet_v3_large", weights="DEFAULT")
    +QuantizableMobileNetV3(
    +  (features): Sequential(
    +   ....
    +   )
    +)
    +
    + +

    Get weights

    +

    Sometimes, while working with config files or using TorchHub, you might have the name of a specific weight entry and wish to get its instance. This can be easily done with the following method:

    + +
    >>> get_weight("ResNet50_Weights.IMAGENET1K_V2")
    +ResNet50_Weights.IMAGENET1K_V2
    +
    + +

    To get the enum class with all available weights of a specific model you can use either its name:

    + +
    >>> get_model_weights("quantized_mobilenet_v3_large")
    +<enum 'MobileNet_V3_Large_QuantizedWeights'>
    +
    + +

    Or its model builder method:

    + +
    >>> get_model_weights(torchvision.models.quantization.mobilenet_v3_large)
    +<enum 'MobileNet_V3_Large_QuantizedWeights'>
    +
    + +

    TorchHub support

    +

    The new methods are also available via TorchHub:

    + +
    import torch
    +
    +# Fetching a specific weight entry by its name:
    +weights = torch.hub.load("pytorch/vision", "get_weight", weights="ResNet50_Weights.IMAGENET1K_V2")
    +
    +# Fetching the weights enum class to list all available entries:
    +weight_enum = torch.hub.load("pytorch/vision", "get_model_weights", name="resnet50")
    +print([weight for weight in weight_enum])
    +
    + +

    Putting it all together

    + +

    For example, if you wanted to retrieve all the small-sized models with pre-trained weights and initialize one of them, it’s a matter of using the above APIs:

    + +
    import torchvision
    +from torchvision.models import get_model, get_model_weights, list_models
    +
    +
    +max_params = 5000000
    +
    +tiny_models = []
    +for model_name in list_models(module=torchvision.models):
    +    weights_enum = get_model_weights(model_name)
    +    if len([w for w in weights_enum if w.meta["num_params"] <= max_params]) > 0:
    +        tiny_models.append(model_name)
    +
    +print(tiny_models)
    +# ['mnasnet0_5', 'mnasnet0_75', 'mnasnet1_0', 'mobilenet_v2', ...]
    +
    +model = get_model(tiny_models[0], weights="DEFAULT")
    +print(sum(x.numel() for x in model.state_dict().values()))
    +# 2239188
    +
    + +

    For more technical details please see the original RFC. Please spare a few minutes to provide your feedback on the new API, as this is crucial for graduating it from beta and including it in the next release. You can do this on the dedicated Github Issue. We are looking forward to reading your comments!

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/ecosystem-day-2021-recap/index.html b/blog/ecosystem-day-2021-recap/index.html new file mode 100644 index 000000000000..a7234aa7e96f --- /dev/null +++ b/blog/ecosystem-day-2021-recap/index.html @@ -0,0 +1,673 @@ + + + + + + + + + + + + + PyTorch Ecosystem Day 2021 Recap and New Contributor Resources | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    Thank you to our incredible community for making the first ever PyTorch Ecosystem Day a success! The day was filled with discussions on new developments, trends and challenges showcased through 71 posters, 32 breakout sessions and 6 keynote speakers.

    + +
    + +
    + +

    Special thanks to our keynote speakers: Piotr Bialecki, Ritchie Ng, Miquel Farré, Joe Spisak, Geeta Chauhan, and Suraj Subramanian who shared updates from the latest release of PyTorch, exciting work being done with partners, use case example from Disney, the growth and development of the PyTorch community in Asia Pacific, and latest contributor highlights.

    + +

    If you missed the opening talks, you rewatch them here:

    + + +

    In addition to the talks, we had 71 posters covering various topics such as multimodal, NLP, compiler, distributed training, researcher productivity tools, AI accelerators, and more. From the event, it was clear that an underlying thread that ties all of these different projects together is the cross-collaboration of the PyTorch community. Thank you for continuing to push the state of the art with PyTorch!

    + +

    To view the full catalogue of poster, please visit PyTorch Ecosystem Day 2021 Event Page.

    + +

    New Contributor Resources

    +

    Today, we are also sharing new contributor resources that we are trying out to give you the most access to up-to-date news, networking opportunities and more.

    +
      +
    • Contributor Newsletter - Includes curated news including RFCs, feature roadmaps, notable PRs, editorials from developers, and more to support keeping track of everything that’s happening in our community.
    • +
    • Contributors Discussion Forum - Designed for contributors to learn and collaborate on the latest development across PyTorch.
    • +
    • PyTorch Developer Podcast (Beta) - Edward Yang, PyTorch Research Scientist, at Facebook AI shares bite-sized (10 to 20 mins) podcast episodes discussing topics about all sorts of internal development topics in PyTorch.
    • +
    + +

    Thank you,

    + +

    Team PyTorch

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/ecosystem_day_2021/index.html b/blog/ecosystem_day_2021/index.html new file mode 100644 index 000000000000..cc295a41d23c --- /dev/null +++ b/blog/ecosystem_day_2021/index.html @@ -0,0 +1,691 @@ + + + + + + + + + + + + + Announcing PyTorch Ecosystem Day | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    March 09, 2021

    +

    + Announcing PyTorch Ecosystem Day +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    We’re proud to announce our first PyTorch Ecosystem Day. The virtual, one-day event will focus completely on our Ecosystem and Industry PyTorch communities!

    + +

    PyTorch is a deep learning framework of choice for academics and companies, all thanks to its rich ecosystem of tools and strong community. As with our developers, our ecosystem partners play a pivotal role in the development and growth of the community.

    + +
    + +
    + +

    We will be hosting our first PyTorch Ecosystem Day, a virtual event designed for our ecosystem and industry communities to showcase their work and discover new opportunities to collaborate.

    + +

    PyTorch Ecosystem Day will be held on April 21, with both a morning and evening session, to ensure we reach our global community. Join us virtually for a day filled with discussions on new developments, trends, challenges, and best practices through keynotes, breakout sessions, and a unique networking opportunity hosted through Gather.Town .

    + +

    Event Details

    +

    April 21, 2021 (Pacific Time) +Fully digital experience

    + +
      +
    • +

      Morning Session: (EMEA) +Opening Talks - 8:00 am-9:00 am PT +Poster Exhibition & Breakout Sessions - 9:00 am-12:00 pm PT

      +
    • +
    • +

      Evening Session (APAC/US) +Opening Talks - 3:00 pm-4:00 pm PT +Poster Exhibition & Breakout Sessions - 3:00 pm-6:00 pm PT

      +
    • +
    • +

      Networking - 9:00 am-7:00 pm PT

      +
    • +
    + +

    There are two ways to participate in PyTorch Ecosystem Day:

    + +
      +
    1. +

      Poster Exhibition from the PyTorch ecosystem and industry communities covering a variety of topics. Posters are available for viewing throughout the duration of the event. To be part of the poster exhibition, please see below for submission details. If your poster is accepted, we highly recommend tending your poster during one of the morning or evening sessions or both!

      +
    2. +
    3. +

      Breakout Sessions are 40-min sessions freely designed by the community. The breakouts can be talks, demos, tutorials or discussions. Note: you must have an accepted poster to apply for the breakout sessions.

      +
    4. +
    + +

    Call for posters now open! Submit your proposal today! Please send us the title and summary of your projects, tools, and libraries that could benefit PyTorch researchers in academia and industry, application developers, and ML engineers for consideration. The focus must be on academic papers, machine learning research, or open-source projects. Please no sales pitches. Deadline for submission is March 18, 2021.

    + +

    Visit pytorchecosystemday.fbreg.com for more information and we look forward to welcoming you to PyTorch Ecosystem Day on April 21st!

    + + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/effective-multi-objective-nueral-architecture/index.html b/blog/effective-multi-objective-nueral-architecture/index.html new file mode 100644 index 000000000000..d42251f35c78 --- /dev/null +++ b/blog/effective-multi-objective-nueral-architecture/index.html @@ -0,0 +1,784 @@ + + + + + + + + + + + + + Efficient Multi-Objective Neural Architecture Search with Ax | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + David Eriksson, Max Balandat + +

    +

    tl;dr

    + +

    Multi-Objective Optimization in Ax enables efficient exploration of tradeoffs (e.g. between model performance and model size or latency) in Neural Architecture Search. This method has been successfully applied at Meta for a variety of products such as On-Device AI. In this post, we provide an end-to-end tutorial that allows you to try it out yourself.

    + +

    Introduction

    + +

    Neural networks continue to grow in both size and complexity. Developing state-of-the-art architectures is often a cumbersome and time-consuming process that requires both domain expertise and large engineering efforts. In an attempt to overcome these challenges, several Neural Architecture Search (NAS) approaches have been proposed to automatically design well-performing architectures without requiring a human in-the-loop.

    + +

    Despite being very sample-inefficient, naïve approaches like random search and grid search are still popular for both hyperparameter optimization and NAS (a study conducted at NeurIPS 2019 and ICLR 2020 found that 80% of NeurIPS papers and 88% of ICLR papers tuned their ML model hyperparameters using manual tuning, random search, or grid search). But as models are often time-consuming to train and may require large amounts of computational resources, minimizing the number of configurations that are evaluated is important.

    + +

    Ax is a general tool for black-box optimization that allows users to explore large search spaces in a sample-efficient manner using state-of-the art algorithms such as Bayesian Optimization. At Meta, Ax is used in a variety of domains, including hyperparameter tuning, NAS, identifying optimal product settings through large-scale A/B testing, infrastructure optimization, and designing cutting-edge AR/VR hardware.

    + +

    In many NAS applications, there is a natural tradeoff between multiple metrics of interest. For instance, when deploying models on-device we may want to maximize model performance (e.g., accuracy), while simultaneously minimizing competing metrics such as power consumption, inference latency, or model size, in order to satisfy deployment constraints. In many cases, we have been able to reduce computational requirements or latency of predictions substantially by accepting a small degradation in model performance (in some cases we were able to both increase accuracy and reduce latency!). Principled methods for exploring such tradeoffs efficiently are key enablers of Sustainable AI.

    + +

    At Meta, we have successfully used multi-objective Bayesian NAS in Ax to explore such tradeoffs. Our methodology is being used routinely for optimizing AR/VR on-device ML models. Beyond NAS applications, we have also developed MORBO which is a method for high-dimensional multi-objective optimization that can be used to optimize optical systems for augmented reality (AR).

    + +

    Fully automated Multi-Objective NAS with Ax

    + +

    Ax’s Scheduler allows running experiments asynchronously in a closed-loop fashion by continuously deploying trials to an external system, polling for results, leveraging the fetched data to generate more trials, and repeating the process until a stopping condition is met. No human intervention or oversight is required. Features of the Scheduler include:

    + +
      +
    • +

      Customizability of parallelism, failure tolerance, and many other settings;

      +
    • +
    • +

      A large selection of state-of-the-art optimization algorithms;

      +
    • +
    • +

      Saving in-progress experiments (to a SQL DB or json) and resuming an experiment from storage;

      +
    • +
    • +

      Easy extensibility to new backends for running trial evaluations remotely.

      +
    • +
    + +

    The following illustration from the Ax scheduler tutorial summarizes how the scheduler interacts with any external system used to run trial evaluations:

    + + + +

    + +

    + +

    To run automated NAS with the Scheduler, the main things we need to do are:

    + +
      +
    • +

      Define a Runner, which is responsible for sending off a model with a particular architecture to be trained on a platform of our choice (like Kubernetes, or maybe just a Docker image on our local machine). In the tutorial below, we use TorchX for handling deployment of training jobs.

      +
    • +
    • +

      Define a Metric, which is responsible for fetching the objective metrics (such as accuracy, model size, latency) from the training job. In our tutorial, we use Tensorboard to log data, and so can use the Tensorboard metrics that come bundled with Ax.

      +
    • +
    + +

    Tutorial

    + +

    In our tutorial we show how to use Ax to run multi-objective NAS for a simple neural network model on the popular MNIST dataset. While the underlying methodology can be used for more complicated models and larger datasets, we opt for a tutorial that is easily runnable end-to-end on a laptop in less than an hour. In our example, we will tune the widths of two hidden layers, the learning rate, the dropout probability, the batch size, and the number of training epochs. The goal is to trade off performance (accuracy on the validation set) and model size (the number of model parameters) using multi-objective Bayesian optimization.

    + +

    The tutorial makes use of the following PyTorch libraries:

    + +
      +
    • +

      PyTorch Lightning (specifying the model and training loop)

      +
    • +
    • +

      TorchX (for running training jobs remotely / asynchronously)

      +
    • +
    • +

      BoTorch (the Bayesian optimization library that powers Ax’s algorithms)

      +
    • +
    + +

    The complete runnable example is available as a PyTorch Tutorial.

    + +

    Results

    + +

    The final results from the NAS optimization performed in the tutorial can be seen in the tradeoff plot below. Here, each point corresponds to the result of a trial, with the color representing its iteration number, and the star indicating the reference point defined by the thresholds we imposed on the objectives. We see that our method was able to successfully explore the trade-offs between validation accuracy and number of parameters and found both large models with high validation accuracy as well as small models with lower validation accuracy. Depending on the performance requirements and model size constraints, the decision maker can now choose which model to use or analyze further.

    + +

    + +

    + +

    Visualizations

    + +

    Ax provides a number of visualizations that make it possible to analyze and understand the results of an experiment. Here, we will focus on the performance of the Gaussian process models that model the unknown objectives, which are used to help us discover promising configurations faster. Ax makes it easy to better understand how accurate these models are and how they perform on unseen data via leave-one-out cross-validation. In the figures below, we see that the model fits look quite good - predictions are close to the actual outcomes, and predictive 95% confidence intervals cover the actual outcomes well. Additionally, we observe that the model size (num_params) metric is much easier to model than the validation accuracy (val_acc) metric.

    + + + + + +
    +

    + +

    + +

    + +

    +
    + +

    Takeaways

    + +
      +
    • +

      We showed how to run a fully automated multi-objective Neural Architecture Search using Ax.

      +
    • +
    • +

      Using the Ax Scheduler, we were able to run the optimization automatically in a fully asynchronous fashion - this can be done locally (as done in the tutorial) or by deploying trials remotely to a cluster (simply by changing the TorchX scheduler configuration).

      +
    • +
    • +

      The state-of-the-art multi-objective Bayesian optimization algorithms available in Ax allowed us to efficiently explore the tradeoffs between validation accuracy and model size.

      +
    • +
    + +

    Advanced Functionality

    + +

    Ax has a number of other advanced capabilities that we did not discuss in our tutorial. Among these are the following:

    + +

    Early Stopping

    + +

    When evaluating a new candidate configuration, partial learning curves are typically available while the NN training job is running. We can use the information contained in the partial curves to identify under-performing trials to stop early in order to free up computational resources for more promising candidates. While not demonstrated in the above tutorial, Ax supports early stopping out-of-the-box.

    + +

    High-dimensional search spaces

    + +

    In our tutorial, we used Bayesian optimization with a standard Gaussian process in order to keep the runtime low. However, these models typically scale to only about 10-20 tunable parameters. Our new SAASBO method (paper, Ax tutorial, BoTorch tutorial) is very sample-efficient and enables tuning hundreds of parameters. SAASBO can easily be enabled by passing use_saasbo=True to choose_generation_strategy.

    + +

    Acknowledgements

    + +

    We thank the TorchX team (in particular Kiuk Chung and Tristan Rice) for their help with integrating TorchX with Ax, and the Adaptive Experimentation team @ Meta for their contributions to Ax and BoTorch.

    + +

    References

    + +

    D. Eriksson, P. Chuang, S. Daulton, M. Balandat. Optimizing model accuracy and latency using Bayesian multi-objective neural architecture search. Meta Research blog, July 2021.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/efficient-large-scale-training-with-pytorch/index.html b/blog/efficient-large-scale-training-with-pytorch/index.html new file mode 100644 index 000000000000..237b5cdf94db --- /dev/null +++ b/blog/efficient-large-scale-training-with-pytorch/index.html @@ -0,0 +1,1096 @@ + + + + + + + + + + + + + Efficient Large-Scale Training with Pytorch FSDP and AWS | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Less Wright, Hamid Shojanazeri, Geeta Chauhan + +

    +

    Cutting-edge AI models are becoming extremely large. The cost and overhead of training these models is increasing rapidly, and involves large amounts of engineering and guesswork to find the right training regime. FSDP reduces these costs significantly by enabling you to train much larger models with the same amount of resources. FSDP lowers the memory footprint on your GPUs, and is usable via a lightweight configuration that requires substantially less effort, typically with just a few lines of code.

    + +

    The main performance gains in FSDP come from maximizing the overlap between network communication and model computation, and eliminating the memory redundancy inherent in traditional data parallel training (DDP). PyTorch FSDP can train models approximately 4x larger on the same server resources as DDP and 20x larger if we combine activation checkpointing and activation offloading.

    + +

    Since PyTorch 1.12, FSDP is now in beta status, and has added a number of new features that can be tuned to further accelerate your model training.

    + +

    In this series of blog posts, we will explain multiple performance optimizations you can run with FSDP to boost your distributed training speed and model sizes within the context of your available server resources. We use the HuggingFace T5 3B, 11B and DeepVit, in fine-tuning mode, as the running examples throughout the series.

    + +

    As a preview of some of the optimizations discussed in this series, we show the before and after performance scaled in Flops below (Note that these results can vary based on your server resources and model architecture).

    + +

    + +

    + +

    *T5 3B Performance measured on AWS A100 and A10 servers. Original with no optimizations and Tuned with the applied optimization

    + +

    + +

    + +

    *T5 11B Performance measured on A100 servers. Original with no optimizations and Tuned with the applied optimization

    + +

    In this first post, we will provide a quick overview of FSDP and how it can make training large- scale AI models more efficient. We will highlight briefly the multiple performance options available, and dive deeper into the details on these in upcoming posts. We will then conclude with an overview on how to leverage AWS parallel cluster for large- scale training with FSDP.

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Optimization + T5 Model + Throughput Improvement +
    Mixed Precision + 3 B + 5x +
    11 B + 10x +
    Activation Checkpointing (AC) + 3 B + 10x +
    11 B + 100x +
    Transformer Wrapping Policy + 3 B + 2x +
    11 B + Unable to run the experiment without the Transformer wrapping policy. +
    Full Shard Strategy + 3 B + 1.5x +
    11 B + Not able to run with Zero2 +
    + +

    Performance optimization gains on T5 models over non-optimized.

    + +

    In our experiments with the T5 3B model, using the transformer wrapping policy resulted in >2x higher throughput measured in TFLOPS versus the default wrapping policy. Activation checkpointing resulted in 10x improvement by reinvesting the freed memory from the checkpoints into larger batch size. Mixed precision with BFloat16 resulted in ~5x improvement versus FP32 and finally the full sharding strategy versus zero2 (DDP) resulted in 1.5x improvement.

    + +

    We ran similar experiments for a larger model, T5 11B, but the larger model size resulted in some changes to the experiment space. Specifically, we found that two optimizations, transformer wrapping policy and activation checkpointing, were needed to enable us to run these experiments on 3 nodes (each node had 8 A100 gpus with 80 GB of memory). With these optimizations, we could fit a batch size of 50 and get higher throughput compared to removing each one of them. Thus rather than running on/off solely for a single optimization test as with the 3B model, the larger model experiments were done with 1 of 3 optimizations turned on/off while always running the other two in order to allow a usable batch size for both test states for each item.

    + +

    Based on TFLOP comparisons, with the 11B model, we saw even more payoff from the optimizations. Mixed precision(~10x improvement) and activation checkpointing (~100x improvement) had a much larger impact with the 11B model compared to the 3B parameter model. With mixed precision we could fit ~2x larger batch sizes and with activation checkpointing >15x batch sizes (from 3 with no activation checkpointing to 50 with activation checkpointing) which translated into large throughput improvements.

    + +

    We also have observed that for these larger models > 3B, using Zero2 sharding strategy would result in minimal room left in memory for the batch data, and had to go with very small batch sizes (e.g 1-2) that essentially makes full sharding strategy a necessity to enable fitting larger batches sizes.

    + +

    Note - this tutorial assumes a basic understanding of FSDP. To learn more about basics of FSDP please refer to the getting started and advanced FSDP tutorials.

    + +

    What is FSDP? How does it make Large-Scale Training More Efficient

    + +

    FSDP expands upon distributed data parallel, by parallelizing not just data, but the model parameters, the optimizer states and gradients associated with the model. Specifically - each GPU only stores a subset of the entire model and the associated subset of optimizer states and gradients.

    + +

    To show the evolution of distributed training, we can start from the beginning, where AI models were simply trained on a single GPU.

    + +

    DDP (Distributed Data Parallel) was the initial step up from training with only a single GPU, and was an effort to address the data and model size growth, where multiple GPUs each housed their own copy of the same model. The gain here is that the data for each batch could be split and processed independently on each GPU, all at the same time,thus parallelizing the processing of the data set and increasing training speed by the increasing number of GPUs. The tradeoff is the need to communicate the gradients between each GPU to synchronize the models after the backward pass.

    + +

    FSDP expands on scaling models by removing the redundancy of optimizer calculations and state storage, as well as gradient and memory storage of model parameters that are present in DDP (DDP = Distributed Data Parallel). This redundancy reduction, along with increased communication overlap where model parameter communication takes place at the same time as model computation, is what allows FSDP to train much larger models with the same resources as DDP.

    + +

    A key point is that this efficiency also allows for AI models that are larger than a single GPU to be trained. The model size available for training is now increased to the aggregate memory of all GPUs, rather than the size of a single GPU. (And as a point of note, FSDP can go beyond aggregated GPU memory by leveraging CPU memory as well, though we will not directly cover this aspect here).

    + +

    As discussed in a previous blog post, with DDP the largest model that we could train on 32, A100 gpus with 40 GB memory (4 nodes) was up to 3B parameters, and batch size of 128, with the help of activation checkpointing. By contrast, using FSDP we were able to train up to 81B model size, combining activation checkpointing, along with activation and parameter offloading. In another experiment, we benchmarked a 1T parameter model with FSDP using 512 gpus.

    + +

    + +

    + +

    For intuition on the parameter level workings of FSDP, below we show an animation detailing how the model parameters are sharded and communicated assuming a two GPU scenario and a simple 8 parameter model:

    + +

    + +

    + +

    Above - the animations walk through the steps involved with the initial sharding of the model amongst ranks, and we start the all_gathers and forward pass

    + +

    + +

    + +

    We continue through the model with the forward pass. After each FSDP unit completes, non-locally owned params are dropped to free memory, and optionally activations can be checkpointed. This continues until we finish the forward pass and compute the loss.

    + +

    + +

    + +

    During the backward pass, another all_gather is used to load the parameters and the gradients are computed. These gradients are then reduce_scattered so that the local owners of each param can aggregate and prepare to update the weights.

    + +

    + +

    + +

    Finally, each rank passes the summed gradients through the optimizer states and updates the weights to complete the mini-batch.

    + +

    With the model now distributed across the entire set of available GPUs, the logical question is how data moves through the model given this sharding of model parameters.

    + +

    This is accomplished by FSDP coordinating with all GPUs to effectively share (communicate) the respective parts of the model. The model is decomposed into FSDP units and parameters within each unit are flattened and then sharded across all GPUs. Within each FSDP unit, GPU’s are assigned interleaving ownership of individual model parameters.

    + +

    By interleaving, we mean the following - assuming 2 gpus with an id of 1 and 2, the FSDP unit ownership pattern would be [12121212], rather than a contiguous chunk of [111222].

    + +

    During training, an all_gather is initiated and the locally owned model parameters within a FSDP unit are shared by the owner GPU with the other non-owners, when they need it, on a ‘just in time’ type basis. FSDP prefetches parameters to overlap all_gather communication with computation.

    + +

    When those requested parameters arrive, the GPU uses the delivered parameters, in combination with the parameters it already owns, to create a fully populated FSDP unit. Thus there is a moment where each GPU hits peak memory usage while holding a fully populated FSDP unit.

    + +

    It then processes the data through the FSDP unit, and drops the parameters it received from other GPU’s to free up memory for the next unit…the process continues over and over proceeding through the entire model to complete the forward pass.The process is then repeated (in general) for the backward pass.(note - this is a simplified version for understanding..there is additional complexity but this should help construct a basic mental model of the FSDP process).

    + +

    This eliminates much of the memory redundancy present in DDP, but imposes the cost of higher amounts of network communication to shuttle these requested parameters back and forth amongst all the GPUs.Overlapping the communication timing with the computation taking place is the basis of many of the performance improvements we’ll discuss in this series. The key gains are frequently based on the fact that communication can often take place at the same time as computation.As you can surmise, having high communication speed is vital for FSDP performance.

    + +

    How do I optimize my training with FSDP?

    + +

    There are four main performance improvements we will cover - the transformer wrapper, activation checkpointing, mixed precision, and selecting the proper sharding strategy. The flowchart below will help as a checklist for tuning options that we will discuss in this post.

    + +

    + +

    + +

    Wrapping policy - for transformers, use Transformer wrapping policy

    + +

    The first performance optimization is leveraging the FSDP transformer wrapper for transformer models.

    + +

    One of the pre-defined wrapping policy is size_based_autowrap_policy. With size_based_autowrap_policy, FSDP will traverse the module structure from bottom to top, a new FSDP unit will be created once the current unit has at least the min_num_params specified within the size policy (this defaults to 1e8, or 100M). If the module can not be created as an FSDP unit, FSDP will continue to check its parent module. This size based wrapping policy may not be ideal for some model structures, PyTorch distributed team is actively working on a new default wrapping policy in the next release which is based on size and also module execution order, users can simply tune the size and achieve the optimized performance.

    + +

    In the current release, you can greatly improve your performance when running Transformer models by using the ‘transformer wrapper’. You will need to provide the appropriate layer class for your model. Here, layer class is the class that houses the Multi-Head Attention and Feed Forward Network.

    + +

    FSDP will then form the FSDP units around the layer class rather than arbitrary breaks based on parameter size. By sharding the model around layer classes that are uniformly repeated within the transformer, FSDP can create uniform FSDP units that better balance the overlap of computation and communication. By contrast, size based wrapping can produce very uneven or skewed shards for models, which then have uneven matching of compute vs communication overlap. As discussed earlier, the main driver of FSDP high performance is the overlap of communication and computation, and hence why the Transformer wrapper provides improved performance. Note that the Transformer wrapper can also be used for non-transformer models if these models have a list of uniform layers.

    + +

    Let’s compare the performance difference on a T5, 3B parameter model when running under the default wrapper and the transformer wrapper.

    + +

    For default wrapping, we don’t need to take any action - we simply pass the model to FSDP as shown:

    + +
    model = FSDP(
    +      model,
    +      device_id=torch.cuda.current_device(),
    +  )
    +
    + +

    In this case FSDP will simply wrap the whole model in a single FSDP unit.

    + +

    Running on an NVIDIA A100-SXM4–40GB with 8 GPUs, we are able to reach 2.3 TFlops and 95% GPU memory utilization with a batch size of 14.

    + +

    However, since T5 is a transformer model, we are better served to leverage the transformer wrapper for this model.

    + +

    To use that, we need to isolate the layer class for the transformer, and then pass it in to create our transformer wrapper.

    + +
    from transformers.models.t5.modeling_t5 import T5Block
    +
    + +

    And now we can create our Transformer wrapper:

    + +
    transformer_auto_wrapper_policy = functools.partial(
    +        transformer_auto_wrap_policy,
    +        transformer_layer_cls={
    +            T5Block,  # < ---- Your Transformer layer class
    +        },
    +    )
    +
    + +

    With our model aware wrapper ready, we can initialize FSDP:

    + +
    # invoke FSDP with your transformer wrapper policy:
    +model = FSDP(
    +        model,
    +        auto_wrap_policy=transformer_auto_wrapper_policy,
    +        device_id=torch.cuda.current_device(),  # streaming init
    +    )
    +
    + +

    Running this wrapped model, we can see some substantial performance gains.We can fit nearly double the batch size, going to 28, and with better memory and communication efficiency, we see a TFlops increase to 5.07 from 2.3.

    + +

    Thus, we’ve increased our training throughput by over 200% (2.19x) due to providing greater model info to FSDP! The transformer wrapping policy results in more fine-grained and balanced FSDP units each holding a layer class, which leads to a more effective communication-computation overlap.

    + +

    + +

    + +

    Above: Graphical comparison of TFlops based on wrapper type

    + +

    If you are training a Transformer model, it pays to configure your training with FSDP using the transformer wrapper. For more information on how to isolate your layer class, please see our in depth video on Transformer wrapping here, where we walk through a number of transformers showing where the layer class can be found.

    + +

    Mixed precision - use BF16 if you have an Ampere architecture GPU

    + +

    FSDP supports a flexible mixed precision policy that gives you granular control over parameters, gradients and buffer data types. This lets you easily leverage BFloat16 or FP16 to increase your training speed by up to 70%.

    + +

    *Note that BFloat 16 is only available on Ampere type GPUs. On AWS this is available with p4dn and g5 instances.

    + +

    By way of comparison, we can show a 77% speed improvement when comparing fully tuned BFloat16 vs FP32 on an 8B DeepVit model.

    + +

    + +

    + +

    We have obtained even greater acceleration using BFloat16 in fine-tuning a 3B HuggingFace T5 model as shown in the figures below. We observed that because of the lower precision the validation loss of BFloat16 is slightly behind in the first few epochs, but it is able to catch up and results in the same final accuracy as FP32.

    + +

    + +

    + +

    To use mixed precision, we create a policy with our desired data types, and pass it in during the FSDP initialization.

    + +

    To create our policy, we need to import the MixedPrecision class, and then define our custom policy using our customized class:

    + +
    from torch.distributed.fsdp import MixedPrecision
    +bfSixteen = MixedPrecision(
    +   param_dtype=torch.bfloat16,
    +   # Gradient communication precision.
    +   reduce_dtype=torch.bfloat16,
    +   # Buffer precision.
    +   buffer_dtype=torch.bfloat16,
    +)
    +model = FSDP(
    +       model,
    +       auto_wrap_policy=transformer_auto_wrapper_policy,
    +       mixed_precision=bfloatPolicy)
    +
    + +

    You can mix and match the precision for parameters, gradients and buffers as you prefer:

    + +
    comboPolicy = MixedPrecision(
    +        # Param precision
    +        param_dtype=torch.bfloat16,
    +        # Gradient communication precision.
    +        reduce_dtype=torch.float32,
    +        # Buffer precision.
    +        buffer_dtype=torch.float32,
    +    )
    +
    + +

    For training with FP16, you will need to also use the ShardedGradScaler, which we will cover in subsequent posts. For BFloat16, it is a drop-in replacement.

    + +

    AnyPrecision Optimizer - going beyond mixed precision with full BF16 training

    + +

    Mixed precision training, both in FSDP and elsewhere, maintains the working weights in the reduced datatype (BF16 or FP16) while keeping the master weights in full FP32. The reason for the master weights in FP32 is that running in pure BF16 will result in ‘weight stagnation’, where very small weight updates are lost due to the lower precision, and the accuracy flatlines over time while FP32 weights can continue to improve from these small updates.

    + +

    In order to resolve this dilemma, we can use the new AnyPrecision optimizer available in TorchDistX (Torch Distributed Experimental) that allows you to successfully train and keep the master weights in pure BF16 instead of FP32. In addition, unlike the typical storage of optimizer states in FP32, AnyPrecision is able to maintain states in pure BF16 as well.

    + +

    AnyPrecision enables pure BF16 training by maintaining an extra buffer that tracks the precision lost during the weight updates and re-applies that during the next update…effectively resolving the weight stagnation issue without requiring FP32.

    + +

    As a comparison of the throughput gains available with pure BF16 training using AnyPrecision, we ran experiments using FSDP with the T5 11B model with regular FP32 training, Mixed Precision training with BF16, and pure BF16 training using the AnyPrecision optimizer on 3 nodes with A100 gpus as mentioned previously.

    + +

    + +

    + +

    As shown above, training with AnyPrecision and pure BF16 resulted in 2x the throughput vs Mixed Precision, and over 20x improvement vs FP32.

    + +

    The potential tradeoff is the impact on final accuracy - in the cases we tested, the accuracy was equal or better than FP32 due to a regularization effect from the slightly reduced precision, but your results may vary.

    + +

    AnyPrecision optimizer is available for you to test with here, and is a drop in replacement for AdamW optimizer.

    + +

    Activation checkpointing - increasing throughput by trading compute for memory

    + +

    + +

    + +

    FSDP supports activation checkpointing once the model has been sharded, and makes it easy to implement. The graph above shows ~4x throughput improvement using activation checkpointing.

    + +

    Activation checkpointing is where the intermediate activations are freed during the forward pass, and a checkpoint is left as a placeholder. This generally increases available GPU memory by over 30%.

    + +

    The tradeoff is that during the backward pass, these previously removed intermediate activations must be re-calculated again using information in the checkpoint (duplicate compute), but by leveraging the increased GPU memory, one can increase the batch size such that the net throughput can increase substantially.

    + +
    # verify we have FSDP activation support ready by importing:
    +from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
    +   checkpoint_wrapper,
    +   CheckpointImpl,
    +   apply_activation_checkpointing_wrapper,
    +)
    +
    + +

    The steps required to implement activation checkpointing is to first import the FSDP checkpointing functions. We need declare our checkpointer wrapper type which is non-reentrant and create a check function to identify which layer to wrap as follows

    + +
    non_reentrant_wrapper = partial(
    +    checkpoint_wrapper,
    +    offload_to_cpu=False,
    +    checkpoint_impl=CheckpointImpl.NO_REENTRANT,
    +)
    +check_fn = lambda submodule: isinstance(submodule, T5Block)
    +
    + +
    apply_activation_checkpointing_wrapper(
    +       model, checkpoint_wrapper_fn=non_reentrant_wrapper, check_fn=check_fn
    +   )
    +
    + +

    Important note - this must be run after the model has been initialized with FSDP.

    + +

    However, hopefully you’ve seen how some initial tuning with FSDP options can have a large impact on your training performance.

    + +

    With that, we turn our attention from how to scale within FSDP, to how to scale your server hardware for FSDP using AWS.

    + +

    Large Scale Training with FSDP on AWS - For multi-node prioritize high speed network

    + +

    AWS provides several services that can be used to run distributed training with FSDP: Amazon EC2 Accelerated Computing instances, AWS ParallelCluster, and Amazon Sagemaker.

    + +

    In this series of blog posts, we used Amazon EC2 p4d instances in a single-instance multi-GPU configuration and in a multi-instance configuration using AWS ParallelCluster and SageMaker in order to run our training jobs.

    + +

    Here, we’ll focus specifically on AWS parallel cluster and provide an overview of how to utilize it for training purposes.

    + +

    AWS ParallelCluster Setup

    + +

    AWS ParallelCluster is an open source, cluster management tool that makes it easy for you to deploy and manage High Performance Computing (HPC) clusters on AWS. AWS ParallelCluster uses yaml configuration files to provision all the necessary resources. It also supports multiple instance types, job submission queues, shared file systems like Amazon EFS (NFS) or Amazon FSx for Lustre, and job schedulers like AWS Batch and Slurm.

    + +

    + +

    + +

    Workflow on Clusters

    + +

    The high level idea is to have a cluster that has a head node which controls the compute nodes. The actual training job runs on the compute nodes. Overall steps to run a training job on a cluster are as follows:

    + +
      +
    1. Set up an AWS ParallelCuster (we discuss below)
    2. +
    3. Connect to the head node, and import the training code/ setup the environment.
    4. +
    5. Pull the data and place it in a shared folder that compute nodes can access (FSx Lustre drive).
    6. +
    7. Run the training job using a job scheduler (in this case Slurm).
    8. +
    + +

    Setup AWS ParallelCuster

    + +

    To setup AWS ParallelCluster,

    + +
      +
    1. +

      Deploy a network stack. This step is optional since you could use your account default VPC and let AWS ParallelCluster create your subnets and security groups. However, we prefer to compartmentalize our desired network infrastructure and do this deployment via a CloudFormation stack.

      + +

      Since we deploy a public and a private subnet, we want to create them into an Availability Zone that contains our target instances, in this case p4d. We consult their availability in the region we use (us-east-1) through the following AWS CLI command:

      + +

      aws ec2 describe-instance-type-offerings --location-type availability-zone \ --filters Name=instance-type,Values=p4d.24xlarge --region us-east-1 --output table

      + +

      We see three availability zones containing p4d instances, we pick one of them (us-east-1c, yours may be different) when deploying our network stack. This can be done with the AWS Console or the AWS CLI. In our case we use the latter as follows

      + +

      aws cloudformation create-stack --stack-name VPC-Large-Scale --capabilities CAPABILITY_IAM --template-body file://VPC-Large-Scale.yaml --parameters ParameterKey=SubnetsAZ,ParameterValue=us-east-1c

      + +

      CloudFormation will deploy our new VPC, subnets, security groups and endpoints on our behalf. Once done, you can retrieve the IDs of the public and private subnets by querying the stack outputs and the values PublicSubnet and PrivateSubnet.

      + +

      For example, using the AWS CLI for the private subnet:

      + +

      aws cloudformation describe-stacks --stack-name VPC-Large-Scale --query "Stacks[0].Outputs[?OutputKey=='PrivateSubnet'].OutputValue" --output text

      +
    2. +
    3. +

      Create ParallelCluster, The cluster configuration file specifies the resources for our cluster. These resources include instance type for Head node, compute nodes, access to S3 buckets, shared storage where our data will be located. We will use Amazon FSx for Lustre that offers a fully managed shared storage service with Lustre.

      + +

      Here is an example of a cluster configuration file. We can use AWs ParallelCluster CLI to create the cluster. Please note that the private and public subnet IDs will need to be replaced by the ones you retrieved earlier. You will be able to control the cluster using the AWS ParallelCluster CLI to start, stop, pause, etc.

      + +
      pcluster create-cluster --cluster-name my-hpc-cluster --cluster-configuration cluster.yaml
      +
      +
    4. +
    5. +

      SSH to Head node - once the cluster is ready, we can connect to the Head node using the SSH protocol, pull our training code with and place the data in the shared storage specified in the cluster configuration file.

      + +
      pcluster ssh --cluster-name cluster -i your-key_pair
      +
      +
    6. +
    7. +

      Launch the training job - now that we have the data and training code, we can launch the slurm job for training. Here is an example of a slurm script to launch the job using torchrun.

      +
    8. +
    + +

    More details on how to set up the cluster is out of the scope of this post, however we will have a separate post on it.

    + +

    What’s next?

    + +

    With this post we provided a high level overview of FSDP and how it efficiently scales distributed AI training. The flowchart included will help provide a checklist for you to review tuning options discussed such as the transformer wrapper and activation checkpointing.

    + +

    In the next posts, we will continue with the T5 model and go deeper into each of the topics above, specifically with sharding strategy and other optimizations to provide more insight and details. For now, a good reference for the sharding strategy is in our video tutorial here:

    + +

    If you have questions or find an issue, please find the authors Less, Hamid and Geeta or open an issue on PyTorch github.

    + +

    Special thanks to:

    + +

    Pytorch Distributed team, Shen Li, Rohan Varma, Yanli Zhao, Andrew Gu, Anjali Sridhar, Ana Simoes, Pierre-Yves Aquilanti, Sundar Ranganathan, and the broader AWS team for supporting us with providing infrastructure and technical support for running the large scale experiments.

    + +

    Resources:

    + +

    FSDP video series

    + +

    Getting started with FSDP

    + +

    Advanced tutorial on FSDP

    + +

    API documentation

    + + + + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/efficient-pytorch-io-library-for-large-datasets-many-files-many-gpus/index.html b/blog/efficient-pytorch-io-library-for-large-datasets-many-files-many-gpus/index.html new file mode 100644 index 000000000000..f9fa3f00dac0 --- /dev/null +++ b/blog/efficient-pytorch-io-library-for-large-datasets-many-files-many-gpus/index.html @@ -0,0 +1,830 @@ + + + + + + + + + + + + + Efficient PyTorch I/O library for Large Datasets, Many Files, Many GPUs | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Alex Aizman, Gavin Maltby, Thomas Breuel + +

    +

    Data sets are growing bigger every day and GPUs are getting faster. This means there are more data sets for deep learning researchers and engineers to train and validate their models.

    + +
      +
    • Many datasets for research in still image recognition are becoming available with 10 million or more images, including OpenImages and Places.
    • +
    • million YouTube videos (YouTube 8M) consume about 300 TB in 720p, used for research in object recognition, video analytics, and action recognition.
    • +
    • The Tobacco Corpus consists of about 20 million scanned HD pages, useful for OCR and text analytics research.
    • +
    + +

    Although the most commonly encountered big data sets right now involve images and videos, big datasets occur in many other domains and involve many other kinds of data types: web pages, financial transactions, network traces, brain scans, etc.

    + +

    However, working with the large amount of data sets presents a number of challenges:

    + +
      +
    • Dataset Size: datasets often exceed the capacity of node-local disk storage, requiring distributed storage systems and efficient network access.
    • +
    • Number of Files: datasets often consist of billions of files with uniformly random access patterns, something that often overwhelms both local and network file systems.
    • +
    • Data Rates: training jobs on large datasets often use many GPUs, requiring aggregate I/O bandwidths to the dataset of many GBytes/s; these can only be satisfied by massively parallel I/O systems.
    • +
    • Shuffling and Augmentation: training data needs to be shuffled and augmented prior to training.
    • +
    • Scalability: users often want to develop and test on small datasets and then rapidly scale up to large datasets.
    • +
    + +

    Traditional local and network file systems, and even object storage servers, are not designed for these kinds of applications. The WebDataset I/O library for PyTorch, together with the optional AIStore server and Tensorcom RDMA libraries, provide an efficient, simple, and standards-based solution to all these problems. The library is simple enough for day-to-day use, is based on mature open source standards, and is easy to migrate to from existing file-based datasets.

    + +

    Using WebDataset is simple and requires little effort, and it will let you scale up the same code from running local experiments to using hundreds of GPUs on clusters or in the cloud with linearly scalable performance. Even on small problems and on your desktop, it can speed up I/O tenfold and simplifies data management and processing of large datasets. The rest of this blog post tells you how to get started with WebDataset and how it works.

    + +

    The WebDataset Library

    + +

    The WebDataset library provides a simple solution to the challenges listed above. Currently, it is available as a separate library (github.com/tmbdev/webdataset), but it is on track for being incorporated into PyTorch (see RFC 38419). The WebDataset implementation is small (about 1500 LOC) and has no external dependencies.

    + +

    Instead of inventing a new format, WebDataset represents large datasets as collections of POSIX tar archive files consisting of the original data files. The WebDataset library can use such tar archives directly for training, without the need for unpacking or local storage.

    + +

    WebDataset scales perfectly from small, local datasets to petascale datasets and training on hundreds of GPUs and allows data to be stored on local disk, on web servers, or dedicated file servers. For container-based training, WebDataset eliminates the need for volume plugins or node-local storage. As an additional benefit, datasets need not be unpacked prior to training, simplifying the distribution and use of research data.

    + +

    WebDataset implements PyTorch’s IterableDataset interface and can be used like existing DataLoader-based code. Since data is stored as files inside an archive, existing loading and data augmentation code usually requires minimal modification.

    + +

    The WebDataset library is a complete solution for working with large datasets and distributed training in PyTorch (and also works with TensorFlow, Keras, and DALI via their Python APIs). Since POSIX tar archives are a standard, widely supported format, it is easy to write other tools for manipulating datasets in this format. E.g., the tarp command is written in Go and can shuffle and process training datasets.

    + +

    Benefits

    + +

    The use of sharded, sequentially readable formats is essential for very large datasets. In addition, it has benefits in many other environments. WebDataset provides a solution that scales well from small problems on a desktop machine to very large deep learning problems in clusters or in the cloud. The following table summarizes some of the benefits in different environments.

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    EnvironmentBenefits of WebDataset
    Local Cluster with AIStoreAIStore can be deployed easily as K8s containers and offers linear scalability and near 100% utilization of network and I/O bandwidth. Suitable for petascale deep learning.
    Cloud ComputingWebDataset deep learning jobs can be trained directly against datasets stored in cloud buckets; no volume plugins required. Local and cloud jobs work identically. Suitable for petascale learning.
    Local Cluster with existing distributed FS or object storeWebDataset’s large sequential reads improve performance with existing distributed stores and eliminate the need for dedicated volume plugins.
    Educational EnvironmentsWebDatasets can be stored on existing web servers and web caches, and can be accessed directly by students by URL
    Training on Workstations from Local DrivesJobs can start training as the data still downloads. Data doesn’t need to be unpacked for training. Ten-fold improvements in I/O performance on hard drives over random access file-based datasets.
    All EnvironmentsDatasets are represented in an archival format and contain metadata such as file types. Data is compressed in native formats (JPEG, MP4, etc.). Data management, ETL-style jobs, and data transformations and I/O are simplified and easily parallelized.
    + +

    We will be adding more examples giving benchmarks and showing how to use WebDataset in these environments over the coming months.

    + +

    High-Performance

    +

    For high-performance computation on local clusters, the companion open-source AIStore server provides full disk to GPU I/O bandwidth, subject only to hardware constraints. This Bigdata 2019 Paper contains detailed benchmarks and performance measurements. In addition to benchmarks, research projects at NVIDIA and Microsoft have used WebDataset for petascale datasets and billions of training samples.

    + +

    Below is a benchmark of AIStore with WebDataset clients using 12 server nodes with 10 rotational drives each.

    + +
    + +
    + +

    The left axis shows the aggregate bandwidth from the cluster, while the right scale shows the measured per drive I/O bandwidth. WebDataset and AIStore scale linearly to about 300 clients, at which point they are increasingly limited by the maximum I/O bandwidth available from the rotational drives (about 150 MBytes/s per drive). For comparison, HDFS is shown. HDFS uses a similar approach to AIStore/WebDataset and also exhibits linear scaling up to about 192 clients; at that point, it hits a performance limit of about 120 MBytes/s per drive, and it failed when using more than 1024 clients. Unlike HDFS, the WebDataset-based code just uses standard URLs and HTTP to access data and works identically with local files, with files stored on web servers, and with AIStore. For comparison, NFS in similar experiments delivers about 10-20 MBytes/s per drive.

    + +

    Storing Datasets in Tar Archives

    + +

    The format used for WebDataset is standard POSIX tar archives, the same archives used for backup and data distribution. In order to use the format to store training samples for deep learning, we adopt some simple naming conventions:

    +
      +
    • datasets are POSIX tar archives
    • +
    • each training sample consists of adjacent files with the same basename
    • +
    • shards are numbered consecutively
    • +
    + +

    For example, ImageNet is stored in 1282 separate 100 Mbyte shards with names pythonimagenet-train-000000.tar to imagenet-train-001281.tar, the contents of the first shard are:

    + +
    -r--r--r-- bigdata/bigdata      3 2020-05-08 21:23 n03991062_24866.cls
    +-r--r--r-- bigdata/bigdata 108611 2020-05-08 21:23 n03991062_24866.jpg
    +-r--r--r-- bigdata/bigdata      3 2020-05-08 21:23 n07749582_9506.cls
    +-r--r--r-- bigdata/bigdata 129044 2020-05-08 21:23 n07749582_9506.jpg
    +-r--r--r-- bigdata/bigdata      3 2020-05-08 21:23 n03425413_23604.cls
    +-r--r--r-- bigdata/bigdata 106255 2020-05-08 21:23 n03425413_23604.jpg
    +-r--r--r-- bigdata/bigdata      3 2020-05-08 21:23 n02795169_27274.cls
    +
    + +

    WebDataset datasets can be used directly from local disk, from web servers (hence the name), from cloud storage and object stores, just by changing a URL. WebDataset datasets can be used for training without unpacking, and training can even be carried out on streaming data, with no local storage.

    + +

    Shuffling during training is important for many deep learning applications, and WebDataset performs shuffling both at the shard level and at the sample level. Splitting of data across multiple workers is performed at the shard level using a user-provided shard_selection function that defaults to a function that splits based on get_worker_info. (WebDataset can be combined with the tensorcom library to offload decompression/data augmentation and provide RDMA and direct-to-GPU loading; see below.)

    + +

    Code Sample

    +

    Here are some code snippets illustrating the use of WebDataset in a typical PyTorch deep learning application (you can find a full example at http://github.com/tmbdev/pytorch-imagenet-wds.

    + +
    import webdataset as wds
    +import ...
    +
    +sharedurl = "/imagenet/imagenet-train-{000000..001281}.tar"
    +
    +normalize = transforms.Normalize(
    +  mean=[0.485, 0.456, 0.406],
    +  std=[0.229, 0.224, 0.225])
    +
    +preproc = transforms.Compose([
    +  transforms.RandomResizedCrop(224),
    +  transforms.RandomHorizontalFlip(),
    +  transforms.ToTensor(),
    +  normalize,
    +])
    +
    +dataset = (
    +  wds.Dataset(sharedurl)
    +  .shuffle(1000)
    +  .decode("pil")
    +  .rename(image="jpg;png", data="json")
    +  .map_dict(image=preproc)
    +  .to_tuple("image", "data")
    +)
    +
    +loader = torch.utils.data.DataLoader(dataset, batch_size=64, num_workers=8)
    +
    +for inputs, targets in loader:
    +  ...
    +
    + +

    This code is nearly identical to the file-based I/O pipeline found in the PyTorch Imagenet example: it creates a preprocessing/augmentation pipeline, instantiates a dataset using that pipeline and a data source location, and then constructs a DataLoader instance from the dataset.

    + +

    WebDataset uses a fluent API for a configuration that internally builds up a processing pipeline. Without any added processing stages, In this example, WebDataset is used with the PyTorch DataLoader class, which replicates DataSet instances across multiple threads and performs both parallel I/O and parallel data augmentation.

    + +

    WebDataset instances themselves just iterate through each training sample as a dictionary:

    + +
    # load from a web server using a separate client process
    +sharedurl = "pipe:curl -s http://server/imagenet/imagenet-train-{000000..001281}.tar"
    +
    +dataset = wds.Dataset(sharedurl)
    +
    +for sample in dataset:
    +  # sample["jpg"] contains the raw image data
    +  # sample["cls"] contains the class
    +  ...
    +
    + +

    For a general introduction to how we handle large scale training with WebDataset, see these YouTube videos.

    + + + +
      +
    • +

      AIStore is an open-source object store capable of full-bandwidth disk-to-GPU data delivery (meaning that if you have 1000 rotational drives with 200 MB/s read speed, AIStore actually delivers an aggregate bandwidth of 200 GB/s to the GPUs). AIStore is fully compatible with WebDataset as a client, and in addition understands the WebDataset format, permitting it to perform shuffling, sorting, ETL, and some map-reduce operations directly in the storage system. AIStore can be thought of as a remix of a distributed object store, a network file system, a distributed database, and a GPU-accelerated map-reduce implementation.

      +
    • +
    • +

      tarp is a small command-line program for splitting, merging, shuffling, and processing tar archives and WebDataset datasets.

      +
    • +
    • +

      tensorcom is a library supporting distributed data augmentation and RDMA to GPU.

      +
    • +
    • +

      pytorch-imagenet-wds contains an example of how to use WebDataset with ImageNet, based on the PyTorch ImageNet example.

      +
    • +
    • +

      Bigdata 2019 Paper with Benchmarks

      +
    • +
    + +

    Check out the library and provide your feedback for RFC 38419.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/empowering-models-performance/index.html b/blog/empowering-models-performance/index.html new file mode 100644 index 000000000000..16b7246e7a13 --- /dev/null +++ b/blog/empowering-models-performance/index.html @@ -0,0 +1,728 @@ + + + + + + + + + + + + + Empowering Models with Performance: The Art of Generalized Model Transformation Approach | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Jackie (Jiaqi) Xu, Yanbo Liang, Jason Ansel, Chunzhi Yang, Jade Nie, Yuzhen Huang, CK Luk, Xiaodong Wang, Lu Fang, Menglu Yu, Jinwon Lee, Daohang Shi, Flavio Sales Truzzi + +

    +

    Introduction

    + +

    PyTorch 2.0 (PT2) offers a compiled execution mode which rewrites Python bytecode to extract sequences of PyTorch operations, translating them into a Graph IR. The IR is then just-in-time compiled through a customizable back end, improving training performance without user interference. Often, production models may go through multiple stages of optimization/lowering to hit performance targets. Therefore, having a compiled mode is desirable as it can separate the work of improving model performance from direct modification of the PyTorch model implementation. Thus, the compiled mode becomes more important, enabling Pytorch users to enhance model performance without modifying the PyTorch code implementation. This feature is particularly valuable for optimizing complex models, including large-scale and production-ready ones.

    + +

    In our previous blog post , we outlined how heuristic model transformation rules are employed to optimize intricate production models. While these rules enabled substantial performance gains for some pilot models, they lacked universal adaptability; they don’t consistently perform well across different models or sometimes even within different sections of a single model.

    + +

    Fig.1 PT1 Graph mode vs PT2 Compile mode.

    + +

    Fig. 1: PT1 Graph mode vs PT2 Compile mode.

    + +

    In this blog post, we propose a more generalized model transformation solution, serving as a plugin to the PT2 compiler as shown in Fig.1 which is more general, performant and user-friendly, bringing performance improvements to both model training and inference without manual efforts. As illustrated in Fig.2, by incorporating the previously user-defined transformations into the compiler, we have streamlined the production stack. These changes bring advantages to a broader range of PyTorch models, extending beyond just Meta models, which has already been incorporated in PT2 and is ready for use to benefit all Pytorch models.

    + +

    Fig.2 Simplified stack with PT2 compile mode.

    + +

    Fig. 2: Simplified stack with PT2 compile mode.

    + +

    Guiding Principle: Atomic Rules

    + +

    Traditionally, people might use predefined heuristic rules to replace a model subgraph with another more performant subgraph toreduce launch overhead, minimize memory bw, and fully occupy SMs. However, this approach doesn’t scale well as it is hard to craft a set of rules that fits all models perfectly.

    + +

    Instead of grappling with bulky, complex rules, we can actually break them down into smaller, more digestible pieces – what we call ‘atomic rules’. These tiny powerhouses of efficiency target the transformation of individual operators, to conduct one step of the fusion/transformation. This makes them easy to handle and apply, offering a straightforward path to optimizing models. So, with these atomic rules in hand, optimizing any model for top-tier performance becomes a breeze!

    + +

    We will walk through some simple examples to demonstrate how we use a chain of atomic rules to replace complicated heuristic rules.

    + +

    Case 1: Horizontal fusion of computation chains started with accesses to embedding tables

    + +

    Horizontal fusion means fusing parallel operators into one so as to reduce the number of kernels to be launched and improve performance. In our previous blog (Section 3.2), we described model transformations that fused layernorm and activation functions after embedding bags, as shown in the figure provided. However, this method, had limitations:

    + +
      +
    1. It only worked with layernorm and activation functions after embedding.
    2. +
    3. It was restricted to models with specific architecture rules, causing various issues in our production stack, including parameter changes and inference disruptions.
    4. +
    + +

    To improve, we can use three atomic rules as shown in Fig.3 to replace the complicated heuristic rule:

    + +
      +
    • Fuse layernorms that follow the same split nodes horizontally.
    • +
    • Then, fuse tanh functions following the same split nodes horizontally.
    • +
    • Lastly, fuse vertical split-cat nodes.
    • +
    + +

    These atomic rules offer a clean and streamlined way for model simplification and optimization.

    + +

    Fig.3 Before, we optimized the model in one go by replacing subgraphs. Now, with atomic rules, we optimize step-by-step, covering more cases.

    + +

    Fig. 3: Before, we optimized the model in one go by replacing subgraphs. Now, with atomic rules, we optimize step-by-step, covering more cases.

    + +

    Case 2: Fuse horizontal MLP

    + +

    MLPs (Multilayer Perceptrons) are fundamental components of deep neural networks, often consisting of linear, normalization, and activation functions. In complex models, there’s often a need to fuse many horizontal MLPs. Traditional methods find and replace parallel MLPs with a fused module as shown in Fig.4, but this isn’t always straightforward. Some models might not have normalization, or they might use different activation functions, making it hard to apply a one-size-fits-all rule.

    + +

    This is where our atomic rules come in handy. These simplified rules target individual operators one at a time, making the process easier and more manageable. We use the following atomic rules for horizontal MLP fusion:

    + +
      +
    • Fusing horizontal linear operators
    • +
    • Fusing horizontal layernorms.
    • +
    • Fusing horizontal activation functions.
    • +
    + +

    Fig.4 Pseudocode for fusing MLP. Traditional optimizations need manual Python code changes.

    + +

    Fig. 4: Pseudocode for fusing MLP. Traditional optimizations need manual Python code changes.

    + +

    The beauty of these rules is that they’re not limited to one case. They can be applied broadly. Since PyTorch models are built with torch operators, focusing on a smaller set of operators simplifies the process. This approach is not only more manageable but also more general compared to writing a specific large pattern replacement rule, making it easier to optimize various models efficiently.

    + + + +

    Our principle is to use chained atomic rules to replace heuristic rules. While this approach covers a wider range of cases, it does entail a longer time for graph search and pattern matching. The next question is: how can we minimize compilation time while performing compile-time graph searches efficiently?

    + +

    We design a two-step greedy algorithm as illustrated in Fig. 5. The first step in this process is to identify the target nodes, which we follow certain rules, e.g., identifying all linear operations with the same input shapes. Once identified, we use a Breadth-First Search (BFS) strategy to separate these nodes into different sets, so that nodes within a set don’t have data dependency. The nodes within each of these sets are independent and can be fused horizontally.

    + +

    Fig.5 Process of model transformation with graph IR.

    + +

    Fig. 5: Process of model transformation with graph IR.

    + +

    With our approach, the search time is roughly 60 seconds for one of our largest internal models, which is manageable for on-the-fly tasks.

    + +

    In the End

    + +

    In our tests with internal ranking models, we observed approximately 5% to 15% training performance improvement across five models on top of the performance gain brought by torch.compile. We have enabled the optimization in PT2 compiler stack and landed it as default when users choose Inductor as the backend (config). We expect our generalized transformation approach could benefit models beyond Meta, and look forward to more discussion and improvement through this compiler level transformation framework.

    + +

    Acknowledgements

    + +

    Many thanks to Mark Saroufim, Gregory Chanan, Adnan Aziz, and Rocky Liu for their detailed and insightful reviews.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/empowering-pytorch-on-intel-xeon-scalable-processors-with-bfloat16/index.html b/blog/empowering-pytorch-on-intel-xeon-scalable-processors-with-bfloat16/index.html new file mode 100644 index 000000000000..c01a1fb2623a --- /dev/null +++ b/blog/empowering-pytorch-on-intel-xeon-scalable-processors-with-bfloat16/index.html @@ -0,0 +1,722 @@ + + + + + + + + + + + + + Empowering PyTorch on Intel® Xeon® Scalable processors with Bfloat16 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Mingfei Ma (Intel), Vitaly Fedyunin (Meta), Wei Wei (Meta) + +

    +

    Overview

    + +

    Recent years, the growing complexity of AI models have been posing requirements on hardware for more and more compute capability. Reduced precision numeric format has been proposed to address this problem. Bfloat16 is a custom 16-bit floating point format for AI which consists of one sign bit, eight exponent bits, and seven mantissa bits. With the same dynamic range as float32, bfloat16 doesn’t require a special handling such as loss scaling. Therefore, bfloat16 is a drop-in replacement for float32 when running deep neural networks for both inference and training.

    + +

    The 3rd Gen Intel® Xeon® Scalable processor (codenamed Cooper Lake), is the first general purpose x86 CPU with native bfloat16 support. Three new bfloat16 instructions were introduced in Intel® Advanced Vector Extensions-512 (Intel® AVX-512): VCVTNE2PS2BF16, VCVTNEPS2BF16, and VDPBF16PS. The first two instructions perform conversion from float32 to bfloat16, and the last one performs a dot product of bfloat16 pairs. Bfloat16 theoretical compute throughput is doubled over float32 on Cooper Lake. On the next generation of Intel® Xeon® Scalable Processors, bfloat16 compute throughput will be further enhanced through Advanced Matrix Extensions (Intel® AMX) instruction set extension.

    + +

    Intel and Meta previously collaborated to enable bfloat16 on PyTorch, and the related work was published in an earlier blog during launch of Cooper Lake. In that blog, we introduced the hardware advancement for native bfloat16 support and showcased a performance boost of 1.4x to 1.6x of bfloat16 over float32 from DLRM, ResNet-50 and ResNext-101-32x4d.

    + +

    In this blog, we will introduce the latest software enhancement on bfloat16 in PyTorch 1.12, which would apply to much broader scope of user scenarios and showcase even higher performance boost.

    + +

    Native Level Optimization on Bfloat16

    + +

    On PyTorch CPU bfloat16 path, the compute intensive operators, e.g., convolution, linear and bmm, use oneDNN (oneAPI Deep Neural Network Library) to achieve optimal performance on Intel CPUs with AVX512_BF16 or AMX support. The other operators, such as tensor operators and neural network operators, are optimized at PyTorch native level. We have enlarged bfloat16 kernel level optimizations to majority of operators on dense tensors, both inference and training applicable (sparse tensor bfloat16 support will be covered in future work), specifically:

    + +
      +
    • Bfloat16 vectorization: Bfloat16 is stored as unsigned 16-bit integer, which requires it to be casted to float32 for arithmetic operations such as add, mul, etc. Specifically, each bfloat16 vector will be converted to two float32 vectors, processed accordingly and then converted back. While for non-arithmetic operations such as cat, copy, etc., it is a straight memory copy and no data type conversion will be involved.
    • +
    • Bfloat16 reduction: Reduction on bfloat16 data uses float32 as accumulation type to guarantee numerical stability, e.g., sum, BatchNorm2d, MaxPool2d, etc.
    • +
    • Channels Last optimization: For vision models, Channels Last is the preferable memory format over Channels First from performance perspective. We have implemented fully optimized CPU kernels for all the commonly used CV modules on channels last memory format, taking care of both float32 and bfloat16.
    • +
    + +

    Run Bfloat16 with Auto Mixed Precision

    + +

    To run model on bfloat16, typically user can either explicitly convert the data and model to bfloat16, for example:

    + +
    # with explicit conversion
    +input = input.to(dtype=torch.bfloat16)
    +model = model.to(dtype=torch.bfloat16)
    +
    + +

    or utilize torch.amp (Automatic Mixed Precision) package. The autocast instance serves as context managers or decorators that allow regions of your script to run in mixed precision, for example:

    + +
    # with AMP
    +with torch.autocast(device_type="cpu", dtype=torch.bfloat16):
    +    output = model(input)
    +
    + +

    Generally, the explicit conversion approach and AMP approach have similar performance. Even though, we recommend run bfloat16 models with AMP, because:

    + +
      +
    • +

      Better user experience with automatic fallback: If your script includes operators that don’t have bfloat16 support, autocast will implicitly convert them back to float32 while the explicit converted model will give a runtime error.

      +
    • +
    • +

      Mixed data type for activation and parameters: Unlike the explicit conversion which converts all the model parameters to bfloat16, AMP mode will run in mixed data type. To be specific, input/output will be kept in bfloat16 while parameters, e.g., weight/bias, will be kept in float32. The mixed data type of activation and parameters will help improve performance while maintaining the accuracy.

      +
    • +
    + +

    Performance Gains

    + +

    We benchmarked inference performance of TorchVision models on Intel® Xeon® Platinum 8380H CPU @ 2.90GHz (codenamed Cooper Lake), single instance per socket (batch size = 2 x number of physical cores). Results show that bfloat16 has 1.4x to 2.2x performance gain over float32.

    + +

    + +

    + +

    The performance boost of bfloat16 over float32 primarily comes from 3 aspects:

    + +
      +
    • The compute intensive operators take advantage of the new bfloat16 native instruction VDPBF16PS which doubles the hardware compute throughput.
    • +
    • Bfloat16 have only half the memory footprint of float32, so theoretically the memory bandwidth intensive operators will be twice faster.
    • +
    • On Channels Last, we intentionally keep the same parallelization scheme for all the memory format aware operators (can’t do this on Channels First though), which increases the data locality when passing each layer’s output to the next. Basically, it keeps the data closer to CPU cores while data would reside in cache anyway. And bfloat16 will have a higher cache hit rate compared with float32 in such scenarios due to smaller memory footprint.
    • +
    + +

    Conclusion & Future Work

    + +

    In this blog, we introduced recent software optimizations on bfloat16 introduced in PyTorch 1.12. Results on the 3rd Gen Intel® Xeon® Scalable processor show that bfloat16 has 1.4x to 2.2x performance gain over float32 on the TorchVision models. Further improvement is expected on the next generation of Intel® Xeon® Scalable Processors with AMX instruction support. Though the performance number for this blog is collected with TorchVision models, the benefit is broad across all topologies. And we will continue to extend the bfloat16 optimization effort to a broader scope in the future!

    + +

    Acknowledgement

    + +

    The results presented in this blog is a joint effort of Meta and Intel PyTorch team. Special thanks to Vitaly Fedyunin and Wei Wei from Meta who spent precious time and gave substantial assistance! Together we made one more step on the path of improving the PyTorch CPU eco system.

    + +

    Reference

    + + + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/enhancing-deep-learning/index.html b/blog/enhancing-deep-learning/index.html new file mode 100644 index 000000000000..9ecf2e984697 --- /dev/null +++ b/blog/enhancing-deep-learning/index.html @@ -0,0 +1,742 @@ + + + + + + + + + + + + + Enhancing Deep Learning Workflows: PyTorch Ecosystem Tools | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    Welcome to the thriving PyTorch ecosystem, where a wealth of tools and libraries await, purpose-built to elevate your experience in deep learning as a developer or researcher. The Ecosystem Tools pages host many projects from experts spanning academia, industry, application development, and machine learning.

    + +

    Initially, PyTorch aimed to establish a thriving community, enabling developers to access each other’s tools, engage in meaningful discussions, and explore the wealth of resources available within the community.

    + +

    Today, the PyTorch ecosystem has grown to feature over 100 projects tailored to your needs, providing robust support, enhanced speed, and effortless integration with PyTorch. If your project aligns with our mission, we invite you to submit it and join this dynamic ecosystem.

    + +

    New this month, we’ve moved all of our Ecosystem blogs over to our PyTorch.org website to host a space where our community can show off the latest innovations with our users. Read on to hear about the latest projects in the ecosystem!

    + +

    Explore the Latest Tools and Frameworks in the Ecosystem

    + +

    As we continue into 2024, we’re thrilled to showcase an impressive array of ecosystem tools that significantly enrich the PyTorch community. These tools cover a wide range of domains, including pose estimation, profiling, and even quantum computing. Let’s explore each one to witness firsthand how they are reshaping the PyTorch landscape, opening up exciting possibilities for developers.

    + +

    Anomalib

    + +

    Anomalib is a deep learning library that aims to collect state-of-the-art anomaly detection algorithms for benchmarking on both public and private datasets. Anomalib provides several ready-to-use implementations of anomaly detection algorithms described in the recent literature, as well as a set of tools that facilitate the development and implementation of custom models. The library has a strong focus on image-based anomaly detection, where the goal of the algorithm is to identify anomalous images, or anomalous pixel regions within images in a dataset. Anomalib is constantly updated with the latest algorithms and training/inference extensions.

    + +

    Diffusers

    + +

    Diffusers is the go-to library for state-of-the-art pretrained diffusion models for generating images, audio, and even 3D structures of molecules. Whether you’re looking for a simple inference solution or training your own diffusion models, Diffusers is a modular toolbox that supports both.

    + +

    Pomegranate

    + +

    Pomegranate is a versatile machine learning library that integrates seamlessly with PyTorch. It provides a wide range of probabilistic models and tools for probabilistic modeling tasks. Pomegranate empowers users to build complex models such as hidden Markov models (HMMs), Bayesian networks, and Gaussian mixture models (GMMs). By combining the strengths of PyTorch and Pomegranate, developers can leverage the power of deep learning and probabilistic modeling to tackle various machine learning challenges.

    + +

    PyPose

    + +

    PyPose is a PyTorch-based library designed for pose estimation tasks. With PyPose, developers can efficiently train and deploy models for human pose estimation, a fundamental computer vision problem. By leveraging PyTorch’s flexibility and performance, PyPose simplifies the process of building accurate pose estimation models. Its intuitive APIs and pre-trained models make it an excellent choice for researchers and developers exploring human pose estimation applications.

    + +

    PyPOTS

    + +

    A python toolbox/library for data mining on partially-observed time series with PyTorch, including SOTA models supporting tasks of imputation, classification, clustering, and forecasting on incomplete (irregularly-sampled) multivariate time series with missing values.

    + +

    OctoML Profiler

    + +

    OctoML Profiler is a performance profiling tool that aids in optimizing PyTorch models. This tool helps developers identify performance bottlenecks and inefficiencies within their deep learning models. By providing insights into memory usage, compute time, and data movement, the OctoML Profiler enables developers to fine-tune their models for improved efficiency. With this valuable feedback, developers can optimize their models for deployment on various hardware platforms.

    + +

    Open Compass

    + +

    OpenCompass is a one-stop platform for large model evaluation, aiming to provide a fair, open, and reproducible benchmark for large model evaluation. Its main features include: Comprehensive support for models and datasets, efficient distributed evaluation, diversified evaluation paradigms, modular design with high extensibility and experiment management and reporting mechanism.

    + +

    Renate

    + +

    Renate is a PyTorch-based library for neural architecture search (NAS). It simplifies the process of automatically searching for optimal neural network architectures tailored to specific tasks. Renate leverages techniques like reinforcement learning and evolutionary algorithms to efficiently explore the architecture space. By using Renate, developers can save significant time and resources while discovering highly performant models.

    + +

    RoMa

    + +

    RoMa is a standalone library to handle rotation representations with PyTorch (rotation matrices, quaternions, rotation vectors, etc). It aims for robustness, ease-of-use, and efficiency.

    + +

    Substra

    + +

    Substra is an open source federated learning (FL) software. It enables the training and validation of machine learning models on distributed datasets. It provides a flexible Python interface and a web application to run federated learning training at scale. Substra’s main usage is in production environments. It has already been deployed and used by hospitals and biotech companies. Substra can also be used on a single machine to perform FL simulations and debug code.

    + +

    TorchQuantum

    + +

    TorchQuantum is a powerful library that combines the PyTorch framework with quantum computing concepts. It enables developers to explore quantum machine learning algorithms and build hybrid classical-quantum models. By integrating the principles of quantum computing into PyTorch, TorchQuantum opens up new possibilities for solving complex problems that traditional deep learning approaches may struggle with.

    + +

    TIAToolbox

    + +

    The TIAToolbox (Text-Image-Augmentation Toolbox) is a PyTorch library designed to augment text and image data for deep learning tasks. It offers a comprehensive set of tools for data augmentation, including transformations, noise injection, and image/text synthesis. By applying TIAToolbox, developers can enrich their training datasets, improve model generalization, and enhance the robustness of their deep learning models.

    + +

    torchdistill

    + +

    torchdistill is a coding-free framework built on PyTorch for reproducible deep learning and knowledge distillation studies. The framework is designed to enable users to design experiments by declarative PyYAML configuration files and supports high-level module abstractions.

    + +

    TorchOpt

    + +

    TorchOpt is a PyTorch library focused on optimization algorithms for deep learning. It provides a collection of state-of-the-art optimization techniques, such as stochastic gradient descent (SGD) variants, adaptive learning rate methods, and optimization schedules. TorchOpt empowers developers to fine-tune their models efficiently, converge faster, and achieve better performance in various deep learning tasks.

    + +

    USB

    + +

    USB, or Unified Speech-to-Text Benchmark, is a PyTorch-based toolkit for training and evaluating speech recognition models. It provides standardized datasets and evaluation metrics to facilitate fair and accurate comparisons between different speech recognition architectures. By using USB, researchers and developers can benchmark their models against state-of-the-art systems and drive advancements in the field of automatic speech recognition.

    + +

    Zeus

    + +

    Zeus is the current state-of-the-art in deep learning energy measurement and optimization. It has monitor components that allow users to measure GPU energy consumption and optimizer components that automatically optimize DNN or GPU knobs based on measurements from the monitor component.

    + +

    Be Part of Our Ecosystem

    + +

    Our diverse ecosystem tools are instrumental in PyTorch’s success.. They provide essential support for tasks such as pose estimation, probabilistic modeling, performance profiling, model interpretability, speech recognition, quantum computing, data augmentation, optimization, and neural architecture search.

    + +

    Leveraging these tools empowers developers and researchers to accelerate their deep learning workflows and unlock new possibilities in the field of AI.

    + +

    Have a tool that would be a good fit for the PyTorch Ecosystem? If you can answer the below questions, we’d love for you to submit your tool for review.

    + +
      +
    1. Does your project complement PyTorch, enhancing user experience, introducing new capabilities, or accelerating training and inference processes? +
        +
      • Examples could include visualization tools, a kernel library or a framework that sits on top to enable research in a particular area such as NLP.
      • +
      +
    2. +
    3. Is the project ready for broad developer usage? +
        +
      • For example, is the project stable, will it be maintained, and is there adequate supporting infrastructure, documentation, and technical support to allow a developer to successfully use it?
      • +
      +
    4. +
    + +

    Thank you to all of our contributors and collaborators in our ecosystem! Here’s to a great 2024.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/executorch-alpha/index.html b/blog/executorch-alpha/index.html new file mode 100644 index 000000000000..1460693d780a --- /dev/null +++ b/blog/executorch-alpha/index.html @@ -0,0 +1,692 @@ + + + + + + + + + + + + + ExecuTorch Alpha: Taking LLMs and AI to the Edge with Our Community and Partners | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    We are excited to announce the release of ExecuTorch alpha, focused on deploying large language models (LLMs) and large ML models to the edge, stabilizing the API surface, and improving our installation processes. It has been an exciting few months from our 0.1 (preview) release in collaboration with our partners at Arm, Apple, and Qualcomm Technologies, Inc.

    + +

    In this post we’ll discuss our full support for Meta’s Llama 2, early support for Meta’s Llama 3, broad model support in ExecuTorch, and highlight the important work our partners have done to move us forward.

    + +

    Large Language Models on Mobile

    + +

    Mobile devices are highly constrained for compute, memory, and power. To bring LLMs to these devices, we heavily leverage quantization and other techniques to pack these models appropriately.

    + +

    ExecuTorch alpha supports 4-bit post-training quantization using GPTQ. We’ve provided broad device support on CPU by landing dynamic shape support and new dtypes in XNNPack. We’ve also made significant improvements in export and lowering, reduced memory overhead and improved runtime performance. This enables running Llama 2 7B efficiently on iPhone 15 Pro, iPhone 15 Pro Max, Samsung Galaxy S22, S23, and S24 phones and other edge devices. Early support for Llama 3 8B is also included. We are always improving the token/sec on various edge devices and you can visit GitHub for the latest performance numbers.

    + +

    We’re working closely with our partners at Apple, Arm, and Qualcomm Technologies to delegate to GPU and NPU for performance through Core ML, MPS, TOSA, and Qualcomm AI Stack backends respectively.

    + +

    Supported Models

    + +

    We remain committed to supporting an ever-expanding list of models with ExecuTorch. Since preview, we have significantly expanded our tested models across NLP, vision and speech, with full details in our release notes. Although support for on-device LLMs is early, we anticipate most traditional models to function seamlessly out of the box, with delegation to XNNPACK, Core ML, MPS, TOSA, and HTP for performance. If you encounter any problems please open a GitHub issue with us.

    + +

    Productivity

    + +

    Deploying performant models tuned for specific platforms often require deep visualization into the on-device runtime data to determine the right changes to make in the original PyTorch model. With ExecuTorch alpha, we provide a powerful SDK with observability throughout the process from model authoring to deployment, including delegate and hardware-level information.

    + +

    The ExecuTorch SDK was enhanced to include better debugging and profiling tools. Because ExecuTorch is built on PyTorch, the debugging capabilities include the ability to map from operator nodes back to original Python source code for more efficient anomaly resolution and performance tuning for both delegated and non-delegated model instances. You can learn more about the ExecuTorch SDK here.

    + +

    Partnerships

    + +

    ExecuTorch has only been possible because of strong collaborations across Arm, Apple, and Qualcomm Technologies. The collaboration for the initial launch of ExecuTorch continues as we support LLMs and large AI models on the edge for PyTorch. As we’ve seen with this early work for ExecuTorch alpha, there are unique challenges with these larger models and we’re excited to develop in the open.

    + +

    We also want to highlight the great partnership with Google on XNNPACK for CPU performance. The teams continue to work together upstreaming our changes and across the TensorFlow and PyTorch teams to make sure we can all support generative AI models on the edge with SOTA performance.

    + +

    Lastly, our hardware partner MediaTek has been doing work enabling the Llama collection of models with ExecuTorch on their SoCs. We’ll have more to share in the future.

    + +

    Alpha and Production Usage

    + +

    With our alpha release, we have production-tested ExecuTorch. Meta is using ExecuTorch for hand tracking on Meta Quest 3 and a variety of models on Ray-Ban Meta Smart Glasses. In addition, we have begun the rollout of ExecuTorch with Instagram and are integrating with other Meta products. We are excited to see how ExecuTorch can be used for other edge experiences.

    + +

    Community

    + +

    We are excited to see various efforts in the community to adopt or contribute to ExecuTorch. For instance, Unity recently shared their work at the Game Developers Conference (GDC) on leveraging ExecuTorch and Edge IR to run PyTorch models with their neural network inference library Sentis. Leveraging ExecuTorch’s hackability and extensibility, Unity introduced their own custom backend that serializes ExecuTorch’s Edge Dialect IR into Sentis’ native serialized format enabling developers to begin using PyTorch models easily in their games and apps.

    + +

    We’ve been building and innovating with ExecuTorch in the open. Our north star is to empower the community to deploy any ML model on edge devices painlessly and efficiently. Whether you are a hobbyist or this is your day job, we’d love for you to jump in to bring your ML models to the edge. We are looking for your help to:

    + +
      +
    1. Use ExecuTorch to run your LLM models locally on various deployment targets and share your feedback
    2. +
    3. Expand our supported models, including bug reports
    4. +
    5. Expand our quantization schemes
    6. +
    7. Help us build out delegates to GPU and NPU
    8. +
    + +

    To all individual contributors and early adopters of ExecuTorch, a big thank you as well. We can’t wait to have more of you join us!

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/executorch-beta/index.html b/blog/executorch-beta/index.html new file mode 100644 index 000000000000..f9e8051bf036 --- /dev/null +++ b/blog/executorch-beta/index.html @@ -0,0 +1,710 @@ + + + + + + + + + + + + + ExecuTorch Beta: On-Device AI and LLMs, Stability, and Acceleration with Partners | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +
      +
    • ExecuTorch has achieved Beta status with the release of v0.4, providing stable APIs and runtime, as well as extensive kernel coverage.
    • +
    • ExecuTorch is the recommended on-device inference engine for Llama 3.2 1B/3B models, offering enhanced performance and memory efficiency for both original and quantized models.
    • +
    • There has been a significant increase in adoption and ecosystem growth for ExecuTorch, and the focus is now on improving reliability, performance, and coverage for non-CPU backends as the next steps.
    • +
    + +

    Current On-Device AI Market

    + +

    The on-device AI market has been rapidly expanding, and is revolutionizing the way we interact with technology. It is unlocking new experiences, enabling personalization, and reducing latency. Traditionally, computer vision and speech recognition have been the primary use-cases for on-device AI, particularly in IoT, industrial applications, and mobile devices. However, the emergence of Large Language Models (LLMs) has made Generative AI the fastest growing sector in AI, subsequently highlighting the importance of on-device Generative AI. IDC forecasts by 2028, close to 1 billion GenAI capable smartphones being shipped worldwide.

    + +

    LLMs are not only getting smaller but more powerful. This has led to the creation of a new class of applications that leverage multiple models for intelligent agents and streamlined workflows. The community is rapidly adopting and contributing to these new models, with quantized versions being created within hours of model release. Several leading technology companies are investing heavily in small LLMs, even deploying Low-Rank Adaptation (LoRA) at scale on-device to transform user experiences.

    + +

    However, this rapid progress comes at a cost. The fragmentation of our on-device AI landscape creates complexity and inefficiency when going from model authoring to edge deployment. This is where PyTorch’s ExecuTorch comes in – our Beta announcement marks an important milestone in addressing these challenges and empowering developers to create innovative, AI-powered applications.

    + +

    What’s New Today

    + +

    It’s been exactly one year since we first open sourced ExecuTorch, six months since Alpha release, and today, we’re excited to announce three main developments:

    + +

    1. Beta. ExecuTorch has reached Beta status starting from v0.4! It is now widely adopted and used in production environments across Meta. Through this adoption process we’ve identified and addressed feature gaps, improved stability, and expanded kernel and accelerator coverage. These improvements make us confident to promote ExecuTorch from Alpha to Beta status, and we are happy to welcome the community to adopt it in their own production settings. Here are three concrete enhancements:

    + +
      +
    1. Developers can write application code and include the latest ExecuTorch as a dependency, updating when needed with a clean API contract. This is possible due to our API stabilization efforts, as well as our explicit API lifecycle and backwards compatibility policy.
    2. +
    3. Running ExecuTorch on CPUs reached the necessary performance, portability and coverage. In particular, we have implemented more than 85% of all core ATen operators as part of our portable CPU kernels library to ensure running a model on ExecuTorch just works in most cases and making missing ops an exception rather than the norm. Moreover, we integrated and extensively tested our XNNPACK delegate for high performance on a wide range of CPU architectures. It is used in a number of production cases today.
    4. +
    5. In addition to the low-level ExecuTorch components for greater portability, we built extensions and higher-level abstractions to support more common use-cases such as developer tooling to support on-device debugging and profiling, and Module.h extension to simplify deployment for mobile devices.
    6. +
    + +

    2. On-Device Large-Language Models (LLMs). There has been a growing interest in the community to deploy Large Language Models (LLMs) on edge devices, as it offers improved privacy and offline capabilities. However, these models are quite large, pushing the limits of what is possible. Fortunately, ExecuTorch can support these models, and we’ve enhanced the overall framework with numerous optimizations.

    + +
      +
    • ExecuTorch is the recommended framework to run latest Llama models on-device with excellent performance today. The Llama 3.2 1B/3B models are well-suited for mobile deployment, and it is especially true with the official quantized 1B/3B model releases from Meta, as it provides a great balance between performance, accuracy, and size. When deploying Llama 3.2 1B/3B quantized models, decode latency improved by 2.5x and prefill latency improved by 4.2x on average, while model size decreased by 56% and memory usage reduced by 41% on average when benchmarked on Android OnePlus 12 device (we’ve also verified similar relative performance on Samsung S24+ for 1B and 3B, and Samsung S22 for 1B). For Llama 3.2 1B quantized model, for example, ExecuTorch is able to achieve 50.2 tokens/s for decoding and 260 tokens/s for prefill on the OnePlus 12, using the latest CPU kernels from XNNPACK and Kleidi libraries. These quantized models allow developers to integrate LLMs into memory and power-constrained devices while still maintaining quality and safety.
    • +
    • One of the value propositions of ExecuTorch is being able to use accelerators on mobile devices seamlessly. In fact, ExecuTorch also showcased accelerators to achieve even greater performance running Llama across Apple MPS backend, Qualcomm AI Accelerator, and MediaTek AI Accelerator.
    • +
    • There has been growing community and industry interest in multimodal and beyond text-only LLMs, evidenced by Meta’s Llama 3.2 11B/90B vision models and open-source models like Llava. We have so far enabled Llava 1.5 7B model on phones via ExecuTorch, making many optimizations, notably reducing runtime memory from 11GB all the way down to 5GB.
    • +
    + +

    3. Ecosystem and Community Adoption
    +Now that ExecuTorch is in Beta, it is mature enough to be used in production. It is being increasingly used at Meta across various product surfaces. For instance, ExecuTorch already powers various ML inference use cases across Meta’s Ray-Ban Meta Smart Glasses and Quest 3 VR headsets as well as Instagram and WhatsApp.

    + +

    We also partnered with Hugging Face to provide native ExecuTorch support for models being exported using torch.export. This collaboration ensures exported artifacts can directly be lowered and run efficiently on various mobile and edge devices. Models like gemma-2b and phi3-mini are already supported and more foundational models support is in progress.

    + +

    With stable APIs and Gen AI support, we’re excited to build and grow ExecuTorch with the community. The on-device AI community is growing rapidly and finding ways to adopt ExecuTorch across various fields. For instance, ExecuTorch is being used in a mobile app built by Digica to streamline inventory management in hospitals. As another example, Software Mansion developed an app, EraserAI, to remove unwanted objects from a photo with EfficientSAM running on-device with ExecuTorch via Core ML delegate.

    + +

    Towards General Availability (GA):
    +Since the original release of ExecuTorch alpha, we’ve seen a growing interest within the community in using ExecuTorch in various production environments. To that end, we have made great progress towards more stabilized and matured APIs and have made a significant investment in community support, adoption and contribution to ExecuTorch. As are are getting close to GA, we are investing our efforts in the following areas:

    + +
      +
    • +

      Non-CPU backends: Bringing non-CPU backends to even greater robustness, coverage and performance is our next goal. From day one of our original launch, we have partnered with Apple (for Core ML and MPS), Arm (for EthosU NPU) and Qualcomm (for Hexagon NPU) on accelerator integration with ExecuTorch, and we’ve since then expanded our partnership to MediaTek (NPU) and Cadence (XTensa DSP). We’re also building Vulkan GPU integration in-house. In terms of feature coverage, we’ve successfully implemented the core functionalities with our partners, ensured seamless integration with our developer tooling, and showcased successful LLM integration with many of the accelerators. Our next big step is to thoroughly validate the performance and reliability of the system in real-world, production use-cases. This stage will help us fine-tune the experience and ensure the stability needed for smooth operations.

      +
    • +
    • +

      Benchmarking infra: As part of our ongoing testing efforts, we’ve developed a benchmarking infrastructure along with a public dashboard to showcase our progress toward on-device model inference benchmarking. This allows us to transparently track and display model coverage across various backends, giving our community real-time insights into how we’re advancing towards our goals.

      +
    • +
    + +

    We’re excited to share these developments with you and look forward to continued improvements in collaboration with our partners and the community! We welcome community contribution to help us make ExecuTorch the clear choice for deploying AI and LLM models on-device. We invite you to start using ExecuTorch in your on-device projects, or even better consider contributing to it. You can also report any issues on our GitHub page.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/experience-power-pytorch-2.0/index.html b/blog/experience-power-pytorch-2.0/index.html new file mode 100644 index 000000000000..9d3b30af83f4 --- /dev/null +++ b/blog/experience-power-pytorch-2.0/index.html @@ -0,0 +1,673 @@ + + + + + + + + + + + + + Experience the power of PyTorch 2.0 on AMD Solutions | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + AMD + +

    +

    PyTorch 2.0 represents a significant step forward for the PyTorch machine learning framework. The stable release of PyTorch 2.0 brings new features that unlock even higher performance, while remaining backward compatible with prior releases and retaining the Pythonic focus which has helped to make PyTorch so enthusiastically adopted by the AI/ML community. AMD has long been a strong proponent of PyTorch, and we are delighted that the PyTorch 2.0 stable release includes support for AMD Instinct™ and Radeon™ GPUs that are supported by the ROCm™ software platform.

    + +

    With the stable PyTorch 2.0 release, PyTorch 2.0 introduces torch.compile as a beta feature underpinned by TorchInductor with support for AMD Instinct and Radeon GPUs through OpenAI Triton deep learning compiler. Through TorchInductor, developers can now generate low level kernels using Triton that are portable and performant to hand-written kernels on native hardware centric kernel programming models.

    + +

    OpenAI Triton is a language and compiler for blocked algorithms, which aims to provide an abstraction layer between CUDA/HIP and Torch at which developers can write efficient kernels more productively. We have written a new backend which interfaces Triton’s custom MLIR dialects with our ROCm compiler stack.

    + +

    Triton can automatically optimize kernels generated by machine learning compilers such as TorchInductor for multiple AI accelerators including AMD Instinct GPU accelerator by leveraging hardware-specific features of the AMD CDNA™ GPU architecture. This makes it easy for developers and users to switch seamlessly from any HW to AMD Instinct GPU accelerators and get great out of the box performance.

    + +

    In addition, compilers like Triton can also enable developers to use high-level programming languages, such as Python, to write machine learning code that can be efficiently compiled and executed on specialized hardware. This can help greatly improve the productivity of machine learning developers, as they can focus on the algorithmic aspects of their models and rely on the compiler to generate efficient code.

    + +

    By design, PyTorch 2.0 is backward compatible to earlier PyTorch releases. This holds true for the ROCm build of PyTorch 2.0 as well. Developers using PyTorch with AMD GPUs can migrate to PyTorch 2.0 with the confidence that their existing code will continue to work without any required changes, so there is no penalty to access the improvements that come with this release. On the other hand, using PyTorch 2.0 and TorchInductor can result in significant performance improvement over the default eager-mode as shown below.

    + +

    The initial results using AMD Instinct MI250 GPUs already shows strong performance improvement with minimal optimization on TorchInductor compared to the default eager-mode. We see an average performance increase of up to 1.54X on 44 out of the 45 models on HuggingFace benchmarks suite with CamemBert, DistillGPT2 and T5Small being a few of the standout models with up to 1.5X or more performance improvement over eager-mode. We are looking forward to continued engagement with members of the PyTorch team at Meta to enable further optimization on ROCm software stack and the additional performance improvement for future PyTorch releases.

    + +

    Image 1: AMD MI250 GPU performance improvement for TorchInductor vs eager-mode using HuggingFace

    + +

    Image 1: AMD MI250 GPU performance improvement for TorchInductor vs eager-mode using HuggingFace MI200-89.

    + +

    PyTorch 2.0 follows the same set of install options as before to build and install for supporting AMD GPUs. These include an installable Python package hosted at pytorch.org, AMD’s public PyTorch docker image, and of course the option to build from source using the upstream PyTorch repository. As with PyTorch builds for other platforms, the specific command line to be run for pip-based install is provided by the configurator at https://pytorch.org/get-started/locally/.

    + +

    The GPUs supported by the ROCm software platform which forms the basis for PyTorch support on AMD GPUs are documented at https://docs.amd.com/bundle/Hardware_and_Software_Reference_Guide/page/Hardware_and_Software_Support.html

    + +

    Conclusion

    + +

    PyTorch 2.0 represents a major step in continuing to broaden support for ML developers by increasing performance while maintaining a simple, Pythonic interface. This performance uplift is made possible in large part by the new TorchInductor infrastructure, which in turn harnesses the Triton ML programming language and just-in-time compiler. AMD’s support for these technologies allows users to realize the full promise of the new PyTorch architecture. Our GPU support in PyTorch 2.0 is just one manifestation of a larger vision around AI and machine learning. AI/ML plays an important role in multiple AMD product lines, including Instinct and Radeon GPUs, Alveo™ data center accelerators, and both Ryzen™ and EPYC processors. These hardware and software initiatives are all part of AMD’s Pervasive AI vision, and we look forward to addressing the many new challenges and opportunities of this dynamic space.

    + +

    MI200-89 – PyTorch Inductor mode HuggingFace Transformers training speedup, running the standard PyTorch 2.0 test suite, over PyTorch eager-mode comparison based on AMD internal testing on a single GCD as of 3/10/2023 using a 2P AMD EPYC™ 7763 production server with 4x AMD Instinct™ MI250 (128GB HBM2e) 560W GPUs with Infinity Fabric™ technology; host ROCm™ 5.3, guest ROCm™ 5.4.4, PyTorch 2.0.0, Triton 2.0. Server manufacturers may vary configurations, yielding different results. Performance may vary based on factors including use of latest drivers and optimizations.

    + +

    © 2023 Advanced Micro Devices, Inc. All rights reserved. AMD, the AMD Arrow logo, AMD CDNA, AMD Instinct, EPYC, Radeon, ROCm, Ryzen, and combinations thereof are trademarks of Advanced Micro Devices, Inc. Other product names used in this publication are for identification purposes only and may be trademarks of their respective owners.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/extending-torchvisions-transforms-to-object-detection-segmentation-and-video-tasks/index.html b/blog/extending-torchvisions-transforms-to-object-detection-segmentation-and-video-tasks/index.html new file mode 100644 index 000000000000..a48e4a05a95f --- /dev/null +++ b/blog/extending-torchvisions-transforms-to-object-detection-segmentation-and-video-tasks/index.html @@ -0,0 +1,806 @@ + + + + + + + + + + + + + Extending TorchVision’s Transforms to Object Detection, Segmentation & Video tasks | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Philip Meier, Victor Fomin, Vasilis Vryniotis, Nicolas Hug + +

    +

    Note: A previous version of this post was published in November 2022. We have updated this post with the most up-to-date info, in view of the upcoming 0.15 release of torchvision in March 2023, jointly with PyTorch 2.0.

    + +

    TorchVision is extending its Transforms API! Here is what’s new:

    + +
      +
    • You can use them not only for Image Classification but also for Object Detection, Instance & Semantic Segmentation and Video Classification.
    • +
    • You can use new functional transforms for transforming Videos, Bounding Boxes and Segmentation Masks.
    • +
    + +

    The API is completely backward compatible with the previous one, and remains the same to assist the migration and adoption. We are now releasing this new API as Beta in the torchvision.transforms.v2 namespace, and we would love to get early feedback from you to improve its functionality. Please reach out to us if you have any questions or suggestions.

    + +

    Limitations of current Transforms

    + +

    The existing Transforms API of TorchVision (aka V1) only supports single images. As a result it can only be used for classification tasks:

    + +
    from torchvision import transforms
    +trans = transforms.Compose([
    +   transforms.ColorJitter(contrast=0.5),
    +   transforms.RandomRotation(30),
    +   transforms.CenterCrop(480),
    +])
    +imgs = trans(imgs)
    +
    + +

    The above approach doesn’t support Object Detection nor Segmentation. This limitation made any non-classification Computer Vision tasks second-class citizens as one couldn’t use the Transforms API to perform the necessary augmentations. Historically this made it difficult to train high-accuracy models using TorchVision’s primitives and thus our Model Zoo lagged by several points from SoTA.

    + +

    To circumvent this limitation, TorchVision offered custom implementations in its reference scripts that show-cased how one could perform augmentations in each task. Though this practice enabled us to train high accuracy classification, object detection & segmentation models, it was a hacky approach which made those transforms impossible to import from the TorchVision binary.

    + +

    The new Transforms API

    + +

    The Transforms V2 API supports videos, bounding boxes, and segmentation masks meaning that it offers native support for many Computer Vision tasks. The new solution is a drop-in replacement:

    + +
    import torchvision.transforms.v2 as transforms
    +
    +# Exactly the same interface as V1:
    +trans = transforms.Compose([
    +    transforms.ColorJitter(contrast=0.5),
    +    transforms.RandomRotation(30),
    +    transforms.CenterCrop(480),
    +])
    +imgs, bboxes, labels = trans(imgs, bboxes, labels)
    +
    + +

    The new Transform Classes can receive any arbitrary number of inputs without enforcing specific order or structure:

    + +
    # Already supported:
    +trans(imgs)  # Image Classification
    +trans(videos)  # Video Tasks
    +trans(imgs, bboxes, labels)  # Object Detection
    +trans(imgs, bboxes, masks, labels)  # Instance Segmentation
    +trans(imgs, masks)  # Semantic Segmentation
    +trans({"image": imgs, "box": bboxes, "tag": labels})  # Arbitrary Structure
    +
    +# Future support:
    +trans(imgs, bboxes, labels, keypoints)  # Keypoint Detection
    +trans(stereo_images, disparities, masks)  # Depth Perception
    +trans(image1, image2, optical_flows, masks)  # Optical Flow
    +trans(imgs_or_videos, labels)  # MixUp/CutMix-style Transforms
    +
    + +

    The Transform Classes make sure that they apply the same random transforms to all the inputs to ensure consistent results.

    + +

    The functional API has been updated to support all necessary signal processing kernels (resizing, cropping, affine transforms, padding etc) for all inputs:

    + +
    from torchvision.transforms.v2 import functional as F
    +
    +
    +# High-level dispatcher, accepts any supported input type, fully BC
    +F.resize(inpt, size=[224, 224])
    +# Image tensor kernel
    +F.resize_image_tensor(img_tensor, size=[224, 224], antialias=True) 
    +# PIL image kernel
    +F.resize_image_pil(img_pil, size=[224, 224], interpolation=BILINEAR)
    +# Video kernel
    +F.resize_video(video, size=[224, 224], antialias=True) 
    +# Mask kernel
    +F.resize_mask(mask, size=[224, 224])
    +# Bounding box kernel
    +F.resize_bounding_box(bbox, size=[224, 224], spatial_size=[256, 256])
    +
    + +

    Under the hood, the API uses Tensor subclassing to wrap the input, attach useful meta-data and dispatch to the right kernel. For your data to be compatible with these new transforms, you can either use the provided dataset wrapper which should work with most of torchvision built-in datasets, or your can wrap your data manually into Datapoints:

    + +
    from torchvision.datasets import wrap_dataset_for_transforms_v2
    +ds = CocoDetection(..., transforms=v2_transforms)
    +ds = wrap_dataset_for_transforms_v2(ds) # data is now compatible with transforms v2!
    +
    +# Or wrap your data manually using the lower-level Datapoint classes:
    +from torchvision import datapoints
    +
    +imgs = datapoints.Image(images)
    +vids = datapoints.Video(videos)
    +masks = datapoints.Mask(target["masks“])
    +bboxes = datapoints.BoundingBox(target["boxes], format=XYXY, spatial_size=imgs.shape)
    +
    + +

    In addition to the new API, we now provide importable implementations for several data augmentations that are used in SoTA research such as Large Scale Jitter, AutoAugmentation methods and several new Geometric, Color and Type Conversion transforms.

    + +

    The API continues to support both PIL and Tensor backends for Images, single or batched input and maintains JIT-scriptability on both the functional and class APIs.. The new API has been verified to achieve the same accuracy as the previous implementation.

    + +

    An end-to-end example

    + +

    Here is an example of the new API using the following image. It works both with PIL images and Tensors. For more examples and tutorials, take a look at our gallery!

    + +
    from torchvision import io, utils
    +from torchvision import datapoints
    +from torchvision.transforms import v2 as T
    +from torchvision.transforms.v2 import functional as F
    +
    +# Defining and wrapping input to appropriate Tensor Subclasses
    +path = "COCO_val2014_000000418825.jpg"
    +img = datapoints.Image(io.read_image(path))
    +# img = PIL.Image.open(path)
    +bboxes = datapoints.BoundingBox(
    +    [[2, 0, 206, 253], [396, 92, 479, 241], [328, 253, 417, 332],
    +     [148, 68, 256, 182], [93, 158, 170, 260], [432, 0, 438, 26],
    +     [422, 0, 480, 25], [419, 39, 424, 52], [448, 37, 456, 62],
    +     [435, 43, 437, 50], [461, 36, 469, 63], [461, 75, 469, 94],
    +     [469, 36, 480, 64], [440, 37, 446, 56], [398, 233, 480, 304],
    +     [452, 39, 463, 63], [424, 38, 429, 50]],
    +    format=datapoints.BoundingBoxFormat.XYXY,
    +    spatial_size=F.get_spatial_size(img),
    +)
    +labels = [59, 58, 50, 64, 76, 74, 74, 74, 74, 74, 74, 74, 74, 74, 50, 74, 74]
    +# Defining and applying Transforms V2
    +trans = T.Compose(
    +    [
    +        T.ColorJitter(contrast=0.5),
    +        T.RandomRotation(30),
    +        T.CenterCrop(480),
    +    ]
    +)
    +img, bboxes, labels = trans(img, bboxes, labels)
    +# Visualizing results
    +viz = utils.draw_bounding_boxes(F.to_image_tensor(img), boxes=bboxes)
    +F.to_pil_image(viz).show()
    +
    + +

    Development milestones and future work

    + +

    Here is where we are in development:

    + +
      +
    • Design API
    • +
    • Write Kernels for transforming Videos, Bounding Boxes, Masks and Labels
    • +
    • Rewrite all existing Transform Classes (stable + references) on the new API: +
        +
      • Image Classification
      • +
      • Video Classification
      • +
      • Object Detection
      • +
      • Instance Segmentation
      • +
      • Semantic Segmentation
      • +
      +
    • +
    • Verify the accuracy of the new API for all supported Tasks and Backends
    • +
    • Speed Benchmarks and Performance Optimizations (in progress - planned for Dec)
    • +
    • Graduate from Prototype (planned for Q1)
    • +
    • Add support of Depth Perception, Keypoint Detection, Optical Flow and more (future)
    • +
    • Add smooth support for batch-wise transforms like MixUp and CutMix
    • +
    + +

    We would love to get feedback from you to improve its functionality. Please reach out to us if you have any questions or suggestions.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/fast-beam-search-decoding-in-pytorch-with-torchaudio-and-flashlight-text/index.html b/blog/fast-beam-search-decoding-in-pytorch-with-torchaudio-and-flashlight-text/index.html new file mode 100644 index 000000000000..f76c281f2859 --- /dev/null +++ b/blog/fast-beam-search-decoding-in-pytorch-with-torchaudio-and-flashlight-text/index.html @@ -0,0 +1,779 @@ + + + + + + + + + + + + + Fast Beam Search Decoding in PyTorch with TorchAudio and Flashlight Text | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Caroline Chen, Jacob Kahn (@jacob_d_kahn) + +

    +

    Beam search decoding with industry-leading speed from Flashlight Text (part of the Flashlight ML framework) is now available with official support in TorchAudio, bringing high-performance beam search and text utilities for speech and text applications built on top of PyTorch. The current integration supports CTC-style decoding, but it can be used for any modeling setting that outputs token-level probability distributions over time steps.

    + +

    A brief beam search refresher

    + +

    In speech and language settings, beam search is an efficient, greedy algorithm that can convert sequences of continuous values (i.e. probabilities or scores) into graphs or sequences (i.e. tokens, word-pieces, words) using optional constraints on valid sequences (i.e. a lexicon), optional external scoring (i.e. an LM which scores valid sequences), and other score adjustments for particular sequences.

    + +

    In the example that follows, we’ll consider — a token set of {ϵ, a, b}, where ϵ is a special token that we can imagine denotes a space between words or a pause in speech. Graphics here and below are taken from Awni Hannun’s excellent distill.pub writeup on CTC and beam search.

    + +

    + +

    + +

    With a greedy-like approach, beam search considers the next viable token given an existing sequence of tokens — in the example above, a, b, b is a valid sequence, but a, b, a is not. We rank each possible next token at each step of the beam search according to a scoring function. Scoring functions (s) typically looks something like:

    + +

    + +

    + +

    Where ŷ is a potential path/sequence of tokens, x is the input (P(ŷ|x) represents the model’s predictions over time), and 𝛼 is a weight on the language model probability (P(y) the probability of the sequence under the language model). Some scoring functions add 𝜷 which adjusts a score based on the length of the predicted sequence |ŷ|. This particular scoring function is used in FAIR’s prior work on end-to-end ASR, and there are many variations on scoring functions which can vary across application areas.

    + +

    Given a particular sequence, to assess the next viable token in that sequence (perhaps constrained by a set of allowed words or sequences, such as a lexicon of words), the beam search algorithm scores the sequence with each candidate token added, and sorts token candidates based on those scores. For efficiency and since the number of paths is exponential in the token set size, the top-k highest-scoring candidates are kept — k represents the beam size.

    + +

    + +

    + +

    There are many other nuances with how beam search can progress: similar hypothesis sequences can be “merged”, for instance. +

    + +

    The scoring function can be further augmented to up/down-weight token insertion or long or short words. Scoring with stronger external language models, while incurring computational cost, can also significantly improve performance; this is frequently referred to as LM fusion. There are many other knobs to tune for decoding — these are documented in TorchAudio’s documentation and explored further in TorchAudio’s ASR Inference tutorial. Since decoding is quite efficient, parameters can be easily swept and tuned.

    + +

    Beam search has been used in ASR extensively over the years in far too many works to cite, and in strong, recent results and systems including wav2vec 2.0 and NVIDIA’s NeMo.

    + + + +

    Beam search remains a fast competitor to heavier-weight decoding approaches such as RNN-Transducer that Google has invested in putting on-device and has shown strong results with on common benchmarks. Autoregressive text models at scale can benefit from beam search as well. Among other things, beam search gives:

    + +
      +
    • A flexible performance/latency tradeoff — by adjusting beam size and the external LM, users can sacrifice latency for accuracy or pay for more accurate results with a small latency cost. Decoding with no external LM can improve results at very little performance cost.
    • +
    • Portability without retraining — existing neural models can benefit from multiple decoding setups and plug-and-play with external LMs without training or fine-tuning.
    • +
    • A compelling complexity/accuracy tradeoff — adding beam search to an existing modeling pipeline incurs little additional complexity and can improve performance.
    • +
    + +

    Performance Benchmarks

    + +

    Today’s most commonly-used beam search decoding libraries today that support external language model integration include Kensho’s pyctcdecode, NVIDIA’s NeMo toolkit. We benchmark the TorchAudio + Flashlight decoder against them with a wav2vec 2.0 base model trained on 100 hours of audio evaluated on LibriSpeech dev-other with the official KenLM 3-gram LM. Benchmarks were run on Intel E5-2698 CPUs on a single thread. All computation was in-memory — KenLM memory mapping was disabled as it wasn’t widely supported.

    + +

    When benchmarking, we measure the time-to-WER (word error rate) — because of subtle differences in the implementation of decoding algorithms and the complex relationships between parameters and decoding speed, some hyperparameters differed across runs. To fairly assess performance, we first sweep for parameters that achieve a baseline WER, minimizing beam size if possible.

    + +

    + +

    + +

    +Decoding performance on Librispeech dev-other of a pretrained wav2vec 2.0 model. TorchAudio + Flashlight decoding outperforms by an order of magnitude at low WERs. +

    + +

    + +

    + +

    +Time-to-WER results, deferring to smaller beam size, across decoders. The TorchAudio + Flashlight decoder scales far better with larger beam sizes and at lower WERs. +

    + +

    TorchAudio API and Usage

    + +

    TorchAudio provides a Python API for CTC beam search decoding, with support for the following:

    + +
      +
    • lexicon and lexicon-free decoding
    • +
    • KenLM n-gram language model integration
    • +
    • character and word-piece decoding
    • +
    • sample pretrained LibriSpeech KenLM models and corresponding lexicon and token files
    • +
    • various customizable beam search parameters (beam size, pruning threshold, LM weight…)
    • +
    + +

    To set up the decoder, use the factory function torchaudio.models.decoder.ctc_decoder

    + +
    from torchaudio.models.decoder import ctc_decoder, download_pretrained_files
    +files = download_pretrained_files("librispeech-4-gram")
    +decoder = ctc_decoder(
    +   lexicon=files.lexicon,
    +   tokens=files.tokens,
    +   lm=files.lm,
    +   nbest=1,
    +   ... additional optional customizable args ...
    +)
    +
    + +

    Given emissions of shape (batch, time, num_tokens), the decoder will compute and return a List of batch Lists, each consisting of the nbest hypotheses corresponding to the emissions. Each hypothesis can be further broken down into tokens, words (if a lexicon is provided), score, and timesteps components.

    + +
    emissions = acoustic_model(waveforms)  # (B, T, N)
    +batch_hypotheses = decoder(emissions)  # List[List[CTCHypothesis]]
    +
    +# transcript for a lexicon decoder
    +transcripts = [" ".join(hypo[0].words) for hypo in batch_hypotheses]
    +
    +# transcript for a lexicon free decoder, splitting by sil token
    +batch_tokens = [decoder.idxs_to_tokens(hypo[0].tokens) for hypo in batch_hypotheses]
    +transcripts = ["".join(tokens) for tokens in batch_tokens]
    +
    + +

    Please refer to the documentation for more API details, and the tutorial (ASR Inference Decoding) or sample inference script for more usage examples.

    + +

    Upcoming Improvements

    + +

    Full NNLM support — decoding with large neural language models (e.g. transformers) remains somewhat unexplored at scale. Already supported in Flashlight, we plan to add support in TorchAudio, allowing users to use custom decoder-compatible LMs. Custom word level language models are already available in the nightly TorchAudio build, and is slated to be released in TorchAudio 0.13.

    + +

    Autoregressive/seq2seq decoding — Flashlight Text also supports sequence-to-sequence (seq2seq) decoding for autoregressive models, which we hope to add bindings for and add to TorchAudio and TorchText with efficient GPU implementations as well.

    + +

    Better build support — to benefit from improvements in Flashlight Text, TorchAudio will directly submodule Flashlight Text to make upstreaming modifications and improvements easier. This is already in effect in the nightly TorchAudio build, and is slated to be released in TorchAudio 0.13.

    + +

    Citation

    + +

    To cite the decoder, please use the following:

    + +
    @inproceedings{kahn2022flashlight,
    +  title={Flashlight: Enabling innovation in tools for machine learning},
    +  author={Kahn, Jacob D and Pratap, Vineel and Likhomanenko, Tatiana and Xu, Qiantong and Hannun, Awni and Cai, Jeff and Tomasello, Paden and Lee, Ann and Grave, Edouard and Avidov, Gilad and others},
    +  booktitle={International Conference on Machine Learning},
    +  pages={10557--10574},
    +  year={2022},
    +  organization={PMLR}
    +}
    +
    +
    @inproceedings{yang2022torchaudio,
    +  title={Torchaudio: Building blocks for audio and speech processing},
    +  author={Yang, Yao-Yuan and Hira, Moto and Ni, Zhaoheng and Astafurov, Artyom and Chen, Caroline and Puhrsch, Christian and Pollack, David and Genzel, Dmitriy and Greenberg, Donny and Yang, Edward Z and others},
    +  booktitle={ICASSP 2022-2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
    +  pages={6982--6986},
    +  year={2022},
    +  organization={IEEE}
    +}
    +
    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/finetune-llms/index.html b/blog/finetune-llms/index.html new file mode 100644 index 000000000000..0ca329bd7485 --- /dev/null +++ b/blog/finetune-llms/index.html @@ -0,0 +1,815 @@ + + + + + + + + + + + + + Finetune LLMs on your own consumer hardware using tools from PyTorch and Hugging Face ecosystem | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Younes Belkada, Marc Sun, Titus von Köller, Sourab Mangrulkar, Benjamin Bossan, Lysandre Debut, Steven Liu + +

    +

    We demonstrate how to finetune a 7B parameter model on a typical consumer GPU (NVIDIA T4 16GB) with LoRA and tools from the PyTorch and Hugging Face ecosystem with complete reproducible Google Colab notebook.

    + +

    Introduction

    + +

    Large Language Models (LLMs) have shown impressive capabilities in industrial applications. Often, developers seek to tailor these LLMs for specific use-cases and applications to fine-tune them for better performance. However, LLMs are large by design and require a large number of GPUs to be fine-tuned.

    + +

    Let’s focus on a specific example by trying to fine-tune a Llama model on a free-tier Google Colab instance (1x NVIDIA T4 16GB). Llama-2 7B has 7 billion parameters, with a total of 28GB in case the model is loaded in full-precision. Given our GPU memory constraint (16GB), the model cannot even be loaded, much less trained on our GPU. This memory requirement can be divided by two with negligible performance degradation. You can read more about running models in half-precision and mixed precision for training here.

    + +

    What makes our Llama fine-tuning expensive?

    + +

    In the case of full fine-tuning with Adam optimizer using a half-precision model and mixed-precision mode, we need to allocate per parameter:

    + +
      +
    • 2 bytes for the weight
    • +
    • 2 bytes for the gradient
    • +
    • 4 + 8 bytes for the Adam optimizer states
    • +
    + +

    → With a total of 16 bytes per trainable parameter, this makes a total of 112GB (excluding the intermediate hidden states). Given that the largest GPU available today can have up to 80GB GPU VRAM, it makes fine-tuning challenging and less accessible to everyone. To bridge this gap, Parameter Efficient Fine-Tuning (PEFT) methods are largely adopted today by the community.

    + +

    Parameter Efficient Fine-Tuning (PEFT) methods

    + +

    PEFT methods aim at drastically reducing the number of trainable parameters of a model while keeping the same performance as full fine-tuning.

    + +

    They can be differentiated by their conceptual framework: does the method fine-tune a subset of existing parameters, introduce new parameters, introduce trainable prompts, etc.? We recommend readers to have a look at the paper shared below that extensively compares existing PEFT methods.

    + +

    Venn diagram

    + +

    Image taken from the paper: Scaling Down to Scale Up: A Guide to Parameter-Efficient Fine-Tuning

    + +

    For this blog post, we will focus on Low-Rank Adaption for Large Language Models (LoRA), as it is one of the most adopted PEFT methods by the community.

    + +

    Low-Rank Adaptation for Large Language Models (LoRA) using 🤗 PEFT

    + +

    The LoRA method by Hu et al. from the Microsoft team came out in 2021, and works by attaching extra trainable parameters into a model(that we will denote by base model).

    + +

    To make fine-tuning more efficient, LoRA decomposes a large weight matrix into two smaller, low-rank matrices (called update matrices). These new matrices can be trained to adapt to the new data while keeping the overall number of changes low. The original weight matrix remains frozen and doesn’t receive any further adjustments. To produce the final results, both the original and the adapted weights are combined.

    + +

    This approach has several advantages:

    + +
      +
    • LoRA makes fine-tuning more efficient by drastically reducing the number of trainable parameters.
    • +
    • The original pre-trained weights are kept frozen, which means you can have multiple lightweight and portable LoRA models for various downstream tasks built on top of them.
    • +
    • LoRA is orthogonal to many other parameter-efficient methods and can be combined with many of them.
    • +
    • The performance of models fine-tuned using LoRA is comparable to the performance of fully fine-tuned models.
    • +
    • LoRA does not add any inference latency when adapter weights are merged with the base model
    • +
    + +

    In principle, LoRA can be applied to any subset of weight matrices in a neural network to reduce the number of trainable parameters. However, for simplicity and further parameter efficiency, in Transformer models LoRA is typically applied to attention blocks only. The resulting number of trainable parameters in a LoRA model depends on the size of the low-rank update matrices, which is determined mainly by the rank r and the shape of the original weight matrix.

    + +

    Animated diagram that show how LoRA works in practice

    + +

    Animated diagram that show how LoRA works in practice - original content adapter from the figure 1 of LoRA original paper

    + +

    Below is a code snippet showing how to train LoRA model using Hugging Face PEFT library:

    + +

    code snippet showing how to train LoRA model using  Hugging Face PEFT library

    + +

    The base model can be in any dtype: leveraging SOTA LLM quantization and loading the base model in 4-bit precision

    + +

    According to the LoRA formulation, the base model can be compressed in any data type (‘dtype’) as long as the hidden states from the base model are in the same dtype as the output hidden states from the LoRA matrices.

    + +

    Compressing and quantizing large language models has recently become an exciting topic as SOTA models become larger and more difficult to serve and use for end users. Many people in the community proposed various approaches for effectively compressing LLMs with minimal performance degradation.

    + +

    This is where the bitsandbytes library comes in. Its purpose is to make cutting-edge research by Tim Dettmers, a leading academic expert on quantization and the use of deep learning hardware accelerators, accessible to the general public.

    + +

    QLoRA: One of the core contributions of bitsandbytes towards the democratization of AI

    + +

    Quantization of LLMs has largely focused on quantization for inference, but the QLoRA (Quantized model weights + Low-Rank Adapters) paper showed the breakthrough utility of using backpropagation through frozen, quantized weights at large model scales.

    + +

    With QLoRA we are matching 16-bit fine-tuning performance across all scales and models, while reducing fine-tuning memory footprint by more than 90%— thereby allowing fine-tuning of SOTA models on consumer-grade hardware.

    + +

    In this approach, LoRA is pivotal both for purposes of fine-tuning and the correction of minimal, residual quantization errors. Due to the significantly reduced size of the quantized model it becomes possible to generously place low-rank adaptors at every network layer, which together still make up just 0.2% of the original model’s weight memory footprint. Through such usage of LoRA, we achieve performance that has been shown to be equivalent to 16-bit full model finetuning.

    + +

    System diagram

    + +

    In addition to generous use of LoRA, to achieve high-fidelity fine-tuning of 4-bit models, QLoRA uses 3 further algorithmic tricks:

    + +
      +
    1. 4-bit NormalFloat (NF4) quantization, a custom data type exploiting the property of the normal distribution of model weights and distributing an equal number of weights (per block) to each quantization bin—thereby enhancing information density.
    2. +
    3. Double Quantization, quantization of the quantization constants (further savings).
    4. +
    5. Paged Optimizers, preventing memory spikes during gradient checkpointing from causing out-of-memory errors.
    6. +
    + +

    An interesting aspect is the dequantization of 4-bit weights in the GPU cache, with matrix multiplication performed as a 16-bit floating point operation. In other words, we use a low-precision storage data type (in our case 4-bit, but in principle interchangeable) and one normal precision computation data type. This is important because the latter defaults to 32-bit for hardware compatibility and numerical stability reasons, but should be set to the optimal BFloat16 for newer hardware supporting it to achieve the best performance.

    + +

    To conclude, through combining these refinements to the quantization process and generous use of LoRA, we compress the model by over 90% and retain full model performance without the usual quantization degradation, while also retaining full fine-tuning capabilities with 16-bit LoRA adapters at every layer.

    + +

    Using QLoRA in practice

    + +

    These SOTA quantization methods come packaged in the bitsandbytes library and are conveniently integrated with HuggingFace 🤗 Transformers. For instance, to use LLM.int8 and QLoRA algorithms, respectively, simply pass load_in_8bit and load_in_4bit to the from_pretrained method.

    + +
    import torch
    +from transformers import AutoModelForCausalLM, AutoTokenizer
    +
    +model_id = "facebook/opt-125m"
    +# For LLM.int8()
    +# model = AutoModelForCausalLM.from_pretrained(model_id, load_in_8bit=True)
    +
    +# For QLoRA
    +model = AutoModelForCausalLM.from_pretrained(model_id, load_in_4bit=True)
    +
    + +

    You can read more about quantization features in this specific section of the documentation: https://huggingface.co/docs/transformers/main_classes/quantization

    + +

    When using QLoRA with Adam optimizer using a 4-bit base model and mixed-precision mode, we need to allocate per parameter:

    + +
      +
    • ~0.5 bytes for the weight
    • +
    • 2 bytes for the gradient
    • +
    • 4 + 8 bytes for the Adam optimizer states
    • +
    + +

    Giving a total of 14 bytes per trainable parameter times 0.0029 as we end up having only 0.29% trainable parameters with QLoRA, this makes the QLoRA training setup cost around 4.5GB to fit, but requires in practice ~7-10GB to include intermediate hidden states which are always in half-precision (7 GB for a sequence length of 512 and 10GB for a sequence length of 1024) in the Google Colab demo shared in the next section.

    + +

    Below is the code snippet showing how to train QLoRA model using Hugging Face PEFT:

    + +

    code snippet showing how to train QLoRA model using Hugging Face PEFT

    + +

    Using TRL for LLM training

    + +

    Models such as ChatGPT, GPT-4, and Claude are powerful language models that have been fine-tuned using a method called Reinforcement Learning from Human Feedback (RLHF) to be better aligned with how we expect them to behave and would like to use them. The finetuning goes through 3 steps:

    + +
      +
    • Supervised Fine-tuning (SFT)
    • +
    • Reward / preference modeling (RM)
    • +
    • Reinforcement Learning from Human Feedback (RLHF)
    • +
    + +

    Process diagram

    + +

    From InstructGPT paper: Ouyang, Long, et al. “Training language models to follow instructions with human feedback.” arXiv preprint arXiv:2203.02155 (2022).

    + +

    Here, we will only focus on the supervised fine-tuning step. We train the model on the new dataset following a process similar to that of pretraining. The objective is to predict the next token (causal language modeling). Multiple techniques can be applied to make the training more efficient:

    + +
      +
    • Packing: Instead of having one text per sample in the batch and then padding to either the longest text or the maximal context of the model, we concatenate a lot of texts with an End-Of-Sentence (EOS) token in between and cut chunks of the context size to fill the batch without any padding. This approach significantly improves training efficiency as each token processed by the model contributes to training.
    • +
    + +

    Sample diagram

    + +
      +
    • Train on completion only: We want the model to be able to understand the prompt and generate an answer/. Instead of training the model on the whole input (prompt + answer), the training will be more efficient if we only train the model on completion.
    • +
    + +

    You can perform supervised fine-tuning with these techniques using SFTTrainer:

    + +
    from trl import SFTTrainer
    +
    +trainer = SFTTrainer(
    +    model=model,
    +    args=training_arguments,
    +    train_dataset=train_dataset,
    +    dataset_text_field="text",
    +    max_seq_length=1024,
    +    packing=True,
    +)
    +
    + +

    Since SFTTrainer back-end is powered by 🤗accelerate, you can easily adapt the training to your hardware setup in one line of code!

    + +

    For example, with you have 2 GPUs, you can perform Distributed Data Parallel training with using the following command:

    + +
    accelerate launch --num_processes=2 training_llama_script.py
    +
    + +

    Putting all the pieces together

    + +

    We made a complete reproducible Google Colab notebook that you can check through this link. We use all the components shared in the sections above and fine-tune a llama-7b model on UltraChat dataset using QLoRA. As it can be observed through the screenshot below, when using a sequence length of 1024 and a batch size od 4, the memory usage remains very low (around 10GB).

    + +

    Memory usage diagram

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/flash-decoding/index.html b/blog/flash-decoding/index.html new file mode 100644 index 000000000000..47d343854f06 --- /dev/null +++ b/blog/flash-decoding/index.html @@ -0,0 +1,809 @@ + + + + + + + + + + + + + Flash-Decoding for long-context inference | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    October 13, 2023

    +

    + Flash-Decoding for long-context inference +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Tri Dao, Daniel Haziza, Francisco Massa, Grigory Sizov + +

    +

    Motivation

    + +

    Large language models (LLM) such as ChatGPT or Llama have received unprecedented attention lately. However, they remain massively expensive to run. Even though generating a single response can cost about $0.01 (a few seconds of an 8xA100 instance on AWS), the costs quickly add up when scaling to billions of users, who could have multiple daily interactions with such LLMs. Some use cases are more expensive, like code auto-completion, because it runs whenever a new character is typed. As LLM applications multiply, even small efficiency gains to the generation time can have a massive impact.

    + +

    LLM inference (or “decoding”) is an iterative process: tokens are generated one at a time. Generating full sentences of N tokens requires N forward passes through the model. Fortunately, it is possible to cache previously calculated tokens: this means that a single generation step does not depend on the context length, except for a single operation, the attention. This operation does not scale well with context length.

    + +

    There are a number of important emerging use cases of LLMs that utilize a long context. With a longer context, LLMs can reason about longer documents, either to summarize or answer questions about them, they can keep track of longer conversations, or even process entire codebases before writing code. As an example, most LLMs had a context length of up to 2k in 2022 (GPT-3), but we now have open-source LLMs scaling up to 32k (Llama-2-32k), or even 100k more recently (CodeLlama). In this setting, attention takes a significant fraction of time during inference.

    + +

    When scaling on the batch size dimension, the attention can also become a bottleneck even with relatively small contexts. This is because the amount of memory to read scales with the batch dimension, whereas it only depends on the model size for the rest of the model.

    + +

    We present a technique, Flash-Decoding, that significantly speeds up attention during inference, bringing up to 8x faster generation for very long sequences. The main idea is to load the keys and values in parallel as fast as possible, then separately rescale and combine the results to maintain the right attention outputs.

    + +

    Multi-head attention for decoding

    + +

    During decoding, every new token that is generated needs to attend to all previous tokens, to compute:

    + +

    softmax(queries @ keys.transpose) @ values

    + +

    This operation has been optimized with FlashAttention (v1 and v2 recently) in the training case, where the bottleneck is the memory bandwidth to read and write the intermediate results (e.g. Q @ K^T). However, these optimizations don’t apply directly to the inference case, because the bottlenecks are different. For training, FlashAttention parallelizes across the batch size and query length dimensions. During inference, the query length is typically 1: this means that if the batch size is smaller than the number of streaming multiprocessors (SMs) on the GPU (108 for an A100), the operation will only use a small part of the GPU! This is especially the case when using long contexts, because it requires smaller batch sizes to fit in GPU memory. With a batch size of 1, FlashAttention will use less than 1% of the GPU!

    + +

    FlashAttention

    + +

    FlashAttention parallelizes across blocks of queries and batch size only, and does not manage to occupy the entire GPU during decoding

    + +

    The attention can also be done using matrix multiplication primitives - without using FlashAttention. In this case, the operation occupies the GPU entirely, but launches many kernels that write and read intermediate results, which is not optimal.

    + +

    A faster attention for decoding: Flash-Decoding

    + +

    Our new approach Flash-Decoding is based on FlashAttention, and adds a new parallelization dimension: the keys/values sequence length. It combines the benefits of the 2 approaches from above. Like FlashAttention, it stores very little extra data to global memory, however it fully utilizes the GPU even when the batch size is small, as long as the context length is large enough.

    + +

    Flash-Decoding

    + +

    Flash-Decoding also parallelizes across keys and values, at the cost of a small final reduction step

    + +

    Flash-Decoding works in 3 steps:

    + +
      +
    1. First, we split the keys/values in smaller chunks.
    2. +
    3. We compute the attention of the query with each of these splits in parallel using FlashAttention. We also write 1 extra scalar per row and per split: the log-sum-exp of the attention values.
    4. +
    5. Finally, we compute the actual output by reducing over all the splits, using the log-sum-exp to scale the contribution of each split.
    6. +
    + +

    All of this is possible because the attention/softmax can be calculated iteratively. In Flash-Decoding, it is used at 2 levels: within splits (like FlashAttention), and across splits to perform the final reduction.

    + +

    In practice, step (1) does not involve any GPU operation, as the key/value chunks are views of the full key/value tensors. We then have 2 separate kernels to perform respectively (2) and (3).

    + +

    Benchmarks on CodeLlama 34B

    + +

    To validate this approach, we benchmark the decoding throughput of the CodeLLaMa-34b. This model has the same architecture as Llama 2, and more generally results should generalize across many LLMs. We measure the decoding speed in tok/s at various sequence lengths, from 512 to 64k, and compare multiple ways of calculating the attention:

    + +
      +
    • Pytorch: Running the attention using pure PyTorch primitives (without using FlashAttention)
    • +
    • FlashAttention v2
    • +
    • FasterTransformer: Uses the FasterTransformer attention kernel
    • +
    • Flash-Decoding
    • +
    • And an upper bound calculated as the time it takes to read from memory the entire model along with the KV-cache
    • +
    + +

    Flash-Decoding unlocks up to 8x speedups in decoding speed for very large sequences, and scales much better than alternative approaches.

    + +

    CodeLlama

    + +

    All approaches perform similarly for small prompts, but scale poorly as the sequence length increases from 512 to 64k, except Flash-Decoding. In this regime (batch size 1) with Flash-Decoding, scaling the sequence length has little impact on generation speed

    + +

    Component-level micro-benchmarks

    + +

    We also micro-benchmark the scaled multi-head attention for various sequence lengths and batch sizes on A100 with inputs in f16. We set the batch size to 1, and use 16 query heads of dimension 128, for 2 key/value heads (grouped-query attention), which matches the dimensions used in CodeLLaMa-34b when running on 4 GPUs.

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        
    Setting \ AlgorithmPyTorch Eager (us)Flash-Attention v2.0.9 (us)Flash-Decoding (us)
    B=256, seqlen=2563058.6390.563.4
    B=128, seqlen=5123151.4366.367.7
    B=64, seqlen=10243160.4364.877.7
    B=32, seqlen=20483158.335258.5
    B=16, seqlen=40963157401.757
    B=8, seqlen=81923173.1529.256.4
    B=4, seqlen=163843223582.758.2
    B=2, seqlen=327683224.11156.160.3
    B=1, seqlen=655361335.62300.664.4
    B=1, seqlen=13107226644592.2106.6
    + +

    Micro-benchmark of the multi-head attention, run-time in us. Flash-Decoding achieves almost constant run-time as the sequence length scales to up to 64k.

    + +

    The up to 8x speedup end-to-end measured earlier is made possible because the attention itself is up to 50x faster than FlashAttention. Up until sequence length 32k, the attention time is roughly constant, because Flash-Decoding manages to fully utilize the GPU.

    + +

    Using Flash-Decoding

    + +

    Flash-decoding is available:

    + +
      +
    • In the FlashAttention package, starting at version 2.2
    • +
    • Through xFormers starting at version 0.0.22 through `xformers.ops.memory_efficient_attention`. The dispatcher will automatically use either the Flash-Decoding or FlashAttention approaches depending on the problem size. When these approaches are not supported, it can dispatch to an efficient triton kernel that implements the Flash-Decoding algorithm.
    • +
    + +

    A full example of decoding with LLaMa v2 / CodeLLaMa is available in the FlashAttention repo here and in the xFormers repo here. We also provide a minimal example of an efficient decoding code for LLaMa v1/v2 models, meant to be fast, easy to read, educational and hackable.

    + +

    Acknowledgements

    + +

    Thanks to Erich Elsen, Ashish Vaswani, and Michaël Benesty for suggesting this idea of splitting the KVcache loading. We want to thank Jeremy Reizenstein, Patrick Labatut and Andrew Tulloch for the valuable discussions, and Quentin Carbonneaux for contributing the efficient decoding example to xFormers. We also want to thank Geeta Chauhan and Gregory Chanan for helping with the writing and more broadly contributing to getting this published on the PyTorch blog.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/flashattention-3/index.html b/blog/flashattention-3/index.html new file mode 100644 index 000000000000..dbef94cab475 --- /dev/null +++ b/blog/flashattention-3/index.html @@ -0,0 +1,750 @@ + + + + + + + + + + + + + FlashAttention-3: Fast and Accurate Attention with Asynchrony and Low-precision | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Jay Shah and Ganesh Bikshandi, Colfax Research, Ying Zhang, Meta, Vijay Thakkar and Pradeep Ramani, NVIDIA, Tri Dao, TogetherAI and Princeton University + +

    +

    Attention, as a core layer of the ubiquitous Transformer architecture, is a bottleneck for large language models and long-context applications. FlashAttention (and FlashAttention-2) pioneered an approach to speed up attention on GPUs by minimizing memory reads/writes, and is now used by most libraries to accelerate Transformer training and inference. This has contributed to a massive increase in LLM context length in the last two years, from 2-4K (GPT-3, OPT) to 128K (GPT-4), or even 1M (Llama 3). However, despite its success, FlashAttention has yet to take advantage of new capabilities in modern hardware, with FlashAttention-2 achieving only 35% utilization of theoretical max FLOPs on the H100 GPU. In this blogpost, we describe three main techniques to speed up attention on Hopper GPUs: exploiting asynchrony of the Tensor Cores and TMA to (1) overlap overall computation and data movement via warp-specialization and (2) interleave block-wise matmul and softmax operations, and (3) incoherent processing that leverages hardware support for FP8 low-precision.

    + +

    We’re excited to release FlashAttention-3 that incorporates these techniques. It’s 1.5-2.0x faster than FlashAttention-2 with FP16, up to 740 TFLOPS, i.e., 75% utilization of H100 theoretical max FLOPS. With FP8, FlashAttention-3 reaches close to 1.2 PFLOPS, with 2.6x smaller error than baseline FP8 attention.

    + +

    FlashAttention-3 is available at: https://github.com/Dao-AILab/flash-attention
    +Paper

    + +

    FlashAttention Recap

    + +

    FlashAttention is an algorithm that reorders the attention computation and leverages tiling and recomputation to significantly speed it up and reduce memory usage from quadratic to linear in sequence length. We use tiling to load blocks of inputs from HBM (GPU memory) to SRAM (fast cache), perform attention with respect to that block, and update the output in HBM. By not writing the large intermediate attention matrices to HBM, we reduce the amount of memory reads/writes, which brings 2-4x wallclock time speedup.

    + +

    Here we show a diagram of FlashAttention forward pass: with tiling and softmax rescaling, we operate by blocks and avoid having to read/write from HBM, while obtaining the correct output with no approximation.

    + +

    math equations

    + +

    New hardware features on Hopper GPUs - WGMMA, TMA, FP8

    + +

    While FlashAttention-2 can achieve up to 70% theoretical max FLOPS on Ampere (A100) GPUs, it does not yet take advantage of new features on Hopper GPUs to maximize performance. We describe some of the new Hopper-specific features here, and why they are important.

    + +

    1. WGMMA (Warpgroup Matrix Multiply-Accumulate). This new feature makes use of the new Tensor Cores on Hopper, with much higher throughput1 than the older mma.sync instruction in Ampere (image from the H100 white paper).

    + +

    image from the H100 white paper

    + +

    2. TMA (Tensor Memory Accelerator). This is a special hardware unit that accelerates the transfer of data between global memory and shared memory, taking care of all index calculation and out-of-bound predication. This frees up registers, which is a valuable resource to increase tile size and efficiency.

    + +

    block diagram

    + +

    3. Low-precision with FP8. This doubles the Tensor Core throughput (e.g. 989 TFLOPS with FP16 and 1978 TFLOPS with FP8), but trades off accuracy by using fewer bits to represent floating point numbers.

    + +

    6x throughput

    + +

    FlashAttention-3 makes use of all of these new features of Hopper, using powerful abstractions from NVIDIA’s CUTLASS library.
    +
    +By rewriting FlashAttention to use these new features, we can already significantly speed it up (e.g., from 350 TFLOPS in FlashAttention-2 FP16 forward pass to around 540-570 TFLOPS). However, the asynchronous nature of the new instructions on Hopper (WGMMA and TMA) opens up additional algorithmic opportunities to overlap operations and thereby extract even greater performance. For this blogpost, we’ll explain two such techniques specific to attention. The generic technique of warp specialization, with separate producer and consumer warps doing TMA and WGMMA, is well-covered elsewhere in the context of GEMM and works the same here.

    + +

    Asynchrony: Overlapping GEMM and Softmax

    + +

    Why overlap?

    + +

    Attention has GEMMs (those matmuls between Q and K and between attention probability P and V) and softmax as its two main operations. Why do we need to overlap them? Isn’t most of the FLOPS in the GEMMs anyway? As long as the GEMMs are fast (e.g., computed using WGMMA instructions), shouldn’t the GPU be going brrrr?

    + +

    The problem is that non-matmul operations are much slower than matmul operations on modern accelerators. Special functions such as exponential (for the softmax) have even lower throughput than floating point multiply-add; they are evaluated by the multi-function unit, a unit separate from floating point multiply-add or matrix multiply-add. As an example, the H100 GPU SXM5 has 989 TFLOPS of FP16 matrix multiply, but only 3.9 TFLOPS (256x less throughput) for special functions2! For head dimension 128, there are 512x more matmul FLOPS than exponential, which means that exponential can take 50% of the time compared to matmul. The situation is even worse for FP8, where the matmul FLOPS are twice as fast yet exponential FLOPS stay the same speed. Ideally we want matmul and softmax to operate in parallel. While the Tensor Cores are busy with matmul, the multi-function units should be calculating exponential!

    + +

    Inter-warpgroup overlapping with pingpong scheduling

    + +

    The first and easiest way to overlap GEMM and softmax is to do nothing at all! The warp schedulers already try to schedule warps so that if some warps are blocked (e.g., waiting for GEMM results), other warps can run. That is, the warp schedulers do some of this overlapping for us, for free.

    + +

    However, we can improve on this by doing some of the scheduling manually. As an example, if we have 2 warpgroups (labeled 1 and 2 – each warpgroup is a group of 4 warps), we can use synchronization barriers (bar.sync) so that warpgroup 1 first does its GEMMs (e.g., GEMM1 of one iteration and GEMM0 of the next iteration), and then warpgroup 2 does its GEMMs while warpgroup 1 does its softmax, and so on. This “pingpong” schedule is illustrated in the figure below, where the same color denotes the same iteration.

    + +

    block chart

    + +

    This would allow us to perform the softmax in the shadow of the GEMMs of the other warpgroup. Of course, this figure is just a caricature; in practice the scheduling is not really this clean. Nevertheless, pingpong scheduling can improve FP16 attention forward pass from around 570 TFLOPS to 620 TFLOPS (head dim 128, seqlen 8K).

    + +

    Intra-warpgroup overlapping of GEMM and Softmax

    + +

    Even within one warpgroup, we can have some part of softmax running while the GEMMs of that warpgroup is running. This is illustrated in this figure, where the same color denotes the same iteration.

    + +

    block chart

    + +

    This pipelining increases throughput from around 620 TFLOPS to around 640-660 TFLOPS for FP16 attention forward, at the cost of higher register pressure. We need more registers to hold both accumulators of the GEMMs, and the input/output of softmax. Overall, we find this technique to offer a favorable tradeoff.

    + +

    Low-precision: reduce quantization error with incoherent processing

    + +

    LLM activation can have outliers with much larger magnitude than the rest of the features. These outliers make it difficult to quantize, producing much larger quantization errors. We leverage incoherent processing, a technique used in the quantization literature (e.g. from QuIP) that multiplies the query and key with a random orthogonal matrix to “spread out” the outliers and reduce quantization error. In particular, we use the Hadamard transform (with random signs), which can be done per attention head in O(d log d) instead of O(d^2) time, where d is the head dimension. Since the Hadamard transform is memory-bandwidth bound, it can be fused with previous operations such as rotary embedding (also memory-bandwidth bound) “for free”.

    + +

    In our experiment where Q, K, V are generated from a standard normal distribution but 0.1% of the entries have large magnitudes (to simulate outliers), we found that incoherent processing can reduce the quantization error by 2.6x. We show numerical error comparison in the table below. Please see the paper for details.

    + +

    text diagram

    + +

    Attention benchmark

    + +

    We show some results with FlashAttention-3, and compare it to FlashAttention-2, as well as the implementation in Triton and cuDNN (both of which already use new hardware features of Hopper GPUs).

    + +

    For FP16, we see about 1.6x-1.8x speedup over FlashAttention-2

    + +

    speed charts

    + +

    speed charts

    + +

    For FP8, we can reach close to 1.2 PFLOPS!

    + +

    speed charts

    + +

    Discussion

    + +

    This blogpost highlights some of the optimizations for FlashAttention available on Hopper GPUs. Other optimizations (e.g., variable length sequences, persistent kernel, and in-kernel transpose for FP8) are covered in the paper.

    + +

    We have seen that designing algorithms that take advantage of the hardware they run on can bring significant efficiency gains and unlock new model capabilities such as long context. We look forward to future work on optimization for LLM inference, as well as generalizing our techniques to other hardware architectures.

    + +

    We also look forward to FlashAttention-3 being integrated in a future release of PyTorch.

    + + +

    Notes

    + +
    +
      +
    1. + +

      Without the wgmma instruction, the older mma.sync instruction can only reach about ⅔ the peak throughput of Hopper Tensor Cores: https://arxiv.org/abs/2402.13499v1 

      +
    2. +
    3. + +

      The CUDA programming guide specifies that the throughput for special functions is 16 operations per streaming multiprocessor (SM) per clock cycle. We multiply 16 by 132 SMs and 1830 Mhz (clock speed used to calculate 989 TFLOPS of FP16 matmul) to get 3.9 TFLOPS 

      +
    4. +
    +
    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/flexattention-for-inference/index.html b/blog/flexattention-for-inference/index.html new file mode 100644 index 000000000000..5a6e6fb62032 --- /dev/null +++ b/blog/flexattention-for-inference/index.html @@ -0,0 +1,984 @@ + + + + + + + + + + + + + FlexAttention Part II: FlexAttention for Inference | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Joy Dong, Boyuan Feng, Driss Guessous, Joel Schlosser, Yanbo Liang, Horace He + +

    +

    Overview

    + +

    In PyTorch 2.5.0 release, we introduced FlexAttention torch.nn.attention.flex_attention for ML researchers who’d like to customize their attention kernels without writing kernel code. This blog introduces our decoding backend optimized for inference, supporting GQA and PagedAttention, along with feature updates including nested jagged tensor support, performance tuning guides and trainable biases support.

    + +

    If you’re looking for an easy way to play around with FlexAttention in your post-training / inference pipeline, PyTorch native post-training library torchtune and inference codebase gpt-fast already have FlexAttention integrated. Try it out!

    + +

    We are excited to share that our paper on FlexAttention has been accepted for presentation at the MLSys2025 Conference held from May 12-15th in Santa Clara, California.

    + +

    Title: FlexAttention: A Programming Model for Generating Optimized Attention Kernels. Poster

    + +

    FlexAttention for Inference

    + +

    TL;DR: torch.compile lowers flex_attention to a fused FlashDecoding kernel when it runs on a very short query.

    + +

    One fused attention kernel does not suit all – especially in long-context LLM inference.

    + +

    The decoding phase of LLM inference is an iterative process: tokens are generated one at a time, requiring N forward passes to generate an N-token sentence. Fortunately, each iteration doesn’t need to recompute self-attention over the full sentence — previously calculated tokens are cached, therefore we only need to attend the newly generated token to the cached context.

    + +

    chart

    + +

    This results in a unique attention pattern where a short query sequence (1 token) attends to a long key-value cache (context length up to 128k). Traditional optimizations for square attention kernels (q_len ≈ kv_len) don’t directly apply here. This pattern poses new challenges for GPU memory utilization and occupancy. We build a dedicated FlexDecoding backend optimized for long-context LLM inference incorporating decoding-specific techniques from FlashDecoding.

    + +

    FlexDecoding is implemented as an alternative backend for the torch.nn.attention.flex_attention operator. flex_attention automatically switches to the FlexDecoding backend for its JIT compilation when given a short query and a long KV cache. If the input shape changes significantly, for example transitioning from the prefill phase to decoding, JIT recompilation generates a separate kernel for each scenario.

    + +
    flex_attention = torch.compile(flex_attention)
    +
    +k_cache = torch.random(B, H, 16384, D) 
    +v_cache = torch.random(B, H, 16384, D)
    +
    +...
    +
    +# Prefill Phase: query shape = [B, H, 8000, D]
    +flex_attention(q_prefill, k_cache, v_cache, ...) # Uses FlexAttention backend optimized for prefill & training
    +
    +# Decoding Phase: q_last_token shape = [B, H, 1, D]
    +flex_attention(q_last_token  , k_cache, v_cache, ...) # Recompiles with the FlexDecoding backend 
    +
    +# decode 2 tokens at the same time: q_last_2_tokens shape = [B, H, 2, D]
    +flex_attention(q_last_2_tokens, k_cache, v_cache, ...) # No recompilation needed! Runs the decoding kernel again.
    +
    + +

    Working with KV Cache

    + +

    One of the key optimizations for efficient inference is maintaining a preallocated KV cache that updates in place as new tokens are generated. Instead of enforcing a specific KV cache policy with a dedicated API, FlexDecoding allows users to define and manage the KV cache themselves.

    + +

    Similar to FlexAttention, FlexDecoding takes user-defined mask_mod and score_mod functions. These functions modify attention scores before the softmax operation.

    + +

    chart

    + +
    score_mod(score, b, h, q_idx, kv_idx) -> tensor # return updated score
    +
    + +

    Score is a scalar pytorch tensor that represents the dot product of a query token and a key token. The rest of the arguments specify which score is being computed:

    + +
      +
    • b batch index
    • +
    • h attention head index
    • +
    • q_idx token position in query tensor
    • +
    • kv_idx token position in key/value tensor
    • +
    + +

    In the decoding phase, previously calculated tokens are cached, and only the latest generated token (i-th) is used as the query. A naive causal mask on this one token query looks like this:

    + +
    def causal(score, b, h, q_idx, kv_idx):
    +    return torch.where(q_idx >= kv_idx, score, -float("inf"))
    +
    + +

    chart

    + +

    This is problematic: the new token “saw” should attend to all previously generated tokens i.e. “The cat sat on the mat and saw”, not just the first entry in the kv cache. To correct this, the score_mod needs to offset q_idx by i for accurate decoding.

    + +

    chart

    + +

    Creating a new score_mod for each token to accommodate the offset is slow since it means FlexAttention needs to be recompiled every iteration for a different score_mod. Instead,

    + +

    We define this offset as a tensor and increment its value at each iteration:

    + +
    offset = torch.tensor(i, "cuda")
    +def causal_w_offset(score, b, h, q_idx, kv_idx):
    +    return torch.where(q_idx + offset >= kv_idx, score, -float("inf"))
    +
    +# Attend the i-th token
    +flex_attention(..., score_mod=causal_w_offset  ) # Compiles the kernel here 
    +...
    +# Attend the i+1-th token
    +offset = offset + 1 # Increment offset
    +flex_attention(..., score_mod=causal_w_offset ) # Doesn't need to recompile! 
    +
    + +

    Notably, here offset becomes a captured tensor and it does not need to recompile if offset changes values.

    + +

    Manually rewriting your score_mod and mask_mod for offset handling isn’t necessary. We can automate this process with a generic rewriter:

    + +
    offset = torch.tensor(i, "cuda")
    +
    +def get_score_mod_w_offset(score_mod: _score_mod_signature, _offset: tensor):
    +    def _score_mod(score, b, h, q, kv):
    +        return score_mod(score, b, h, q + _offset, kv)
    +    return _score_mod
    +
    +def get_mask_mod_w_offset(mask_mod: _mask_mod_signature, _offset: tensor):
    +    def _mask_mod(b, h, q, kv):
    +        return mask_mod(b, h, q + _offset, kv)
    +    return _mask_mod
    +
    +causal_w_offset = get_score_mod_w_offset(causal, offset)
    +
    + +

    BlockMask for Inference

    + +

    We can also use BlockMask with inference to leverage mask sparsity. The idea is to precompute the BlockMask once during model setup and use slices of it during decoding

    + +

    Precomputing BlockMask

    + +

    During setup, we create a squared BlockMask for MAX_SEQ_LEN x MAX_SEQ_LEN:

    + +
    from torch.nn.attention.flex_attention import create_block_mask
    +
    +def causal_mask(b, h, q_idx, kv_idx):
    +    return q_idx >= kv_idx
    +
    +block_mask = create_block_mask(causal_mask, B=None, H=None, Q_LEN=MAX_SEQ_LEN,KV_LEN=MAX_SEQ_LEN)
    +
    + +

    chart

    + +

    Using BlockMask During Decoding

    + +

    For the i-th token, we use a slice of the mask:

    + +
    block_offset = i // block_mask.BLOCK_SIZE[0]
    +block_mask_slice = block_mask[:, :, block_offset]
    +
    +# don't forget to use the mask_mod with offset! 
    +block_mask_slice.mask_mod = get_mask_mod_w_offset(causal_mask)
    +
    + +

    chart

    + +

    Performance

    + +

    chart

    + +

    FlexDecoding kernel performs on par with FlashDecoding (FAKV) and significantly outperforms pytorch scaled_dot_product_attention (code).

    + +

    chart

    + +

    FlexDecoding boosts LLaMa3.1-8B serving performance by 1.22x-2.04x, and LLaMa3.1-70B performance by 0.99x - 1.66x compared to SDPA in gpt-fast. (code)

    + +

    Paged Attention

    + +

    vLLM is one of the popular LLM serving engines, powered by the efficient memory management from PagedAttention. Existing PagedAttention implementation requires dedicated CUDA kernels and shows limited flexibility on supporting emerging attention variants. In this section, we present a PT2-native PagedAttention implementation that is enabled by flex attention and torch.compile.

    + +

    PagedAttention scatters KV cache to reduce memory fragmentation and support higher batch sizes. Without PagedAttention, KV cache from the same request are stored in a contiguous memory, requiring 2 tensor of shape B x H x KV LEN x D. We call it a logical KV cache. Here, KV_LEN is the maximum sequence length over all requests in a batch. Considering the Figure 1(a), KV_LEN is 9 thus all requests must be padded to 9 tokens, leading to large memory waste. With PagedAttention, we can chunk each request into multiple pages of the same size page_size and scatter these pages into a physical KV cache of shape 1 x H x max seq len x D, where max_seq_len=n_pages x page_size. This avoids padding requests to the same length and saves memory. Specifically, we provide an assign API to update KV cache via index computations:

    + +
    def assign(
    +    batch_idx: torch.Tensor,
    +    input_pos: torch.Tensor,
    +    k_val: torch.Tensor,
    +    v_val: torch.Tensor,
    +    k_cache: torch.Tensor,
    +    v_cache: torch.Tensor,
    +) -> None
    +
    + +

    Behind this assign API is a page table, a tensor mapping logical KV cache to physical KV cache:

    + +

    [batch_idx, logical_page_idx] -> physical_page_idx

    + +

    assign takes k_val and v_val and scatters to physical KV cache guided by the mapping from the page table.

    + +

    chart

    + +

    Paged Attention with Page Table

    + +

    A natural question is, how to integrate PagedAttention with flex attention to support diverse attention variants? A naive idea is to materialize the logical KV cache before computing with flex attention. But this leads to redundant memory copy and bad performance. Another idea is to build a dedicated CUDA or Triton kernel for paged attention, similar to existing PagedAttention implementation. However, this adds much manual effort and code complexity.

    + +

    Instead, we design a fused indirect memory access by converting a logical block mask according to the page table. In FlexAttention, we exploit BlockMask to identify logical blocks and skip redundant computation. While Paged Attention adds an extra layer of indirect memory access, we can further convert the logical block mask to the physical block mask corresponding to the page table, as illustrated in Figure 2. Our PagedAttention implementation provides a convert_logical_block_mask via torch.gather calls:

    + +
    def convert_logical_block_mask(
    +    block_mask: BlockMask,
    +    batch_idx: Optional[torch.Tensor] = None,
    +) -> BlockMask
    +
    + +

    chart

    + +

    Paged Attention via Block Mask Conversion

    + +

    One remaining question is how to rewrite user-specified mask_mod and score_mod for PagedAttention. When users specify these modifications, they write with logical indices without the knowledge of the page table maintained at runtime. The following code shows an automated conversion at runtime which is necessary to rewrite user-specified modifications with physical kv indices. The new_mask_mod would take the physical_kv_idx and convert it back to the logical_kv_idx and apply user-specified mask_mod on the logical_kv_idx for the correct mask. For efficiency, we maintain physical_to_logical as a mapping from physical_kv_block to logical_kv_block to facilitate the conversion. For correctness, we mask out-of-boundary blocks as False with a torch.where call. After batching logical KV caches from multiple requests into the same physical KV cache, there are much more physical blocks than the number of logical blocks for each request. Thus, a physical block may not have a corresponding logical block for a specific request during block mask conversion. By masking as False with torch.where, we can ensure the correctness that data from different requests do not interfere with each other. Similarly, we can convert the score_mod automatically.

    + +
    def get_mask_mod(mask_mod: Optional[_mask_mod_signature]) -> _mask_mod_signature:
    +    if mask_mod is None:
    +        mask_mod = noop_mask
    +
    +    def new_mask_mod(
    +        b: torch.Tensor,
    +        h: torch.Tensor,
    +        q_idx: torch.Tensor,
    +        physical_kv_idx: torch.Tensor,
    +    ):
    +        physical_kv_block = physical_kv_idx // page_size
    +        physical_kv_offset = physical_kv_idx % page_size
    +        logical_block_idx = physical_to_logical[b, physical_kv_block]
    +        logical_kv_idx = logical_block_idx * page_size + physical_kv_offset
    +        return torch.where(
    +            logical_block_idx >= 0, mask_mod(b, h, q_idx, logical_kv_idx), False
    +        )
    +
    +    return new_mask_mod
    +
    + +

    Figure 3 demonstrates the latency from Paged Attention (code). Overall, there is less than 5% overhead from Flex Attention with Paged Attention, compared with Flex Attention only. We also observe an on-par performance with Flash Attention v2. A minimal serving example further shows that PagedAttention can support 76x higher batch size when evaluating on OpenOrca dataset which includes 1M GPT-4 completions and 3.2M GPT-3.5 completions.

    + +

    chart

    + +

    Paged Attention: Latency under diverse sequence length

    + +

    Ragged input sequences with Nested Jagged Tensors (NJTs)

    + +

    FlexAttention now supports ragged-sized input sequences through the use of Nested Jagged Tensors (NJTs). NJTs represent ragged-sized sequences by packing sequences into a single “stacked sequence” and maintaining a set of offsets delimiting sequence boundaries for each batch item.

    + +

    A block mask can be created for input NJTs through the new create_nested_block_mask() API. The returned block mask is compatible with the ragged structure of the given NJT, treating it as a single “stacked sequence” with inter-sequence attention automatically masked out. The mask_mod or score_mod function can be written as usual.

    + +
    from torch.nn.attention.flex_attention import create_nested_block_mask, flex_attention
    +
    +BATCH = 8
    +NUM_HEADS = 8
    +D = 16
    +device = "cuda"
    +
    +# Input NJTs of shape (BATCH, SEQ_LEN*, D) with ragged SEQ_LEN
    +sequence_lengths = [torch.randint(5, 30, ()).item() for _ in range(BATCH)]
    +query = torch.nested.nested_tensor([
    +    torch.randn(seq_len, NUM_HEADS * D, device=device)
    +    for seq_len in sequence_lengths
    +], layout=torch.jagged)
    +key = torch.randn_like(query)
    +value = torch.randn_like(query)
    +
    +# View as shape (BATCH, NUM_HEADS, SEQ_LEN*, HEAD_DIM)
    +query = query.unflatten(-1, [NUM_HEADS, D]).transpose(1, 2)
    +key = key.unflatten(-1, [NUM_HEADS, D]).transpose(1, 2)
    +value = value.unflatten(-1, [NUM_HEADS, D]).transpose(1, 2)
    +
    +# Simple causal mask
    +def my_mask_mod(b, h, q_idx, kv_idx):
    +    return q_idx >= kv_idx
    +
    +# Construct a block mask using the ragged structure of the
    +# specified query NJT. Ragged-sized sequences are treated as a single
    +# "stacked sequence" with inter-sequence attention masked out.
    +block_mask = create_nested_block_mask(my_mask_mod, 1, 1, query)
    +
    +# For cross attention, create_nested_block_mask() also supports a
    +# rectangular block mask using the ragged structures of both query / key.
    +#block_mask = create_nested_block_mask(my_mask_mod, 1, 1, query, key)
    +
    +output = flex_attention(query, key, value, block_mask=block_mask)
    +
    + +

    Trainable Biases

    + +

    FlexAttention now supports trainable parameters in score_mod functions. This feature enables users to reference tensors that require gradients within their score_mod implementations, with gradients automatically backpropagating through these parameters during training.

    + +

    Memory-Efficient Gradient Accumulation

    + +

    Instead of materializing the full attention scores matrix, FlexAttention uses atomic additions (tl.atomic_add) to accumulate gradients. This approach significantly reduces memory usage at the cost of introducing some non-determinism in gradient calculations.

    + +

    Handling Broadcasted Operations

    + +

    Broadcasting operations in the forward pass (e.g., score + bias[h]) require special consideration in the backward pass. When broadcasting a tensor across multiple attention scores within a head or other dimensions, we need to reduce these gradients back to the original tensor shape. Rather than materializing the full attention score matrix to perform this reduction, we use atomic operations. While this incurs some runtime overhead, it allows us to maintain memory efficiency by avoiding the materialization of large intermediate tensors.

    + +

    Current Limitations

    + +

    The implementation currently allows only a single read from each input tensor in the score_mod function. For example, bias[q_idx] + bias[kv_idx] would not be supported as it reads from the same tensor twice. We hope to remove this restriction in the future.

    + +

    Simple Example:

    + +
    bias = torch.randn(num_heads, requires_grad=True)
    +def score_mod(score, b, h, q_idx, kv_idx):
    +    return score + bias[h]  
    +
    + +

    Performance Tuning for FlexAttention

    + +

    TL;DR

    + +

    For optimal performance, compile FlexAttention using max-autotune, especially when dealing with complex score_mods and mask_mods:

    + +

    flex_attention = torch.compile(flex_attention, dynamic=True, mode=’max-autotune’)

    + +

    What is max-autotune?

    + +

    max-autotune is a torch.compile mode in which TorchInductor sweeps many kernel parameters (e.g., tile size, num_stages) and selects the best-performing configuration. This process allows kernels to test both successful and failing configurations without issues, and find the best viable configuration.

    + +

    While compilation takes longer with max-autotune, the optimal configuration is cached for future kernel executions.

    + +

    Here’s an example of FlexAttention compiled with max-autotune:

    + +
    triton_flex_attention_backward_7 0.2528 ms 100.0% BLOCKS_ARE_CONTIGUOUS=False, BLOCK_M1=32, BLOCK_M2=32, BLOCK_N1=32, BLOCK_N2=32, FLOAT32_PRECISION="'ieee'", GQA_SHARED_HEADS=7, HAS_FULL_BLOCKS=False, IS_DIVISIBLE=False, OUTPUT_LOGSUMEXP=True, PRESCALE_QK=False, QK_HEAD_DIM=128, ROWS_GUARANTEED_SAFE=False, SM_SCALE=0.08838834764831843, SPARSE_KV_BLOCK_SIZE=1073741824, SPARSE_Q_BLOCK_SIZE=1073741824, V_HEAD_DIM=128, num_stages=4, num_warps=4
    +
    + +

    Why Use max-autotune for FlexAttention?

    + +

    The amount of shared memory utilized in FlexAttention depends on score_mod and mask_mod methods. This variability means that the preconfigured default kernel parameters may lead to performance cliffs or even out of shared memory** **errors on certain hardware for some masks/mods.

    + +

    For instance, with document masks, default configurations can halve GPU occupancy, reducing performance to ~75% of its potential on some GPUs. To avoid such issues, we strongly recommend enabling max-autotune.

    + +

    Updates and Enhancements

    + +
      +
    • Now available as a prototype feature in PyTorch 2.5.0
    • +
    • Fixed critical correctness issues, including a bug affecting multiple calls to FlexAttention within the same call to torch.compile
    • +
    + +

    Expanded Architecture Support

    + +
      +
    • Arbitrary sequence length support - no longer requires multiples of 128
    • +
    • Added native grouped-query attention (GQA) support via is_gqa=True
    • +
    • Enhanced dimension flexibility: +
        +
      • Different QK and V head dimensions
      • +
      • Non-power-of-two head dimensions
      • +
      +
    • +
    • Trainable attention biases (prototype)
    • +
    + +

    Under the Hood

    + +
      +
    • New fused CPU backend
    • +
    • Improved TF32 handling for float32 inputs
    • +
    • Resolved various dynamic shape issues
    • +
    • Output layout matching query strides
    • +
    + +

    These updates make FlexAttention more robust and flexible while maintaining its core promise of combining PyTorch’s ease of use with FlashAttention’s performance benefits.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/flexattention/index.html b/blog/flexattention/index.html new file mode 100644 index 000000000000..5646c1acb95f --- /dev/null +++ b/blog/flexattention/index.html @@ -0,0 +1,1103 @@ + + + + + + + + + + + + + FlexAttention: The Flexibility of PyTorch with the Performance of FlashAttention | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch: Driss Guessous, Yanbo Liang, Joy Dong, Horace He + +

    +

    a cartoon chart flexing his muscles

    + +

    In theory, Attention is All You Need. In practice, however, we also need optimized attention implementations like FlashAttention.

    + +

    Although these fused attention implementations have substantially improved performance and enabled long contexts, this efficiency has come with a loss of flexibility. You can no longer try out a new attention variant by writing a few PyTorch operators - you often need to write a new custom kernel! This operates as a sort of “software lottery” for ML researchers - if your attention variant doesn’t fit into one of the existing optimized kernels, you’re doomed to slow runtime and CUDA OOMs.

    + +

    For some examples of attention variants, we have Causal, Relative Positional Embeddings, Alibi, Sliding Window Attention, PrefixLM, Document Masking/Sample Packing/Jagged Tensors, Tanh Soft-Capping, PagedAttention, etc. Even worse, folks often want combinations of these! Sliding Window Attention + Document Masking + Causal + Context Parallelism? Or what about PagedAttention + Sliding Window + Tanh Soft-Capping?

    + +

    The left picture below represents the state of the world today - some combinations of masking + biases + setting have existing kernels implemented. But the various options lead to an exponential number of settings, and so overall we end up with fairly spotty support. Even worse, new attention variants researchers come up with will have zero support.

    + +

    Attention variant support diagram

    + +

    To solve this hypercube problem once and for all, we introduce FlexAttention, a new PyTorch API.

    + +
      +
    1. We provide a flexible API that allows implementing many attention variants (including all the ones mentioned in the blog post so far) in a few lines of idiomatic PyTorch code.
    2. +
    3. We lower this into a fused FlashAttention kernel through torch.compile, generating a FlashAttention kernel that doesn’t materialize any extra memory and has performance competitive with handwritten ones.
    4. +
    5. We also automatically generate the backwards pass, leveraging PyTorch’s autograd machinery.
    6. +
    7. Finally, we can also take advantage of sparsity in the attention mask, resulting in significant improvements over standard attention implementations.
    8. +
    + +

    With FlexAttention, we hope that trying new attention variants will only be limited by your imagination.

    + +

    You can find many FlexAttention examples at the Attention Gym: https://github.com/pytorch-labs/attention-gym. If you have any cool applications, feel free to submit an example!

    + +

    PS: We also find this API very exciting since it leverages a lot of existing PyTorch infra in a fun way - more on that in the end.

    + +

    FlexAttention

    + +

    Here is the classic attention equation:

    + +

    math equation

    + +

    In code form:

    + +
    Q, K, V: Tensor[batch_size, num_heads, sequence_length, head_dim]
    +score: Tensor[batch_size, num_heads, sequence_length, sequence_length] = (Q @ K) / sqrt(head_dim)
    +probabilities = softmax(score, dim=-1)
    +output: Tensor[batch_size, num_heads, sequence_length, head_dim] = probabilities @ V
    +
    + +

    FlexAttention allows for an user-defined function score_mod:

    + +

    math equation

    + +

    In code form:

    + +
    Q, K, V: Tensor[batch_size, num_heads, sequence_length, head_dim]
    +score: Tensor[batch_size, num_heads, sequence_length, sequence_length] = (Q @ K) / sqrt(head_dim)
    +modified_scores: Tensor[batch_size, num_heads, sequence_length, sequence_length] = score_mod(score)
    +probabilities = softmax(modified_scores, dim=-1)
    +output: Tensor[batch_size, num_heads, sequence_length, head_dim] = probabilities @ V
    +
    + +

    This function allows you to modify the attention scores prior to softmax. Surprisingly, this ends up being sufficient for the vast majority of attention variants (examples below)!

    + +

    Concretely, the expected signature for score_mod is somewhat unique.

    + +
    def score_mod(score: f32[], b: i32[], h: i32[], q_idx: i32[], kv_idx: i32[])
    +    return score # noop - standard attention
    +
    + +

    In other words, score is a scalar pytorch tensor that represents the dot product of a query token and a key token. The rest of the arguments tell you which dot product you’re currently computing - b (current element in batch), h (current head), q_idx (position in query), kv_idx (position in key/value tensors).

    + +

    To apply this function, we could implement it as

    + +
    for b in range(batch_size):
    +    for h in range(num_heads):
    +        for q_idx in range(sequence_length):
    +            for kv_idx in range(sequence_length):
    +                modified_scores[b, h, q_idx, kv_idx] = score_mod(scores[b, h, q_idx, kv_idx], b, h, q_idx, kv_idx)
    +
    + +

    Of course, this is not how FlexAttention is implemented under the hood. Leveraging torch.compile, we automatically lower your function into a single fused FlexAttention kernel - guaranteed or your money back!

    + +

    This API ends up being surprisingly expressive. Let’s look at some examples.

    + +

    Score Mod Examples

    + +

    Full Attention

    + +

    Let’s first do “full attention”, or standard bidirectional attention. In this case, score_mod is a no-op - it takes as input the scores and then returns them as is..

    + +
    def noop(score, b, h, q_idx, kv_idx):
    +    return score
    +
    + +

    And to use it end to end (including both forwards and backwards):

    + +
    from torch.nn.attention.flex_attention import flex_attention
    +
    +flex_attention(query, key, value, score_mod=noop).sum().backward()
    +
    + +

    Relative Position Encodings

    + +

    One common attention variant is the “relative position encoding”. Instead of encoding the absolute distance in the queries and keys, relative position encoding adjusts scores based on the “distance” between the queries and keys.

    + +
    def relative_positional(score, b, h, q_idx, kv_idx):
    +    return score + (q_idx - kv_idx)
    +
    + +

    Note that unlike typical implementations, this does not need to materialize a SxS tensor. Instead, FlexAttention computes the bias values “on the fly” within the kernel, leading to significant memory and performance improvements.

    + +

    relative position encoding

    + +

    ALiBi Bias

    + +

    alibi bias

    +

    Source: Train Short, Test Long: Attention with Linear Biases Enables Input Length Extrapolation

    + +

    ALiBi was introduced in Train Short, Test Long: Attention with Linear Biases Enables Input Length Extrapolation, and claims to have beneficial properties for length extrapolation at inference. Notably, MosaicML has pointed to “lack of kernel support” as the main reason why they eventually switched from ALiBi to rotary embeddings.

    + +

    Alibi is similar to relative positional encodings with one exception - it has a per-head factor that is typically precomputed.

    + +
    alibi_bias = generate_alibi_bias() # [num_heads]
    +
    +def alibi(score, b, h, q_idx, kv_idx):
    +    bias = alibi_bias[h] * (kv_idx - q_idx)
    +    return score + bias
    +
    + +

    This demonstrates one interesting piece of flexibility torch.compile provides - we can load from alibi_bias even though it wasn’t explicitly passed in as an input! The generated Triton kernel will calculate the correct loads from the alibi_bias tensor and fuse it. Note that you could regenerate alibi_bias and we still wouldn’t need to recompile.

    + +

    Soft-capping

    + +

    Soft-capping is a technique used in Gemma2 and Grok-1 that prevents logits from growing excessively large. In FlexAttention, it looks like:

    + +
    softcap = 20
    +def soft_cap(score, b, h, q_idx, kv_idx):
    +    score = score / softcap
    +    score = torch.tanh(score)
    +    score = score * softcap
    +    return score
    +
    + +

    Note that we also automatically generate the backwards pass from the forwards pass here. Also, although this implementation is semantically correct, we likely want to use a tanh approximation in this case for performance reasons. See attention-gym for more details.

    + +

    Causal Mask

    + +

    Although bidirectional attention is the simplest, the original Attention is All You Need paper and the vast majority of LLMs use attention in a decoder-only setting where each token can only attend to the tokens prior to it. Folks often think of this as a lower-triangular mask, but with the score_mod API it can be expressed as:

    + +
    def causal_mask(score, b, h, q_idx, kv_idx):
    +    return torch.where(q_idx >= kv_idx, score, -float("inf"))
    +
    + +

    Basically, if the query token is “after” the key token, we keep the score. Otherwise, we mask it out by setting it to -inf, thus ensuring it won’t participate in the softmax calculation.

    + +

    However, masking is special compared to other modifications - if something is masked out, we can completely skip its computation! In this case, a causal mask has about 50% sparsity, so not taking advantage of the sparsity would result in a 2x slowdown. Although this score_mod is sufficient to implement causal masking correctly, getting the performance benefits of sparsity requires another concept - mask_mod.

    + +

    Mask Mods

    + +

    To take advantage of sparsity from masking, we need to do some more work. Specifically, by passing a mask_mod to create_block_mask, we can create a BlockMask. FlexAttention can then use BlockMask to take advantage of the sparsity!

    + +

    The signature of mask_mod is very similar to score_mod - just without the score. In particular

    + +
    # returns True if this position should participate in the computation
    +mask_mod(b, h, q_idx, kv_idx) => bool
    +
    + +

    Note that score_mod is strictly more expressive than mask_mod. However, for masking, it’s recommended to use mask_mod and create_block_mask, as it’s more performant. See the FAQ on why score_mod and mask_mod are separate.

    + +

    Now, let’s take a look at how we might implement causal mask with mask_mod.

    + +

    Causal Mask

    + +
    from torch.nn.attention.flex_attention import create_block_mask
    +
    +def causal(b, h, q_idx, kv_idx):
    +    return q_idx >= kv_idx
    +
    +# Because the sparsity pattern is independent of batch and heads, we'll set them to None (which broadcasts them) 
    +block_mask = create_block_mask(causal, B=None, H=None, Q_LEN=1024, KV_LEN=1024)
    +# In this case, we don't need a score_mod, so we won't pass any in.
    +# However, score_mod can still be combined with block_mask if you need the additional flexibility.
    +flex_attention(query, key, value, block_mask=block_mask)
    +
    + +

    Note that create_block_mask is a relatively expensive operation! Although FlexAttention will not need to recompile when it changes, if you aren’t careful about caching it, it can lead to significant slowdowns (check out the FAQ for suggestions on best practices).

    + +

    flexattention performance charts

    + +

    While the TFlops are roughly the same, the execution time is 2x faster for the mask_mod version! This demonstrates that we can leverage the sparsity that BlockMask provides us without losing hardware efficiency.

    + +

    Sliding Window + Causal

    + +

    Sliding Window Causal diagrams

    +

    Source: Mistral 7B

    + +

    Popularized by Mistral, sliding window attention (also known as local attention) takes advantage of the intuition that the most recent tokens are the most useful. In particular, it allows the query token to only attend to, say, the 1024 most recent tokens. This is often used together with causal attention.

    + +
    SLIDING_WINDOW = 1024
    +
    +def sliding_window_causal(b, h, q_idx, kv_idx):
    +    causal_mask = q_idx >= kv_idx
    +    window_mask = q_idx - kv_idx <= SLIDING_WINDOW 
    +    return causal_mask & window_mask
    +
    +# If you want to be cute...
    +from torch.nn.attention import and_masks
    +
    +def sliding_window(b, h, q_idx, kv_idx)
    +    return q_idx - kv_idx <= SLIDING_WINDOW
    +
    +sliding_window_causal = and_masks(causal_mask, sliding_window)
    +
    + +

    We benchmark it against F.scaled_dot_product_attention with a sliding window mask as well as FA2 with a causal mask (as a reference point for performance). Not only are we significantly faster than F.scaled_dot_product_attention, we’re also significantly faster than FA2 with a causal mask as this mask has significantly more sparsity.

    + +

    execution time charts

    + +

    PrefixLM

    + +

    PrefixLM diagram

    +

    Source: PaliGemma: A versatile 3B VLM for transfer

    + +

    The T5 architecture, proposed in Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer, describes an attention variant that performs full bidirectional attention on a “prefix”, and causal attention on the rest. We again compose two mask functions to accomplish this, one for causal masking and one that is based off of the prefix length.

    + +
    prefix_length: [B]
    +def prefix_mask(b, h, q_idx, kv_idx):
    +    return kv_idx <= prefix_length[b]
    +
    +prefix_lm_causal = or_masks(prefix_mask, causal_mask)
    +# In this case, our mask is different per sequence so we set B equal to our batch size
    +block_mask = create_block_mask(prefix_lm_causal, B=B, H=None, S, S)
    +
    + +

    Just like with score_mod, mask_mod allows us to refer to additional tensors that aren’t explicitly an input to the function! However, with prefixLM, the sparsity pattern changes per input. This means that for each new input batch, we’ll need to recompute the BlockMask. One common pattern is to call create_block_mask at the beginning of your model and reuse that block_mask for all attention calls in your model. See Recomputing Block Masks vs. Recompilation.

    + +

    However, in exchange for that, we’re not only able to have an efficient attention kernel for prefixLM, we’re also able to take advantage of however much sparsity exists in the input! FlexAttention will dynamically adjust its performance based off of the BlockMask data, without needing to recompile the kernel.

    + +

    Document Masking/Jagged Sequences

    + +

    Another common attention variant is document masking/jagged sequences. Imagine that you have a number of sequences of varying length. You want to train on all of them together, but unfortunately, most operators only accept rectangular tensors.

    + +

    Through BlockMask, we can support this efficiently in FlexAttention as well!

    + +
      +
    1. First, we flatten all sequences into a single sequence with sum(sequence lengths) tokens.
    2. +
    3. Then, we compute the document_id that each token belongs to.
    4. +
    5. Finally, in our mask_mod, we simply whether the query and kv token belong to the same document!
    6. +
    + +
    # The document that each token belongs to.
    +# e.g. [0, 0, 0, 1, 1, 2, 2, 2, 2, 2, 2] corresponds to sequence lengths 3, 2, and 6.
    +document_id: [SEQ_LEN]
    +
    +def document_masking(b, h, q_idx, kv_idx):
    +    return document_id[q_idx] == document_id[kv_idx]
    +
    + +

    And that’s it! In this case, we see that we end up with a blockdiagonal mask.

    + +

    blockdiagonal mask

    + +

    One interesting aspect about document masking is that it’s easy to see how it might combine with an arbitrary combination of other masks . For example, we already defined prefixlm_mask in the previous section. Do we now need to define a prefixlm_document_mask function as well?

    + +

    In these cases, one pattern we’ve found quite useful is what we call a “higher level modification”. In this case, we can take an existing mask_mod and automatically transform it into one that works with jagged sequences!

    + +
    def generate_doc_mask_mod(mask_mod, document_id):
    +    # Get unique document IDs and their counts
    +    _, counts = torch.unique_consecutive(document_id, return_counts=True)
    +    # Create cumulative counts (offsets)
    +    offsets = torch.cat([torch.tensor([0], device=document_id.device), counts.cumsum(0)[:-1]])
    +    def doc_mask_wrapper(b, h, q_idx, kv_idx):
    +        same_doc = document_id[q_idx] == document_id[kv_idx]
    +        q_logical = q_idx - offsets[document_id[q_idx]]
    +        kv_logical = kv_idx - offsets[document_id[kv_idx]]
    +        inner_mask = mask_mod(b, h, q_logical, kv_logical)
    +        return same_doc & inner_mask
    +    return doc_mask_wrapper
    +
    + +

    For example, given the prefix_lm_causal mask from above, we can transform it into one that works on on packed documents like so:

    + +
    prefix_length = torch.tensor(2, dtype=torch.int32, device="cuda")
    +def prefix_mask(b, h, q_idx, kv_idx):
    +    return kv_idx < prefix_length
    +prefix_lm_causal = or_masks(prefix_mask, causal_mask)
    +doc_prefix_lm_causal_mask = generate_doc_mask_mod(prefix_lm_causal, document_id)
    +
    + +

    blockdiagonal mask

    + +

    Now, this mask is “block-prefixLM-diagonal” shaped. :)

    + +

    That’s all of our examples! There are far more attention variants than we have space to list, so check out Attention Gym for more examples. We hope that the community will contribute some of their favorite applications of FlexAttention as well.

    + +

    FAQ

    + +
    Q: When does FlexAttention need to recompile?
    + +

    As FlexAttention leverages torch.compile for graph capture, it can actually avoid recompilation in a broad spectrum of cases. Notably, it does not need to recompile even if captured tensors change values!

    + +
    flex_attention = torch.compile(flex_attention)
    +def create_bias_mod(bias)
    +    def bias_mod(score, b, h, q_idx, kv_idx):
    +        return score + bias
    +    return bias_mod
    +bias_mod1 = create_bias_mod(torch.tensor(0))
    +flex_attention(..., score_mod=bias_mod1) # Compiles the kernel here 
    +
    +bias_mod2 = create_bias_mod(torch.tensor(2))
    +flex_attention(..., score_mod=bias_mod2) # Doesn't need to recompile! 
    +
    + +

    Even changing the block-sparsity doesn’t require a recompile. However, if the block-sparsity changes, we do need to recompute the BlockMask.

    + +
    Q: When should we recompute the BlockMask?
    + +

    We need to recompute the BlockMask whenever the block-sparsity changes. Although computing the BlockMask is much cheaper than recompilation (on the order of hundreds of microseconds as opposed to seconds), you should still take care to not excessively recompute the BlockMask.

    + +

    Here are some common patterns and some recommendations on how you might approach them.

    + +

    Mask never changes (e.g. causal mask)
    +In this case, you can simply precompute the block mask and cache it globally, reusing it for all attention calls.

    + +
    block_mask = create_block_mask(causal_mask, 1, 1, S,S)
    +causal_attention = functools.partial(flex_attention, block_mask=block_mask)
    +
    + +

    Mask changes every batch (e.g. document masking)
    +In this case, we would suggest computing the BlockMask at the beginning of the model and threading it through the model - reusing the BlockMask for all layers.

    + +
    def forward(self, x, doc_mask):
    +    # Compute block mask at beginning of forwards
    +    block_mask = create_block_mask(doc_mask, None, None, S, S)    
    +    x = self.layer1(x, block_mask)
    +    x = self.layer2(x, block_mask)
    +    ...
    +    # amortize block mask construction cost across all layers
    +    x = self.layer3(x, block_mask) 
    +    return x
    +
    + +

    Mask changes every layer (e.g. data-dependent sparsity)
    +This is the hardest setting, since we’re unable to amortize the block mask computation across multiple FlexAttention invocations. Although FlexAttention can certainly still benefit this case, the actual benefits from BlockMask depend on how sparse your attention mask is and how fast we can construct the BlockMask. That leads us to…

    + +
    Q: How can we compute BlockMask quicker?
    + +

    create_block_mask is unfortunately fairly expensive, both from a memory and compute perspective, as determining whether a block is completely sparse requires evaluating mask_mod at every single point in the block. There are a couple ways to address this:

    + +
      +
    1. If your mask is the same across batch size or heads, make sure that you’re broadcasting over those (i.e. set them to None in create_block_mask).
    2. +
    3. Compile create_block_mask. Unfortunately, today, torch.compile does not work directly on create_block_mask due to some unfortunate limitations. However, you can set _compile=True, which will significantly reduce the peak memory and runtime (often an order of magnitude in our testing).
    4. +
    5. +

      Write a custom constructor for BlockMask. The metadata for BlockMask is quite simple (see the documentation). It’s essentially two tensors. +a. num_blocks: The number of KV blocks computed for each query block.
      +b. indices: The positions of the KV blocks computed for each query block.

      + +

      For example, here’s a custom BlockMask constructor for causal_mask.

      +
    6. +
    + +
    def create_causal_mask(S):
    +    BLOCK_SIZE = 128
    +    # The first query block computes one block, the second query block computes 2 blocks, etc.
    +    num_blocks = torch.arange(S // BLOCK_SIZE, device="cuda") + 1
    +    # Since we're always computing from the left to the right,
    +    # we can use the indices [0, 1, 2, ...] for every query block.
    +    indices = torch.arange(S // BLOCK_SIZE, device="cuda").expand(
    +        S // BLOCK_SIZE, S // BLOCK_SIZE
    +    )
    +    num_blocks = num_blocks[None, None, :]
    +    indices = indices[None, None, :]
    +    return BlockMask(num_blocks, indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=causal_mask)
    +
    + +
    Q: Why are score_mod and mask_mod different? Isn’t mask_mod just a special case of score_mod?
    + +

    Very astute question, hypothetical audience member! In fact, any mask_mod can be easily converted to a score_mod (we do not recommend using this function in practice!)

    + +
    def mask_mod_as_score_mod(b, h, q_idx, kv_idx):
    +    return torch.where(mask_mod(b, h, q_idx, kv_idx), score, -float("inf"))
    +
    + +

    So, if score_mod can implement everything mask_mod can, what’s the point of having mask_mod?

    + +

    One immediate challenge: a score_mod requires the actual score value as an input, but when we’re precomputing the BlockMask, we don’t have the actual score value. We can perhaps fake the values by passing in all zeros, and if the score_mod returns -inf, then we consider it to be masked (in fact, we originally did this!).

    + +

    However, there are two issues. The first is that this is hacky - what if the user’s score_mod returned -inf when the input is 0? Or what if the user’s score_mod masked out with a large negative value instead of -inf? It seems we’re trying to cram a round peg into a square hole. However, there’s a more important reason to separate out mask_mod from score_mod - it’s fundamentally more efficient!.

    + +

    As it turns out, applying masking to every single computed element is actually quite expensive - our benchmarks see about a 15-20% degradation in performance! So, although we can get significant speedups by skipping half the computation, we lose a meaningful part of that speedup from needing to mask out every element!

    + +

    Luckily, if we visualize the causal mask, we notice that the vast majority of blocks do not require a “causal mask” at all - they’re fully computed! It is only the blocks on the diagonal, partially computed and partially masked, that require masking to be applied.

    + +

    blockdiagonal mask

    + +

    The BlockMask previously told us which blocks we need to compute and which blocks we can skip. Now, we further augment this data structure to also tell us which blocks are “fully computed” (i.e. masking can be skipped) vs. “partially computed” (i.e. a mask needs to be applied). Note, however, that although masks can be skipped on “fully computed” blocks, other score_mods like relative positional embeddings still need to be applied.

    + +

    Given just a score_mod, there’s no sound way for us to tell which parts of it are “masking”. Hence, the user must separate these out themselves into mask_mod.

    + +
    Q: How much additional memory does the BlockMask need?
    + +

    The BlockMask metadata is of size [BATCH_SIZE, NUM_HEADS, QUERY_LEN//BLOCK_SIZE, KV_LEN//BLOCK_SIZE]. If the mask is the same across the batch or heads dimension it can be broadcasted over that dimension to save memory.

    + +

    At the default BLOCK_SIZE of 128, we expect that the memory usage will be fairly negligible for most use cases. For example, for a sequence length of 1 million, the BlockMask would only use 60MB of additional memory. If this is a problem, you can increase the block size: create_block_mask(..., BLOCK_SIZE=1024). For example, increasing BLOCK_SIZE to 1024 would result in this metadata dropping to under a megabyte.

    + +
    Q: How do the numerics compare?
    + +

    Although the results are not bitwise identical, we are confident that FlexAttention is as numerically accurate as FlashAttention. We generate the following distribution of differences comparing FlashAttention versus FlexAttention over a large range of inputs on both causal and non causal attention variants. The errors are nearly identical.

    + +

    distribution chart

    + +

    Performance

    + +

    Generally speaking, FlexAttention is nearly as performant as a handwritten Triton kernel, which is unsurprising, as we heavily leverage a handwritten Triton kernel. However, due to its generality, we do incur a small performance penalty. For example, we must incur some additional latency to determine which block to compute next. In some cases, we provide some kernel options that can affect the performance of the kernel while changing its behavior. They can be found here: performance knobs

    + +

    As a case study, let’s explore how the knobs affect the performance of causal attention. We will compare performance of the triton kernel versus FlashAttentionv2 on A100. The script can be found here.

    + +

    FlexAttention achieves 90% of FlashAttention2’s performance in the forward pass and 85% in the backward pass. FlexAttention is currently utilizing a deterministic algorithm that recomputes more intermediates than FAv2, but we have plans to improve FlexAttention’s backward algorithm and hope to close this gap!

    + +

    flexattention speed chart

    + +

    flexattention speed chart

    + +

    Conclusion

    + +

    We hope you have as much fun using FlexAttention as we did developing it! While working on this, we ended up finding way more applications of this API than we could have expected. We’ve already seen it accelerate torchtune’s sample packing throughput by 71%, replace the need for a researcher to spend over a week writing their own custom Triton kernel, and deliver competitive performance with custom handwritten attention variants.

    + +

    One final thing that made implementing FlexAttention quite fun is that we were able to leverage a lot of existing PyTorch infra in an interesting way. For example, one of the unique aspects about TorchDynamo (torch.compile’s frontend) is that it does not require tensors used in the compiled function to be explicitly passed in as inputs. This allows us to compile mods like document masking, which require accessing global variables where the global variables need to change!

    + +
    bias = torch.randn(1024, 1024)
    +def score_mod(score, b, h, q_idx, kv_idx):
    +    return score + bias[q_idx][kv_idx] # The bias tensor can change!
    +
    + +

    Furthermore, the fact that torch.compile is a generic graph-capture mechanism also allows it to support more “advanced” transformations, such as the higher order transform that transforms any mask_mod into one that works with jagged tensors.

    + +

    We also leverage TorchInductor (torch.compile’s backend) infrastructure for Triton templates. Not only did this make it easy to support codegening FlexAttention - it also automatically gave us support for dynamic shapes as well as epilogue fusion (i.e. fusing an operator onto the end of attention)! In the future, we plan on extending this support to allow for quantized versions of attention or things like RadixAttention as well.

    + +

    In addition, we also leveraged higher order ops, PyTorch’s autograd to automatically generate the backwards pass, as well as vmap to automatically apply score_mod for creating the BlockMask.

    + +

    And, of course, this project wouldn’t have been possible without Triton and TorchInductor’s ability to generate Triton code.

    + +

    We look forward to leveraging the approach we used here to more applications in the future!

    + +

    Limitations and Future Work

    + +
      +
    • FlexAttention is currently available in PyTorch nightly releases, we plan to release it as a prototype feature in 2.5.0
    • +
    • We did not cover how to use FlexAttention for inference here (or how to implement PagedAttention) - we will cover those in a later post.
    • +
    • We are working to improve the performance of FlexAttention to match FlashAttention3 on H100 GPUs.
    • +
    • FlexAttention requires that all sequence lengths be a multiple of 128 - this will be addressed soon.
    • +
    • We plan on adding GQA support soon - for now, you can just replicate the kv heads.
    • +
    + +

    Acknowledgements

    + +

    We want to highlight some prior work (and people) that have inspired FlexAttention.

    + +
      +
    • Tri Dao’s work on FlashAttention
    • +
    • Francisco Massa and the Xformers team for BlockSparseAttention in Triton
    • +
    • The Jax team’s work on SplashAttention
    • +
    • Philippe Tillet and Keren Zhou for helping us with Triton
    • +
    • Ali Hassani for discussions on neighborhood attention
    • +
    • Everybody who’s complained about attention kernels not supporting their favorite attention variant :)
    • +
    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/genai-acceleration-intel-xeon/index.html b/blog/genai-acceleration-intel-xeon/index.html new file mode 100644 index 000000000000..b40e830d3ccb --- /dev/null +++ b/blog/genai-acceleration-intel-xeon/index.html @@ -0,0 +1,837 @@ + + + + + + + + + + + + + GenAI Acceleration for PyTorch 2.5 on Intel® Xeon®Processors | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + the Intel PyTorch Team + +

    +

    This blog is the fifth in a series focused on accelerating generative AI models with pure, native PyTorch. We demonstrate the GenAI acceleration of GPTFast, Segment Anything Fast, and Diffusion Fast on Intel® Xeon®Processors.

    + +

    First, we revisit GPTFast, a remarkable work that speeds up text generation in under 1000 lines of native PyTorch code. Initially, GPTFast supported only the CUDA backend. We will show you how to run GPTFast on CPU and achieve additional performance speedup with weight-only quantization (WOQ).

    + +

    In Segment Anything Fast, we have incorporated support for the CPU backend and will demonstrate performance acceleration by leveraging the increased power of CPU with BFloat16, torch.compile, and scaled_dot_product_attention (SDPA) with a block-wise attention mask. The speedup ratio against FP32 can reach 2.91x in vit_b and 3.95x in vit_h.

    + +

    Finally, Diffusion Fast now supports the CPU backend and leverages the increased power of CPU with BFloat16, torch.compile, and SDPA. We also optimize the layout propagation rules for convolution, cat, and permute in Inductor CPU to improve performance. The speedup ratio against FP32 can achieve 3.91x in Stable Diffusion XL (SDXL).

    + +

    Optimization strategies to boost performance on PyTorch CPU

    + +

    GPTFast

    + +

    Over the past year, generative AI has achieved great success across various language tasks and become increasingly popular. However, generative models face high inference costs due to the memory bandwidth bottlenecks in the auto-regressive decoding process. To address these issues, the PyTorch team published GPTFast which targets accelerating text generation with only pure, native PyTorch. This project developed an LLM from scratch almost 10x faster than the baseline in under 1000 lines of native PyTorch code. Initially, GPTFast supported only the CUDA backend and garnered approximately 5,000 stars in about four months. Inspired by Llama.cpp, the Intel team provided CPU backend support starting with the PyTorch 2.4 release, further enhancing the project’s availability in GPU-free environments. The following are optimization strategies used to boost performance on PyTorch CPU:

    + +
      +
    • +

      Torch.compile

      + +

      torch.compile is a PyTorch function introduced since PyTorch 2.0 that aims to solve the problem of accurate graph capturing in PyTorch and ultimately enable software engineers to run their PyTorch programs faster.

      +
    • +
    • +

      Weight-only Quantization

      + +

      Weight-only quantization (WOQ) is a trade-off between the performance and the accuracy since the bottleneck of the auto-regressive decoding phase in text generation is the memory bandwidth of loading weights and generally WOQ could lead to better accuracy compared to traditional quantization approach such as W8A8. GPTFast supports two types of WOQs: W8A16 and W4A16. To be specific, activations are stored in BFloat16 and model weights could be quantized to int8 and int4, as shown in Figure 1.

      +
    • +
    + +

    flow diagram

    + +

    Figure 1. Weight-only Quantization Pattern. Source: Mingfei Ma, Intel

    + +
      +
    • +

      Weight Prepacking & Micro Kernel Design.

      + +

      To maximize throughput, GPTFast allows model weights to be prepacked into hardware-specific layouts on int4 using internal PyTorch ATen APIs. Inspired by Llama.cpp, we prepacked the model weights from [N, K] to [N/kNTileSize, K, kNTileSize/2], with kNTileSize set to 64 on avx512. First, the model weights are blocked along the N dimension, then the two innermost dimensions are transposed. To minimize de-quantization overhead in kernel computation, we shuffle the 64 data elements on the same row in an interleaved pattern, packing Lane2 & Lane0 together and Lane3 & Lane1 together, as illustrated in Figure 2.

      +
    • +
    + +

    flow diagram

    + +

    Figure 2. Weight Prepacking on Int4. Source: Mingfei Ma, Intel

    + +

    During the generation phase, the torch.nn.Linear module will be lowered to be computed with high-performance kernels inside PyTorch ATen, where the quantized weights will be de-quantized first and then accumulated with fused multiply-add (FMA) at the register level, as shown in Figure 3.

    + +

    flow diagram

    + +

    Figure 3. Micro Kernel Design. Source: Mingfei Ma, Intel

    + +

    Segment Anything Fast

    + +

    Segment Anything Fast offers a simple and efficient PyTorch native acceleration for the Segment Anything Model (SAM) , which is a zero-shot vision model for generating promptable image masks. The following are optimization strategies used to boost performance on PyTorch CPU:

    + +
      +
    • +

      BFloat16

      + +

      Bfloat16 is a commonly used half-precision type. Through less precision per parameter and activations, we can save significant time and memory in computation.

      +
    • +
    • +

      Torch.compile

      + +

      torch.compile is a PyTorch function introduced since PyTorch 2.0 that aims to solve the problem of accurate graph capturing in PyTorch and ultimately enable developers to run their PyTorch programs faster.

      +
    • +
    • +

      Scaled Dot Product Attention (SDPA)

      + +

      Scaled Dot-Product Attention (SDPA) is a crucial mechanism in transformer models. PyTorch offers a fused implementation that significantly outperforms a naive approaches. For Segment Anything Fast, we convert the attention mask from bfloat16 to float32 in a block-wise manner. This method not only reduces peak memory usage, making it ideal for systems with limited memory resources, but also enhances performance.

      +
    • +
    + +

    Diffusion Fast

    + +

    Diffusion Fast offers a simple and efficient PyTorch native acceleration for text-to-image diffusion models. The following are optimization strategies used to boost performance on PyTorch CPU:

    + +
      +
    • +

      BFloat16

      + +

      Bfloat16 is a commonly used half-precision type. Through less precision per parameter and activations, we can save significant time and memory in computation.

      +
    • +
    • +

      Torch.compile

      + +

      torch.compile is a PyTorch function introduced since PyTorch 2.0 that aims to solve the problem of accurate graph capturing in PyTorch and ultimately enable software engineers to run their PyTorch programs faster.

      +
    • +
    • +

      Scaled Dot Product Attention (SDPA)

      + +

      SDPA is a key mechanism used in transformer models, PyTorch provides a fused implementation to show large performance benefits over a naive implementation.

      +
    • +
    + +

    Model Usage on Native PyTorch CPU

    + +

    GPTFast

    + +

    To launch WOQ in GPTFast, first quantize the model weights. For example, to quantize with int4 and group size of 32:

    + +
    python quantize.py --checkpoint_path checkpoints/$MODEL_REPO/model.pth --mode int4 –group size 32
    +
    + +

    Then run generation by passing the int4 checkpoint to generate.py

    + +
    python generate.py --checkpoint_path checkpoints/$MODEL_REPO/model_int4.g32.pth --compile --device $DEVICE
    +
    + +

    To use CPU backend in GPTFast, simply switch DEVICE variable from cuda to CPU.

    + +

    Segment Anything Fast

    + +
    cd experiments
    +
    +export SEGMENT_ANYTHING_FAST_USE_FLASH_4=0
    +
    +python run_experiments.py 16 vit_b &lt;pytorch_github> &lt;segment-anything_github> &lt;path_to_experiments_data> --run-experiments --num-workers 32 --device cpu
    +
    +python run_experiments.py 16 vit_h &lt;pytorch_github> &lt;segment-anything_github> &lt;path_to_experiments_data> --run-experiments --num-workers 32 --device cpu
    +
    + +

    Diffusion Fast

    + +
    python run_benchmark.py --compile_unet --compile_vae --device=cpu
    +
    + +

    Performance Evaluation

    + +

    GPTFast

    + +

    We ran llama-2-7b-chat model based on test branch and the above hardware configuration on PyTorch. After applying the following steps, we saw a 3.8x boost compared to the baseline in eager mode:

    + +
      +
    • Use torch.compile to automatically fuse elementwise operators.
    • +
    • Reduce memory footprint with WOQ-int8.
    • +
    • Further reduce memory footprint with WOQ-int4.
    • +
    • Use AVX512 which enables faster de-quant in micro kernels.
    • +
    + +

    bar chart

    + +

    Figure 4. GPTFast Performance speedup in Llama2-7b-chat

    + +

    Segment Anything Fast

    + +

    We ran Segment Anything Fast on the above hardware configuration on PyTorch and achieved a performance speedup of BFloat16 with torch.compile and SDPA compared with FP32 as shown in Figure 5. The speedup ratio against FP32 can achieve 2.91x in vit_b, and 3.95x in vit_h.

    + +

    bar chart

    + +

    Figure 5. Segment Anything Fast Performance speedup in vit_b/vit_h

    + +

    Diffusion Fast

    + +

    We ran Diffusion Fast on the above hardware configuration on PyTorch and achieved a performance speedup of BFloat16 with torch.compile and SDPA compared with FP32 as shown in Figure 6. The speedup ratio against FP32 can achieve 3.91x in Stable Diffusion XL (SDXL).

    + +

    bar chart

    + +

    Figure 6. Diffusion Fast Performance speedup in Stable Diffusion XL

    + +

    Conclusion and Future Work

    + +

    In this blog, we introduced software optimizations for weight-only quantization, torch.compile, and SDPA, demonstrating how we can accelerate text generation with native PyTorch on CPU. Further improvements are expected with the support of the AMX-BF16 instruction set and the optimization of dynamic int8 quantization using torchao on CPU. We will continue to extend our software optimization efforts to a broader scope.

    + +

    Acknowledgments

    + +

    The results presented in this blog are a joint effort between Meta and the Intel PyTorch Team. Special thanks to Michael Gschwind from Meta who spent precious time providing substantial assistance. Together we took one more step on the path to improve the PyTorch CPU ecosystem.

    + + + +

    Part 1: How to accelerate Segment Anything over 8x with Segment Anything Fast.

    + +

    Part 2: How to accelerate Llama-7B by almost 10x with help of GPTFast.

    + +

    Part 3: How to accelerate text-to-image diffusion models up to 3x with Diffusion Fast.

    + +

    Part 4: How to speed up FAIR’s Seamless M4T-v2 model by 2.7x.

    + +

    Product and Performance Information

    + +

    Figure 4: Intel Xeon Scalable Processors: Measurement on 4th Gen Intel Xeon Scalable processor using: 2x Intel(R) Xeon(R) Platinum 8480+, 56cores, HT On, Turbo On, NUMA 2, Integrated Accelerators Available [used]: DLB 2 [0], DSA 2 [0], IAA 2 [0], QAT 2 [0], Total Memory 512GB (16x32GB DDR5 4800 MT/s [4800 MT/s]), BIOS 3B07.TEL2P1, microcode 0x2b000590, Samsung SSD 970 EVO Plus 2TB, CentOS Stream 9, 5.14.0-437.el9.x86_64, run single socket (1 instances in total with: 56 cores per instance, Batch Size 1 per instance), Models run with PyTorch 2.5 wheel. Test by Intel on 10/15/24.

    + +

    Figure 5: Intel Xeon Scalable Processors: Measurement on 4th Gen Intel Xeon Scalable processor using: 2x Intel(R) Xeon(R) Platinum 8480+, 56cores, HT On, Turbo On, NUMA 2, Integrated Accelerators Available [used]: DLB 2 [0], DSA 2 [0], IAA 2 [0], QAT 2 [0], Total Memory 512GB (16x32GB DDR5 4800 MT/s [4800 MT/s]), BIOS 3B07.TEL2P1, microcode 0x2b000590, Samsung SSD 970 EVO Plus 2TB, CentOS Stream 9, 5.14.0-437.el9.x86_64, run single socket (1 instances in total with: 56 cores per instance, Batch Size 16 per instance), Models run with PyTorch 2.5 wheel. Test by Intel on 10/15/24.

    + +

    Figure 6: Intel Xeon Scalable Processors: Measurement on 4th Gen Intel Xeon Scalable processor using: 2x Intel(R) Xeon(R) Platinum 8480+, 56cores, HT On, Turbo On, NUMA 2, Integrated Accelerators Available [used]: DLB 2 [0], DSA 2 [0], IAA 2 [0], QAT 2 [0], Total Memory 512GB (16x32GB DDR5 4800 MT/s [4800 MT/s]), BIOS 3B07.TEL2P1, microcode 0x2b000590, Samsung SSD 970 EVO Plus 2TB, CentOS Stream 9, 5.14.0-437.el9.x86_64, run single socket (1 instances in total with: 56 cores per instance, Batch Size 1 per instance), Models run with PyTorch 2.5 wheel. Test by Intel on 10/15/24.

    + +

    Notices and Disclaimers

    + +

    Performance varies by use, configuration and other factors. Learn more on the Performance Index site. Performance results are based on testing as of dates shown in configurations and may not reflect all publicly available updates.  See backup for configuration details.  No product or component can be absolutely secure. Your costs and results may vary. Intel technologies may require enabled hardware, software or service activation.

    + +

    Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others.

    + +

    AI disclaimer:

    + +

    AI features may require software purchase, subscription or enablement by a software or platform provider, or may have specific configuration or compatibility requirements. Details at www.intel.com/AIPC. Results may vary.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/geospatial-deep-learning-with-torchgeo/index.html b/blog/geospatial-deep-learning-with-torchgeo/index.html new file mode 100644 index 000000000000..f96e00f2b066 --- /dev/null +++ b/blog/geospatial-deep-learning-with-torchgeo/index.html @@ -0,0 +1,881 @@ + + + + + + + + + + + + + Geospatial deep learning with TorchGeo | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    June 23, 2022

    +

    + Geospatial deep learning with TorchGeo +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Adam Stewart (University of Illinois at Urbana-Champaign), Caleb Robinson (Microsoft AI for Good Research Lab), Isaac Corley (University of Texas at San Antonio) + +

    +

    TorchGeo is a PyTorch domain library providing datasets, samplers, transforms, and pre-trained models specific to geospatial data.

    + +

    + +

    + +

    + https://github.com/microsoft/torchgeo +

    + +

    For decades, Earth observation satellites, aircraft, and more recently UAV platforms have been collecting increasing amounts of imagery of the Earth’s surface. With information about seasonal and long-term trends, remotely sensed imagery can be invaluable for solving some of the greatest challenges to humanity, including climate change adaptation, natural disaster monitoring, water resource management, and food security for a growing global population. From a computer vision perspective, this includes applications like land cover mapping (semantic segmentation), deforestation and flood monitoring (change detection), glacial flow (pixel tracking), hurricane tracking and intensity estimation (regression), and building and road detection (object detection, instance segmentation). By leveraging recent advancements in deep learning architectures, cheaper and more powerful GPUs, and petabytes of freely available satellite imagery datasets, we can come closer to solving these important problems.

    + +

    + +

    + +

    +National Oceanic and Atmospheric Administration satellite image of Hurricane Katrina, taken on August 28, 2005 (source). Geospatial machine learning libraries like TorchGeo can be used to detect, track, and predict future trajectories of hurricanes and other natural disasters. +

    + +

    The challenges

    + +

    In traditional computer vision datasets, such as ImageNet, the image files themselves tend to be rather simple and easy to work with. Most images have 3 spectral bands (RGB), are stored in common file formats like PNG or JPEG, and can be easily loaded with popular software libraries like PIL or OpenCV. Each image in these datasets is usually small enough to pass directly into a neural network. Furthermore, most of these datasets contain a finite number of well-curated images that are assumed to be independent and identically distributed, making train-val-test splits straightforward. As a result of this relative homogeneity, the same pre-trained models (e.g., CNNs pretrained on ImageNet) have shown to be effective across a wide range of vision tasks using transfer learning methods. Existing libraries, such as torchvision, handle these simple cases well, and have been used to make large advances in vision tasks over the past decade.

    + +

    Remote sensing imagery is not so uniform. Instead of simple RGB images, satellites tend to capture images that are multispectral (Landsat 8 has 11 spectral bands) or even hyperspectral (Hyperion has 242 spectral bands). These images capture information at a wider range of wavelengths (400 nm–15 µm), far outside of the visible spectrum. Different satellites also have very different spatial resolutions—GOES has a resolution of 4 km/px, Maxar imagery is 30 cm/px, and drone imagery resolution can be as high as 7 mm/px. These datasets almost always have a temporal component, with satellite revisists that are daily, weekly, or biweekly. Images often have overlap with other images in the dataset, and need to be stitched together based on geographic metadata. These images tend to be very large (e.g., 10K x 10K pixels), so it isn’t possible to pass an entire image through a neural network. This data is distributed in hundreds of different raster and vector file formats like GeoTIFF and ESRI Shapefile, requiring specialty libraries like GDAL to load.

    + +

    + +

    + +

    +From left to right: Mercator, Albers Equal Area, and Interrupted Goode Homolosine projections (source). Geospatial data is associated with one of many different types of reference systems that project the 3D Earth onto a 2D representation. Combining data from different sources often involves re-projecting to a common reference system in order to ensure that all layers are aligned. +

    + +

    Although each image is 2D, the Earth itself is 3D. In order to stitch together images, they first need to be projected onto a 2D representation of the Earth, called a coordinate reference system (CRS). Most people are familiar with equal angle representations like Mercator that distort the size of regions (Greenland looks larger than Africa even though Africa is 15x larger), but there are many other CRSs that are commonly used. Each dataset may use a different CRS, and each image within a single dataset may also be in a unique CRS. In order to use data from multiple layers, they must all share a common CRS, otherwise the data won’t be properly aligned. For those who aren’t familiar with remote sensing data, this can be a daunting task.

    + +

    + +

    + +

    +Even if you correctly georeference images during indexing, if you don't project them to a common CRS, you'll end up with rotated images with nodata values around them, and the images won't be pixel-aligned. +

    + +

    The solution

    + +

    At the moment, it can be quite challenging to work with both deep learning models and geospatial data without having expertise in both of these very different fields. To address these challenges, we’ve built TorchGeo, a PyTorch domain library for working with geospatial data. TorchGeo is designed to make it simple:

    + +
      +
    1. for machine learning experts to work with geospatial data, and
    2. +
    3. for remote sensing experts to explore machine learning solutions.
    4. +
    + +

    TorchGeo is not just a research project, but a production-quality library that uses continuous integration to test every commit with a range of Python versions on a range of platforms (Linux, macOS, Windows). It can be easily installed with any of your favorite package managers, including pip, conda, and spack:

    + +
    $ pip install torchgeo
    +
    + +

    TorchGeo is designed to have the same API as other PyTorch domain libraries like torchvision, torchtext, and torchaudio. If you already use torchvision in your workflow for computer vision datasets, you can switch to TorchGeo by changing only a few lines of code. All TorchGeo datasets and samplers are compatible with the PyTorch DataLoader class, meaning that you can take advantage of wrapper libraries like PyTorch Lightning for distributed training. In the following sections, we’ll explore possible use cases for TorchGeo to show how simple it is to use.

    + +

    Geospatial datasets and samplers

    + +

    + +

    + +

    +Example application in which we combine A) a scene from Landsat 8 and B) Cropland Data Layer labels, even though these files are in different EPSG projections. We want to sample patches C) and D) from these datasets using a geospatial bounding box as an index. +

    + +

    Many remote sensing applications involve working with geospatial datasets —datasets with geographic metadata. In TorchGeo, we define a GeoDataset class to represent these kinds of datasets. Instead of being indexed by an integer, each GeoDataset is indexed by a spatiotemporal bounding box, meaning that two or more datasets covering a different geographic extent can be intelligently combined.

    + +

    In this example, we show how easy it is to work with geospatial data and to sample small image patches from a combination of Landsat and Cropland Data Layer (CDL) data using TorchGeo. First, we assume that the user has Landsat 7 and 8 imagery downloaded. Since Landsat 8 has more spectral bands than Landsat 7, we’ll only use the bands that both satellites have in common. We’ll create a single dataset including all images from both Landsat 7 and 8 data by taking the union between these two datasets.

    + +
    from torch.utils.data import DataLoader
    +from torchgeo.datasets import CDL, Landsat7, Landsat8, stack_samples
    +from torchgeo.samplers import RandomGeoSampler
    +
    +landsat7 = Landsat7(root="...")
    +landsat8 = Landsat8(root="...", bands=Landsat8.all_bands[1:-2])
    +landsat = landsat7 | landsat8
    +
    + +

    Next, we take the intersection between this dataset and the CDL dataset. We want to take the intersection instead of the union to ensure that we only sample from regions where we have both Landsat and CDL data. Note that we can automatically download and checksum CDL data. Also note that each of these datasets may contain files in different CRSs or resolutions, but TorchGeo automatically ensures that a matching CRS and resolution is used.

    + +
    cdl = CDL(root="...", download=True, checksum=True)
    +dataset = landsat & cdl
    +
    + +

    This dataset can now be used with a PyTorch data loader. Unlike benchmark datasets, geospatial datasets often include very large images. For example, the CDL dataset consists of a single image covering the entire contiguous United States. In order to sample from these datasets using geospatial coordinates, TorchGeo defines a number of samplers. In this example, we’ll use a random sampler that returns 256 x 256 pixel images and 10,000 samples per epoch. We’ll also use a custom collation function to combine each sample dictionary into a mini-batch of samples.

    + +
    sampler = RandomGeoSampler(dataset, size=256, length=10000)
    +dataloader = DataLoader(dataset, batch_size=128, sampler=sampler, collate_fn=stack_samples)
    +
    + +

    This data loader can now be used in your normal training/evaluation pipeline.

    + +
    for batch in dataloader:
    +    image = batch["image"]
    +    mask = batch["mask"]
    +
    +    # train a model, or make predictions using a pre-trained model
    +
    + +

    Many applications involve intelligently composing datasets based on geospatial metadata like this. For example, users may want to:

    + +
      +
    • Combine datasets for multiple image sources and treat them as equivalent (e.g., Landsat 7 and 8)
    • +
    • Combine datasets for disparate geospatial locations (e.g., Chesapeake NY and PA)
    • +
    + +

    These combinations require that all queries are present in at least one dataset, and can be created using a UnionDataset. Similarly, users may want to:

    + +
      +
    • Combine image and target labels and sample from both simultaneously (e.g., Landsat and CDL)
    • +
    • Combine datasets for multiple image sources for multimodal learning or data fusion (e.g., Landsat and Sentinel)
    • +
    + +

    These combinations require that all queries are present in both datasets, and can be created using an IntersectionDataset. TorchGeo automatically composes these datasets for you when you use the intersection (&) and union (|) operators.

    + +

    Multispectral and geospatial transforms

    + +

    In deep learning, it’s common to augment and transform the data so that models are robust to variations in the input space. Geospatial data can have variations such as seasonal changes and warping effects, as well as image processing and capture issues like cloud cover and atmospheric distortion. TorchGeo utilizes augmentations and transforms from the Kornia library, which supports GPU acceleration and supports multispectral imagery with more than 3 channels.

    + +

    Traditional geospatial analyses compute and visualize spectral indices which are combinations of multispectral bands. Spectral indices are designed to highlight areas of interest in a multispectral image relevant to some application, such as vegetation health, areas of man-made change or increasing urbanization, or snow cover. TorchGeo supports numerous transforms, which can compute common spectral indices and append them as additional bands to a multispectral image tensor.

    + +

    Below, we show a simple example where we compute the Normalized Difference Vegetation Index (NDVI) on a Sentinel-2 image. NDVI measures the presence of vegetation and vegetation health and is computed as the normalized difference between the red and near-infrared (NIR) spectral bands. Spectral index transforms operate on sample dictionaries returned from TorchGeo datasets and append the resulting spectral index to the image channel dimension.

    + +

    First, we instantiate a Sentinel-2 dataset and load a sample image. Then, we plot the true color (RGB) representation of this data to see the region we are looking at.

    + +
    import matplotlib.pyplot as plt
    +from torchgeo.datasets import Sentinel2
    +from torchgeo.transforms import AppendNDVI
    +
    +dataset = Sentinel2(root="...")
    +sample = dataset[...]
    +fig = dataset.plot(sample)
    +plt.show()
    +
    + +

    Next, we instantiate and compute an NDVI transform, appending this new channel to the end of the image. Sentinel-2 imagery uses index 0 for its red band and index 3 for its NIR band. In order to visualize the data, we also normalize the image. NDVI values can range from -1 to 1, but we want to use the range 0 to 1 for plotting.

    + +
    transform = AppendNDVI(index_red=0, index_nir=3)
    +sample = transform(sample)
    +sample["image"][-1] = (sample["image"][-1] + 1) / 2
    +plt.imshow(sample["image"][-1], cmap="RdYlGn_r")
    +plt.show()
    +
    + +

    + +

    + +

    +True color (left) and NDVI (right) of the Texas Hill Region, taken on November 16, 2018 by the Sentinel-2 satellite. In the NDVI image, red indicates water bodies, yellow indicates barren soil, light green indicates unhealthy vegetation, and dark green indicates healthy vegetation. +

    + +

    Benchmark datasets

    + +

    One of the driving factors behind progress in computer vision is the existence of standardized benchmark datasets like ImageNet and MNIST. Using these datasets, researchers can directly compare the performance of different models and training procedures to determine which perform the best. In the remote sensing domain, there are many such datasets, but due to the aforementioned difficulties of working with this data and the lack of existing libraries for loading these datasets, many researchers opt to use their own custom datasets.

    + +

    One of the goals of TorchGeo is to provide easy-to-use data loaders for these existing datasets. TorchGeo includes a number of benchmark datasets —datasets that include both input images and target labels. This includes datasets for tasks like image classification, regression, semantic segmentation, object detection, instance segmentation, change detection, and more.

    + +

    If you’ve used torchvision before, these types of datasets should be familiar. In this example, we’ll create a dataset for the Northwestern Polytechnical University (NWPU) very-high-resolution ten-class (VHR-10) geospatial object detection dataset. This dataset can be automatically downloaded, checksummed, and extracted, just like with torchvision.

    + +
    from torch.utils.data import DataLoader
    +from torchgeo.datasets import VHR10
    +
    +dataset = VHR10(root="...", download=True, checksum=True)
    +dataloader = DataLoader(dataset, batch_size=128, shuffle=True, num_workers=4)
    +
    +for batch in dataloader:
    +    image = batch["image"]
    +    label = batch["label"]
    +
    +    # train a model, or make predictions using a pre-trained model
    +
    + +

    All TorchGeo datasets are compatible with PyTorch data loaders, making them easy to integrate into existing training workflows. The only difference between a benchmark dataset in TorchGeo and a similar dataset in torchvision is that each dataset returns a dictionary with keys for each PyTorch Tensor.

    + +

    + +

    + +

    +Example predictions from a Mask R-CNN model trained on the NWPU VHR-10 dataset. The model predicts sharp bounding boxes and masks for all objects with high confidence scores. +

    + +

    Reproducibility with PyTorch Lightning

    + +

    Another key goal of TorchGeo is reproducibility. For many of these benchmark datasets, there is no predefined train-val-test split, or the predefined split has issues with class imbalance or geographic distribution. As a result, the performance metrics reported in the literature either can’t be reproduced, or aren’t indicative of how well a pre-trained model would work in a different geographic location.

    + +

    In order to facilitate direct comparisons between results published in the literature and further reduce the boilerplate code needed to run experiments with datasets in TorchGeo, we have created PyTorch Lightning datamodules with well-defined train-val-test splits and trainers for various tasks like classification, regression, and semantic segmentation. These datamodules show how to incorporate augmentations from the kornia library, include preprocessing transforms (with pre-calculated channel statistics), and let users easily experiment with hyperparameters related to the data itself (as opposed to the modeling process). Training a semantic segmentation model on the Inria Aerial Image Labeling dataset is as easy as a few imports and four lines of code.

    + +
    from pytorch_lightning import Trainer
    +from torchgeo.datamodules import InriaAerialImageLabelingDataModule
    +from torchgeo.trainers import SemanticSegmentationTask
    +
    +datamodule = InriaAerialImageLabelingDataModule(root_dir="...", batch_size=64, num_workers=6)
    +task = SemanticSegmentationTask(segmentation_model="unet", encoder_weights="imagenet", learning_rate=0.1)
    +trainer = Trainer(gpus=1, default_root_dir="...")
    +
    +trainer.fit(model=task, datamodule=datamodule)
    +
    + +

    + +

    + +

    +Building segmentations produced by a U-Net model trained on the Inria Aerial Image Labeling dataset. Reproducing these results is as simple as a few imports and four lines of code, making comparison of different models and training techniques simple and easy. +

    + +

    In our preprint we show a set of results that use the aforementioned datamodules and trainers to benchmark simple modeling approaches for several of the datasets in TorchGeo. For example, we find that a simple ResNet-50 can achieve state-of-the-art performance on the So2Sat dataset. These types of baseline results are important for evaluating the contribution of different modeling choices when tackling problems with remotely sensed data.

    + +

    Future work and contributing

    + +

    There is still a lot of remaining work to be done in order to make TorchGeo as easy to use as possible, especially for users without prior deep learning experience. One of the ways in which we plan to achieve this is by expanding our tutorials to include subjects like “writing a custom dataset” and “transfer learning”, or tasks like “land cover mapping” and “object detection”.

    + +

    Another important project we are working on is pre-training models. Most remote sensing researchers work with very small labeled datasets, and could benefit from pre-trained models and transfer learning approaches. TorchGeo is the first deep learning library to provide models pre-trained on multispectral imagery. Our goal is to provide models for different image modalities (optical, SAR, multispectral) and specific platforms (Landsat, Sentinel, MODIS) as well as benchmark results showing their performance with different amounts of training data. Self-supervised learning is a promising method for training such models. Satellite imagery datasets often contain petabytes of imagery, but accurately labeled datasets are much harder to come by. Self-supervised learning methods will allow us to train directly on the raw imagery without needing large labeled datasets.

    + +

    Aside from these larger projects, we’re always looking to add new datasets, data augmentation transforms, and sampling strategies. If you’re Python savvy and interested in contributing to TorchGeo, we would love to see contributions! TorchGeo is open source under an MIT license, so you can use it in almost any project.

    + +

    External links:

    + + + +

    If you like TorchGeo, give us a star on GitHub! And if you use TorchGeo in your work, please cite our paper.

    + +

    Acknowledgments

    + +

    We would like to thank all TorchGeo contributors for their efforts in creating the library, the Microsoft AI for Good program for support, and the PyTorch Team for their guidance. This research is part of the Blue Waters sustained-petascale computing project, which is supported by the National Science Foundation (awards OCI-0725070 and ACI-1238993), the State of Illinois, and as of December, 2019, the National Geospatial-Intelligence Agency. Blue Waters is a joint effort of the University of Illinois at Urbana-Champaign and its National Center for Supercomputing Applications. The research was supported in part by NSF grants IIS-1908104, OAC-1934634, and DBI-2021898.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/getting-started-with-pytorch-2.0/index.html b/blog/getting-started-with-pytorch-2.0/index.html new file mode 100644 index 000000000000..3f7ddc686f9e --- /dev/null +++ b/blog/getting-started-with-pytorch-2.0/index.html @@ -0,0 +1,653 @@ + + + + + + + + + + + + + Get Started with PyTorch 2.0 Summary and Overview | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    + +
    + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    Introducing PyTorch 2.0, our first steps toward the next generation 2-series release of PyTorch. Over the last few years we have innovated and iterated from PyTorch 1.0 to the most recent 1.13 and moved to the newly formed PyTorch Foundation, part of the Linux Foundation.

    + +

    To complement the PyTorch 2.0 announcement and conference, we have also posted a comprehensive introduction and technical overview within the Get Started menu at https://pytorch.org/get-started/pytorch-2.0.

    + +

    We also wanted to ensure you had all the information to quickly leverage PyTorch 2.0 in your models so we added the technical requirements, tutorial, user experience, Hugging Face benchmarks and FAQs to get you started today!

    + +

    Finally we are launching a new “Ask the Engineers: 2.0 Live Q&A” series that allows you to go deeper on a range of topics with PyTorch subject matter experts. We hope this content is helpful for the entire community and level of users/contributors.

    + +

    https://pytorch.org/get-started/pytorch-2.0

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/graphcore-joins-pytorch/index.html b/blog/graphcore-joins-pytorch/index.html new file mode 100644 index 000000000000..cbaf5fd4830e --- /dev/null +++ b/blog/graphcore-joins-pytorch/index.html @@ -0,0 +1,669 @@ + + + + + + + + + + + + + Graphcore Joins the PyTorch Foundation as a General Member | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    Graphcore logo

    + +

    The PyTorch Foundation, a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem, is announcing today that Graphcore has joined as a general member.

    + +

    Graphcore is a UK-based company that specializes in designing and manufacturing AI accelerators, hardware and software specifically tailored for artificial intelligence and machine learning workloads.

    + +

    “We’re thrilled that PyTorch is the leading framework for development on the Graphcore platform,” said Executive Director of the PyTorch Foundation Ibrahim Haddad. “Graphcore has played an important role in the hardware and open source space, and we look forward to their continued contributions to PyTorch.”

    + +

    Graphcore has contributed to the PyTorch ecosystem by developing integrations to run on their IPU hardware. These integrations enable researchers and practitioners to use their preferred frameworks while taking advantage of Graphcore’s specialized hardware.

    + +

    “At Graphcore we’re truly aligned with PyTorch’s objective of reducing the barrier of entry to AI practitioners. By supporting a native PyTorch software environment for IPUs we are giving developers access to new underlying hardware, designed from the ground up for AI, to help unlock new AI techniques to improve efficiency or performance and to drive breakthroughs in AI research and applications, with the same user-friendly PyTorch framework they know and expect. We look forward to contributing to and growing the global AI community as an active member of the PyTorch Foundation and are proud to be the first general member.” Anthony Barbier, Software Frameworks Lead at Graphcore.

    + +

    To learn more about how you can be a part of the PyTorch Foundation, visit our website.

    + +

    About Graphcore

    + +

    Graphcore compute systems are accelerating the AI revolution. Powered by the groundbreaking Intelligence Processing Unit (IPU), Graphcore delivers leading-edge AI performance with unprecedented efficiency. IPUs are used around the world by organisations building their intelligent compute capabilities, including AI-centric startups, large multinational corporations and both public and private research institutions. Graphcore is backed by some of the world’s leading investors and has attracted more than $700m of funding. The company is based in Bristol, UK, with offices across Europe, Asia and North America.

    + +

    About PyTorch Foundation

    + +

    The PyTorch Foundation is a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem. The PyTorch Foundation is supported by its members and leading contributors to the PyTorch open source project. The Foundation leverages resources provided by members and contributors to enable community discussions and collaboration.

    + +

    About The Linux Foundation

    + +

    The Linux Foundation is the world’s leading home for collaboration on open source software, hardware, standards, and data. Linux Foundation projects are critical to the world’s infrastructure including Linux, Kubernetes, Node.js, ONAP, PyTorch, RISC-V, SPDX, OpenChain, and more. The Linux Foundation focuses on leveraging best practices and addressing the needs of contributors, users, and solution providers to create sustainable models for open collaboration. For more information, please visit us at linuxfoundation.org. The Linux Foundation has registered trademarks and uses trademarks. For a list of trademarks of The Linux Foundation, please see its trademark usage page. Linux is a registered trademark of Linus Torvalds.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/hacker-cup/index.html b/blog/hacker-cup/index.html new file mode 100644 index 000000000000..242032a1fe8a --- /dev/null +++ b/blog/hacker-cup/index.html @@ -0,0 +1,647 @@ + + + + + + + + + + + + + Announcing Hacker Cup AI Track at NeurIPS 2024 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    The PyTorch team in partnership with Meta Hacker Cup, and Microsoft Research, are excited to announce the Hacker Cup AI Track at NeurIPS 2024. This will be the first AI track for the popular Meta Hacker Cup programming competition designed to assess the capabilities of Generative AI in performing autonomous code generation tasks. We aim to test the limits of AI in complex coding challenges and measure the performance gap between AI systems and human programmers. We will provide access to all Hacker Cup problems since 2011 alongside their respective solutions in a multimodal (image and text) format, and utilize the existing Hacker Cup infrastructure for competitor evaluation. Featuring both open evaluation, open model and open evaluation, closed model tracks, this competition invites diverse participation from research institutions of varied interests and resource constraints, including academic labs, AI startups, large technology companies, and AI enthusiasts. Our goal is to develop and democratize meaningful advancements in code automation with the very first open evaluation process for competitive AI programmers. Registration will begin in Early August, with our first qualification round on September 20th.

    + +

    For more information please visit our website at https://www.facebook.com/codingcompetitions/hacker-cup/ and join our Discord at discord.gg/wWeN9hTH32

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/hadacore/index.html b/blog/hadacore/index.html new file mode 100644 index 000000000000..2b0f26773b5c --- /dev/null +++ b/blog/hadacore/index.html @@ -0,0 +1,819 @@ + + + + + + + + + + + + + HadaCore: Tensor Core Accelerated Hadamard Transform Kernel | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + IBM and Meta + +

    +

    IBM: Krish Agarwal, Rishi Astra, Adnan Hoque, Mudhakar Srivatsa, Raghu Ganti
    +Meta: Less Wright, Sijia Chen

    + +

    Quantization is a method for improving model inference speeds by compressing model weights and performing (faster) computation in lower precision data types. However, quantization can result in accuracy loss due to the presence of outliers. Recent works like QuaRot, SpinQuant, and FlashAttention-3 introduce methods to increase the numerical accuracy of INT4, INT8 and FP8 quantization in LLMs. These methods rely on Hadamard Transforms. In this blog, we present HadaCore, a Hadamard Transform CUDA kernel that achieves state-of-the-art performance on NVIDIA A100 and H100 GPUs. Our kernel achieves speedups of 1.1–1.4x and 1.0–1.3x, with a peak gain of 3.5x and 3.6x respectively, over Dao AI Lab’s Fast Hadamard Transform Kernel. We leverage a hardware-aware work decomposition that benefits from Tensor Core acceleration while maintaining quantization error reduction.

    + +

    Figure 1: Speedup of HadaCore vs Dao AI Hadamard CUDA kernel. A peak gain of 3.46x on the A100 is achieved using 128 rotation by 8.4M elements.

    + +

    Figure 1: Speedup of HadaCore vs Dao AI Hadamard CUDA kernel. A peak gain of 3.46x on the A100 is achieved using 128 rotation by 8.4M elements.

    + +

    The HadaCore Kernel is publicly available.

    + +

    Background

    + +

    QuaRot and SpinQuant both propose methods to increase the numerical accuracy of INT4 and INT8 quantization in LLMs. Both methods rotate model activations since rotations are statistically likely to reduce the magnitude of outliers, as it “distributes” extreme values among other (less extreme) dimensions, and rotation is also an easily invertible operation using the inverse of the rotation matrix. These methods can also improve FP8 inference accuracy, such as in FlashAttention-3.

    + +

    Figure 2. Transformer block showing online (red) and offline rotations (blue) in QuaRot

    + +

    Figure 2. Transformer block showing online (red) and offline rotations (blue) in QuaRot

    + +

    Applying these rotation matrices introduces model runtime overhead due to the online operations shown in Figure 2. These rotations can be applied through matrix multiplication, but the added overhead would diminish the benefits from quantization. Therefore, QuaRot and SpinQuant opt to use Walsh-Hadamard matrices, a special type of rotation matrix that can be applied faster than matrix multiplication using the Fast Walsh-Hadamard Transform algorithm. HadaCore is an optimized implementation of this algorithm for NVIDIA GPUs that support Tensor Cores.

    + +

    Tensor Core Accelerated Hadamard Transform

    + +

    HadaCore leverages NVIDIA Tensor Cores, which are specialized compute units on NVIDIA GPUs optimized for matrix multiplication. To achieve this, our kernel performs a hardware-aware work decomposition of the Fast Walsh-Hadamard algorithm. This work decomposition ensures that we can utilize the MMA PTX instructions that execute on the Tensor Core chip. HadaCore applies a 16×16 Hadamard transform to chunks of the input data. The computation can then be offloaded to the FP16 Tensor Core with usage of the mma.m16n8k16 instruction. The warp-level parallelism for HadaCore is shown below.

    + +

    Figure 3: HadaCore Parallelization, 1x256 vectors (rows) being rotated by a size 256 Hadamard.

    + +

    Figure 3: HadaCore Parallelization, 1x256 vectors (rows) being rotated by a size 256 Hadamard.

    + +

    We process fragments of 256 elements in parallel using warp-level Tensor Core operations to achieve up to a 256-size Hadamard transform. For further sizes, we shuffle data between warps and repeat.

    + +

    Microbenchmarks

    + +

    We benchmark HadaCore against the Dao AI Lab Hadamard Kernel on both NVIDIA H100 and A100 GPUs across varying Hadamard and input tensor sizes.

    + +

    Figure 4:  HadaCore Kernel Speedup on NVIDIA A100 over Dao AI Lab Fast Hadamard Kernel

    + +

    Figure 4: HadaCore Kernel Speedup on NVIDIA A100 over Dao AI Lab Fast Hadamard Kernel

    + +

    Color coded Speedup Table for NVIDIA A100, Green = Speedup over Baseline

    + +

    Color coded Speedup Table for NVIDIA A100, Green = Speedup over Baseline

    + +

    Figure 5:  HadaCore Kernel Speedup on NVIDIA H100 over Dao AI Lab Fast Hadamard Kernel

    + +

    Figure 5: HadaCore Kernel Speedup on NVIDIA H100 over Dao AI Lab Fast Hadamard Kernel

    + +

    Color coded Speedup Table for NVIDIA H100, Green = Speedup over Baseline

    + +

    Color coded Speedup Table for NVIDIA H100, Green = Speedup over Baseline

    + +

    We showcase our speedup as the input tensor size (labeled element count) in our charts increase. Element count is the number of elements in the target matrix we are rotating. For example, in multi-head attention:

    + +

    The queries (Q), keys (K) and values (V) tensors are 4D tensors of size:

    + +

    (batch_size, seq_len, n_heads, head_dim)

    + +

    A Hadamard matrix of size head_dim is applied to these activation tensors, so we refer to this as using a Hadamard size of head_dim with an element count of:

    + +

    batch_size*seq_len*n_heads*head_dim.

    + +

    Common element counts for query rotations in an attention block:

    + + + + + + + + + + + + + + + + + +
    Model \ Tokens + Prefill + Decoding +
    Llama-2 70b + 33,554,432 elements +
    +128 Hadamard size +
    + +(1 batch * 64 heads * 4096 tokens * 128 dimensional embeddings per head per token) +
    8192 elements +
    +128 Hadamard size +
    +(1 batch * 64 heads * 1 token * 128 dimensional embeddings per head per token) +
    Llama-3 8b + 33,554,432 elements +
    +128 Hadamard size +
    +(1 batch * 32 heads * 8192 tokens * 128 dimensional embeddings per head per token) +
    4,096 elements +
    +128 Hadamard size +
    +(1 batch * 32 heads * 1 token * 128 dimensional embeddings per head per token) +
    + +

    HadaCore achieves 1.1–1.4x speedup on A100 and 1.0–1.3x speedup on H100 over Dao AI Lab’s Fast Hadamard kernel, with a peak gain of 3.5x and 3.6x, respectively. For smaller sizes on H100, HadaCore’s gain decreases. For future work, we plan to incorporate usage of Hopper specific features like TMA and WGMMA for improved H100 performance.

    + +

    MMLU Benchmarks

    + +

    We evaluated MMLU scores on a Llama 3.1-8B inference workload where the FlashAttention computation was performed in FP8. Newer generation NVIDIA Hopper GPUs come equipped with FP8 Tensor Cores that deliver substantial compute gain over FP16.

    + +

    Our results show the benefit of using HadaCore for accuracy preservation when combined with optimizations such as FP8 FlashAttention.

    + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Format + Method + Llama3.1-8B +
    +Avg. 5-Shot MMLU Accuracy +
    Q, K, V: FP16 +
    +FlashAttention: FP16 +
    N/A + 65.38 +
    Q, K, V: FP16 +
    +FlashAttention: FP8 +
    No Hadamard + 64.40 +
    Q, K, V: FP8 +
    +FlashAttention: FP8 +
    HadaCore + 65.09 +
    Q, K, V: FP8 +
    +FlashAttention: FP8 +
    Dao AI Fast Hadamard Kernel + 65.45 +
    + +

    Table 1: MMLU scores for Llama3.1 8B with FP16 baseline and FP8 attention using Hadamard transforms, comparing an implementation with explicit Hadamard matrix multiplications vs. HadaCore (higher is better)

    + +

    From the above MMLU scores, we note that for Llama3.1-8B inference with FP8 attention, HadaCore improves the quantization error introduced from computing attention in a lower precision.

    + +

    Conclusion

    + +

    We showcased our speedups achieved by moving the Fast-Walsh Hadamard algorithm into a CUDA kernel that leverages Tensor Core acceleration and achieves a peak speedup of 3.5x and 3.6x over the Dao AI Fast-Hadamard kernel on NVIDIA A100 and H100, respectively.

    + +

    Further, we showed on the MMLU benchmark that rotating with HadaCore maintains similar quantization error reduction to the Fast-Hadamard kernel, while providing computational acceleration.

    + +

    Future Work

    + +

    We plan to implement a Triton version of our kernel and experiment with more advanced techniques such as kernel fusion to support fused Hadamard transform and quantization. Further, we plan to extend our kernel to support BF16 Tensor Core compute.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/hi-po-low-bit-operators/index.html b/blog/hi-po-low-bit-operators/index.html new file mode 100644 index 000000000000..0685b7f33beb --- /dev/null +++ b/blog/hi-po-low-bit-operators/index.html @@ -0,0 +1,765 @@ + + + + + + + + + + + + + High-Performance Low-Bit Operators for PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    + +
    + +
    +
    +
    + +
    +

    + by + + Scott Roy, Digant Desai, Kimish Patel + +

    +

    We are excited to announce the addition of embedding operators with low-bit weights (1-8 bit) and linear operators with 8-bit dynamically quantized activations and low-bit weights (1-8 bit) for Arm CPUs in TorchAO, PyTorch’s native low-precision library. These operators work seamlessly across all PyTorch surfaces, including eager, torch.compile, AOTI, and ExecuTorch, and are available to use in torchchat.

    + +

    In developing these linear operators, our focus was on code sharing between PyTorch and ExecuTorch, and establishing a clear boundary between the higher-level operator and the lower-level kernel. This design allows third-party vendors to easily swap in their own kernels. We also set out to create a place and infrastructure to experiment with new CPU quantization ideas and test those across the PyTorch ecosystem.

    + +

    Universal low-bit kernels

    + +

    There is no hardware support for low-bit arithmetic. In what we call universal kernels, we explicitly separated the logic that unpacks low-bit values to int8 values, and the int8 GEMV kernel logic in a modular fashion. We started with an 8-bit kernel, for example, this 1x8 8-bit GEMV kernel that uses the Arm neondot instruction. Within the 8-bit kernel, we invoke an inlined unpacking routine to convert low-bit values into int8 values. This unpacking routine is force-inlined and templated on some low-bit value. Our experiments showed no performance difference between using a separate force-inlined unpacking routine and directly embedding the unpacking code inline.

    + +

    The advantage of this modular design is improved development speed and code maintainability. After writing an 8-bit kernel, we quickly achieved full low-bit coverage by writing simple bitpacking routines. In fact, developers who worked on the bit packing routines did not need to be experts on GEMV/GEMM kernel writing. We also reused the same bitpacking routines from the linear kernels within the embedding kernels. In future we could reuse the same bitpacking routines for universal GEMM kernels or kernels based on fma or i8mm instructions.

    + +

    Shared code between PyTorch and ExecuTorch

    + +

    To achieve shared code between PyTorch and ExecuTorch, we wrote kernels using raw pointers instead of PyTorch tensors. Moreover, we implemented the linear operator in a header that is included in separate PyTorch and ExecuTorch operator registration code. By using only features common to both ATen and ExecuTorch tensors, we ensured compatibility between the two frameworks. For multi-threaded compute, we introduced torchao::parallel_1d, which compiles to either at::parallel_for or ExecuTorch’s threadpool based on compile-time flags.

    + +

    Swappable kernels

    + +

    Our design for the higher-level multi-threaded linear operator is agnostic to the lower-level single-threaded kernels, allowing third-party vendors to swap in their own implementations. The interface between the operator and kernel is defined by a ukernel config, which specifies kernel function pointers for preparing activation data, preparing weight data, and running the kernel. The operator, responsible for tiling and scheduling, interacts with kernels solely through this config.

    + +

    Performance

    + +

    In the table below, we show Llama3.1 8B token generation performance using 6 CPU threads on an M1 Macbook Pro with 32GB of RAM.

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Bitwidth x + torch.compile (Decode tokens/sec) + ExecuTorch (Decode tokens/sec) + ExecuTorch PTE size (GiB) +
    1 + 24.18 + 17.86 + 1.46 +
    2 + 27.02 + 19.65 + 2.46 +
    3 + 21.01 + 22.25 + 3.46 +
    4 + 19.51 + 19.47 + 4.47 +
    5 + 14.78 + 16.34 + 5.47 +
    6 + 12.80 + 13.61 + 6.47 +
    7 + 8.16 + 11.73 + 7.48 +
    + +

    Results were run on an M1 Macbook Pro (with 8 perf cores, and 2 efficiency cores) with 32GB of RAM and 6 threads using torchchat. In each test, the max-seq-length of 128 tokens were generated. For each bit width x, the embedding layer was groupwise quantized to x-bits with group size 32. In the linear layers, activations were dynamically quantized per token to 8 bits and weights were groupwise quantized to x-bits with group size 256. Our focus here is performance and we do not report accuracy or perplexity numbers. Depending on the model, lower bit widths may require quantization-aware training, quantizing a model with a mixture of bit widths, or adjusting the group sizes for acceptable accuracy.

    + +

    Llama 3.1 chart

    + +

    Try them out and contribute!

    + +

    If you want to see the new low-bit kernels in action, give them a try by setting up torchchat and quantizing and running an LLM locally using the kernels.

    + +

    If you want to help contribute, consider adding support for one of the following areas:

    + +
      +
    • Add universal low-bit GEMM kernels for Arm CPU, reusing the same bitpacking routines from the universal GEMV kernels.
    • +
    • Improve runtime selection of ukernel configs based on ISA, packing format, and activation shape.
    • +
    • Add low-bit kernels for other CPU ISAs like x86.
    • +
    • Integrate third-party libraries like KleidiAI with the operator framework.
    • +
    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/high-performance-llama-2/index.html b/blog/high-performance-llama-2/index.html new file mode 100644 index 000000000000..f525dfee7eda --- /dev/null +++ b/blog/high-performance-llama-2/index.html @@ -0,0 +1,1119 @@ + + + + + + + + + + + + + High-Performance Llama 2 Training and Inference with PyTorch/XLA on Cloud TPUs | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Jiewen Tan, Jon Bolin, Yeounoh Chung, Liyang Lu, Siyuan Liu, Wonjoo Lee, Manfei Bai, Meghan Cowan, Jack Cao, Milad Mohammadi, Shauheen Zahirazami, Alex Spiridonov + +

    +

    In a landscape where AI innovation is accelerating at an unprecedented pace, Meta’s Llama family of open sourced large language models (LLMs) stands out as a notable breakthrough. Llama marked a significant step forward for LLMs, demonstrating the power of pre-trained architectures for a wide range of applications. Llama 2 further pushed the boundaries of scale and capabilities, inspiring advancements in language understanding, generation, and beyond.

    + +

    Shortly after the announcement of Llama, we published a blog post showcasing ultra-low inference latency for Llama using PyTorch/XLA on Cloud TPU v4. Building on these results, today, we are proud to share Llama 2 training and inference performance using PyTorch/XLA on Cloud TPU v4 and our newest AI supercomputer, Cloud TPU v5e.

    + +

    In this blog post, we use Llama 2 as an example model to demonstrate the power of PyTorch/XLA on Cloud TPUs for LLM training and inference. We discuss the computation techniques and optimizations used to improve inference throughput and training model FLOPs utilization (MFU). For Llama 2 70B parameters, we deliver 53% training MFU, 17 ms/token inference latency, 42 tokens/s/chip throughput powered by PyTorch/XLA on Google Cloud TPU. We offer a training user guide and an inference user guide for reproducing the results in this article. Additionally, you may find our Google Next 2023 presentation here.

    + +

    Model Overview

    + +

    Llama 2 comes in various sizes, ranging from 7B to 70B parameters, catering to different needs, computational resources, and training / inference budgets. Whether it’s small-scale projects or large-scale deployments, Llama models offer versatility and scalability to accommodate a wide range of applications.

    + +

    Llama 2 is an auto-regressive language model that uses an optimized transformer architecture. The largest, 70B model, uses grouped-query attention, which speeds up inference without sacrificing quality. Llama 2 is trained on 2 trillion tokens (40% more data than Llama) and has the context length of 4,096 tokens for inference (double the context length of Llama), which enables more accuracy, fluency, and creativity for the model.

    + +

    Llama 2 is a state-of-the-art LLM that outperforms many other open source language models on many benchmarks, including reasoning, coding, proficiency, and knowledge tests. The model’s scale and complexity place many demands on AI accelerators, making it an ideal benchmark for LLM training and inference performance of PyTorch/XLA on Cloud TPUs.

    + +

    Performance Challenge of LLMs

    + +

    Large-scale distributed training for LLMs such as Llama 2 introduces technical challenges that require practical solutions to make the most efficient use of TPUs. Llama’s size can strain both memory and processing resources of TPUs. To address this, we use model sharding, which involves breaking down the model into smaller segments, each fitting within the capacity of a single TPU core. This enables parallelism across multiple TPUs, improving training speed while reducing communication overhead.

    + +

    Another challenge is managing the large datasets required for training Llama 2 efficiently, which requires effective data distribution and synchronization methods. Additionally, optimizing factors like learning rate schedules, gradient aggregation, and weight synchronization across distributed TPUs is crucial for achieving convergence.

    + +

    After pretraining or fine-tuning Llama 2, running inference on the model checkpoint creates additional technical challenges. All of the challenges discussed in our previous blog post, such as autoregressive decoding, variable input prompt lengths, and the need for model sharding and quantization still apply for Llama 2. In addition, Llama 2 introduced two new capabilities: grouped-query attention and early stopping. We discuss how PyTorch/XLA handles these challenges to enable high-performance, cost-efficient training and inference of Llama 2 on Cloud TPU v4 and v5e.

    + +

    Large-Scale Distributed Training

    + +

    PyTorch/XLA offers two major ways of doing large-scale distributed training: SPMD, which utilizes the XLA compiler to transform and partition a single-device program into a multi-device distributed program; and FSDP, which implements the widely-adopted Fully Sharded Data Parallel algorithm.

    + +

    In this blog post, we show how to use the SPMD API to annotate the HuggingFace (HF) Llama 2 implementation to maximize performance. For comparison, we also show our FSDP results with the same configurations; read about PyTorch/XLA FSDP API here.

    + +

    SPMD Overview

    + +

    Let’s briefly review the fundamentals of SPMD. For details, please refer to our blog post and user guide.

    + +

    Mesh

    + +

    A multidimensional array that describes the logical topology of the TPU devices:

    + +
    # Assuming you are running on a TPU host that has 8 devices attached
    +num_devices = xr.global_runtime_device_count()
    +# mesh shape will be (4,2) in this example
    +mesh_shape = (num_devices // 2, 2)
    +device_ids = np.array(range(num_devices))
    +# axis_names 'x' and 'y' are optional
    +mesh = Mesh(device_ids, mesh_shape, ('x', 'y'))
    +
    + +

    Partition Spec

    + +

    A tuple that describes how the corresponding tensor’s dimensions are sharded across the mesh:

    + +
    partition_spec = ('x', 'y')
    +
    + +

    Mark Sharding

    + +

    An API that takes a mesh and a partition_spec, and then generates a sharding annotation for the XLA compiler.

    + +
    tensor = torch.randn(4, 4).to('xla')
    +# Let's resue the above mesh and partition_spec.
    +# It means the tensor's 0th dim is sharded 4 way and 1th dim is sharded 2 way.
    +xs.mark_sharding(tensor, mesh, partition_spec)
    +
    + +

    2D Sharding with SPMD

    + +

    In our SPMD blog post, we demonstrated using 1D FSDP style sharding. Here, we introduce a more powerful sharding strategy, called 2D sharding, where both the parameters and activations are sharded. This new sharding strategy not only allows fitting a larger model but also boosts the MFU to up to 54.3%. For more details, read the Benchmarks section.

    + +

    This section introduces a set of general rules that applies to most LLMs, and for convenience we directly reference the variable names and configuration names from HF Llama.

    + +

    First, let’s create a 2D Mesh with corresponding axis names: data and model. The data axis is usually where we distribute the input data, and the model axis is where we further distribute the model.

    + +
    mesh = Mesh(device_ids, mesh_shape, ('data', 'model'))
    +
    + +

    The mesh_shape can be a hyper-parameter that is tuned for different model sizes and hardware configurations. The same mesh will be reused in all following sharding annotations. In the next few sections, we will cover how to use the mesh to shard parameters, activations and input data.

    + +

    Parameter Sharding

    + +

    Below is a table that summarizes all parameters of HF Llama 2 and corresponding partition specifications. Example HF code can be found here.

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Parameter Name + Explanation + Parameter Shape + Partition Spec +
    embed_tokens + embedding layer + (vocab_size, hidden_size) + (model, data) +
    q_proj + attention weights + (num_heads x head_dim, hidden_size) + (data, model) +
    k_proj / v_proj + attention weights + (num_key_value_heads x head_dim, hidden_size) + (data, model) +
    o_proj + attention weights + (hidden_size, num_heads x head_dim) + (model, data) +
    gate_proj / up_proj + MLP weights + (intermediate_size, hidden_size) + (model, data) +
    down_proj + MLP weights + (hidden_size, intermediate_size) + (data, model) +
    lm_head + HF output embedding + (vocab_size, hidden_size) + (model, data) +
    + +

    Table 1: SPMD 2D Sharding Parameter Partition Spec

    + +

    The rule is to shard the hidden_size dim of any weights except QKVO projections according to the data axis of the mesh, then shard the other dim with the remaining model axis. For QKVO, do the opposite. This model-data axis rotation methodology is similar to that of Megatron-LM to reduce communication overhead. For layernorm weights, we implicitly mark them as replicated across different devices given they are 1D tensors.

    + +

    Activation Sharding

    + +

    In order to better utilize the device memory, very often we need to annotate the output of some memory bound ops. That way the compiler is forced to only keep partial output on devices instead of the full output. In Llama 2, we explicitly annotate all torch.matmul and nn.Linear outputs. Table 2 summarizes the corresponding annotations; the example HF code can be found here.

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Output Name + Explanation + Output Shape + Partition Spec +
    inputs_embeds + embedding layer output + (batch_size, sequence_length, hidden_size) + (data, None, model) +
    query_states + attention nn.Linear output + (batch_size, sequence_length, num_heads x head_dim) + (data, None, model) +
    key_states / value_states + attention nn.Linear output + (batch_size, sequence_length, num_key_value_heads x head_dim) + (data, None, model) +
    attn_weights + attention weights + (batch_size, num_attention_heads, sequence_length, sequence_length) + (data, model, None, None) +
    attn_output + attention layer output + (batch_size, sequence_length, hidden_size) + (data, None, model) +
    up_proj / gate_proj / down_proj + MLP nn.Linear outputs + (batch_size, sequence_length, intermediate_size) + (data, None, model) +
    logits + HF output embedding output + (batch_size, sequence_length, hidden_size) + (data, None, model) +
    + +

    Table 2: SPMD 2D Sharding Activation Partition Spec

    + +

    The rule is to shard the batch_size dim of any outputs according to the data axis of the mesh, then replicate the length dims of any outputs, and finally shard the last dim along the model axis.

    + +

    Input Sharding

    + +

    For input sharding, the rule is to shard the batch dim along the data axis of the mesh, and replicate the sequence_length dim. Below is the example code, and the corresponding HF change may be found here.

    + +
    partition_spec = ('data', None)
    +sharding_spec = xs.ShardingSpec(mesh, partition_spec)
    +# MpDeviceLoader will shard the input data before sending to the device.
    +pl.MpDeviceLoader(dataloader, self.args.device, input_sharding=sharding_spec, ...)
    +
    + +

    Now, all the data and model tensors that require sharding are covered!

    + +

    Optimizer States & Gradients

    + +

    You may be wondering whether it is necessary to shard the optimizer states and gradients as well. Great news: the sharding propagation feature of the XLA compiler automates the sharding annotation in these two scenarios, without needing more hints to improve performance.

    + +

    It is important to note that optimizer states are typically initialized within the first iteration of the training loop. From the standpoint of the XLA compiler, the optimizer states are the outputs of the first graph, and therefore have the sharding annotation propagated. For subsequent iterations, the optimizer states become inputs to the second graph, with the sharding annotation propagated from the first one. This is also why PyTorch/XLA typically produces two graphs for the training loops. If the optimizer states are somehow initialized before the first iteration, users will have to manually annotate them, just like the model weights.

    + +

    Again, all concrete examples of the above sharding annotation can be found in our fork of HF Transformers here. The repo also contains code for our experimental feature MultiSlice, including HybridMesh and dcn axis, which follows the same principles mentioned above.

    + +

    Caveats

    + +

    While using SPMD for training, there are a few important things to pay attention to:

    + +
      +
    • Use torch.einsum instead of torch.matmul; torch.matmul usually flattens tensors and does a torch.mm at the end, and that’s bad for SPMD when the combined axes are sharded. The XLA compiler will have a hard time determining how to propagate the sharding.
    • +
    • PyTorch/XLA provides patched [nn.Linear](https://github.com/pytorch/xla/blob/master/torch_xla/experimental/xla_sharding.py#L570) to overcome the above constraint:
    • +
    + +
    import torch_xla.experimental.xla_sharding as xs
    +from torch_xla.distributed.fsdp.utils import apply_xla_patch_to_nn_linear
    +
    + model = apply_xla_patch_to_nn_linear(model, xs.xla_patched_nn_linear_forward)
    +
    + +
      +
    • Always reuse the same mesh across all shardings
    • +
    • Always specify --dataloader_drop_last yes. The last smaller data is hard to annotate.
    • +
    • Large models which are initialized on the host can induce host-side OOM. One way to avoid this issue is to initialize parameters on the meta device, then create and shard real tensors layer-by-layer.
    • +
    + +

    Infrastructure Improvements

    + +

    Besides the above modeling techniques, we have developed additional features and improvements to maximize performance, including:

    + +
      +
    • We enable asynchronous collective communication. This requires enhancements on the XLA compiler’s latency hiding scheduler to better optimize for the Llama 2 PyTorch code.
    • +
    • We now allow sharding annotations in the middle of the IR graph, just like JAX’s jax.lax.with_sharding_constraint. Previously, only graph inputs were annotated.
    • +
    • We also propagate replicated sharding spec from the compiler to the graph outputs. This allows us to shard the optimizer states automatically.
    • +
    + +

    Inference Optimizations

    + +

    All the PyTorch/XLA optimizations implemented for Llama inference are applied to Llama 2 as well. That includes Tensor Parallelism + Dynamo (torch.compile) using torch-xla collective ops, autoregressive decoding logic improvement to avoid recompilation, bucketized prompt length, KV-cache with compilation friendly index ops. Llama 2 introduces two new changes: Grouped Query Attention, and Early Stopping when eos is reached for all prompts. We applied corresponding changes to promote better performance and flexibility with PyTorch/XLA.

    + +

    Grouped Query Attention

    + +

    Llama 2 enables Grouped Query Attention for the 70B models. It allows the number of Key and Value heads to be smaller than the number of Query heads, while still supporting KV-cache sharding up to the number of KV heads. For the 70B models, the n_kv_heads is 8, which limits the tensor parallelism to be less or equal to 8. In order to shard the model checkpoint to run on more devices, the K, V projection weights need to be replicated first, and then split into multiple pieces. For example, to shard the 70B model checkpoint from 8 pieces to 16 pieces, the K, V projection weights are duplicated and split into 2 pieces for each shard. We provide a reshard_checkpoints.py script to handle that, and to make sure the sharded checkpoint performs mathematically identical to the original checkpoint.

    + +

    EOS Early Stopping

    + +

    The Llama 2 generation code added the early stopping logic. A eos_reached tensor is used to track the completion of all the prompt generations, and if the eos token is reached for all the prompts in the batch, the generation would stop early. The similar change is incorporated in the PyTorch/XLA optimized version as well, with some minor tweaks.

    + +

    In PyTorch/XLA, checking the value of a tensor like eos_reached as part of the control flow condition would invoke a blocking device-to-host transfer. The tensor would be transferred from device memory to CPU memory to evaluate its value, while all other logics are waiting. This introduced a delay on the scale of ms after every new token generation. As a trade-off, we reduce the rate of checking the eos_reached value to be once every 10 new token generations. With this change, the impact of the blocking device-to-host transfer would be reduced by 10x, while the early stopping would still be effective, and at most 9 unnecessary tokens would be generated after each sequence reaches the eos token.

    + +

    Model Serving

    + +

    PyTorch/XLA is working on a serving strategy to enable the PyTorch community to serve their deep learning applications via Torch.Export, StableHLO, and SavedModel. PyTorch/XLA Serving is an experimental feature in PyTorch/XLA 2.1 release; for details visit our serving user guide. Users can take advantage of TorchServe to run their single-host workloads.

    + +

    Benchmarks

    + +

    Metrics

    + +

    To measure training performance, we use the industry-standard metric: Model FLOPS Utilization (MFU). Model FLOPS are the floating point operations required to perform a single forward and backward pass. Model FLOPs are hardware and implementation independent and only depend on the underlying model. MFU measures how effectively the model is using the actual hardware during training. Achieving 100% MFU means that the model is using the hardware perfectly.

    + +

    To measure inference performance, we use the industry-standard metric of throughput. First, we measure latency per token when the model has been compiled and loaded. Then, we calculate throughput by dividing batch size (BS) over latency per chip. As a result, throughput measures how the model is performing in production environments regardless of how many chips are used.

    + +

    Results

    + +

    Training Evaluation

    + +

    Figure 1 shows Llama 2 SPMD 2D sharding training results on a range of Google TPU v4 hardware with PyTorch/XLA FSDP as the baseline. We increased MFU by 28% across all sizes of Llama 2 compared to FSDP running on the same hardware configuration. This performance improvement is largely due to: 1) 2D Sharding has less communication overhead than FSDP, and 2) asynchronous collective communication is enabled in SPMD which allows communication and computation overlapping. Also note that as the model size scales, we maintain the high MFU. Table 3 shows all the hardware configurations plus some hyperparameters used in the training benchmarks.

    + +

    Figure 1. Llama 2 Training MFU on TPU v4 Hardware

    + +

    Fig. 1: Llama 2 Training MFU on TPU v4 Hardware

    + +

    The results in Figure 1 are produced with sequence length 1,024. Figure 2 shows how the performance behaves with larger sequence lengths. It shows our performance also scales linearly with sequence lengths. The MFU is expected to decrease a little as a smaller per device batch size is needed to accommodate the additional memory pressure introduced by the larger sequence length since the sequence length axis is not sharded in 2D sharding. And TPU is very sensitive to batch size. For Llama 2, 70B parameters, the performance decrease is as low as 4%. At the time of preparing these results, Hugging Face Llama 2 tokenizer limits the max model input to 2,048, preventing us from evaluating larger sequence lengths.

    + +

    Figure 2. Llama 2 SPMD Training MFU on TPU v4 with Different Sequence Lengths

    + +

    Fig. 2: Llama 2 SPMD Training MFU on TPU v4 with Different Sequence Lengths

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Model Size + 7B + 13B + 70B +
    TPU NumCores + V4-32 + V4-64 + V4-256 +
    Mesh Shape + (16, 1) + (32, 1) + (32, 4) +
    Seq Len + 1,024 + 2,048 + 1,024 + 2,048 + 1,024 + 2,048 +
    Global Batch + 256 + 128 + 256 + 128 + 512 + 256 +
    Per Device Batch + 16 + 8 + 8 + 4 + 16 + 8 +
    + +

    Table 3: Llama 2 SPMD Training Benchmark TPU Configurations and Hyperparameters

    + +

    One last thing to call out is that we use adafactor as the optimizer for better memory utilization. And once again, here is the user guide to reproduce the benchmark results listed above.

    + +

    Inference Evaluation

    + +

    In this section, we extend our previous evaluation of Llama on Cloud v4 TPU. Here, we demonstrate the performance properties of TPU v5e for inference applications.

    + +

    We define inference throughput as the number of tokens produced by a model per second per TPU chip. Figure 3 shows Llama 2 70B throughput on a v5e-16 TPU node. Given Llama is a memory bound application, we see that applying weight-only quantization unblocks extending the model batch size to 32. Higher throughput results would be possible on larger TPU v5e hardware up to the point where the ICI network bandwidth between chips throttle the TPU slice from delivering higher throughput. Exploring the upper bound limits of TPU v5e on Llama 2 was outside of the scope of this work. Notice, to make the Llama 2 70B model run on v5e-16, we replicated the attention heads to have one head per chip as discussed in the Inference section above. As discussed previously, with increasing model batch size, per-token latency grows proportionally; quantization improves overall latency by reducing memory I/O demand.

    + +

    Figure 3. Llama 2 70B Inference Per-Chip Throughput on TPU v5e vs. Batch Size

    + +

    Fig. 3: Llama 2 70B Inference Per-Chip Throughput on TPU v5e vs. Batch Size

    + +

    Figure 4 shows inference throughput results across different model sizes. These results highlight the largest throughput given the hardware configuration when using bf16 precision. With weight only quantization, this throughput reaches 42 on the 70B model. As mentioned above, increasing hardware resources may lead to performance gains.

    + +

    Figure 4. Llama 2 Inference Per-Chip Throughput on TPU v5e

    + +

    Fig. 4: Llama 2 Inference Per-Chip Throughput on TPU v5e

    + +

    Figure 5 shows the cost of serving Llama 2 models (from Figure 4) on Cloud TPU v5e. We report the TPU v5e per-chip cost based on the 3-year commitment (reserved) price in the us-west4 region. All model sizes use maximum sequence length of 2,048 and maximum generation length of 1,000 tokens. Note that with quantization, the cost for the 70B model drops to $0.0036 per 1,000 tokens.

    + +

    Figure 5. Llama 2 Inference Per-Chip Cost on TPU v5e

    + +

    Fig. 5: Llama 2 Inference Per-Chip Cost on TPU v5e

    + +

    Figure 6 summarizes our best Llama 2 inference latency results on TPU v5e. Llama 2 7B results are obtained from our non-quantized configuration (BF16 Weight, BF16 Activation) while the 13B and 70B results are from the quantized (INT8 Weight, BF16 Activation) configuration. We attribute this observation to the inherent memory saving vs. compute overhead tradeoff of quantization; as a result, for smaller models, quantization may not lead to lower inference latency.

    + +

    Additionally, prompt length has a strong effect on the memory requirements of LLMs. For instance, we observe a latency of 1.2ms / token (i.e. 201 tokens / second / chip) when max_seq_len=256 at batch size of 1 with no quantization on v5e-4 running Llama2 7B.

    + +

    Figure 6. Llama 2 Inference Latency on TPU v5e

    + +

    Fig. 6: Llama 2 Inference Latency on TPU v5e

    + +

    Final Thoughts

    + +

    The recent wave of AI innovation has been nothing short of transformative, with breakthroughs in LLMs at the forefront. Meta’s Llama and Llama 2 models stand as notable milestones in this wave of progress. PyTorch/XLA uniquely enables high-performance, cost-efficient training and inference for Llama 2 and other LLMs and generative AI models on Cloud TPUs, including the new Cloud TPU v5e. Looking forward, PyTorch/XLA will continue to push the performance limits on Cloud TPUs in both throughput and scalability and at the same time maintain the same PyTorch user experience.

    + +

    We are ecstatic about what’s ahead for PyTorch/XLA and invite the community to join us. PyTorch/XLA is developed fully in open source. So, please file issues, submit pull requests, and send RFCs to GitHub so that we can openly collaborate. You can also try out PyTorch/XLA for yourself on various XLA devices including TPUs and GPUs.

    + +

    We would like to extend our special thanks to Marcello Maggioni, Tongfei Guo, Andy Davis, Berkin Ilbeyi for their support and collaboration in this effort.

    + +

    Cheers,
    +The PyTorch/XLA Team at Google

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/high-performance-llama/index.html b/blog/high-performance-llama/index.html new file mode 100644 index 000000000000..ec7f1fa1e010 --- /dev/null +++ b/blog/high-performance-llama/index.html @@ -0,0 +1,923 @@ + + + + + + + + + + + + + High performance Llama 2 deployments with AWS Inferentia2 using TorchServe | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Mike Zhang, Li Ning, Sergey Ivanov, Naman Nandan, Hamid Shojanazeri, Geeta Chauhan, Abhi Shivaditya, Michael Nguyen, Pinak Panigrahi + +

    +

    Recently, Llama 2 was released and has attracted a lot of interest from the machine learning community. Amazon EC2 Inf2 instances, powered by AWS Inferentia2, now support training and inference of Llama 2 models. In this post, we show low-latency and cost-effective inference of Llama-2 models on Amazon EC2 Inf2 instances using the latest AWS Neuron SDK release.  We first introduce how to create, compile and deploy the Llama-2 model and explain the optimization techniques introduced by AWS Neuron SDK to achieve high performance at low cost. We then present our benchmarking results. Lastly, we show how the Llama-2 model can be deployed through Amazon SageMaker using TorchServe on an Inf2 instance. 

    + +

    Llama 2 is an auto-regressive language model that uses an optimized transformer architecture

    + +

    What is Llama 2

    + +

    Llama 2 is an auto-regressive language model that uses an optimized transformer architecture. Llama 2 is intended for commercial and research use in English. It comes in multiple sizes—7 billion, 13 billion, and 70 billion parameters—as well as pre-trained and fine-tuned variations. According to Meta, the tuned versions use supervised fine-tuning (SFT) and reinforcement learning with human feedback (RLHF) to align to human preferences for helpfulness and safety. Llama 2 was pre-trained on 2 trillion tokens of data from publicly available sources. The tuned models are intended for assistant-like chat, whereas pre-trained models can be adapted for a variety of natural language generation tasks. Regardless of which version of the model a developer uses, the responsible use guide from Meta can assist in guiding additional fine-tuning that may be necessary to customize and optimize the models with appropriate safety mitigations.

    + +

    Amazon EC2 Inf2 instances Overview

    + +

    Amazon EC2 Inf2 instances, featuring Inferentia2, provide 3x higher compute, 4x more accelerator memory, resulting in up to 4x higher throughput, and up to 10x lower latency, compared to the first generation Inf1 instances.

    + +

    Large language model (LLM) inference is a memory bound workload, performance scales up with more accelerator memory bandwidth. Inf2 instances are the only inference optimized instances in Amazon EC2 to provide high speed accelerator interconnect (NeuronLink) enabling high performance large LLM model deployments with cost effective distributed inference. You can now efficiently and cost-effectively deploy billion-scale LLMs across multiple accelerators on Inf2 instances.

    + +

    Inferentia2 supports FP32, TF32, BF16, FP16, UINT8, and the new configurable FP8 (cFP8) data type. AWS Neuron can take high-precision FP32 and FP16 models and autocast them to lower-precision data types while optimizing accuracy and performance. Autocasting reduces time to market by removing the need for lower-precision retraining and enabling higher-performance inference with smaller data types.

    + +

    To make it flexible and extendable to deploy constantly evolving deep learning models, Inf2 instances have hardware optimizations and software support for dynamic input shapes as well as custom operators written in C++ through the standard PyTorch custom operator programming interfaces.

    + +

    Transformers Neuron (transformers-neuronx)

    + +

    Transformers Neuron is a software package that enables PyTorch users to deploy performance optimized LLM inference. It has an optimized version of transformer models implemented with XLA high level operators (HLO), which enables sharding tensors across multiple NeuronCores, a.k.a. tensor parallelism, and performance optimizations such as parallel context encoding and KV caching for Neuron hardware. The Llama 2 source code in XLA HLOs can be found here.

    + +

    Llama 2 is supported in Transformers Neuron through the LlamaForSampling class. Transformers Neuron provides a seamless user experience with Hugging Face models to provide optimized inference on Inf2 instances. More details can be found from the Transforms Neuron Developer Guide. In the following section, we will explain how to deploy the Llama-2 13B model using Transformers Neuron. And, this example also applies to other Llama-based models.

    + +

    Llama 2 model inference with Transformers Neuron

    + +

    Create model, compile and deploy

    + +

    We have three simple steps here to create, compile and deploy the model on Inf2 instances.

    + +
      +
    1. Create a CPU model, use this script or the following code snippet to serialize and save checkpoints in a local directory.
    2. +
    + +
    from transformers import AutoModelForCausalLM
    +from transformers_neuronx.module import save_pretrained_split
    +model_cpu = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-13b-hf", low_cpu_mem_usage=True)
    +model_dir = "./llama-2-13b-split"
    +save_pretrained_split(model_cpu, model_dir)
    +
    + +
      +
    1. Load and compile model from the local directory that you saved serialized checkpoints using the following. +To load the Llama 2 model, we use LlamaForSampling from Transformers Neuron. Note that the environment variable NEURON_RT_NUM_CORES specifies the number of NeuronCores to be used at runtime and it should match the tensor parallelism (TP) degree specified for the model. Also, NEURON_CC_FLAGS enables compiler optimization on decoder-only LLM models.
    2. +
    + +
    from transformers_neuronx.llama.model import LlamaForSampling
    +os.environ['NEURON_RT_NUM_CORES'] = '24'
    +os.environ['NEURON_CC_FLAGS'] = '--model-type=transformer'
    +model = LlamaForSampling.from_pretrained(
    +        model_dir,
    +        batch_size=1,
    +        tp_degree=24,
    +        amp='bf16',
    +        n_positions=16,
    +        context_length_estimate=[8]
    +    )
    +
    + +

    Now let's compile the model and load model weights into device memory with a one liner API.

    +
    model.to_neuron()
    +
    + +
      +
    1. Finally let’s run the inference on the compiled model. Note that both input and output of the sample function are a sequence of tokens.
    2. +
    + +
    inputs = torch.tensor([[1, 16644, 31844, 312, 31876, 31836, 260, 3067, 2228, 31844]])
    +seq_len = 16
    +outputs = model.sample(inputs, seq_len, top_k=1)
    +
    + +

    Inference optimizations in Transformers Neuron

    + +

    Tensor parallelism

    + +

    Latency with different TP degrees

    + +

    Transformer Neuron implements parallel tensor operations across multiple NeuronCores. We denote the number of cores to be used for inference as TP degree. Larger TP degree provides higher memory bandwidth, leading to lower latency, as LLM token generation is a memory-IO bound workload. With increasing the TP degree, the inference latency has decreased significantly, our results shows, ~4x overall speed up with increased TP degrees from 2 to 24. For the Llama-2 7B model, latency decreases from 30.1 ms/token with 2 cores to 7.9 ms/token with 24 cores; similarly for the Llama-2 13B model, it goes down from 57.3 ms/token to 11.1 ms/token.

    + +

    Parallel context encoding

    + +

    In the transformer architecture, tokens are produced in a sequential procedure called autoregressive sampling while input prompt tokens can be processed in parallel with parallel context encoding. This can significantly reduce the latency for input prompt context encoding before token generation through autoregressive sampling. By default, the parameter context_length_estimate would be set as a list of power-of-2 numbers which aims to cover a wide variety of context lengths. Depending on the use case, it can be set to custom numbers. This can be done when creating the Llama 2 model using LlamaForSampling.from_pretrained. We characterize the impact of input token length on end-to-end (E2E) latency. As shown in the figure, latency for text generation with the Llama-2 7B model only slightly increases with bigger input prompts, thanks to parallel context encoding.

    + +

    E2E latency

    + +

    KV caching

    + +

    Self-attention block performs the self-attention operation with KV vectors. And, KV vectors are calculated using token embeddings and weights of KV and thus associated with tokens. In naive implementations, for each generated token, the entire KV cache is recalculated, but this reduces performance. Therefore Transformers Neuron library is reusing previously calculated KV vectors to avoid unnecessary computation, also known as KV caching, to reduce latency in the autoregressive sampling phase. 

    + +

    Benchmarking results

    + +

    We benchmarked the latency and cost for both Llama-2 7B and 13B models under different conditions, i.e., number of output tokens, instance types. Unless specified, we use data type ‘bf16’ and batch size of 1 as this is a common configuration for real-time applications like chatbot and code assistant.

    + +

    Latency

    + +

    The following graphs shows the per token latency on inf2.48xlarge instance with TP degree 24. Here, the latency per output token is calculated as the end-to-end latency divided by the number of output tokens. Our experiments show Llama-2 7B end-to-end latency to generate 256 tokens is 2x faster compared to other comparable inference-optimized EC2 instances. 

    + +

    Latency on inf2

    + +

    Throughput

    + +

    We now show the number of tokens generated per second for the Llama-2 7B and 13B models that can be delivered by the inf2.48xlarge instance. With TP degree 24, fully utilizing all the 24 NeuronCores, we can achieve 130 tokens/sec and 90 tokens/sec for the Llama-2 7B and 13B models, respectively.

    + +

    E2E throughput

    + +

    Cost

    + +

    For latency-first applications, we show the cost of hosting Llama-2 models on the inf2.48xlarge instance, $0.011 per 1000 tokens and $0.016 per 1000 tokens for the 7B and 13B models, respectively, which achieve 3x cost saving over other comparable inference-optimized EC2 instances. Note that we report the cost based on 3-year reserved instance price which is what customers use for large production deployments.

    + +

    Cost on inf2

    + +

    We also compare the cost of hosting the Llama-2 7B model on inf2.xlarge and inf2.48xlarge instances. We can see that inf2.xlarge is more than 4x cheaper than inf2.48xlarge but at the expense of longer latency due to smaller TP degree. For example, it takes 7.9 ms for the model to generate 256 output tokens with 256 input tokens on inf2.48xlarge but 30.1 ms on Inf2.xlarge.

    + +

    Cost on Llama

    + +

    Serving Llama2 with TorchServe on EC2 Inf2 instance

    + +

    Now, we move on to model deployment. In this section, we show you how to deploy the Llama-2 13B model through SageMaker using TorchServe, which is the recommended model server for PyTorch, preinstalled in the AWS PyTorch Deep Learning Containers (DLC).

    + +

    This section describes the preparation work needed for using TorchServe, particularly, how to configure model_config.yaml and inf2_handler.py as well as how to generate model artifacts and pre-compile the model for use in later model deployment. Preparing the model artifacts ahead-of-time avoids model compilation during model deployment and thus reduces the model loading time.

    + +

    Model configuration model-config.yaml

    + +

    The parameters defined in section handler and micro_batching are used in customer handler inf2_handler.py. More details about model_config.yaml are here. TorchServe micro-batching is a mechanism to pre-process and post-process a batch of inference requests in parallel. It is able to achieve higher throughput by better utilizing the available accelerator when the backend is steadily fed with incoming data, see here for more details. For model inference on Inf2, micro_batch_size, amp, tp_degree and max_length specify the batch size, data type, tensor parallelism degree and max sequence length, respectively.

    + +
    # TorchServe Frontend Parameters
    +minWorkers: 1
    +maxWorkers: 1
    +maxBatchDelay: 100
    +responseTimeout: 10800
    +batchSize: 16
    +
    +# TorchServe Backend Custom Handler Parameters
    +handler:
    +    model_checkpoint_dir: "llama-2-13b-split"
    +    amp: "bf16"
    +    tp_degree: 12
    +    max_length: 100
    +
    +micro_batching:
    +    # Used by batch_size in function LlamaForSampling.from_pretrained
    +    micro_batch_size: 1  
    +    parallelism:
    +        preprocess: 2
    +        inference: 1
    +        postprocess: 2
    +
    + +

    Custom handler inf2_handler.py

    + +

    Custom handler in Torchserve is a simple Python script that lets you define the model initialization, preprocessing, inference and post-processing logic as functions. Here, we create our Inf2 custom handler.

    + +
      +
    1. The initialize function is used to load the model. Here, Neuron SDK will compile the model for the first time and save the precompiled model in the directory as enabled by NEURONX_CACHE in the directory specified by NEURONX_DUMP_TO. After the first time, subsequent runs will check if there are already pre-compiled model artifacts. If so, it will skip model compilation. +Once the model is loaded, we initiate warm-up inference requests so that the compiled version is cached. When the neuron persistent cache is utilized, it can significantly reduce the model loading latency, ensuring that the subsequent inference runs swiftly.
    2. +
    + +
    os.environ["NEURONX_CACHE"] = "on"
    +os.environ["NEURONX_DUMP_TO"] = f"{model_dir}/neuron_cache"
    +
    + +

    TorchServe `TextIteratorStreamerBatch` extends Hugging Face transformers `BaseStreamer` to support response streaming when `batchSize` is larger than 1. 

    + +
    self.output_streamer = TextIteratorStreamerBatch(
    +    self.tokenizer,
    +    batch_size=self.handle.micro_batch_size,
    +    skip_special_tokens=True,
    +)
    +
    + +
      +
    1. The inference function calls send_intermediate_predict_response to send the streaming response.
    2. +
    + +
    for new_text in self.output_streamer:
    +    logger.debug("send response stream")
    +    send_intermediate_predict_response(
    +        new_text[: len(micro_batch_req_id_map)],
    +        micro_batch_req_id_map,
    +        "Intermediate Prediction success",
    +        200,
    +        self.context,
    +    )
    +
    + +

    Package model artifacts

    + +

    Package all the model artifacts into a folder llama-2-13b-neuronx-b1 using the torch-model-archiver

    + +
    torch-model-archiver --model-name llama-2-13b-neuronx-b1 --version 1.0 --handler inf2_handler.py -r requirements.txt --config-file model-config.yaml --archive-format no-archive
    +
    + +

    Serve the model

    + +
    export TS_INSTALL_PY_DEP_PER_MODEL="true"
    +torchserve --ncs --start --model-store model_store --models llama-2-13b-neuronx-b1
    +
    + +

    Once the log shows “WORKER_MODEL_LOADED”, the pre-compiled model should be saved in the folder llama-2-13b-neuronx-b1/neuron_cache, which is tightly coupled with Neuron SDK version. Then, upload the folder llama-2-13b-neuronx-b1 to your S3 bucket for later use in the product deployment. The Llama-2 13B model artifacts in this blog can be found here, which is associated with Neuron SDK 2.13.2, in the TorchServe model zoo.

    + +

    Deploy Llama-2 13B model on SageMaker Inf2 instance using TorchServe 

    + +

    In this section, we deploy the Llama-2 13B model using a PyTorch Neuronx container on a SageMaker endpoint with an ml.inf2.24xlarge hosting instance, which has 6 Inferentia2 accelerators corresponding to our model configuration model_config.yaml handler’s setting - tp_degree: 12. Given that we have packaged all the model artifacts into a folder using torch-model-archiver and uploaded to S3 bucket, we will now use the SageMaker Python SDK to create a SageMaker model and deploy it to a SageMaker real-time endpoint using the deploy uncompressed model method. Speed is the key benefit to deploying in this manner with SageMaker and you get a fully functional production ready endpoint complete with a secure RESTful endpoint without any effort spent on infrastructure. There are 3 steps to deploying the model and running inference on SageMaker. The notebook example can be found here.

    + +
      +
    1. Create a SageMaker model
    2. +
    + +
    from datetime import datetime
    +
    +instance_type = "ml.inf2.24xlarge"
    +endpoint_name = sagemaker.utils.name_from_base("ts-inf2-llama2-13b-b1")
    +
    +model = Model(
    +    name="torchserve-inf2-llama2-13b" + datetime.now().strftime("%Y-%m-%d-%H-%M-%S"),
    +    # Enable SageMaker uncompressed model artifacts
    +    model_data={
    +        "S3DataSource": {
    +                "S3Uri": s3_uri,
    +                "S3DataType": "S3Prefix",
    +                "CompressionType": "None",
    +        }
    +    },
    +    image_uri=container,
    +    role=role,
    +    sagemaker_session=sess,
    +    env={"TS_INSTALL_PY_DEP_PER_MODEL": "true"},
    +)
    +
    + +
      +
    1. Deploy a SageMaker model
    2. +
    + +
    model.deploy(
    +    initial_instance_count=1,
    +    instance_type=instance_type,
    +    endpoint_name=endpoint_name,
    +    volume_size=512, # increase the size to store large model
    +    model_data_download_timeout=3600, # increase the timeout to download large model
    +    container_startup_health_check_timeout=600, # increase the timeout to load large model
    +)
    +
    + +
      +
    1. Run streaming response inference on SageMaker +When the endpoint is in service, you can use the invoke_endpoint_with_response_stream API call to invoke the model. This feature enables the return of each generated token to the user, enhancing the user experience. It’s especially beneficial when generating an entire sequence is time-consuming.
    2. +
    + +
    import json
    +
    +body = "Today the weather is really nice and I am planning on".encode('utf-8')
    +resp = smr.invoke_endpoint_with_response_stream(EndpointName=endpoint_name, Body=body, ContentType="application/json")
    +event_stream = resp['Body']
    +parser = Parser()
    +for event in event_stream:
    +    parser.write(event['PayloadPart']['Bytes'])
    +    for line in parser.scan_lines():
    +        print(line.decode("utf-8"), end=' ')
    +
    + +

    Sample inference:

    + +

    Input

    + +

    “Today the weather is really nice and I am planning on”

    + +

    Output

    + +

    “Today the weather is really nice and I am planning on going to the beach. I am going to take my camera and take some pictures of the beach. I am going to take pictures of the sand, the water, and the people. I am also going to take pictures of the sunset. I am really excited to go to the beach and take pictures.

    + +

    The beach is a great place to take pictures. The sand, the water, and the people are all great subjects for pictures. The sunset is also a great subject for pictures.”

    + +

    Conclusion

    + +

    In this post, we showcased how to run Llama 2 model inference using Transformers Neuron and deploy Llama 2 model serving using TorchServe through Amazon SageMaker on an EC2 Inf2 instance. We demonstrated the benefits of using Inferentia2—low latency and low cost—enabled by optimizations in AWS Neuron SDK including tensor parallelism, parallel context encoding and KV caching, particularly for LLM inference. To stay up to date, please follow AWS Neuron’s latest release for new features.

    + +

    Get started today with Llama 2 examples on EC2 and through SageMaker and stay tuned for how to optimize Llama 70B on Inf2!

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/hitchhikers-guide-speculative-decoding/index.html b/blog/hitchhikers-guide-speculative-decoding/index.html new file mode 100644 index 000000000000..ed8000b8145a --- /dev/null +++ b/blog/hitchhikers-guide-speculative-decoding/index.html @@ -0,0 +1,732 @@ + + + + + + + + + + + + + A Hitchhiker’s Guide to Speculative Decoding | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch at IBM + +

    +

    Speculative decoding is an optimization technique for inference that makes educated guesses about future tokens while generating the current token, all within a single forward pass. It incorporates a verification mechanism to ensure the correctness of these speculated tokens, thereby guaranteeing that the overall output of speculative decoding is identical to that of vanilla decoding. Optimizing the cost of inference of large language models (LLMs) is arguably one of the most critical factors in reducing the cost of generative AI and increasing its adoption. Towards this goal, various inference optimization techniques are available, including custom kernels, dynamic batching of input requests, and quantization of large models.

    + +

    In this blog post, we provide a guide to speculative decoding and demonstrate how it can coexist with other optimizations. We are proud to open source the following, which includes the first speculator for Llama3 models:

    + +
      +
    1. Speculator models for Meta Llama3 8B, IBM Granite 7B lab, Meta Llama2 13B, and Meta Code Llama2 13B.
    2. +
    3. The code for inference via IBM’s fork of HF TGI.
    4. +
    5. The code for training your own speculators and corresponding recipes.
    6. +
    + +

    We have deployed these speculators in an internal production-grade environment with thousands of daily users and observed 2x speedup on language models - Llama3 8B, Llama2 13B, and IBM Granite 7B and 3x speedup on IBM’s Granite 20B code models. We provide a detailed explanation of our approach in this technical report and are planning in-depth analysis in an upcoming ArXiv paper.

    + +

    Speculative decoding: Inference

    + +

    We run IBM TGIS in our internal production environment that has optimizations such as continuous batching, fused kernels, and quantization kernels. To enable speculative decoding in TGIS, we modified the paged attention kernel from vLLM. In what follows, we will describe the key changes to the inference engine to enable speculative decoding.

    + +

    Speculative decoding is based on the premise that the model is powerful enough to predict multiple tokens in a single forward pass. However, the current inference servers are optimized to predict only a single token at a time. In our approach, we attach multiple speculative heads (in addition to the usual one) to the LLM to predict N+1-, N+2-, N+3-th … token. For example, 3 heads will predict 3 additional tokens. Details of the speculator architecture are explained in a later part of this blog. There are two challenges to achieve efficiency and correctness during inference - one is to predict without replicating KV-cache and the other is to verify that the predictions match the original model’s outcomes.

    + +

    In a typical generation loop, after the prompt is processed in a single forward step, a sequence length of 1 (next token predicted) is fed into the forward pass of the model along with the kv-cache. In a naive speculative decoding implementation, each speculative head would have its own kv-cache, but instead we modify the paged attention kernel developed in the vLLM project to enable efficient kv-cache maintenance. This ensures that throughput does not reduce at larger batch sizes. Further, we modify the attention masks to enable verification of the N+1’th token and thus enable speculative decoding without deviating from the original model’s output. The details of this implementation are captured here.

    + +

    Results

    + +

    We illustrate the speedup obtained with the Meta’s chat versions of Llama2 13B using a simple prompt.

    + +

    Visual illustration of the non-speculative generation (left) compared to speculative generation (right)

    + +

    Figure 2: Visual illustration of the non-speculative generation (left) compared to speculative generation (right)

    + +

    We deployed the above solution in an internal production environment. The figure below reports two metrics – time to first token (TTFT) and inter-token latency (ITL) with different numbers of concurrent users (which is captured in the numbers on the graph lines). We observe that the speculative decoding version is nearly twice as fast for the Llama2 13B chat model and nearly thrice as fast for the Granite 20B code model compared to the non-speculative version for all batch sizes. We observe similar behavior for the smaller models - IBM’s Granite 7B and Meta Llama3 8B models.

    + +

    Time to first token (TTFT - left) and Inter-token latency (ITL - right) for Llama 13B with number of concurrent users indicated on the graph

    + +

    Figure 3: Time to first token (TTFT - left) and Inter-token latency (ITL - right) for Llama 13B with number of concurrent users indicated on the graph

    + +

    Time to first token (TTFT - left) and Inter-token latency (ITL - right) for Granite 20B Code with number of concurrent users indicated on the graph

    + +

    Figure 4: Time to first token (TTFT - left) and Inter-token latency (ITL - right) for Granite 20B Code with number of concurrent users indicated on the graph

    + +

    Note on efficiency

    + +

    We performed numerous experiments to determine the right configuration for speculator training. These are:

    + +
      +
    1. Speculator architecture: The current approach allows for the number of heads to be modified, which maps to the number of tokens that we can look ahead. Increasing the number of heads also increases the amount of extra compute needed and complexity of training. In practice, for language models, we find 3-4 heads works well in practice, whereas we found that code models can reap benefits from 6-8 heads.
    2. +
    3. Compute: Increasing the number of heads results in increased compute in two dimensions, one is that of increased latency for a single forward pass as well as the compute needed for multiple tokens. If the speculator is not accurate with more heads, it will result in wasted compute increasing the latency and reducing the throughput.
    4. +
    5. Memory: The increased compute is offset by the roundtrips to HBM that need to be done for each forward pass. Note that if we get 3 tokens lookahead correct, we have saved three round trip times on HBM.
    6. +
    + +

    We settled on 3-4 heads for the language models and 6-8 heads for the code models and across different model sizes ranging from 7B to 20B, we observed significant latency improvements without throughput loss compared to non-speculative decoding. We begin to observe throughput reduction beyond a batch size of 64, which happens rarely in practice.

    + +

    Speculative decoding: Training

    + +

    There are two broad approaches for speculative decoding, one is to leverage a smaller model (e.g., Llama 7B as a speculator for Llama 70B) and the other is to attach speculator heads (and train them). In our experiments, we find the approach of attaching speculator heads to be more effective both in model quality and latency gains.

    + +

    Speculator architecture

    + +

    Medusa made speculative decoding popular; their approach is to add a head to the existing model which is then trained to do speculation. We modify the Medusa architecture by making the “heads” hierarchical, where each head stage predicts a single token and then feeds it to the next head stage. These multi-stage heads are depicted in the below figure. We are exploring ways of minimizing the embeddings table by sharing these across the multiple stages and base model.

    + +

    A simple architecture diagram for a 3-headed multi-stage  speculator. Z is the state from the base model.

    + +

    Figure 4: A simple architecture diagram for a 3-headed multi-stage speculator. Z is the state from the base model.

    + +

    Speculator training

    + +

    We have a two-phase approach to training a speculator for efficiency reasons. In the first phase, we train on small batches with long sequence lengths (4k tokens) and use the standard causal LM approach for training. In phase 2, we use large batches with short sequence lengths (256 tokens) generated from the base model. In this training phase, we tune the heads to match the output of the base model. Through numerous experiments, we find that a 5:2 ratio of steps for phase 1 vs phase 2 works well. We depict the progress of these phases in the below figure. We use PyTorch FSDP and IBM FMS for the training of speculators.

    + +

    Per-head training loss curves for Llama2-13B speculator training, phase 1 and 2

    + +

    Figure 5: Per-head training loss curves for Llama2-13B speculator training, phase 1 and 2

    + +

    Conclusion and Future Work

    + +

    Through this blog, we are releasing a new approach for speculative decoding and the following assets:

    + +
      +
    1. Models for improving the inter-token latencies for a range of models - Llama3 8B, Llama2 13B, Granite 7B, and CodeLlama 13B
    2. +
    3. Production quality code for inference
    4. +
    5. Recipes for training speculators
    6. +
    + +

    We are working on training speculators for Llama3 70B and Mistral models and invite the community to contribute as well as help improve on our framework. We would also love to work with major open source serving frameworks such as vLLM and TGI to contribute back our speculative decoding approach to benefit the community.

    + +

    Acknowledgements

    + +

    There are several teams that helped us get to these latency improvements for inference. We would like to thank the vLLM team for creating the paged attention kernel in a clean and reusable manner. We extend our gratitude to the Team PyTorch at Meta that helped provide feedback on this blog as well as continued efforts on optimal usage of PyTorch. Special thanks to our internal production teams at IBM Research who took this prototype to production and hardened it. A shout out to Stas Bekman for providing insightful comments on the blog resulting in an improved explanation of the tradeoffs between compute, memory, and speculator effectiveness.

    + +

    The paged attention kernel was integrated into IBM FMS by Josh Rosenkranz and Antoni Viros i Martin. The speculator architecture and training was done by Davis Wertheimer, Pavithra Ranganathan, and Sahil Suneja. The integration of the modeling code with the inference server was done by Thomas Parnell, Nick Hill, and Prashant Gupta.

    + + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/hopper-tma-unit/index.html b/blog/hopper-tma-unit/index.html new file mode 100644 index 000000000000..5194099231c8 --- /dev/null +++ b/blog/hopper-tma-unit/index.html @@ -0,0 +1,1049 @@ + + + + + + + + + + + + + Deep Dive on the Hopper TMA Unit for FP8 GEMMs | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Adnan Hoque, Less Wright, Chih-Chieh Yang + +

    +

    Abstract

    + +

    The Hopper (H100) GPU architecture, billed as the “first truly asynchronous GPU”, includes a new, fully asynchronous hardware copy engine for bulk data movement between global and shared memory called Tensor Memory Accelerator (TMA). While CUTLASS has built-in support for TMA via its asynchronous pipeline paradigm, Triton exposes TMA support via an experimental API.

    + +

    In this post, we provide a deeper dive into the details of how TMA works, for developers to understand the new async copy engine. We also show the importance of leveraging TMA for H100 kernels by building a TMA enabled FP8 GEMM kernel in Triton, which delivers from 1.4-2.2x performance gains over cuBLAS FP16 for small-to-medium problem sizes. Finally, we showcase key implementation differences between Triton and CUTLASS that may account for reports of performance regressions with TMA in Triton. We open source our implementation for reproducibility and review at https://github.com/pytorch-labs/applied-ai/tree/main/kernels

    + +

    The throughput in TFLOPs of various Triton and cuBLAS FP8 and FP16 kernels, for M=M, N=4096, K=4096. The red line is the Triton TMA, which showcases the advantages of leveraging TMA.

    + +

    Figure 1. The throughput in TFLOPs of various Triton and cuBLAS FP8 and FP16 kernels, for M=M, N=4096, K=4096. The red line is the Triton TMA, which showcases the advantages of leveraging TMA.

    + +

    TMA Background

    + +

    TMA is an H100 hardware addition that allows applications to asynchronously and bi-directionally transfer 1D-5D tensors between GPU global and shared memory. In addition, TMA can also transfer the same data to not just the calling SM’s shared memory, but to other SM’s shared memory if they are part of the same Thread Block Cluster. This is termed ‘multicast’.

    + +

    TMA is very lightweight as only a single thread is needed to kick off a TMA transfer. By moving data directly from GMEM (global) to SMEM (shared), this avoids earlier GPU requirements of using registers for moving data between different memory spaces.

    + +

    A100-style data movement vs H100 with TMA.  TMA hardware eliminates the need for a large amount of threads and registers participating in bulk data transfers.

    + +

    Figure 2. A100-style data movement vs H100 with TMA. TMA hardware eliminates the need for a large amount of threads and registers participating in bulk data transfers. (Image credit Nvidia)

    + +

    A single thread can issue large data movement instructions, allowing the majority of a given thread block to continue working on other instructions while data is in-flight. Combined with asynchronous pipelining, this allows memory transfers to be easily hidden and ensure the majority of any given thread block cluster can focus on computational task.

    + +

    This lightweight invocation for data movement enables the creation of warp-group specialized kernels, where warp-groups take on different roles, namely producers and consumers. Producers elect a leader thread that fires off TMA requests, which are then asynchronously coordinated with the consumer (MMA) warp-groups via an arrival barrier. Consumers then process the data using warp-group MMA, and signal back to the producers when they have finished reading from the SMEM buffer and the cycle repeats.

    + +

    Further, within threadblock clusters, producers can lower their max register requirements since they are only issuing TMA calls, and effectively transfer additional registers to MMA consumers, which helps to alleviate register pressure for consumers.

    + +

    In addition, TMA handles the address computation for the shared memory destination where the data requested should be placed. This is why calling threads (producers) can be so lightweight.

    + +

    To ensure maximum read access speed, TMA can lay out the arriving data based on swizzling instructions, to ensure the arriving data can be read as fast as possible by consumers, as the swizzling pattern helps avoid shared memory bank conflicts.

    + +

    Finally for TMA instructions that are outgoing, or moving data from SMEM to GMEM, TMA can also include reduction operations (add/min/max) and bitwise (and/or) operations.

    + +

    TMA usage in Triton

    + +

    Pre-Hopper Load:

    + +
    offs_m = pid_m*block_m + tl.arange(0, block_m)
    +offs_n = pid_n*block_n + tl.arange(0, block_n)
    +offs_k = tl.arange(0, block_k)
    +
    +a_ptrs = a_ptr + (offs_am[:, None]*stride_am + offs_k[None, :]*stride_ak)
    +b_ptrs = b_ptr + (offs_k[:, None]*stride_bk + offs_bn[None, :]*stride_bn)
    +
    +a = tl.load(a_ptrs)
    +b = tl.load(b_ptrs)
    +
    + +

    Figure 3. Traditional style bulk load from global to shared memory in Triton

    + +

    In the above Triton example showing a pre-Hopper load, we see how the data for tensors a and b are loaded by each thread block computing global offsets (a_ptrs, b_ptrs) from their relevant program_id (pid_m, pid_n, k) and then making a request to move blocks of memory into shared memory for a and b.

    + +

    Now let’s examine how to perform a load using TMA in Triton.

    + +

    The TMA instruction requires a special data structure called a tensor map, in contrast to the above where we directly pass pointers to global memory. To build the tensor map, we first create a TMA descriptor on the CPU. The descriptor handles the creation of the tensor map by using the cuTensorMapEncode API. The tensor map holds metadata such as the global and shared memory layout of the tensor and serves as a compressed representation of the structure of the multi-dimensional tensor stored in global memory.

    + +

    TMA address generation via a copy descriptor

    + +

    Figure 4. TMA address generation via a copy descriptor (Image credit: Nvidia)

    + +

    The TMA descriptor holds the tensor’s key properties:

    + +
      +
    1. Base Pointer
    2. +
    3. Shape and Block Size
    4. +
    5. Datatype
    6. +
    + +

    The TMA descriptor is created on the host before the kernel, and then moved to device by passing the descriptor to a torch tensor. Thus, in Triton, the GEMM kernel receives a global pointer to the tensor map.

    + +

    Triton Host Code

    + +
       desc_a = np.empty(TMA_SIZE, dtype=np.int8)
    +   desc_b = np.empty(TMA_SIZE, dtype=np.int8)
    +   desc_c = np.empty(TMA_SIZE, dtype=np.int8)
    +
    +   triton.runtime.driver.active.utils.fill_2d_tma_descriptor(a.data_ptr(), m, k, block_m, block_k, a.element_size(), desc_a)
    +
    +   triton.runtime.driver.active.utils.fill_2d_tma_descriptor(b.data_ptr(), n, k, block_n, block_k, b.element_size(), desc_b)
    +
    +   triton.runtime.driver.active.utils.fill_2d_tma_descriptor(c.data_ptr(), m, n, block_m, block_n, c.element_size(), desc_c)
    +  
    +   desc_a = torch.tensor(desc_a, device='cuda')
    +   desc_b = torch.tensor(desc_b, device='cuda')
    +   desc_c = torch.tensor(desc_c, device='cuda')
    +
    + +

    This is the code that is used to set up the descriptors in the kernel invoke function.

    + +

    Triton Device Code

    + +

    Offsets/Pointer Arithmetic:

    + +
       offs_am = pid_m * block_m
    +   offs_bn = pid_n * block_n
    +   offs_k = 0
    +
    + +

    Load:

    + +
      a = tl._experimental_descriptor_load(a_desc_ptr, [offs_am, offs_k], [block_m, block_k], tl.float8e4nv)
    +  b = tl._experimental_descriptor_load(b_desc_ptr, [offs_bn, offs_k], [block_n, block_k], tl.float8e4nv)
    +
    + +

    Store:

    + +
     tl._experimental_descriptor_store(c_desc_ptr, accumulator, [offs_am, offs_bn])
    +
    + +

    We no longer need to calculate a pointer array for both load and store functions in the kernel. Instead, we pass a single descriptor pointer, the offsets, block size and the input datatype. This simplifies address calculation and reduces register pressure, as we no longer have to do complex pointer arithmetic in software and dedicate CUDA cores for address computation.

    + +

    TMA Performance Analysis

    + +

    Below, we discuss the PTX instructions for different load mechanisms on Hopper.

    + +

    PTX for Loading Tile (cp.async) - H100 no TMA

    + +
    add.s32 	%r27, %r100, %r8;
    +add.s32 	%r29, %r100, %r9;
    +selp.b32 	%r30, %r102, 0, %p18;
    +
    +
    +@%p1 cp.async.cg.shared.global [ %r27 + 0 ], [ %rd20 + 0 ], 0x10, %r30;
    +@%p1 cp.async.cg.shared.global [ %r29 + 0 ], [ %rd21 + 0 ], 0x10, %r30;
    +
    +
    +cp.async.commit_group ;
    +
    + +

    Here, we observe the older cp.async instruction responsible for global memory copies. From the traces below we can see that both loads bypass the L1 cache. A major difference in the newer TMA load is that before tiles from A and B were ready to be consumed by the Tensor Core we would need to execute an ldmatrix instruction that operated on data contained in register files. On Hopper, the data can now be directly reused from shared memory.

    + +

    H100 Memory Chart showing GMEM Throughput = 910.22 GB/s

    + +

    Figure 5. H100 Memory Chart showing GMEM Throughput = 910.22 GB/s (Triton GEMM without TMA) for M=128, N=4096, K=4096

    + +

    By leveraging TMA through the Triton API changes we mentioned above, we can investigate the PTX that Triton generates for a single 2D tile load with TMA.

    + +

    PTX for Loading Tile (cp.async.bulk.tensor) - H100 using TMA

    + +
    bar.sync 	0;
    +shr.u32 	%r5, %r4, 5;
    +shfl.sync.idx.b32	%r66, %r5, 0, 31, -1;
    +
    +elect.sync _|%p7, 0xffffffff;
    +
    +
    +add.s32 	%r24, %r65, %r67;
    +shl.b32 	%r25, %r66, 7;
    +
    +@%p8
    +cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes [%r24], [%rd26, {%r25,%r152}], [%r19];
    +
    + +

    The cp.async.bulk.tensor.2d.shared TMA instruction is passed the destination address in shared memory, a pointer to the tensor map, the tensor map coordinates and a pointer to the mbarrier object, respectively.

    + +

    H100 Memory Chart GMEM Throughput =1.45 TB/s

    + +

    Figure 6. H100 Memory Chart GMEM Throughput =1.45 TB/s (Triton GEMM with TMA) for M=128, N=4096, K=4096

    + +

    For optimal performance we tuned the TMA GEMM kernel extensively. Amongst other parameters such as tile sizes, number of warps and number of pipeline stages, the biggest increase in memory throughput was observed when we increased the TMA_SIZE (descriptor size) from 128 to 512. From the above NCU profiles, we can see that the final tuned kernel has increased global memory transfer throughput from 910 GB/s to 1.45 TB/s, a 59% increase in GMEM throughput, over the non-TMA Triton GEMM kernel.

    + +

    Comparison of CUTLASS and Triton FP8 GEMM and TMA Implementation - Kernel Architecture

    + +

    Triton vs CUTLASS Ping-Pong FP8 GEMM TFLOPs, M=M, N=4096, K=4096

    + +

    Figure 7. Triton vs CUTLASS Ping-Pong FP8 GEMM TFLOPs, M=M, N=4096, K=4096

    + +

    The above chart shows the performance of a CUTLASS Ping-Pong GEMM kernel against Triton. The Ping-Pong kernel leverages TMA differently than Triton. It makes use of all of its HW and SW software capabilities, while Triton currently does not. Specifically, CUTLASS supports the below TMA features that help explain the performance gaps in pure GEMM performance:.

    + +
      +
    1. +

      TMA Multicast

      + +
        +
      • Enables copy of data from GMEM to multiple SMs
      • +
      +
    2. +
    3. +

      Warp Specialization

      + +
        +
      • Enables warp groups within a threadblock to take on different roles
      • +
      +
    4. +
    5. +

      Tensor Map (TMA Descriptor) Prefetch

      + +
        +
      • Enables prefetching the Tensor Map object from GMEM, which allows pipelining of TMA loads
      • +
      +
    6. +
    + +

    To put the performance numbers in perspective, below we show a ‘speed-up’ chart highlighting the latency differences on a percentage basis:

    + +

    % Speedup of CUTLASS Ping-Pong vs Triton FP8 with TMA.

    + +

    Figure 8: % Speedup of CUTLASS Ping-Pong vs Triton FP8 with TMA.

    + +

    This speedup is purely kernel throughput, not including E2E launch overhead which we will discuss below.

    + +

    TMA Descriptor movement - a key difference between Triton and CUTLASS with E2E performance implications

    + +

    As noted previously, creation of a 2D+ dimensional TMA descriptor takes place on the host and is then transferred to the device. However, this transfer process takes place very differently depending on the implementation.

    + +

    Here we showcase the differences between how Triton transfers TMA descriptors compared with CUTLASS.

    + +

    Recall, TMA transfers require a special data structure, a tensor map to be created on CPU through the cuTensorMap API, which for an FP8 GEMM Kernel means creating three descriptors, one for each A, B and C. We see below that for both the Triton and CUTLASS Kernels the same CPU procedures are invoked.

    + +

    Calls to cuTensorMapEncodeTiled (Both Triton and CUTLASS use this path)

    + +

    Figure 7. Calls to cuTensorMapEncodeTiled (Both Triton and CUTLASS use this path)

    + +

    However, for Triton, each descriptor is transferred in its own distinct copy kernel, which adds a significant amount of overhead and serves as a barrier to use this kernel in an end-to-end use inference scenario.

    + +

    Three H2D Copy Kernels are launched before the kernel execution, for A, B and C

    + +

    Figure 8. Three H2D Copy Kernels are launched before the kernel execution, for A, B and C

    + +

    These copies are not observed in the CUTLASS implementation, due to the way that TMA descriptors are passed to the kernel. We can see from the PTX below that with Cutlass, tensor maps are passed-by-value to the kernel.

    + +
    .entry _ZN7cutlass13device_kernelIN49_GLOBAL__N__8bf0e19b_16_scaled_mm_c3x_cu_2bec3df915cutlass_3x_gemmIaNS_6half_tENS1_14ScaledEpilogueEN4cute5tupleIJNS5_1CILi64EEENS7_ILi128EEES9_EEENS6_IJNS7_ILi2EEENS7_ILi1EEESC_EEENS_4gemm32KernelTmaWarpSpecializedPingpongENS_8epilogue18TmaWarpSpecializedEE10GemmKernelEEEvNT_6ParamsE(
    +
    +.param .align 64 .b8 _ZN7cutlass13device_kernelIN49_GLOBAL__N__8bf0e19b_16_scaled_mm_c3x_cu_2bec3df915cutlass_3x_gemmIaNS_6half_tENS1_14ScaledEpilogueEN4cute5tupleIJNS5_1CILi64EEENS7_ILi128EEES9_EEENS6_IJNS7_ILi2EEENS7_ILi1EEESC_EEENS_4gemm32KernelTmaWarpSpecializedPingpongENS_8epilogue18TmaWarpSpecializedEE10GemmKernelEEEvNT_6ParamsE_param_0[1024]
    +
    +
    +mov.b64 	%rd110, _ZN7cutlass13device_kernelIN49_GLOBAL__N__8bf0e19b_16_scaled_mm_c3x_cu_2bec3df915cutlass_3x_gemmIaNS_10bfloat16_tENS1_14ScaledEpilogueEN4cute5tupleIJNS5_1CILi64EEES8_NS7_ILi256EEEEEENS6_IJNS7_ILi1EEESB_SB_EEENS_4gemm24KernelTmaWarpSpecializedENS_8epilogue18TmaWarpSpecializedEE10GemmKernelEEEvNT_6ParamsE_param_0;
    +
    +add.s64 	%rd70, %rd110, 704;
    +cvta.param.u64 	%rd69, %rd70;
    +
    +cp.async.bulk.tensor.2d.global.shared::cta.bulk_group [%rd69, {%r284, %r283}], [%r1880];
    +
    + +

    Figure 9. CUTLASS kernel PTX showing pass-by-value

    + +

    By directly passing the TMA Descriptor as opposed to passing a global memory pointer, the CUTLASS kernel avoids the three extra H2D copy kernels and instead these copies are included in the single device kernel launch for the GEMM.

    + +

    Because of the difference in how descriptors are moved to the device, the kernel latencies including the time to prepare the tensors to be consumed by the TMA is drastically different. For M=1-128, N=4096, K=4096 the CUTLASS pingpong kernel has an average latency of 10us Triton TMA kernels complete in an average of 4ms. This is a factor of ~3330x slower and appears to be directly linked to the 3 independent kernel launches for TMA descriptor transfer by Triton.

    + +

    Cuda graphs may be one way to reduce this, but given the overhead created by the H2D copies the current Triton implementation when measured end to end is not competitive. A rework of how the Triton compiler manages TMA descriptors would likely resolve this gap. We thus focused on comparing the actual compute kernel throughput and not E2E in our data above.

    + +

    Results Summary

    + +

    Triton FP8 TMA GEMM TFLOPs Comparison

    + +

    Figure 10. Triton FP8 TMA GEMM TFLOPs Comparison

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    M + Triton TMA + Triton Tutorial + Triton SplitK + cuBLAS FP8 + cuBLAS FP16 + CUTLASS Ping-Pong FP8 +
    1 + 2.5 + 1 + 2.4 + 1.5 + 1.8 + 3.57 +
    2 + 5.1 + 2.5 + 4.8 + 3.1 + 3.6 + 5.9 +
    4 + 10.3 + 7.21 + 9.6 + 6.1 + 7.2 + 14.3 +
    8 + 21.0 + 16.5 + 19.2 + 12.3 + 14.4 + 28.6 +
    16 + 44.5 + 41.0 + 37.2 + 24.5 + 27.7 + 55.1 +
    32 + 89.7 + 81.2 + 72.2 + 71.6 + 56.8 + 114.4 +
    64 + 178.5 + 163.7 + 130.8 + 144.6 + 105.3 + 228.7 +
    128 + 359.7 + 225.9 + 160.1 + 244.0 + 189.2 + 377.7 +
    + +

    Figure 11. Triton FP8 TMA GEMM TFLOPs Comparison Table

    + +

    The above chart and table summarize the gain we’ve been able to achieve on a single NVIDIA H100 for FP8 GEMM, by leveraging the TMA Hardware Unit, over non-TMA Triton kernels and high performance CUDA (cuBLAS) kernels. The key point to note is this kernel’s superior scaling (with the batch size) properties over the competition. The problem sizes we benchmarked on are representative of the matrix shapes found in small-to-medium batch size LLM inference. Thus, TMA GEMM kernel performance in the mid-M regime (M=32 to M=128) will be critical for those interested in leveraging this kernel for FP8 LLM deployment use cases, as the FP8 compressed data type can allow larger matrices to fit in GPUs memory.

    + +

    To summarize our analysis, the TMA implementation in Triton and CUTLASS differ in terms of full featureset support (multicast, prefetch etc.) and how the TMA Descriptor is passed to the GPU kernel. If this descriptor is passed in a manner that more closely matches the CUTLASS kernel (pass-by-value), the extraneous H2D copies could be avoided and thus the E2E performance would be greatly improved.

    + +

    Future Work

    + +

    For future research, we plan to improve upon these results, by working with the community to incorporate the CUTLASS architecture of TMA loads into Triton as well as investigating the Cooperative Kernel for FP8 GEMM, a modified strategy to the Ping-Pong Kernel.

    + +

    In addition, once features like thread block clusters and TMA atomic operations are enabled in Triton, we may be able to get further speedups by leveraging the SplitK strategy in the TMA GEMM Kernel, as atomic operations on Hopper can be performed in Distributed Shared Memory (DSMEM) as opposed to L2 Cache. We also note the similarities of NVIDIA Hopper GPUs with other AI hardware accelerators like Google’s TPU and IBM’s AIU which are dataflow architectures. On Hopper, data can now “flow” from GMEM to a network of connected SMs due to the additions of TMA, which we discussed extensively in this blog, and DSMEM, which we plan to cover in a future post.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/how-computational-graphs-are-executed-in-pytorch/index.html b/blog/how-computational-graphs-are-executed-in-pytorch/index.html new file mode 100644 index 000000000000..121a1d69a03b --- /dev/null +++ b/blog/how-computational-graphs-are-executed-in-pytorch/index.html @@ -0,0 +1,1689 @@ + + + + + + + + + + + + + How Computational Graphs are Executed in PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Preferred Networks + +

    +

    Welcome to the last entry into understanding the autograd engine of PyTorch series! +If you haven’t read parts 1 & 2 check them now to understand how PyTorch creates the computational graph for the backward pass!

    + +

    This post is based on PyTorch v1.11, so some highlighted parts may differ across versions.

    + +

    PyTorch autograd graph execution

    + +

    The last post showed how PyTorch constructs the graph to calculate the outputs’ derivatives w.r.t. the inputs when executing the forward pass. Now we will see how the execution of the backward pass is coordinated and done by looking at the whole process, starting from Python down to the lower C++ level internals.

    + +

    What Happens when Calling backward()/grad() from Python

    +

    Using variable.backward()

    + +

    After doing all our calculations with an input set to require the gradient, we call .backward() on the result to initiate the backward pass execution.

    + +
    >>> x = torch.tensor([0.5, 0.75], requires_grad=True)
    +>>> y = torch.exp(x).sum()
    +>>> y.backward()
    +
    + +

    Calling .backward() on a tensor results in a call to torch.autograd.backward().

    +
    # torch/_tensor.py
    +
    +def backward(self, gradient=None, retain_graph=None, create_graph=False, inputs=None):
    +    
    +    torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
    +
    +
    +

    torch.autograd.backward() checks the arguments and calls the autograd engine in the C++ layer.

    + +
    def backward(
    +    tensors: _TensorOrTensors,
    +    grad_tensors: Optional[_TensorOrTensors] = None,
    +    retain_graph: Optional[bool] = None,
    +    create_graph: bool = False,
    +    grad_variables: Optional[_TensorOrTensors] = None,
    +    inputs: Optional[_TensorOrTensors] = None,
    +) -> None:
    +    
    +
    +    if inputs is not None and len(inputs) == 0:
    +        raise RuntimeError("'inputs' argument to backward() cannot be empty.")
    +
    +    tensors = (tensors,) if isinstance(tensors, torch.Tensor) else tuple(tensors)
    +    inputs = (inputs,) if isinstance(inputs, torch.Tensor) else \
    +        tuple(inputs) if inputs is not None else tuple()
    +
    +    grad_tensors_ = _tensor_or_tensors_to_tuple(grad_tensors, len(tensors))
    +    grad_tensors_ = _make_grads(tensors, grad_tensors_)
    +    if retain_graph is None:
    +        retain_graph = create_graph
    +
    +    Variable._execution_engine.run_backward(
    +        tensors, grad_tensors_, retain_graph, create_graph, inputs,
    +        allow_unreachable=True, accumulate_grad=True)  # allow_unreachable flag
    +
    +
    +

    First, whether the grad_tensors argument was specified or not, there is a call to the _make_grads function. This is used to check the provided grad_tensors or to specify the default value for them by looking at the tensors argument values’ shapes. Check the first blog post for details on the default value for the grad_tensors of the backward pass. This function just provides the vector of the vector jacobian product if it was not initially specified.

    + +

    In the above code, Variable has an _execution_engine attribute that is defined in torch.autograd.variable to be of type ImperativeEngine; the C++ engine exported to python and declared in torch/csrc/autograd/python_engine.cpp. In the following sections, we explain in detail how this object executes the backward pass.

    + +

    Note that the torch.autograd.backward function has an inputs optional argument. This argument is used when we want to calculate the .grad field of only a subset of input tensors in the forward pass.

    + +
    >>> x = torch.tensor([0.5, 0.75], requires_grad=True)
    +>>> y = torch.tensor([0.1, 0.90], requires_grad=True)
    +>>> z = torch.exp(x * y).sum()
    +>>> torch.autograd.backward([z], inputs=[x])
    +>>> x.grad
    +tensor([0.1051, 1.7676])
    +>>> y.grad  # None
    +>>>
    +
    +
    +

    Using torch.autograd.grad

    + +

    An alternative to backward() is to use torch.autograd.grad(). The main difference to backward() is that grad() returns a tuple of tensors with the gradients of the outputs w.r.t. the inputs kwargs instead of storing them in the .grad field of the tensors. As you can see, the grad() code shown below is very similar to backward.

    + +
    def grad(
    +    outputs: _TensorOrTensors,
    +    inputs: _TensorOrTensors,
    +    grad_outputs: Optional[_TensorOrTensors] = None,
    +    retain_graph: Optional[bool] = None,
    +    create_graph: bool = False,
    +    only_inputs: bool = True,
    +    allow_unused: bool = False,
    +   is_grads_batched: bool = False
    +) -> Tuple[torch.Tensor, ...]:
    +   
    +    outputs = (outputs,) if isinstance(outputs, torch.Tensor) else tuple(outputs)
    +    inputs = (inputs,) if isinstance(inputs, torch.Tensor) else tuple(inputs)
    +    overridable_args = outputs + inputs
    +    if has_torch_function(overridable_args):
    +        return handle_torch_function(
    +            grad,
    +            overridable_args,
    +            outputs,
    +            inputs,
    +            grad_outputs=grad_outputs,
    +            retain_graph=retain_graph,
    +            create_graph=create_graph,
    +            only_inputs=only_inputs,
    +            allow_unused=allow_unused,
    +        )
    +
    +    grad_outputs_ = _tensor_or_tensors_to_tuple(grad_outputs, len(outputs))
    +    grad_outputs_ = _make_grads(outputs, grad_outputs_)
    +
    +    if retain_graph is None:
    +        retain_graph = create_graph
    +
    +    if is_grads_batched:
    +        # …. It will not be covered here
    +    else:
    +        return Variable._execution_engine.run_backward(
    +            outputs, grad_outputs_, retain_graph, create_graph, inputs,
    +            allow_unused, accumulate_grad=False)  # Calls into the C++ engine to run the backward pass
    +
    +
    + +

    Figure 1 shows the computational graph with the backward() and grad() arguments highlighted in red and blue, respectively:

    + +

    + +

    + +

    +Fgiure 1: Correspondence of `backward`/`grad` arguments in the graphs. +

    + +

    Going Inside the Autograd Engine

    + +

    Refreshing Concepts: Nodes and Edges

    + +

    As we saw in 2 +The computational graph comprises Node and Edge objects. Please read that post if you haven’t done it yet.

    + +

    Nodes

    + +

    Node objects are defined in torch/csrc/autograd/function.h, and they provide an overload of operator() for the associated function and a list of edges to do the graph traversal. Note that Node is a base class that autograd functions inherit from and override the apply method to execute the backward function.

    +
    struct TORCH_API Node : std::enable_shared_from_this<Node> {
    + ...
    + /// Evaluates the function on the given inputs and returns the result of the
    +  /// function call.
    +  variable_list operator()(variable_list&& inputs) {
    +  ...
    +  }
    +
    +protected:
    +  /// Performs the `Node`'s actual operation.
    +  virtual variable_list apply(variable_list&& inputs) = 0;
    +  
    +  edge_list next_edges_;
    +  uint64_t topological_nr_ = 0;
    +  
    +
    +
    + +

    There is an attribute called topological_nr_ in every node object. This number is used to optimize the graph execution as it allows to discard of graph branches under certain conditions. The topological number is the longest distance between this node and any leaf node and it is shown in Figure 2. Its main property is that for any pair of nodes x, y in a directed graph topo_nr(x) < topo_nr(y) means that there is no path from x to y. So this allows for reducing the number of paths in the graph in need of traversal. Check the topological_nr +) method comment for further details.

    + +

    + +

    + +

    +Figure 2: Example of the Topological Number calculation +

    + +

    Edges

    + +

    The Edge object links Nodes together, and its implementation is straightforward.

    + +
    struct Edge {
    +  ...
    +  /// The function this `Edge` points to.
    +  std::shared_ptr<Node> function;
    +  /// The identifier of a particular input to the function.
    +  uint32_t input_nr;
    +};
    +
    +
    + +

    It only requires a function pointer to the Node and an input number that is the index of the output from the forward function this edge points to. When preparing the set of gradients before calling “function”, we know that what is flowing from this edge should be accumulated in the “input_nr”th argument. Note that the input/output name is flipped here and this is the input to the backward function. + Edge objects are constructed using the gradient_edge function method.

    + +
     Edge gradient_edge(const Variable& self) {
    +    if (const auto& gradient = self.grad_fn()) {
    +      return Edge(gradient, self.output_nr());
    +    } else {
    +      return Edge(grad_accumulator(self), 0);
    +    }
    +  }
    +
    +
    +

    Entering the C++ Realm

    + +

    Once that torch.autograd.backward() has been invoked, the +THPEngine_run_backward routine starts the graph traversal. Following is a schema of the function body:

    +
    PyObject *THPEngine_run_backward(PyObject *self, PyObject *args, PyObject *kwargs)
    +{
    +  HANDLE_TH_ERRORS
    +  PyObject *tensors = nullptr;
    +  PyObject *grad_tensors = nullptr;
    +  unsigned char keep_graph = 0;
    +  unsigned char create_graph = 0;
    +  PyObject *inputs = nullptr;
    +  
    +  // Convert the python arguments to C++ objects
    +  const char *accepted_kwargs[] = { // NOLINT
    +      "tensors", "grad_tensors", "keep_graph", "create_graph", "inputs",
    +      "allow_unreachable", "accumulate_grad", nullptr
    +  };
    +  if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OObb|Obb", (char**)accepted_kwargs,
    +        &tensors, &grad_tensors, &keep_graph, &create_graph, &inputs, &allow_unreachable, &accumulate_grad))
    +
    + // Prepare arguments
    + for(const auto i : c10::irange(num_tensors)) {
    +   // Check that the tensors require gradients
    +  }
    +
    +  std::vector<Edge> output_edges;
    +  if (inputs != nullptr) {
    +     // Prepare outputs
    +  }
    +
    +  {
    +      // Calls the actual autograd engine
    +    pybind11::gil_scoped_release no_gil;
    +    outputs = engine.execute(roots, grads, keep_graph, create_graph, accumulate_grad, output_edges);
    +  }
    +    // Clean up and finish
    +}
    +
    +
    + +

    First, we prepare the input arguments after converting the PyObject arguments to actual C++ objects. The tensors list contains the tensors from which we start the backward pass. These tensors are converted to edges using torch::autograd::impl::gradient_edge and added to a list called roots where the graph traversal starts.

    + +
     edge_list roots;
    +  roots.reserve(num_tensors);
    +  variable_list grads;
    +  grads.reserve(num_tensors);
    +  for(const auto i : c10::irange(num_tensors)) {
    +    PyObject *_tensor = PyTuple_GET_ITEM(tensors, i);
    +       const auto& variable = THPVariable_Unpack(_tensor);
    +       auto gradient_edge = torch::autograd::impl::gradient_edge(variable);
    +     roots.push_back(std::move(gradient_edge));
    +
    +    PyObject *grad = PyTuple_GET_ITEM(grad_tensors, i);
    +    if (THPVariable_Check(grad)) {
    +      const Variable& grad_var = THPVariable_Unpack(grad);
    +      grads.push_back(grad_var);
    +    } 
    +  }
    +
    +
    + +

    Now, if the inputs argument was specified in backward or we used the torch.autograd.grad api, the following code creates a list of edges to accumulate the gradients in the specified tensors at the end of the computation. The engine uses this later to optimize the execution as it doesn’t add the gradients in all the leaf nodes, just the specified ones.

    + +
      std::vector<Edge> output_edges;
    +  if (inputs != nullptr) {
    +    int num_inputs = PyTuple_GET_SIZE(inputs);
    +    output_edges.reserve(num_inputs);
    +    for (const auto i : c10::irange(num_inputs)) {
    +      PyObject *input = PyTuple_GET_ITEM(inputs, i);
    +      const auto& tensor = THPVariable_Unpack(input);
    +      const auto output_nr = tensor.output_nr();
    +      auto grad_fn = tensor.grad_fn();
    +      if (!grad_fn) {
    +        grad_fn = torch::autograd::impl::try_get_grad_accumulator(tensor);
    +      }
    +      if (accumulate_grad) {
    +        tensor.retain_grad();
    +      }
    +      if (!grad_fn) {
    +        output_edges.emplace_back(std::make_shared<Identity>(), 0);
    +      } else {
    +        output_edges.emplace_back(grad_fn, output_nr);
    +      }
    +    }
    +  }
    +
    +
    + +

    The next step is the actual graph traversal and node function execution, and finally, the cleanup and return.

    + +
      {
    +    // Calls the actual autograd engine
    +    pybind11::gil_scoped_release no_gil;
    +    auto& engine = python::PythonEngine::get_python_engine();
    +    outputs = engine.execute(roots, grads, keep_graph, create_graph, accumulate_grad, output_edges);
    +  }
    +  // Clean up and finish
    +}
    +
    +
    + +

    Starting the Real Execution

    + +

    engine.executeis present in torch/csrc/autograd/engine.cpp

    + +

    There are two differentiated steps here:

    + +

    Analyze the graph to find dependencies between functions +Create worker threads that traverse the graph

    + +

    Data Structures Used for the Execution

    + +

    GraphTask

    + +

    All the execution metadata is managed by the GraphTask class in torch/csrc/autograd/engine.h

    + +
    struct GraphTask: std::enable_shared_from_this<GraphTask> {
    +  std::atomic<uint64_t> outstanding_tasks_{0};
    +  //  … 
    +  std::unordered_map<Node*, InputBuffer> not_ready_;
    +  std::unordered_map<Node*, int> dependencies_;
    +
    +  struct ExecInfo {
    +     // …
    +  };
    +  std::unordered_map<Node*, ExecInfo> exec_info_;
    +  std::vector<Variable> captured_vars_;
    +  // …
    +  std::shared_ptr<ReadyQueue> cpu_ready_queue_;
    +};
    +
    +
    + +

    Here we see a series of variables dedicated to maintaining the execution state. +outstanding_tasks_ tracks the number of tasks left to be executed for the backward pass to complete. not_ready_ holds the input arguments for the Nodes that are not ready to be executed. dependencies_ track the number of predecessors that a Node has. As the count reaches 0, the Node is ready for execution; it is placed in a ready queue to be retrieved and executed later.

    + +

    exec_info_ and the associated ExecInfo struct are used only when the inputs argument is specified or it is a call to autograd.grad(). They allow filter paths on the graph that are not needeed since only the gradients are calculated only for the variables in the inputs list.

    + +

    captured_vars_ is where the results of the graph execution are temporarily stored if we used the torch.autograd.grad() api instead of torch.autograd.backward() since grad() returns the gradients as tensors instead of just filling the .grad field of the inputs.

    + +

    NodeTask

    + +

    The NodeTask struct is a basic class that holds an fn_ pointer to the node to execute, and an inputs_ buffer to store the input arguments to this function. Note that the functions executed by the backward pass are the derivatives specified in the derivatives.yaml file. or the user provided backward function when using custom functions as described in the second blog post.

    + +

    The inputs_ buffer is also where the output gradients of the previously executed functions are aggregated, and it is defined as a std::vector<Variable> container with facilities to accumulate values at a given position.

    + +
    struct NodeTask {
    +  std::weak_ptr<GraphTask> base_;
    +  std::shared_ptr<Node> fn_;
    +  // This buffer serves as an implicit "addition" node for all of the
    +  // gradients flowing here.  Once all the dependencies are finished, we
    +  // use the contents of this buffer to run the function.
    +  InputBuffer inputs_;
    +};
    +
    +
    +

    GraphRoot

    + +

    The GraphRoot is a special function used to hold multiple input variables in a single place. The code is pretty simple as it only acts as a container of variables.

    + +
    struct TORCH_API GraphRoot : public Node {
    +  GraphRoot(edge_list functions, variable_list inputs)
    +      : Node(std::move(functions)),
    +      outputs(std::move(inputs)) {
    +    for (const auto& t : outputs) {
    +      add_input_metadata(t);
    +    }
    +  }
    +
    +  variable_list apply(variable_list&& inputs) override {
    +    return outputs;
    +  }
    +
    +
    + +

    AccumulateGrad

    + +

    This function is set during the graph creation in gradient_edge when the Variable object doesn’t have a grad_fn. This is, it is a leaf node.

    + +
        if (const auto& gradient = self.grad_fn()) {
    +      // …
    +    } else {
    +      return Edge(grad_accumulator(self), 0);
    +    }
    +
    +
    + +

    The function body is defined in torch/csrc/autograd/functions/accumulate_grad.cpp and it essentially accumulates the input grads in the object’s .grad attribute.

    + +
    auto AccumulateGrad::apply(variable_list&& grads) -> variable_list {
    +  check_input_variables("AccumulateGrad", grads, 1, 0);
    +  
    +
    +  at::Tensor new_grad = callHooks(variable, std::move(grads[0]));
    +  std::lock_guard<std::mutex> lock(mutex_);
    +
    +  at::Tensor& grad = variable.mutable_grad();
    +  accumulateGrad(
    +      variable,
    +      grad,
    +      new_grad,
    +      1 + !post_hooks().empty() /* num_expected_refs */,
    +      [&grad](at::Tensor&& grad_update) { grad = std::move(grad_update); });
    +  return variable_list();
    +}
    +}} // namespace torch::autograd
    +
    +
    +
    +
    + +

    accumulateGrad +does several checks on the tensors format and eventually performs the variable_grad += new_grad; accumulation.

    + +

    Preparing the graph for execution

    + +

    Now, let’s walk through Engine::execute. The first thing to do besides arguments consistency checks is to create the actual GraphTask object we described above. This object keeps all the metadata of the graph execution.

    + +
    auto Engine::execute(const edge_list& roots,
    +                     const variable_list& inputs,
    +                     bool keep_graph,
    +                     bool create_graph,
    +                     bool accumulate_grad,
    +                     const edge_list& outputs) -> variable_list {
    +
    +  validate_outputs(roots, const_cast<variable_list&>(inputs), [](const std::string& msg) {
    +    return msg;
    +  });
    +
    +  // Checks
    +
    +  auto graph_task = std::make_shared<GraphTask>(
    +      /* keep_graph */ keep_graph,
    +      /* create_graph */ create_graph,
    +      /* depth */ not_reentrant_backward_call ? 0 : total_depth + 1,
    +      /* cpu_ready_queue */ local_ready_queue);
    +
    +  // If we receive a single root, skip creating extra root node
    +  // …
    +  // Prepare graph by computing dependencies
    +  // …
    +  // Queue the root 
    +  // …
    +  // launch execution
    +  // …
    +}
    +
    +
    + +

    After creating the GraphTask, we use its associated function if we only have one root node. If we have multiple root nodes, we create a special GraphRoot object as described before.

    + +
      bool skip_dummy_node = roots.size() == 1;
    +  auto graph_root = skip_dummy_node ?
    +    roots.at(0).function :
    +    std::make_shared<GraphRoot>(roots, inputs);
    +
    +
    + +

    The next step is to fill the dependencies_ map in the GraphTask object since the engine must know when it can execute a task. The outputs here is the inputs argument passed to the torch.autograd.backward() call in Python. But here, we have reversed the names since the gradients w.r.t. the inputs of the forward pass are now the outputs of the backward pass. And from now on, there is no concept of forward/backward, but only graph traversal and execution.

    + +
      auto min_topo_nr = compute_min_topological_nr(outputs);
    +  // Now compute the dependencies for all executable functions
    +  compute_dependencies(graph_root.get(), *graph_task, min_topo_nr);
    +
    +  if (!outputs.empty()) {
    +    graph_task->init_to_execute(*graph_root, outputs, accumulate_grad, min_topo_nr);
    +  }
    +
    +
    + +

    Here we preprocess the graph for the execution of the nodes. First, compute_min_topological_nr is called to to obtain the minimum topological number of the tensors specified in outputs (0 if no inputs kwarg was supplied to .backward or input for .grad). This computation prunes paths in the graph that lead to input variables of which we don’t want/need to calculate the grads.

    + +

    Second, is the compute_dependencies call. This function is a very simple graph traversal that starts with the root Node, and for each of the edges in node.next_edges() it increments the counter in dependencies_. Figure 3 shows the result of the dependencies calculation for the example graph. Note that the number of dependencies of any node is just the number of edges arriving at it.

    + +

    + +

    + +

    +Figure 3: Number of dependencies for each node +

    + +

    Finally, the init_to_execute call, this is the one that populates the GraphTask::exec_info_ map in case that inputs were specified in the python backward call. It iterates the graph again, starting from the root, and records in the exec_info_ map the intermediate nodes needed to calculate only the given inputs gradients.

    + +
      // Queue the root
    +  if (skip_dummy_node) {
    +    InputBuffer input_buffer(roots.at(0).function->num_inputs());
    +    auto input = inputs.at(0);
    +
    +
    +    input_buffer.add(roots.at(0).input_nr,
    +                      std::move(input),
    +                      input_stream,
    +                      opt_next_stream);
    +
    +    execute_with_graph_task(graph_task, graph_root, std::move(input_buffer));
    +  } else {
    +    execute_with_graph_task(graph_task, graph_root, InputBuffer(variable_list()));
    +  }
    +  // Avoid a refcount bump for the Future, since we check for refcount in
    +  // DistEngine (see TORCH_INTERNAL_ASSERT(futureGrads.use_count() == 1)
    +  // in dist_engine.cpp).
    +  auto& fut = graph_task->future_result_;
    +  fut->wait();
    +  return fut->value().toTensorVector();
    +}
    +
    +
    + +

    And now, we are ready to start the actual execution by creating the InputBuffer. In case we only have one root variable, we begin by copying the value of the inputs tensor (this is the gradients passed to python backward) in position 0 of the input_buffer. This is a small optimization that avoids running the RootNode for no reason. Also, if the rest of the graph is not on the cpu, we directly start on that worker while the RootNode is always placed on the cpu ready queue. Details of the workers and ready queues are explained in the section below.

    + +

    On the other hand, if we have multiple roots, the GraphRoot object also holds the inputs, so it is enough to pass it an empty InputBuffer.

    + +

    Graph Traversal and Node Execution

    +

    Devices, Threads and Queues

    + +

    Before diving into the actual execution, we need to see how the engine is structured.

    + +

    First of all, the engine is multithreaded with one thread per device. For example, the caller thread is associated with the CPU while additional threads are created and associated with each GPU or other devices available in the system. Each thread tracks its device using thread-local storage in the worker_device variable. In addition, the threads have a queue of tasks to be executed also located in thread-local storage, the local_ready_queue. This is where work is queued for this thread to execute in the thread_main function that is explained later. +You will wonder how the device where a task should be executed is decided. The InputBuffer class has a device() function that returns the first non-cpu device of all its tensors. +This function is used together with Engine::ready_queue to select the queue to queue a task.

    + +
    auto Engine::ready_queue(std::shared_ptr<ReadyQueue> cpu_ready_queue, at::Device device) -> std::shared_ptr<ReadyQueue>{
    +  if (device.type() == at::kCPU || device.type() == at::DeviceType::Meta) {
    +    return cpu_ready_queue;
    +  } else {
    +    // See Note [Allocating GPUs to autograd threads]
    +    return device_ready_queues_.at(device.index());
    +  }
    +}
    +
    +
    + +

    The ReadyQueue object is defined in torch/csrc/autograd/engine.h and it is a simple wrapper over std::priority_queue that allows a thread to wait for a task if it’s empty. One interesting property of the ReadyQueue is that it increases the GraphTask::outstanding_tasks_ value used to determine if the execution has completed or not.

    + +
    auto ReadyQueue::push(NodeTask item, bool incrementOutstandingTasks) -> void {
    +  {
    +    std::lock_guard<std::mutex> lock(mutex_);
    +    if (incrementOutstandingTasks) {
    +      std::shared_ptr<GraphTask> graph_task = item.base_.lock();
    +      ++graph_task->outstanding_tasks_;
    +    }
    +    heap_.push(std::move(item));
    +  }
    +  not_empty_.notify_one();
    +}
    +
    +auto ReadyQueue::pop() -> NodeTask {
    +  std::unique_lock<std::mutex> lock(mutex_);
    +  not_empty_.wait(lock, [this]{ return !heap_.empty(); });
    +  auto task = std::move(const_cast<NodeTask&>(heap_.top())); heap_.pop();
    +  return task;
    +}
    +
    +
    + +

    Reentrant Backward

    + +

    A reentrant backward happens when one of the tasks in a backward pass calls again backward. It is not a very common case, but it can be used to reduce memory utilization as it could potentially avoid saving intermediate results. For more information, check this PyTorch forum post.

    + +
    class ReentrantBackward(torch.autograd.Function):
    +    @staticmethod
    +    def forward(ctx, input):
    +        return input.sum()
    +
    +    @staticmethod
    +    def backward(ctx, input):
    +        # Let's compute the backward by using autograd
    +        input = input.detach().requires_grad_()
    +        with torch.enable_grad():
    +            out = input.sum()
    +        out.backward()  # REENTRANT CALL!!
    +        return out.detach()
    +
    +
    + +

    Here, we call backward() inside backward() for a user custom-defined autograd function. +This situation can lead to deadlocks because the first backward needs to wait for the second one to complete. But some internal implementation details can prevent the second backward from completing as it is explained in the dedicated subsection.

    +

    Thread Initialization

    + +

    execute_with_graph_task is in charge of initializing the threads taking care of the computation and placing the root node in the queue of the device that produced it.

    + +
    c10::intrusive_ptr<at::ivalue::Future> Engine::execute_with_graph_task(
    +    const std::shared_ptr<GraphTask>& graph_task,
    +    std::shared_ptr<Node> graph_root,
    +    InputBuffer&& input_buffer) {
    +
    +  initialize_device_threads_pool();
    +  // Lock mutex for GraphTask.
    +  std::unique_lock<std::mutex> lock(graph_task->mutex_);
    +
    +  auto queue = ready_queue(graph_task->cpu_ready_queue_, input_buffer.device());
    +
    +  if (worker_device == NO_DEVICE) {
    +    set_device(CPU_DEVICE);
    +    graph_task->owner_ = worker_device;
    +    queue->push(NodeTask(graph_task, std::move(graph_root), std::move(input_buffer)));
    +    lock.unlock();
    +    thread_main(graph_task);
    +    worker_device = NO_DEVICE;
    +  } else {
    +     // This deals with reentrant backwards, we will see it later.
    +  }
    +  return graph_task->future_result_;
    +}
    +
    +
    + +

    First, this function initializes several threads (one per device) calling initialize_device_threads_pool() where several things happen: +One ReadyQueue per device is created. +One thread per non-cpu device is created. +A thread local worker_device variable is set to track the current device associated with the thread. +thread_main function is called, and threads wait for tasks to be put in their queues.

    + +

    Then it retrieves the queue to place the root node based on the device that holds the tensors present in the input_buffer using the ready_queue function. Now, the main thread (the one also executing the Python interpreter) has its worker_device set to NO_DEVICE, and it is in charge of executing functions with all its tensors living in the cpu. If worker_device is set to any other value, the graph execution is already started, and .backward() was called inside a running Node, creating a reentrant backward call. This is explained later. For now, +the main thread places the task in the queue and call thread_main.

    +

    Where the Magic Happens

    + +

    It’s been a long way, but finally, we are ready to traverse the graph and execute the nodes. Each of the spawned threads, and the main thread call thread_main.

    + +
    auto Engine::thread_main(const std::shared_ptr<GraphTask>& graph_task) -> void {
    +
    +  while (graph_task == nullptr || !graph_task->future_result_->completed()) {
    +    std::shared_ptr<GraphTask> local_graph_task;
    +    {
    +      NodeTask task = local_ready_queue->pop();
    +
    +      if (task.isShutdownTask_) {
    +        break;
    +      }
    +
    +      if (!(local_graph_task = task.base_.lock())) {
    +        // GraphTask for function is no longer valid, skipping further
    +        // execution.
    +        continue;
    +      }
    +
    +      if (task.fn_ && !local_graph_task->has_error_.load()) {
    +        at::ThreadLocalStateGuard tls_guard(local_graph_task->thread_locals_);
    +
    +        try {
    +          GraphTaskGuard guard(local_graph_task);
    +          NodeGuard ndguard(task.fn_);
    +          {
    +            evaluate_function(
    +                local_graph_task,
    +                task.fn_.get(),
    +                task.inputs_,
    +                local_graph_task->cpu_ready_queue_);
    +          }
    +        } catch (std::exception& e) {
    +          thread_on_exception(local_graph_task, task.fn_, e);
    +        }
    +      }
    +    }
    +
    +    // Decrement the outstanding tasks.
    +    --local_graph_task->outstanding_tasks_;
    +
    +    // Check if we've completed execution.
    +    if (local_graph_task->completed()) {
    +      local_graph_task->mark_as_completed_and_run_post_processing();
    +      auto base_owner = local_graph_task->owner_;
    +      if (worker_device != base_owner) {
    +        std::atomic_thread_fence(std::memory_order_release);
    +        ready_queue_by_index(local_graph_task->cpu_ready_queue_, base_owner)
    +            ->push(NodeTask(local_graph_task, nullptr, InputBuffer(0)));
    +      }
    +    }
    +  }
    +}
    +
    +
    + +

    The code here is simple, given the local_ready_queue assigned to each thread in thread-local storage. The threads loop until there are no tasks left to execute in the graph. Note that for device-associated threads, the passed graph_task argument is nullptr, and they block in local_ready_queue->pop() until a task is pushed in their queue. After some consistency checks (the task type is shutdown, or the graph is still valid). We get to the actual function invocation in evaluate_function.

    + +
            try {
    +          GraphTaskGuard guard(local_graph_task);
    +          NodeGuard ndguard(task.fn_);
    +          {
    +            evaluate_function(
    +                local_graph_task,
    +                task.fn_.get(),
    +                task.inputs_,
    +                local_graph_task->cpu_ready_queue_);
    +          }
    +        } catch (std::exception& e) {
    +          thread_on_exception(local_graph_task, task.fn_, e);
    +        }
    +      }
    +
    +
    + +

    After calling evaluate_function, we check if the graph_task execution is complete by looking the outstanding_tasks_ number. This number increases when a task is pushed to a queue and is decreased in local_graph_task->completed() when a task is executed. When the execution is done, we return the results that are be in the captured_vars_ in case we called torch.autograd.grad() instead of torch.autograd.backward() as this function returns tensors instead of storing them in the .grad attribute of the inputs. Finally we wake up the main thread if it’s waiting by sending a dummy task.

    + +
       // Decrement the outstanding tasks.
    +    --local_graph_task->outstanding_tasks_;
    +
    +    // Check if we've completed execution.
    +    if (local_graph_task->completed()) {
    +      local_graph_task->mark_as_completed_and_run_post_processing();
    +      auto base_owner = local_graph_task->owner_;
    +      if (worker_device != base_owner) {
    +        std::atomic_thread_fence(std::memory_order_release);
    +        ready_queue_by_index(local_graph_task->cpu_ready_queue_, base_owner)
    +            ->push(NodeTask(local_graph_task, nullptr, InputBuffer(0)));
    +      }
    +    }
    +
    +
    + +

    Calling the Function and Unlocking New Tasks

    + +

    evaluate_function serves three purposes:

    + +

    Run the function. +Accumulate its results in the next node InputBuffers. +Decrease the dependencies counter of the next nodes and enqueues the tasks reaching 0 to be executed.

    + +
    void Engine::evaluate_function(
    +    std::shared_ptr<GraphTask>& graph_task,
    +    Node* func,
    +    InputBuffer& inputs,
    +    const std::shared_ptr<ReadyQueue>& cpu_ready_queue) {
    +
    +  // If exec_info_ is not empty, we have to instrument the execution
    +  auto& exec_info_ = graph_task->exec_info_;
    +  if (!exec_info_.empty()) {
    +    // Checks if the function needs to be executed 
    +    if (!fn_info.needed_) {
    +      // Skip execution if we don't need to execute the function.
    +      return;
    +    }
    +  }
    +
    +  auto outputs = call_function(graph_task, func, inputs);
    +
    +  auto& fn = *func;
    +  if (!graph_task->keep_graph_) {
    +    fn.release_variables();
    +  }
    +
    +
    + +

    Initially, we check the exec_info_ map of the GraphTask structure to determine if the current node needs to be executed. Remember that if this map is empty, all the nodes are executed because we are calculating the grads for all the inputs of the forward pass.

    + +

    After this check, the function is executed by running call_function. Its implementation is very straightforward and calls the actual derivative function and registered hooks if any.

    + +
      int num_outputs = outputs.size();
    +  if (num_outputs == 0) {
    +    // Records leaf stream (if applicable)
    +    return;
    +  }
    +
    +  if (AnomalyMode::is_enabled()) {
    +    // check for nan values in result
    +  }
    +
    +
    + +

    Next, we check the outputs of the function after call_function is done. If the number of outputs is 0, there are no following nodes to be executed so we can safely return. This is the case of the AccumulateGrad node associated with the leaf nodes.

    + +

    Also, the check for NaN values in the gradients is done here if requested.

    +
    
    +  std::lock_guard<std::mutex> lock(graph_task->mutex_);
    +  for (const auto i : c10::irange(num_outputs)) {
    +    auto& output = outputs[i];
    +    const auto& next = fn.next_edge(i);
    +
    +    if (!next.is_valid()) continue;
    +
    +   
    +
    +
    + +

    We have now executed a grad_fn that has returned one gradient per each of the associated forward pass function inputs. As we saw in the previous blog post, we have an Edge object per each of these input tensors, and the grad_fn of the function producing them in the forward pass. Essentially, Output[0] of the node in the backward pass, corresponds to the first argument of the forward pass associated function. Figure 4 shows how the outputs of a backward function are related to the inputs of the forward function. See that the outputs of grad_fn C are the gradients of z w.r.t. the inputs of Function C

    + +

    + +

    + +

    +Figure 4: Correspondence between forward and backward functions inputs and outputs +

    + +

    We now iterate through these edges and check if the associated functions are ready to be executed.

    + +
     // Check if the next function is ready to be computed
    +    bool is_ready = false;
    +    auto& dependencies = graph_task->dependencies_;
    +    auto it = dependencies.find(next.function.get());
    +
    +    if (it == dependencies.end()) {
    +      auto name = next.function->name();
    +      throw std::runtime_error(std::string("dependency not found for ") + name);
    +    } else if (--it->second == 0) {
    +      dependencies.erase(it);
    +      is_ready = true;
    +    }
    +
    +    auto& not_ready = graph_task->not_ready_;
    +    auto not_ready_it = not_ready.find(next.function.get());
    +
    +
    + +

    For this, we check the graph_task->dependencies_ map. We decrement the counter, and if it reaches 0, we mark the function pointed by the edge ready to be executed. Following, we prepare the input buffers of the tasks indicated by the next edges.

    + +
        if (not_ready_it == not_ready.end()) {
    +      if (!exec_info_.empty()) {
    +        // Skip functions that aren't supposed to be executed
    +      }
    +
    +      // Creates an InputBuffer and moves the output to the corresponding input position
    +      InputBuffer input_buffer(next.function->num_inputs());
    +      input_buffer.add(next.input_nr,
    +                       std::move(output),
    +                       opt_parent_stream,
    +                       opt_next_stream);
    +
    +      if (is_ready) {
    +        auto queue = ready_queue(cpu_ready_queue, input_buffer.device());
    +        queue->push(
    +            NodeTask(graph_task, next.function, std::move(input_buffer)));
    +      } else {
    +        not_ready.emplace(next.function.get(), std::move(input_buffer));
    +      }
    +
    +
    + +

    Here, we look for the task in the graph_task->not_ready_ map. If it is not present, we create a new InputBuffer object and set the current output in the input_nr position of the buffer associated with the edge. If the task is ready to be executed, we enqueue it in the appropriate device ready_queue and complete the execution. However, if the task is not ready and we have seen it before, it is present in the not_ready_map_.

    + +
        } else {
    +      // The function already has a buffer
    +      auto &input_buffer = not_ready_it->second;
    +      // Accumulates into buffer
    +      input_buffer.add(next.input_nr,
    +                       std::move(output),
    +                       opt_parent_stream,
    +                       opt_next_stream);
    +      if (is_ready) {
    +        auto queue = ready_queue(cpu_ready_queue, input_buffer.device());
    +        queue->push(NodeTask(graph_task, next.function, std::move(input_buffer)));
    +        not_ready.erase(not_ready_it);
    +      }
    +    }
    +  }
    +}
    +
    +
    + +

    In this case, we accumulate the output in the existing input_buffer instead of creating a new one. Once all the tasks are processed, the worker thread exits the loop and complete. +All this process is summarized in the animation in Figure 5. We see how a thread peeks at the tasks in the ready queue and decrements the next nodes’ dependencies, unlocking them for execution.

    + +

    + +

    + +

    +Figure 5: Animation of the execution of the computational graph +

    + +

    Flow with Reentrant Backward

    + +

    As we saw above, the reentrant backward problem is when the currently executed function does a nested call to backward. When this happens, the thread running this function goes all the way down to execute_with_graph_task as in the non-reentrant case, but here is when things are different.

    + +
    c10::intrusive_ptr<at::ivalue::Future> Engine::execute_with_graph_task(
    +    const std::shared_ptr<GraphTask>& graph_task,
    +    std::shared_ptr<Node> graph_root,
    +    InputBuffer&& input_buffer) {
    +
    +  initialize_device_threads_pool();
    +  // Lock mutex for GraphTask.
    +  std::unique_lock<std::mutex> lock(graph_task->mutex_);
    +
    +  auto queue = ready_queue(graph_task->cpu_ready_queue_, input_buffer.device());
    +
    +  if (worker_device == NO_DEVICE) {
    +    //Regular case
    +  } else {
    +    // If worker_device is any devices (i.e. CPU, CUDA): this is a re-entrant
    +    //    backward call from that device.
    +    graph_task->owner_ = worker_device;
    +
    +    // Now that all the non-thread safe fields of the graph_task have been populated,
    +    // we can enqueue it.
    +    queue->push(NodeTask(graph_task, std::move(graph_root), std::move(input_buffer)));
    +
    +    if (current_depth >= max_recursion_depth_) {
    +      // If reached the max depth, switch to a different thread
    +      add_thread_pool_task(graph_task);
    +    } else {
    +      ++total_depth;
    +      ++current_depth;
    +      lock.unlock();
    +      thread_main(graph_task);
    +      --current_depth;
    +      --total_depth;
    +    }
    +  }
    +  return graph_task->future_result_;
    +}
    +
    +
    + +

    Here, execute_with_graph_task detects this as a reentrant call and then looks for the current number of nested calls. If it exceeds the limit, we create a new thread to take care of the execution of this graph, and if not, we execute this reentrant call regularly. +The limit of nested calls was originally set to avoid stack overflow due to reentrant calls creating very large call stacks. However, the number was further reduced when sanitizer tests were added because of the maximum amount of locks a thread can hold at a given moment. This can be seen in torch/csrc/autograd/engine.h.

    + +

    When this maximum depth is exceeded, a new thread is created with the add_thread_pool_task function.

    + +
    void Engine::add_thread_pool_task(const std::weak_ptr<GraphTask>& graph_task) {
    +  std::unique_lock<std::mutex> lck(thread_pool_shared_->mutex_);
    +  // if we have pending graph_task objects to be processed, create a worker.
    +   bool create_thread = (thread_pool_shared_->num_workers_ <= thread_pool_shared_->graphtasks_queue_.size());
    +  thread_pool_shared_->graphtasks_queue_.push(graph_task);
    +
    +
    +  lck.unlock();
    +  if (create_thread) {
    +    std::thread t(&Engine::reentrant_thread_init, this);
    +    t.detach();
    +  }
    +
    +  thread_pool_shared_->work_.notify_one();
    +}
    +
    +
    +
    +
    + +

    Before going in-depth, let’s look at the thread_pool_shared_ object in the Engine which manages all the information related to the threads associated to the reentrant backward calls.

    + +
      struct ThreadPoolShared {
    +    unsigned int num_workers_;
    +    std::condition_variable work_;
    +    std::mutex mutex_;
    +    std::queue<std::weak_ptr<GraphTask>> graphtasks_queue_;
    +
    +    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
    +    ThreadPoolShared() : num_workers_(0) {}
    + };
    +
    +
    +
    +
    + +

    ThreadPoolShared is a simple container holding a queue of GraphTask objects with synchronization mechanisms and the number of current workers.

    + +

    Now it is easy to understand how add_thread_pool_task creates a thread when there are graph_task objects enqueued and insufficient workers to process them.

    + +

    add_thread_pool_task initializes a thread by executing reentrant_thread_init

    + +
    void Engine::reentrant_thread_init() {
    +  at::init_num_threads();
    +  auto tp_shared = thread_pool_shared_;
    +  while(true) {
    +    std::unique_lock<std::mutex> lk(tp_shared->mutex_);
    +    ++thread_pool_shared_->num_workers_;
    +    tp_shared->work_.wait(lk, [&tp_shared]{ return !tp_shared->graphtasks_queue_.empty();});
    +    --thread_pool_shared_->num_workers_;
    +    auto task = tp_shared->graphtasks_queue_.front();
    +    tp_shared->graphtasks_queue_.pop();
    +    lk.unlock();
    +    std::shared_ptr<GraphTask> graph_task;
    +    if (!(graph_task = task.lock())) {
    +      continue;
    +    }
    +    set_device(graph_task->owner_);
    +    // set the local_ready_queue to the ready queue on the graph_task->owner_ device
    +    local_ready_queue = ready_queue_by_index(graph_task->cpu_ready_queue_, graph_task->owner_);
    +    total_depth = graph_task->reentrant_depth_;
    +    thread_main(graph_task);
    +  }
    +}
    +
    +
    +
    +
    + +

    The code is straightforward. The newly created thread waits on the thread_pool_shared->graphtasks_queue_ for reentrant backward graphs to be available and executes them. Notice that this thread uses the task-ready queue associated with the device of the thread that started this call by accessing the graph_task->owner_ field set in the execute_with_graph_task function.

    + +

    Error Handling

    + +

    Whenever an error happens in one of the worker threads. It will be propagated to the backward calling thread.

    + +

    To achieve this, there is a try/catch block in the thread_main that catches any exception in the Node function call and sets it to the associated GraphTask object.

    + +
           try {
    +          
    +          GraphTaskGuard guard(local_graph_task);
    +          NodeGuard ndguard(task.fn_);
    +          {
    +            evaluate_function(
    +               
    +          }
    +        } catch (std::exception& e) {
    +          thread_on_exception(local_graph_task, task.fn_, e);
    +        }
    +      }
    +    }
    +
    +
    + +

    thread_on_exception and the functions it calls end up setting the exception in the local_graph_task object.

    + +
    void Engine::thread_on_exception(
    +    std::shared_ptr<GraphTask> graph_task,
    +    const std::shared_ptr<Node>& fn,
    +    std::exception& e) {
    +  graph_task->set_exception(std::current_exception(), fn);
    +}
    +
    +void GraphTask::set_exception_without_signal(const std::shared_ptr<Node>& fn) {
    +  if (!has_error_.exchange(true)) {
    +    if (AnomalyMode::is_enabled() && fn) {
    +      fn->metadata()->print_stack(fn->name());
    +    }
    +  }
    +}
    +
    +void GraphTask::set_exception(
    +    std::exception_ptr eptr,
    +    const std::shared_ptr<Node>& fn) {
    +  set_exception_without_signal(fn);
    +  if (!future_completed_.exchange(true)) {
    +    // NOLINTNEXTLINE(performance-move-const-arg)
    +    future_result_->setError(std::move(eptr));
    +  }
    +}
    +
    +
    + +

    In set_exception it sets the has_error_ flag to true and it calls the setError +function of the future_result_ object. This will make the error to be re-thrown at the caller thread when future_result_->value() is accessed.

    + +
     IValue value() {
    +    std::unique_lock<std::mutex> lock(mutex_);
    +    AT_ASSERT(completed());
    +    if (eptr_) {
    +      std::rethrow_exception(eptr_);
    +    }
    +    return value_;
    +  }
    +
    +
    + +

    Closing Remarks

    + +

    This has been the last post of this series covering how PyTorch does the auto differentiation. We hope you enjoyed reading it and that now you are familiar enough with PyTorch internals to start contributing in PyTorch development!

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/how-disney-improved-activity-recognition-with-multimodal-approaches-with-pytorch/index.html b/blog/how-disney-improved-activity-recognition-with-multimodal-approaches-with-pytorch/index.html new file mode 100644 index 000000000000..da79b6e740f8 --- /dev/null +++ b/blog/how-disney-improved-activity-recognition-with-multimodal-approaches-with-pytorch/index.html @@ -0,0 +1,775 @@ + + + + + + + + + + + + + How Disney Improved Activity Recognition Through Multimodal Approaches with PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Monica Alfaro, Albert Aparicio, Francesc Guitart, Marc Junyent, Pablo Pernias, Marcel Porta, and Miquel Àngel Farré (former Senior Technology Manager) + +

    +

    Introduction

    + +

    Among the many things Disney Media & Entertainment Distribution (DMED) is responsible for, is the management and distribution of a huge array of media assets including news, sports, entertainment and features, episodic programs, marketing and advertising and more.

    + +

    + +

    + +

    Our team focuses on media annotation as part of DMED Technology’s content platforms group. In our day-to-day work, we automatically analyze a variety of content that constantly challenges the efficiency of our machine learning workflow and the accuracy of our models.

    + +

    Several of our colleagues recently discussed the workflow efficiencies that we achieved by switching to an end-to-end video analysis pipeline using PyTorch, as well as how we approach animated character recognition. We invite you to read more about both in this previous post.

    + +

    While the conversion to an end-to-end PyTorch pipeline is a solution that any company might benefit from, animated character recognition was a uniquely-Disney concept and solution.

    + +

    In this article we will focus on activity recognition, which is a general challenge across industries — but with some specific opportunities when leveraged in the media production field, because we can combine audio, video, and subtitles to provide a solution.

    + +

    Experimenting with Multimodality

    + +

    Working on a multimodal problem adds more complexity to the usual training pipelines. Having multiple information modes for each example means that the multimodal pipeline has to have specific implementations to process each mode in the dataset. Usually after this processing step, the pipeline has to merge or fuse the outputs.

    + +

    Our initial experiments in multimodality were completed using the MMF framework. MMF is a modular framework for vision and language multimodal research. MMF contains reference implementations of state-of-the-art vision and language models and has also powered multiple research projects at Meta AI Research (as seen in this poster presented in PyTorch Ecosystem Day 2020). Along with the recent release of TorchMultimodal, a PyTorch library for training state-of-the-art multimodal models at scale, MMF highlights the growing interest in Multimodal understanding.

    + +

    MMF tackles this complexity with modular management of all the elements of the pipeline through a wide set of different implementations for specific modules, ranging from the processing of the modalities to the fusion of the processed information.

    + +

    In our scenario, MMF was a great entry point to experiment with multimodality. It allowed us to iterate quickly by combining audio, video and closed captioning and experiment at different levels of scale with certain multimodal models, shifting from a single GPU to TPU Pods.

    + +

    Multimodal Transformers

    + +

    With a workbench based on MMF, our initial model was based on a concatenation of features from each modality evolving to a pipeline that included a Transformer-based fusion module to combine the different input modes.

    + +

    Specifically, we made use of the fusion module called MMFTransformer, developed in collaboration with the Meta AI Research team. This is an implementation based on VisualBERT for which the necessary modifications were added to be able to work with text, audio and video.

    + +

    Despite having decent results with the out-of-box implementation MMFTransformer, we were still far from our goal, and the Transformers-based models required more data than we had available.

    + +

    Searching for less data-hungry solutions

    + +

    Searching for less data-hungry solutions, our team started studying MLP-Mixer. This new architecture has been proposed by the Google Brain team and it provides an alternative to well established de facto architectures like convolutions or self-attention for computer vision tasks.

    + +

    MLP-Mixer

    + +

    The core idea behind mixed variations consists of replacing the convolutions or self-attention mechanisms used in transformers with Multilayer Perceptrons. This change in architecture favors the performance of the model in high data regimes (especially with respect to the Transformers), while also opening some questions regarding the inductive biases hidden in the convolutions and the self-attention layers.

    + +

    Those proposals perform great in solving image classification tasks by splitting the image in chunks, flattening those chunks into 1D vectors and passing them through a sequence of Mixer Layers.

    + +

    + +

    + +

    Inspired by the advantages of Mixer based architectures, our team searched for parallelisms with the type of problems we try to solve in video classification: specifically, instead of a single image, we have a set of frames that need to be classified, along with audio and closed captioning in the shape of new modalities.

    + +

    Activity Recognition reinterpreting the MLP-Mixer

    + +

    Our proposal takes the core idea of the MLP-Mixer — using multiple multi-layer perceptrons on a sequence and transposed sequence and extends it into a Multi Modal framework that allows us to process video, audio & text with the same architecture.

    + +

    For each of the modalities, we use different extractors that will provide embeddings describing the content. Given the embeddings of each modality, the MLP-Mixer architecture solves the problem of deciding which of the modalities might be the most important, while also weighing how much each modality contributes to the final labeling.

    + +

    For example, when it comes to detecting laughs, sometimes the key information is in audio or in the frames, and in some of the cases we have a strong signal in the closed caption.

    + +

    We tried processing each frame separately with a ResNet34 and getting a sequence of embeddings and by using a video-specific model called R3D, both pre-trained on ImageNet and Kinetics400 respectively.

    + +

    + +

    + +

    To process the audio, we use the pretrained ResNet34, and we remove the final layers to be able to extract 2D embeddings from the audio spectrograms (for 224x224 images we end up with 7x7 embeddings).

    + +

    + +

    + +

    For closed captioning, we are using a pre-trained BERT-large, with all layers frozen, except for the Embeddings & LayerNorms.

    + +

    + +

    + +

    Once we have extracted the embedding from each modality, we concatenate them into a single sequence and pass it through a set of MLP-Mixer blocks; next we use average pooling & a classification head to get predictions.

    + +

    + +

    + +

    Our experiments have been performed on a custom, manually labeled dataset for activity recognition with 15 classes, which we know from experiments are hard and cannot all be predicted accurately using a single modality.

    + +

    These experiments have shown a significant increase in performance using our approach, especially in a low/mid-data regime (75K training samples).

    + +

    When it comes to using only Text and Audio, our experiments showed a 15 percent improvement in accuracy over using a classifier on top of the features extracted by state-of-the-art backbones.

    + +

    Using Text, Audio and Video we have seen a 17 percent improvement in accuracy over using Meta AIFacebook’s MMF Framework, which uses a VisualBERT-like model to combine modalities using more powerful state of the art backbones.

    + +

    Currently, we extended the initial model to cover up to 55 activity classes and 45 event classes. One of the challenges we expect to improve upon in the future is to include all activities and events, even those that are less frequent.

    + +

    Interpreting the MLP-Mixer mode combinations

    + +

    An MLP-Mixer is a concatenation of MultiLayer Perceptrons. This can be, very roughly, approximated to a linear operation, in the sense that, once trained, the weights are fixed and the input will directly affect the output.

    + +

    Once we assume that approximation, we also assume that for an input consisting of NxM numbers, we could find a NxM matrix that (when multiplied elementwise) could approximate the predictions of the MLP-Mixer for a class.

    + +

    + +

    + +

    We will call this matrix a stencil, and if we have access to it, we can find what parts of the input embeddings are responsible for a specific prediction.

    + +

    You can think of it as a punch card with holes in specific positions. Only information in those positions will pass and contribute to a specific prediction. So we can measure the intensity of the input at those positions.

    + +

    + +

    + +

    Of course, this is an oversimplification, and there won’t exist a unique stencil that perfectly represents all of the contributions of the input to a class (otherwise that would mean that the problem could be solved linearly). So this should be used for visualization purposes only, not as an accurate predictor.

    + +

    Once we have a set of stencils for each class, we can effortlessly measure input contribution without relying on any external visualization techniques.

    + +

    To find a stencil, we can start from a “random noise” stencil and optimize it to maximize the activations for a specific class by just back-propagating through the MLP-Mixer.

    + +

    + +

    + +

    By doing this we can end up with many valid stencils, and we can reduce them to a few by using K-means to cluster them into similar stencils and averaging each cluster.

    + +

    Using the Mixer to get the best of each world

    + +

    MLP-Mixer, used as an image classification model without convolutional layers, requires a lot of data, since the lack of inductive bias – one of the model’s good points overall – is a weakness when it comes to working in low data domains.

    + +

    When used as a way to combine information previously extracted by large pretrained backbones (as opposed to being used as a full end-to-end solution), they shine. The Mixer’s strength lies in finding temporal or structural coherence between different inputs. For example, in video-related tasks we could extract embeddings from the frames using a powerful, pretrained model that understands what is going on at frame level and use the mixer to make sense of it in a sequential manner.

    + +

    This way of using the Mixer allows us to work with limited amounts of data and still get better results than what was achieved with Transformers. This is because Mixers seem to be more stable during training and seem to pay attention to all the inputs, while Transformers tend to collapse and pay attention only to some modalities/parts of the sequence.

    + +

    Acknowledgements: We would like to thank the Meta AI Research and Partner Engineering teams for this collaboration.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/how-ibm-uses-pt-terratorch/index.html b/blog/how-ibm-uses-pt-terratorch/index.html new file mode 100644 index 000000000000..96f7914ea443 --- /dev/null +++ b/blog/how-ibm-uses-pt-terratorch/index.html @@ -0,0 +1,723 @@ + + + + + + + + + + + + + How IBM Research Uses PyTorch and TerraTorch to Make Geospatial Computer Vision Accessible for Everyone | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    Earth Observation-based analytics are becoming essential for understanding our planet — from monitoring deforestation to tracking urban development and analyzing the impacts of climate change. However, the coding and deep learning skills for applying AI models to satellite imagery and earth observation data has traditionally been a major barrier for many practitioners.

    + +

    By IBM Research’s launch of TerraTorch 1.0, a PyTorch domain library for fine-tuning of Geospatial Computer Vision Foundation Models, we make geospatial AI not only more accessible but also more practical for the wider PyTorch community. Our goal: simplify the process so that any data scientist, researcher, or enthusiast can build powerful geospatial models with ease and low GPU and data processing requirements.

    + +

    globes

    + +

    The power of foundation models, even with 75-95% of the input data removed, the models do a fantastic job in reconstruction of the input data - therefore learning the underlying physics of our planet in a deep, latent space

    + +

    The Business Challenge

    + +

    Our goal was to remove the technical barriers that prevent people from working with satellite imagery, weather and climate data at scale. Together with NASA, we’ve developed the Prithvi family of foundation models. Integrating the latest innovations of AI research using the clean API PyTorch provides has facilitated the job.

    + +

    We wanted to create a framework that anyone can use to go from raw data to inference ready models in just a few steps.

    + +

    globes

    + +

    How a weather and climate foundation model created and fine-tuned on PyTorch is used for weather forecasts

    + +

    How IBM Research Used PyTorch

    + +

    We’ve built TerraTorch on top of PyTorch, leveraging its dynamic ecosystem to integrate:

    + +
      +
    • PyTorch Lightning for clean, scalable training loops
    • +
    • TorchGeo for geospatial data handling and transformations (PyTorch transforms)
    • +
    • For foundation models like the leading generative multimodal foundation model ‘Terramind’, co-developed by IBM and ESA, and the ‘Prithvi’ family, co-developed by IBM and NASA, TerraTorch has been used to fine-tune all of the downstream geospatial models for satellite imagery, weather and climate data. It includes the family of fine-tuned models that IBM has released as part of Granite. In addition, other interesting foundation models and ecosystem components like Clay, SatMAE, Satlas, DeCur and DOFA are included in TerraTorch.
    • +
    • Powerful and state-of-the-art vision transformers to experiment with modern neural network architectures
    • +
    • TerraTorch-Iterate build on top of PyTorch, Optuna, MLFlow and Ray Tune for Hyperparameter Optimization (HPO), Neural Architecture Search (NAS) and Foundation Model Benchmarking (GeoBench), where TerraTorch became the reference implementation
    • +
    + +

    flow diagram

    + +

    The fine-tuning and inference process is completely described in a single YAML config file. There, the architectural building blocks of the model (backbone, neck, decoder, head) are defined. The Model Factory assembles the model using the build-in and custom registries. In addition, the Optimizer and Data Modules are created as defined in the config. Finally, everything is passed to the Lightning Trainer, who executes the task.

    + +

    With PyTorch’s flexibility, we were able to prototype quickly, iterate on model architectures, and deploy pipelines for a range of geospatial applications — from flood and biomass detection to increasing resolution of climate data, where some of our our work became part of the IBM Granite Geospatial Model Family.

    + +

    flow diagram

    + +

    Architecture of the Prithvi-EO-2.0-600M foundation model which IBM Research developed together with NASA

    + +

    Solving AI Challenges with PyTorch

    + +

    PyTorch helped us to tackle three major challenges:

    + +
      +
    • Ease of experimentation: Dynamic computation graphs, automatic differentiation, full abstraction of CUDA and rich visualization tools made it simple to test different models and training strategies.
    • +
    • Scalability: With DDP, FSDP, PyTorch Lightning and TorchGeo, we could train models on large-scale datasets without worrying about infrastructure.
    • +
    • Community support: PyTorch - the de-facto standard in AI research - with its active community and excellent documentation made it easy to overcome hurdles and stay up to date with the latest advancements in AI research.
    • +
    + +

    A Word from IBM Research

    + +

    “PyTorch gave me the power to turn complex linear algebra and optimization problems into accessible, shareable solutions for the community. It feels empowering that we’re building and fine-tuning models for anyone curious about understanding our planet through AI.”

    + +

    — Romeo Kienzler, AI Research Engineer at IBM Research Zurich, Rueschlikon

    + +

    quote

    + +

    The Benefits of Using PyTorch

    + +

    Using PyTorch allowed us to:

    + +
      +
    • Build a reproducible, open-source framework for fine-tuning geospatial foundation models
    • +
    • Share our work with the community through easy-to-follow notebooks, TerraTorch configuration files, tutorials and model checkpoints on HuggingFace
    • +
    • Rapidly iterate over foundation model architectures and deploy fine-tuned models for inference, from research to real-world client products
    • +
    + +

    Learn More

    + +

    For more information about this project and to explore the code, visit:

    + + + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/how-intel-uses-pytorch-to-empower-generative-ai-through-intel-arc-gpus/index.html b/blog/how-intel-uses-pytorch-to-empower-generative-ai-through-intel-arc-gpus/index.html new file mode 100644 index 000000000000..ff510bab940b --- /dev/null +++ b/blog/how-intel-uses-pytorch-to-empower-generative-ai-through-intel-arc-gpus/index.html @@ -0,0 +1,683 @@ + + + + + + + + + + + + + How Intel Uses PyTorch to Empower Generative AI through Intel Arc GPUs | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    Intel has long been at the forefront of technological innovation, and its recent venture into Generative AI (GenAI) solutions is no exception. With the rise of AI-powered gaming experiences, Intel sought to deliver an accessible and intuitive GenAI inferencing solution tailored for AI PCs powered by Intel’s latest GPUs. By leveraging PyTorch as the backbone for development efforts, Intel successfully launched AI Playground, an open source application that showcases advanced GenAI workloads.

    + +

    The Business Challenge

    + +

    Our goal was to deliver an accessible and intuitive GenAI inferencing solution tailored for AI PCs powered by Intel. We recognized the need to showcase the capabilities of the latest GenAI workloads on our newest line of client GPUs. To address this, we developed a starter application, AI Playground, which is open source and includes a comprehensive developer reference sample available on GitHub using PyTorch. This application seamlessly integrates image generation, image enhancement, and chatbot functionalities, using retrieval-augmented generation (RAG) features, all within a single, user-friendly installation package. This initiative not only demonstrates the functionality of these AI workloads but also serves as an educational resource for the ecosystem, guiding developers on effectively leveraging the Intel® Arc™ GPU product line for advanced AI applications. This solution leverages Intel® Arc™ Xe Cores and Xe Matrix Extensions (XMX) for accelerating inferencing.

    + +

    AI Playground

    + +

    How Intel Used PyTorch

    + +

    PyTorch is the core AI framework for AI Playground. We extensively leverage PyTorch’s eager mode, which aligns perfectly with the dynamic and iterative nature of our generative models. This approach not only enhances our development workflow but also enables us to rapidly prototype and iterate on advanced AI features. By harnessing PyTorch’s powerful capabilities, we have created a robust reference sample that showcases the potential of GenAI on Intel GPUs in one cohesive application.

    + +

    Solving AI Challenges with PyTorch

    + +

    PyTorch has been instrumental in addressing our AI challenges by providing a robust training and inference framework optimized for discrete and integrated Intel Arc GPU product lines. Choosing PyTorch over alternative frameworks or APIs was crucial. Other options would have necessitated additional custom development or one-off solutions, which could have significantly slowed our time to market and limited our feature set. With PyTorch, we leveraged its flexibility and ease of use, allowing our team to focus on innovation through experimentation, rather than infrastructure. The integration of Intel® Extension for PyTorch further enhanced performance by optimizing computational efficiency and enabling seamless scaling on Intel hardware, ensuring that our application ran faster and more efficiently.

    + +

    A Word from Intel

    + +

    With PyTorch as the backbone of our AI Playground project, we achieved rapid development cycles that significantly accelerated our time to market. This flexibility enabled us to iteratively enhance features and effectively align with the commitments of our hardware launches in 2024.

    + +

    -Bob Duffy, AI Playground Product Manager

    + +

    PyTorch Case Stidu

    + +

    The Benefits of Using PyTorch

    + +

    The biggest benefit of using PyTorch for us is the large PyTorch ecosystem, which connects us with an active and cooperative community of developers. This collaboration has facilitated the seamless deployment of key features from existing open source projects, allowing us to integrate the latest GenAI capabilities into AI Playground. Remarkably, we accomplished this with minimal re-coding, ensuring that these advanced features are readily accessible on Intel Arc GPUs.

    + +

    Learn More

    + +

    For more information about Intel’s AI Playground and collaboration with PyTorch, visit the following links:

    + + + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/how-to-accelerate/index.html b/blog/how-to-accelerate/index.html new file mode 100644 index 000000000000..5bc89461a5a8 --- /dev/null +++ b/blog/how-to-accelerate/index.html @@ -0,0 +1,842 @@ + + + + + + + + + + + + + How to Accelerate PyTorch Geometric on Intel® CPUs | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Intel + +

    +

    Overview

    + +

    The Intel PyTorch team has been collaborating with the PyTorch Geometric (PyG) community to provide CPU performance optimizations for Graph Neural Network (GNN) and PyG workloads. In the PyTorch 2.0 release, several critical optimizations were introduced to improve GNN training and inference performance on CPU. Developers and researchers can now take advantage of Intel’s AI/ML Framework optimizations for significantly faster model training and inference, which unlocks the ability for GNN workflows directly using PyG.

    + +

    In this blog, we will perform a deep dive on how to optimize PyG performance for both training and inference while using the PyTorch 2.0 flagship torch.compile feature to speed up PyG models.

    + +

    Message Passing Paradigm

    + +

    Message passing refers to the process of nodes exchanging information with their respective neighbors by sending messages to one another. In PyG, the process of message passing can be generalized into three steps:

    + +
      +
    1. Gather: Collect edge-level information of adjacent nodes and edges.
    2. +
    3. Apply: Update the collected information with user-defined functions (UDFs).
    4. +
    5. Scatter: Aggregate to node-level information, e.g., via a particular reduce function such as sum, mean, or max.
    6. +
    + +

    Figure 1: The message passing paradigm

    + +

    Figure 1: The message passing paradigm (Source: Matthias Fey)

    + +

    Message passing performance is highly related to the storage format of the adjacency matrix of the graph, which records how pairs of nodes are connected. Two methods for the storage format are:

    + +
      +
    • Adjacency matrix in COO (Coordinate Format): The graph data is physically stored in a two-dimensional tensor shape of [2, num_edges], which maps each connection of source and destination nodes. The performance hotspot is scatter-reduce.
    • +
    • Adjacency matrix in CSR (Compressed Sparse Row): Similar format to COO, but compressed on the row indices. This format allows for more efficient row access and faster sparse matrix-matrix multiplication (SpMM). The performance hotspot is sparse matrix related reduction ops.
    • +
    + +

    Scatter-Reduce

    + +

    The pattern of scatter-reduce is parallel in nature, which updates values of a self tensor using values from a src tensor at the entries specified by index. Ideally, parallelizing on the outer dimension would be most performant. However, direct parallelization leads to write conflicts, as different threads might try to update the same entry simultaneously.

    + +

    Figure 2: Scatter-reduce and its optimization scheme

    + +

    Figure 2: Scatter-reduce and its optimization scheme (Source: Mingfei Ma)

    + +

    To optimize this kernel, we use sorting followed by a reduction:

    + +
      +
    • Sorting: Sort the index tensor in ascending order with parallel radix sort, such that indices pointing to the same entry in the self tensor are managed in the same thread.
    • +
    • Reduction: Paralleled on the outer dimension of self, and do vectorized reduction for each indexed src entry.
    • +
    + +

    For its backward path during the training process (i.e., gather), sorting is not needed because its memory access pattern will not lead to any write conflicts.

    + +

    SpMM-Reduce

    + +

    Sparse matrix-matrix reduction is a fundamental operator in GNNs, where A is sparse adjacency matrix in CSR format and B is a dense feature matrix where the reduction type could be sum, mean or max.

    + +

    Figure 3: SpMM optimization scheme

    + +

    Figure 3: SpMM optimization scheme (Source: Mingfei Ma)

    + +

    The biggest challenge when optimizing this kernel is how to balance thread payload when parallelizing along rows of the sparse matrix A. Each row in A corresponds to a node, and its number of connections may vary vastly from one to another; this results in thread payload imbalance. One technique to address such issues is to do payload scanning before thread partition. Aside from that, other techniques are also introduced to further exploit CPU performance such as vectorization and unrolling and blocking.

    + +

    These optimizations are done via torch.sparse.mm using the reduce flags of amax, amin, mean, sum.

    + +

    Performance Gains: Up to 4.1x Speedup

    + +

    We collected benchmark performance for both inference and training in pytorch_geometric/benchmark and in the Open Graph Benchmark (OGB) to demonstrate the performance improvement from the above-mentioned methods on Intel® Xeon® Platinum 8380 Processor.

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Model – Dataset + Option + Speedup ratio +
    + GCN-Reddit (inference) + 512-2-64-dense + 1.22x +
    1024-3-128-dense + 1.25x +
    512-2-64-sparse + 1.31x +
    1024-3-128-sparse + 1.68x +
    + GraphSage-ogbn-products (inference) + 1024-3-128-dense + 1.15x +
    512-2-64-sparse + 1.20x +
    1024-3-128-sparse + 1.33x +
    full-batch-sparse + 4.07x +
    GCN-PROTEINS (training) + 3-32 + 1.67x +
    GCN-REDDIT-BINARY (training) + 3-32 + 1.67x +
    GCN-Reddit (training) + 512-2-64-dense + 1.20x +
    1024-3-128-dense + 1.12x +
    + +

    Table 1: Performance Speedup on PyG Benchmark1

    + +

    From the benchmark results, we can see that our optimizations in PyTorch and PyG achieved 1.1x-4.1x speed-up for inference and training.

    + +

    torch.compile for PyG

    + +

    The PyTorch2.0 flagship feature torch.compile is fully compatible with PyG 2.3 release, bringing additional speed-up in PyG model inference/training over imperative mode, thanks to TorchInductor C++/OpenMP backend for CPUs. In particular, a 3.0x – 5.4x performance speed-up is measured on basic GNN models with Intel Xeon Platinum 8380 Processor on model training2.

    + +

    Figure 4: Performance Speedup with Torch Compile

    + +

    Figure 4: Performance Speedup with Torch Compile

    + +

    Torch.compile can fuse the multiple stages of message passing into a single kernel, which provides significant speedup due to the saved memory bandwidth. Refer to this pytorch geometric tutorial for additional support.

    + +

    Please note that torch.compile within PyG is in beta mode and under active development. Currently, some features do not yet work together seamlessly such as torch.compile(model, dynamic=True), but fixes are on the way from Intel.

    + +

    Conclusion & Future Work

    + +

    In this blog, we introduced the GNN performance optimizations included in PyTorch 2.0 on CPU. We are closely collaborating with the PyG community for future optimization work, which will focus on in-depth optimizations from torch.compile, sparse optimization, and distributed training.

    + +

    Acknowledgement

    + +

    The results presented in this blog is a joint effort of Intel PyTorch team and Kumo. Special thanks to Matthias Fey (Kumo), Pearu Peterson (Quansight) and Christian Puhrsch (Meta) who spent precious time and gave substantial assistance! Together, we made one more step forward on the path of improving the PyTorch CPU ecosystem.

    + +

    References

    + + + +

    Footnotes

    + +

    Product and Performance Information

    + +

    1Platinum 8380: 1-node, 2x Intel Xeon Platinum 8380 processor with 256GB (16 slots/ 16GB/3200) total DDR4 memory, uCode 0xd000389, HT on, Turbo on, Ubuntu 20.04.5 LTS, 5.4.0-146-generic, INTEL SSDPE2KE016T8 1.5T; GCN + Reddit FP32 inference, GCN+Reddit FP32 training, GraphSAGE + ogbn-products FP32 inference, GCN-PROTAIN, GCN-REDDIT-BINARY FP32 training; Software: PyTorch 2.1.0.dev20230302+cpu, pytorch_geometric 2.3.0, torch-scatter 2.1.0, torch-sparse 0.6.16, test by Intel on 3/02/2023.

    + +

    2Platinum 8380: 1-node, 2x Intel Xeon Platinum 8380 processor with 256GB (16 slots/ 16GB/3200) total DDR4 memory, uCode 0xd000389, HT on, Turbo on, Ubuntu 20.04.5 LTS, 5.4.0-146-generic, INTEL SSDPE2KE016T8 1.5T; GCN, GraphSAGE, GIN and EdgeCNN, FP32; Software: PyTorch 2.1.0.dev20230411+cpu, pytorch_geometric 2.4.0, torch-scatter 2.1.1+pt20cpu, torch-sparse 0.6.17+pt20cpu, test by Intel on 4/11/2023.

    + +

    3Performance varies by use, configuration and other factors. Learn more at www.Intel.com/PerformanceIndex.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/index.html b/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/index.html new file mode 100644 index 000000000000..bb31f3074ee8 --- /dev/null +++ b/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/index.html @@ -0,0 +1,1169 @@ + + + + + + + + + + + + + How to Train State-Of-The-Art Models Using TorchVision’s Latest Primitives | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Vasilis Vryniotis + +

    + + +

    A few weeks ago, TorchVision v0.11 was released packed with numerous new primitives, models and training recipe improvements which allowed achieving state-of-the-art (SOTA) results. The project was dubbed “TorchVision with Batteries Included” and aimed to modernize our library. We wanted to enable researchers to reproduce papers and conduct research more easily by using common building blocks. Moreover, we aspired to provide the necessary tools to Applied ML practitioners to train their models on their own data using the same SOTA techniques as in research. Finally, we wanted to refresh our pre-trained weights and offer better off-the-shelf models to our users, hoping that they would build better applications.

    + +

    Though there is still much work to be done, we wanted to share with you some exciting results from the above work. We will showcase how one can use the new tools included in TorchVision to achieve state-of-the-art results on a highly competitive and well-studied architecture such as ResNet50 [1]. We will share the exact recipe used to improve our baseline by over 4.7 accuracy points to reach a final top-1 accuracy of 80.9% and share the journey for deriving the new training process. Moreover, we will show that this recipe generalizes well to other model variants and families. We hope that the above will influence future research for developing stronger generalizable training methodologies and will inspire the community to adopt and contribute to our efforts.

    + +

    The Results

    + +

    Using our new training recipe found on ResNet50, we’ve refreshed the pre-trained weights of the following models:

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    ModelAccuracy@1Accuracy@5
    ResNet5080.85895.434
    ResNet10181.88695.780
    ResNet15282.28496.002
    ResNeXt50-32x4d81.19895.340
    + +

    Note that the accuracy of all models except RetNet50 can be further improved by adjusting their training parameters slightly, but our focus was to have a single robust recipe which performs well for all.

    + +

    UPDATE: We have refreshed the majority of popular classification models of TorchVision, you can find the details on this blog post.

    + +

    There are currently two ways to use the latest weights of the model.

    + +

    Using the Multi-pretrained weight API

    + +

    We are currently working on a new prototype mechanism which will extend the model builder methods of TorchVision to support multiple weights. Along with the weights, we store useful meta-data (such as the labels, the accuracy, links to recipe etc) and the preprocessing transforms necessary for using the models. Example:

    + +
      from PIL import Image
    +  from torchvision import prototype as P
    +  img = Image.open("test/assets/encode_jpeg/grace_hopper_517x606.jpg")
    +   
    +  # Initialize model
    +  weights = P.models.ResNet50_Weights.IMAGENET1K_V2
    +  model = P.models.resnet50(weights=weights)
    +  model.eval()
    +
    +  # Initialize inference transforms
    +  preprocess = weights.transforms()
    +   
    +  # Apply inference preprocessing transforms
    +  batch = preprocess(img).unsqueeze(0)
    +  prediction = model(batch).squeeze(0).softmax(0)
    +   
    +  # Make predictions
    +  label = prediction.argmax().item()
    +  score = prediction[label].item()
    +   
    +  # Use meta to get the labels
    +  category_name = weights.meta['categories'][label]
    +  print(f"{category_name}: {100 * score}%")
    +
    + +

    Using the legacy API

    + +

    Those who don’t want to use a prototype API have the option of accessing the new weights via the legacy API using the following approach:

    + +
      from torchvision.models import resnet
    +   
    +  # Overwrite the URL of the previous weights
    +  resnet.model_urls["resnet50"] = "https://download.pytorch.org/models/resnet50-11ad3fa6.pth"
    +   
    +  # Initialize the model using the legacy API
    +  model = resnet.resnet50(pretrained=True)
    +   
    +  # TODO: Apply preprocessing + call the model
    +  # ...
    +
    + +

    The Training Recipe

    + +

    Our goal was to use the newly introduced primitives of TorchVision to derive a new strong training recipe which achieves state-of-the-art results for the vanilla ResNet50 architecture when trained from scratch on ImageNet with no additional external data. Though by using architecture specific tricks [2] one could further improve the accuracy, we’ve decided not to include them so that the recipe can be used in other architectures. Our recipe heavily focuses on simplicity and builds upon work by FAIR [3], [4], [5], [6], [7]. Our findings align with the parallel study of Wightman et al. [7], who also report major accuracy improvements by focusing on the training recipes.

    + +

    Without further ado, here are the main parameters of our recipe:

    + +
      # Optimizer & LR scheme
    +  ngpus=8,
    +  batch_size=128,  # per GPU
    +
    +  epochs=600, 
    +  opt='sgd',  
    +  momentum=0.9,
    +
    +  lr=0.5, 
    +  lr_scheduler='cosineannealinglr', 
    +  lr_warmup_epochs=5, 
    +  lr_warmup_method='linear', 
    +  lr_warmup_decay=0.01, 
    +
    +
    +  # Regularization and Augmentation
    +  weight_decay=2e-05, 
    +  norm_weight_decay=0.0,
    +
    +  label_smoothing=0.1, 
    +  mixup_alpha=0.2, 
    +  cutmix_alpha=1.0, 
    +  auto_augment='ta_wide', 
    +  random_erase=0.1, 
    +  
    +  ra_sampler=True,
    +  ra_reps=4,
    +
    +
    +  # EMA configuration
    +  model_ema=True, 
    +  model_ema_steps=32, 
    +  model_ema_decay=0.99998, 
    +
    +
    +  # Resizing
    +  interpolation='bilinear', 
    +  val_resize_size=232, 
    +  val_crop_size=224, 
    +  train_crop_size=176,
    +
    + +

    Using our standard training reference script, we can train a ResNet50 using the following command:

    + +
    torchrun --nproc_per_node=8 train.py --model resnet50 --batch-size 128 --lr 0.5 \
    +--lr-scheduler cosineannealinglr --lr-warmup-epochs 5 --lr-warmup-method linear \
    +--auto-augment ta_wide --epochs 600 --random-erase 0.1 --weight-decay 0.00002 \
    +--norm-weight-decay 0.0 --label-smoothing 0.1 --mixup-alpha 0.2 --cutmix-alpha 1.0 \
    +--train-crop-size 176 --model-ema --val-resize-size 232 --ra-sampler --ra-reps 4
    +
    + +

    Methodology

    + +

    There are a few principles we kept in mind during our explorations:

    + +
      +
    1. Training is a stochastic process and the validation metric we try to optimize is a random variable. This is due to the random weight initialization scheme employed and the existence of random effects during the training process. This means that we can’t do a single run to assess the effect of a recipe change. The standard practice is doing multiple runs (usually 3 to 5) and studying the summarization stats (such as mean, std, median, max, etc).
    2. +
    3. There is usually a significant interaction between different parameters, especially for techniques that focus on Regularization and reducing overfitting. Thus changing the value of one can have effects on the optimal configurations of others. To account for that one can either adopt a greedy search approach (which often leads to suboptimal results but tractable experiments) or apply grid search (which leads to better results but is computationally expensive). In this work, we used a mixture of both.
    4. +
    5. Techniques that are non-deterministic or introduce noise usually require longer training cycles to improve model performance. To keep things tractable, we initially used short training cycles (small number of epochs) to decide which paths can be eliminated early and which should be explored using longer training.
    6. +
    7. There is a risk of overfitting the validation dataset [8] because of the repeated experiments. To mitigate some of the risk, we apply only training optimizations that provide a significant accuracy improvements and use K-fold cross validation to verify optimizations done on the validation set. Moreover we confirm that our recipe ingredients generalize well on other models for which we didn’t optimize the hyper-parameters.
    8. +
    + +

    Break down of key accuracy improvements

    + +

    As discussed in earlier blogposts, training models is not a journey of monotonically increasing accuracies and the process involves a lot of backtracking. To quantify the effect of each optimization, below we attempt to show-case an idealized linear journey of deriving the final recipe starting from the original recipe of TorchVision. We would like to clarify that this is an oversimplification of the actual path we followed and thus it should be taken with a grain of salt. 

    + +

    +Cumulative Accuracy Improvements for ResNet50 +

    + +

    In the table below, we provide a summary of the performance of stacked incremental improvements on top of Baseline. Unless denoted otherwise, we report the model with best Acc@1 out of 3 runs:

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
     Accuracy@1Accuracy@5Incremental DiffAbsolute Diff
    ResNet50 Baseline76.13092.8620.0000.000
    + LR optimizations76.49493.1980.3640.364
    + TrivialAugment76.80693.2720.3120.676
    + Long Training78.60694.0521.8002.476
    + Random Erasing78.79694.0940.1902.666
    + Label Smoothing79.11494.3740.3182.984
    + Mixup79.23294.5360.1183.102
    + Cutmix79.51094.6420.2783.380
    + Weight Decay tuning80.03694.7460.5263.906
    + FixRes mitigations80.19694.6720.1604.066
    + EMA80.45094.9080.2544.320
    + Inference Resize tuning *80.67495.1660.2244.544
    + Repeated Augmentation **80.85895.4340.1844.728
    + +

    *The tuning of the inference size was done on top of the last model. See below for details.

    + +

    ** Community contribution done after the release of the article. See below for details.

    + +

    Baseline

    + +

    Our baseline is the previously released ResNet50 model of TorchVision. It was trained with the following recipe:

    + +
      # Optimizer & LR scheme
    +  ngpus=8,
    +  batch_size=32,  # per GPU
    +
    +  epochs=90, 
    +  opt='sgd',  
    +  momentum=0.9,
    +
    +  lr=0.1, 
    +  lr_scheduler='steplr', 
    +  lr_step_size=30, 
    +  lr_gamma=0.1, 
    +
    +
    +  # Regularization
    +  weight_decay=1e-4,
    +
    +
    +  # Resizing
    +  interpolation='bilinear', 
    +  val_resize_size=256, 
    +  val_crop_size=224, 
    +  train_crop_size=224,
    +
    + +

    Most of the above parameters are the defaults on our training scripts. We will start building on top of this baseline by introducing optimizations until we gradually arrive at the final recipe.

    + +

    LR optimizations

    + +

    There are a few parameter updates we can apply to improve both the accuracy and the speed of our training. This can be achieved by increasing the batch size and tuning the LR. Another common method is to apply warmup and gradually increase our learning rate. This is beneficial especially when we use very high learning rates and helps with the stability of the training in the early epochs. Finally, another optimization is to apply Cosine Schedule to adjust our LR during the epochs. A big advantage of cosine is that there are no hyper-parameters to optimize, which cuts down our search space.

    + +

    Here are the additional optimizations applied on top of the baseline recipe. Note that we’ve run multiple experiments to determine the optimal configuration of the parameters:

    + +
      batch_size=128,  # per GPU
    +
    +  lr=0.5, 
    +  lr_scheduler='cosineannealinglr', 
    +  lr_warmup_epochs=5, 
    +  lr_warmup_method='linear', 
    +  lr_warmup_decay=0.01,
    +
    + +

    The above optimizations increase our top-1 Accuracy by 0.364 points comparing to the baseline. Note that in order to combine the different LR strategies we use the newly introduced SequentialLR scheduler.

    + +

    TrivialAugment

    + +

    The original model was trained using basic augmentation transforms such as Random resized crops and horizontal flips. An easy way to improve our accuracy is to apply more complex “Automatic-Augmentation” techniques. The one that performed best for us is TrivialAugment [9], which is extremely simple and can be considered “parameter free”, which means it can help us cut down our search space further.

    + +

    Here is the update applied on top of the previous step:

    + +
    auto_augment='ta_wide',
    +
    + +

    The use of TrivialAugment increased our top-1 Accuracy by 0.312 points compared to the previous step.

    + +

    Long Training

    + +

    Longer training cycles are beneficial when our recipe contains ingredients that behave randomly. More specifically as we start adding more and more techniques that introduce noise, increasing the number of epochs becomes crucial. Note that at early stages of our exploration, we used relatively short cycles of roughly 200 epochs which was later increased to 400 as we started narrowing down most of the parameters and finally increased to 600 epochs at the final versions of the recipe.

    + +

    Below we see the update applied on top of the earlier steps:

    + +
    epochs=600,
    +
    + +

    This further increases our top-1 Accuracy by 1.8 points on top of the previous step. This is the biggest increase we will observe in this iterative process. It’s worth noting that the effect of this single optimization is overstated and somehow misleading. Just increasing the number of epochs on top of the old baseline won’t yield such significant improvements. Nevertheless the combination of the LR optimizations with strong Augmentation strategies helps the model benefit from longer cycles. It’s also worth mentioning that the reason we introduce the lengthy training cycles so early in the process is because in the next steps we will introduce techniques that require significantly more epochs to provide good results.

    + +

    Random Erasing

    + +

    Another data augmentation technique known to help the classification accuracy is Random Erasing [10], [11]. Often paired with Automatic Augmentation methods, it usually yields additional improvements in accuracy due to its regularization effect. In our experiments we tuned only the probability of applying the method via a grid search and found that it’s beneficial to keep its probability at low levels, typically around 10%. 

    + +

    Here is the extra parameter introduced on top of the previous:

    + +
    random_erase=0.1,
    +
    + +

    Applying Random Erasing increases our Acc@1 by further 0.190 points.

    + +

    Label Smoothing

    + +

    A good technique to reduce overfitting is to stop the model from becoming overconfident. This can be achieved by softening the ground truth using Label Smoothing [12]. There is a single parameter which controls the degree of smoothing (the higher the stronger) that we need to specify. Though optimizing it via grid search is possible, we found that values around 0.05-0.15 yield similar results, so to avoid overfitting it we used the same value as on the paper that introduced it.

    + +

    Below we can find the extra config added on this step:

    + +
    label_smoothing=0.1,
    +
    + +

    We use PyTorch’s newly introduced CrossEntropyLoss label_smoothing parameter and that increases our accuracy by an additional 0.318 points.

    + +

    Mixup and Cutmix

    + +

    Two data augmentation techniques often used to produce SOTA results are Mixup and Cutmix [13], [14]. They both provide strong regularization effects by softening not only the labels but also the images. In our setup we found it beneficial to apply one of them randomly with equal probability. Each is parameterized with a hyperparameter alpha, which controls the shape of the Beta distribution from which the smoothing probability is sampled. We did a very limited grid search, focusing primarily on common values proposed on the papers. 

    + +

    Below you will find the optimal values for the alpha parameters of the two techniques:

    + +
    mixup_alpha=0.2, 
    +cutmix_alpha=1.0,
    +
    + +

    Applying mixup increases our accuracy by 0.118 points and combining it with cutmix improves it by additional 0.278 points.

    + +

    Weight Decay tuning

    + +

    Our standard recipe uses L2 regularization to reduce overfitting. The Weight Decay parameter controls the degree of the regularization (the larger the stronger) and is applied universally to all learned parameters of the model by default. In this recipe, we apply two optimizations to the standard approach. First we perform grid search to tune the parameter of weight decay and second we disable weight decay for the parameters of the normalization layers. 

    + +

    Below you can find the optimal configuration of weight decay for our recipe:

    + +
    weight_decay=2e-05, 
    +norm_weight_decay=0.0,
    +
    + +

    The above update improves our accuracy by a further 0.526 points, providing additional experimental evidence for a known fact that tuning weight decay has significant effects on the performance of the model. Our approach for separating the Normalization parameters from the rest was inspired by ClassyVision’s approach.

    + +

    FixRes mitigations

    + +

    An important property identified early in our experiments is the fact that the models performed significantly better if the resolution used during validation was increased from the 224x224 of training. This effect is studied in detail on the FixRes paper [5] and two mitigations are proposed: a) one could try to reduce the training resolution so that the accuracy on the validation resolution is maximized or b) one could fine-tune the model on a two-phase training so that it adjusts on the target resolution. Since we didn’t want to introduce a 2-phase training, we went for option a). This means that we reduced the train crop size from 224 and used grid search to find the one that maximizes the validation on resolution of 224x224.

    + +

    Below you can see the optimal value used on our recipe:

    + +
    val_crop_size=224, 
    +train_crop_size=176,
    +
    + +

    The above optimization improved our accuracy by an additional 0.160 points and sped up our training by 10%. 

    + +

    It’s worth noting that the FixRes effect still persists, meaning that the model continues to perform better on validation when we increase the resolution. Moreover, further reducing the training crop-size actually hurts the accuracy. This intuitively makes sense because one can only reduce the resolution so much before critical details start disappearing from the picture. Finally, we should note that the above FixRes mitigation seems to benefit models with similar depth to ResNet50. Deeper variants with larger receptive fields seem to be slightly negatively affected (typically by 0.1-0.2 points). Hence we consider this part of the recipe optional. Below we visualize the performance of the best available checkpoints (with the full recipe) for models trained with 176 and 224 resolution:

    + +
    +Best ResNet50 trained with 176 Resolution +Best ResNet50 trained with 224 Resolution +
    + +

    Exponential Moving Average (EMA)

    + +

    EMA is a technique that allows one to push the accuracy of a model without increasing its complexity or inference time. It performs an exponential moving average on the model weights and this leads to increased accuracy and more stable models. The averaging happens every few iterations and its decay parameter was tuned via grid search. 

    + +

    Below you can see the optimal values for our recipe:

    + +
    model_ema=True, 
    +model_ema_steps=32, 
    +model_ema_decay=0.99998,
    +
    + +

    The use of EMA increases our accuracy by 0.254 points comparing to the previous step. Note that TorchVision’s EMA implementation is build on top of PyTorch’s AveragedModel class with the key difference being that it averages not only the model parameters but also its buffers. Moreover, we have adopted tricks from Pycls which allow us to parameterize the decay in a way that doesn’t depend on the number of epochs.

    + +

    Inference Resize tuning

    + +

    Unlike all other steps of the process which involved training models with different parameters, this optimization was done on top of the final model. During inference, the image is resized to a specific resolution and then a central 224x224 crop is taken from it. The original recipe used a resize size of 256, which caused a similar discrepancy as the one described on the FixRes paper [5]. By bringing this resize value closer to the target inference resolution, one can improve the accuracy. To select the value we run a short grid search between interval [224, 256] with step of 8. To avoid overfitting, the value was selected using half of the validation set and confirmed using the other half.

    + +

    Below you can see the optimal value used on our recipe:

    + +
    val_resize_size=232,
    +
    + +

    The above is an optimization which improved our accuracy by 0.224 points. It’s worth noting that the optimal value for ResNet50 works also best for ResNet101, ResNet152 and ResNeXt50, which hints that it generalizes across models:

    + +
    +ResNet50 Inference Resize +ResNet101 Inference Resize +Best ResNet50 trained with 224 Resolution +
    + +

    [UPDATE] Repeated Augmentation

    + +

    Repeated Augmentation [15], [16] is another technique which can improve the overall accuracy and has been used by other strong recipes such as those at [6], [7]. Tal Ben-Nun, a community contributor, has further improved upon our original recipe by proposing training the model with 4 repetitions. His contribution came after the release of this article.

    + +

    Below you can see the optimal value used on our recipe:

    + +
    ra_sampler=True,
    +ra_reps=4,
    +
    + +

    The above is the final optimization which improved our accuracy by 0.184 points. 

    + +

    Optimizations that were tested but not adopted

    + +

    During the early stages of our research, we experimented with additional techniques, configurations and optimizations. Since our target was to keep our recipe as simple as possible, we decided not to include anything that didn’t provide a significant improvement. Here are a few approaches that we took but didn’t make it to our final recipe:

    + +
      +
    • Optimizers: Using more complex optimizers such as Adam, RMSProp or SGD with Nesterov momentum didn’t provide significantly better results than vanilla SGD with momentum.
    • +
    • LR Schedulers: We tried different LR Scheduler schemes such as StepLR and Exponential. Though the latter tends to work better with EMA, it often requires additional hyper-parameters such as defining the minimum LR to work well. Instead, we just use cosine annealing decaying the LR up to zero and choose the checkpoint with the highest accuracy.
    • +
    • Automatic Augmentations: We’ve tried different augmentation strategies such as AutoAugment and RandAugment. None of these outperformed the simpler parameter-free TrivialAugment.
    • +
    • Interpolation: Using bicubic or nearest interpolation didn’t provide significantly better results than bilinear.
    • +
    • Normalization layers: Using Sync Batch Norm didn’t yield significantly better results than using the regular Batch Norm.
    • +
    + +

    Acknowledgements

    + +

    We would like to thank Piotr Dollar, Mannat Singh and Hugo Touvron for providing their insights and feedback during the development of the recipe and for their previous research work on which our recipe is based on. Their support was invaluable for achieving the above result. Moreover, we would like to thank Prabhat Roy, Kai Zhang, Yiwen Song, Joel Schlosser, Ilqar Ramazanli, Francisco Massa, Mannat Singh, Xiaoliang Dai, Samuel Gabriel, Allen Goodman and Tal Ben-Nun for their contributions to the Batteries Included project.

    + +

    References

    + +
      +
    1. Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. “Deep Residual Learning for Image Recognition”.
    2. +
    3. Tong He, Zhi Zhang, Hang Zhang, Zhongyue Zhang, Junyuan Xie, Mu Li. “Bag of Tricks for Image Classification with Convolutional Neural Networks”
    4. +
    5. Piotr Dollár, Mannat Singh, Ross Girshick. “Fast and Accurate Model Scaling”
    6. +
    7. Tete Xiao, Mannat Singh, Eric Mintun, Trevor Darrell, Piotr Dollár, Ross Girshick. “Early Convolutions Help Transformers See Better”
    8. +
    9. Hugo Touvron, Andrea Vedaldi, Matthijs Douze, Hervé Jégou. “Fixing the train-test resolution discrepancy
    10. +
    11. Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou. “Training data-efficient image transformers & distillation through attention”
    12. +
    13. Ross Wightman, Hugo Touvron, Hervé Jégou. “ResNet strikes back: An improved training procedure in timm”
    14. +
    15. Benjamin Recht, Rebecca Roelofs, Ludwig Schmidt, Vaishaal Shankar. “Do ImageNet Classifiers Generalize to ImageNet?”
    16. +
    17. Samuel G. Müller, Frank Hutter. “TrivialAugment: Tuning-free Yet State-of-the-Art Data Augmentation”
    18. +
    19. Zhun Zhong, Liang Zheng, Guoliang Kang, Shaozi Li, Yi Yang. “Random Erasing Data Augmentation”
    20. +
    21. Terrance DeVries, Graham W. Taylor. “Improved Regularization of Convolutional Neural Networks with Cutout”
    22. +
    23. Christian Szegedy, Vincent Vanhoucke, Sergey Ioffe, Jon Shlens, Zbigniew Wojna. “Rethinking the Inception Architecture for Computer Vision”
    24. +
    25. Hongyi Zhang, Moustapha Cisse, Yann N. Dauphin, David Lopez-Paz. “mixup: Beyond Empirical Risk Minimization”
    26. +
    27. Sangdoo Yun, Dongyoon Han, Seong Joon Oh, Sanghyuk Chun, Junsuk Choe, Youngjoon Yoo. “CutMix: Regularization Strategy to Train Strong Classifiers with Localizable Features”
    28. +
    29. Elad Hoffer, Tal Ben-Nun, Itay Hubara, Niv Giladi, Torsten Hoefler, Daniel Soudry. “Augment your batch: better training with larger batches”
    30. +
    31. Maxim Berman, Hervé Jégou, Andrea Vedaldi, Iasonas Kokkinos, Matthijs Douze. “Multigrain: a unified image embedding for classes and instances”
    32. +
    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/huawei-joins-pytorch/index.html b/blog/huawei-joins-pytorch/index.html new file mode 100644 index 000000000000..ef670f2ca36a --- /dev/null +++ b/blog/huawei-joins-pytorch/index.html @@ -0,0 +1,703 @@ + + + + + + + + + + + + + Huawei Joins the PyTorch Foundation as a Premier Member | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    Today, the PyTorch Foundation, a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem, announced that Huawei has joined as a premier member.

    + +

    Huawei has been a long-standing supporter and contributor to the PyTorch Ecosystem, and, through the release of progressive diverse computing, provides easier access to the PyTorch ecosystem for more hardware vendors. By joining as a premier member, Huawei will continue to optimize PyTorch to fully unleash Ascend computing capabilities.

    + +

    “We are delighted to join the PyTorch Foundation, and hope to further collaborate with other member companies and expand the community to a wider audience,” said by Zhang Dixuan, President of Huawei Ascend Computing Business, “This move benefits both Huawei, PyTorch, and the wider AI ecosystem. It also aligns with our long-held beliefs in openness, innovation, collaboration, and shared success, and we are confident that it will spur new innovations in the global AI community.”

    + +

    Huawei unveiled the All Intelligence strategy to accelerate intelligence across all industries. To cater the demand for AI computing needs, Huawei invests in the system-level technologies, and that belief is centered on open hardware and software that enables partners and fosters talent. This strategy aligns with the PyTorch Foundation’s mission to develop AI as part of a sustainable open source ecosystem and produce inclusive technological feats.

    + +

    PyTorch Foundation Executive Director Ibrahim Haddad said, “We are delighted to welcome Huawei to the PyTorch Foundation. Huawei is a leading body in researching computer vision, natural language processing, speech recognition, and other emerging areas, and has proven experience in the field of foundation models. We have no doubt that we will benefit from their support and guidance.”

    + +

    As a premier member, Huawei is granted one seat to the PyTorch Foundation Governing Board, and will help set policies, bylaws, and mission and vision statements that define the overarching scope of the PyTorch Foundation’s initiatives, technical vision, and direction.

    + +

    The Board welcomes Huawei representative Fred Li, Head of Computing Open Source Development Team at Huawei. Fred leads an active and creative team in R&D and operations projects under the principle of “upstream first”, which aims to make diverse computing power ubiquitous.

    + +

    To learn more about how you can be a part of the PyTorch Foundation, visit our website.

    + +

    About Huawei

    + +

    Founded in 1987, Huawei is a leading global provider of information and communications technology (ICT) infrastructure and smart devices. We have 207,000 employees and operate in over 170 countries and regions, serving more than three billion people around the world. We are committed to bringing digital to every person, home and organization for a fully connected, intelligent world.

    + +

    About PyTorch Foundation

    + +

    The PyTorch Foundation is a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem. The PyTorch Foundation is supported by its members and leading contributors to the PyTorch open source project. The Foundation leverages resources provided by members and contributors to enable community discussions and collaboration.

    + +

    About The Linux Foundation

    + +

    The Linux Foundation is the world’s leading home for collaboration on open source software, hardware, standards, and data. Linux Foundation projects are critical to the world’s infrastructure including Linux, Kubernetes, Node.js, ONAP, PyTorch, RISC-V, SPDX, OpenChain, and more. The Linux Foundation focuses on leveraging best practices and addressing the needs of contributors, users, and solution providers to create sustainable models for open collaboration. For more information, please visit us at linuxfoundation.org. The Linux Foundation has registered trademarks and uses trademarks. For a list of trademarks of The Linux Foundation, please see its trademark usage page. Linux is a registered trademark of Linus Torvalds.

    + +
    + +

    华为成为PyTorch基金会Premier会员

    + +

    PyTorch 基金会是深度学习社区在开源 PyTorch 框架和生态系统上进行协作的中立家园,今天宣布华为已作为Premier会员加入。

    + +

    华为长期以来一直是PyTorch生态系统的支持者和贡献者,通过推进多样性算力支持与改进,帮助更多厂商后端能够更加轻松地接入PyTorch生态,并积极致力于PyTorch优化,从而充分释放昇腾的算力。

    + +

    “通过加入PyTorch基金会,我们可以进一步与其他成员公司共同协作,加速PyTorch社区的发展。”华为昇腾计算业务总裁张迪煊表示,“我们相信这对华为和 PyTorch 生态系统是互惠互利的,也符合我们长期以来开放创新,协作共赢的开源理念,为全球人工智能社区带来更多的兴奋和创新。”

    + +

    华为发布全面智能化战略,加速千行万业智能化的转型,持续通过系统级持续创新,坚持硬件开放、软件开源、使能伙伴、发展人才,以满足各行各业多样性的AI算力需求。这与 PyTorch 基金会的使命完美契合且相互补充,即通过培育和维持开源生态系统来推动人工智能的发展,并使每个人都能使用这些技术创新。

    + +

    “华为在计算机视觉、自然语言处理、语音识别等领域进行了广泛的研究,并且在大模型领域也积累了成熟的研究经验。我们相信 PyTorch 基金会将从他们对我们的成员和生态系统的支持中受益匪浅。”PyTorch 基金会执行董事 Ibrahim Haddad 说道。

    + +

    作为 Premier 会员,华为获得了 PyTorch 基金会董事会的一个席位。董事会通过我们的章程、使命和愿景声明制定政策,描述基金会计划、技术愿景和方向的总体范围。

    + +

    我们很高兴欢迎华为计算开源业务总经理李永乐加入我们的董事会。李永乐目前负责华为计算产品线开源业务,他领导着一支极具创新又充满活力的技术和运营团队,他们秉持着“Upstream first”的原则,让多样性算力无处不在。

    + +

    要了解有关如何成为 PyTorch 基金会一部分的更多信息,请访问我们的网站

    + +

    关于华为

    + +

    华为创立于1987年,是全球领先的ICT(信息与通信)基础设施和智能终端提供商。我们的20.7万员工遍及170多个国家和地区,为全球30多亿人口提供服务。我们致力于把数字世界带入每个人、每个家庭、每个组织,构建万物互联的智能世界。

    + +

    关于PyTorch基金会

    + +

    PyTorch 基金会是深度学习社区在开源 PyTorch 框架和生态系统上进行协作的中立家园。 PyTorch 基金会得到其成员和 PyTorch 开源项目主要贡献者的支持。基金会利用成员和贡献者提供的资源来促进社区讨论和协作。

    + +

    关于Linux基金会

    + +

    Linux 基金会是世界领先的开源软件、硬件、标准和数据协作中心。 Linux 基金会项目对世界基础设施至关重要,包括 Linux、Kubernetes、Node.js、ONAP、PyTorch、RISC-V、SPDX、OpenChain 等。 Linux 基金会专注于利用最佳实践并满足贡献者、用户和解决方案提供商的需求,以创建可持续的开放协作模型。欲了解更多信息,请访问我们的 linuxfoundation.org。 Linux 基金会已注册商标并使用商标。有关 Linux 基金会的商标列表,请参阅其商标使用页面:www.linuxfoundation.org/trademark-usage。 Linux 是 Linus Torvalds 的注册商标。

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/hugging-face-joins/index.html b/blog/hugging-face-joins/index.html new file mode 100644 index 000000000000..5a3a676ac66d --- /dev/null +++ b/blog/hugging-face-joins/index.html @@ -0,0 +1,675 @@ + + + + + + + + + + + + + Hugging Face Joins the PyTorch Foundation as a Premier Member | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    Smiling hugging face

    + +

    The PyTorch Foundation, a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem, is announcing today that Hugging Face has joined as a premier member.

    + +

    Hugging Face has been a long time supporter and contributor to the PyTorch Ecosystem by providing powerful models and resources that accelerate research, development, and adoption of AI technologies, particularly in the field of natural language processing.

    + +

    “Our mission has always been to democratize AI and make it accessible to everyone. We’re truly aligned with PyTorch’s objective of reducing the barrier of entry to practitioners. By joining the PyTorch Foundation, we can further amplify that impact and support this very important framework of the ecosystem that is PyTorch,” said Lysandre Debut, Head of Open Source at Hugging Face. “We believe the two ecosystems have significant overlap, and collaborating with the foundation will allow us to bridge the gap to provide the best software, the best tools to the machine learning community at large.”

    + +

    Hugging Face’s Model Hub and open source libraries promote collaboration and knowledge sharing within the AI open source community, making Hugging Face a great match to the growing PyTorch Foundation. They continue to drive industry adoption and collaboration by creating user-friendly tools and resources and providing accessible and well-documented libraries.

    + +

    “Hugging Face’s commitment to open source development and their exceptional contributions to the PyTorch ecosystem have truly impressed us. With their help, we will drive innovation, foster collaboration, and empower the global AI community to create transformative solutions for the AI community,” said PyTorch Foundation Executive Director Ibrahim Haddad. “We welcome Hugging Face to the PyTorch Foundation and look forward to the achievements that lie ahead.”

    + +

    As a premier member, Hugging Face is granted one seat to the PyTorch Foundation Governing Board. The Board sets policy through our bylaws, mission and vision statements, describing the overarching scope of foundation initiatives, technical vision, and direction.

    + +

    Lysandre Debut

    + +

    We’re happy to welcome Lysandre Debut, Head of Open Source at Hugging Face to our board. Lysandre has been at Hugging Face since the company’s pivot to open-source, and was the first engineer to focus entirely on the open-source mission. Now leading the open-source part of the organization, Lysandre remains technically involved by being a core maintainer of the Transformers library.

    + +

    To learn more about how you can be a part of the PyTorch Foundation, visit our website.

    + +

    About Hugging Face

    + +

    Hugging Face is a community and company dedicated to lowering the barrier of entry to Machine Learning and Deep Learning. Strong advocates for open-source and open-science, their model Hub hosts more than 250,000 public models and 50,000 public datasets that are very simple to use. Transformers, Diffusers, PEFT, Accelerate, and Datasets are some of the open-source tools made available by Hugging Face.

    + +

    About PyTorch Foundation

    + +

    The PyTorch Foundation is a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem. The PyTorch Foundation is supported by its members and leading contributors to the PyTorch open source project. The Foundation leverages resources provided by members and contributors to enable community discussions and collaboration.

    + +

    About The Linux Foundation

    + +

    The Linux Foundation is the world’s leading home for collaboration on open source software, hardware, standards, and data. Linux Foundation projects are critical to the world’s infrastructure including Linux, Kubernetes, Node.js, ONAP, PyTorch, RISC-V, SPDX, OpenChain, and more. The Linux Foundation focuses on leveraging best practices and addressing the needs of contributors, users, and solution providers to create sustainable models for open collaboration. For more information, please visit us at linuxfoundation.org. The Linux Foundation has registered trademarks and uses trademarks. For a list of trademarks of The Linux Foundation, please see its trademark usage page: www.linuxfoundation.org/trademark-usage. Linux is a registered trademark of Linus Torvalds.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/ibm-joins-pytorch/index.html b/blog/ibm-joins-pytorch/index.html new file mode 100644 index 000000000000..0f1163aa8c0c --- /dev/null +++ b/blog/ibm-joins-pytorch/index.html @@ -0,0 +1,665 @@ + + + + + + + + + + + + + IBM Joins the PyTorch Foundation as a Premier Member | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    The PyTorch Foundation, part of The Linux Foundation, is pleased to announce that IBM has joined as a premier member.

    + +

    IBM Logo

    + +

    The foundation serves as a neutral space for the deep learning community to collaborate on the open source PyTorch framework and ecosystem. With its extensive industry expertise and leadership in open source and AI, IBM is committed to actively contributing to the PyTorch community.

    + +

    IBM offers a comprehensive portfolio of enterprise AI solutions and recently released watsonx, its next-generation data and AI platform. IBM’s watsonx platform leverages PyTorch to offer an enterprise-grade software stack for end-to-end training and fine-tuning of AI foundation models.

    + +

    “By joining the PyTorch Foundation, we aim to contribute our expertise and resources to further advance PyTorch’s capabilities and make AI more accessible in hybrid cloud environments with flexible hardware options,” said Priya Nagpurkar, Vice President, Hybrid Cloud Platform and Developer Productivity, IBM Research. “We intend for our collaboration with PyTorch to bring the power of foundation models and generative AI to enterprises using the watsonx platform to drive business transformation.”

    + +

    IBM and PyTorch have already collaborated on two projects. The first enables foundation models with billions of parameters to train efficiently on standard cloud networking infrastructure, such as Ethernet networking. Together, IBM and PyTorch have also worked on ways to make checkpointing for AI training considerably more cost-effective, by fixing the distributed checkpointing within PyTorch to support certain types of object storage.

    + +

    “We’re happy to welcome IBM as a premier member. IBM’s expertise and dedication to advancing the field of artificial intelligence align perfectly with the mission of the PyTorch community,” said PyTorch Foundation Executive Director Ibrahim Haddad. “Their commitment to open collaboration and innovation will strengthen our collective efforts to empower developers and researchers worldwide.”

    + +

    As a premier member, IBM is granted one seat to the PyTorch Foundation Governing Board. The Board sets policy through our bylaws, mission and vision statements, describing the overarching scope of foundation initiatives, technical vision, and direction.

    + +

    Raghu Ganti Headshot

    + +

    We’re happy to welcome Raghu Ganti, Principal Research Scientist at IBM Research, to our board. Raghu co-leads IBM Research’s foundation model training and validation platform, built on Red Hat OpenShift. His team primarily contributes to the PyTorch training components, with the mission of democratizing training and validation of foundation models.

    + +

    To learn more about how you can be a part of the PyTorch Foundation, visit our website.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/improve-rag-performance/index.html b/blog/improve-rag-performance/index.html new file mode 100644 index 000000000000..062d6fe9197f --- /dev/null +++ b/blog/improve-rag-performance/index.html @@ -0,0 +1,1035 @@ + + + + + + + + + + + + + Improve RAG performance with torch.compile on AWS Graviton Processors | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Sunita Nadampalli(AWS), Ankith Gunapal(Meta), Hamid Shojanazeri(Meta) + +

    +

    Large Language Models (LLMs) are trained on vast volumes of data and use billions of parameters to support tasks like answering questions, translating languages, and completing sentences. There are a few challenges when working with LLMs such as domain knowledge gaps, factuality issues, and hallucination, which affect their reliability especially for the fields that require high levels of accuracy, such as healthcare, law, or engineering. Retrieval Augmented Generation (RAG) provides a solution to mitigate some of these issues by augmenting LLMs with a specific domain or an organization’s internal knowledge base, without the need to retrain the model.

    + +

    The RAG knowledge source is generally business specific databases which are typically deployed on general-purpose CPU infrastructure. So, deploying RAG on general-purpose CPU infrastructure alongside related business services is both efficient and cost-effective. With this motivation, we evaluated RAG deployment on AWS Graviton based Amazon EC2 instances which have been delivering up to 40% price-performance advantage compared to comparable instances for the majority of the workloads including databases, in-memory caches, big data analytics, media codecs, gaming servers, and machine learning inference.

    + +

    In the past we published a few blog posts on how PyTorch was optimized for AWS Graviton processors to accelerate ML Inference performance for both eager mode (blog) and torch.compile mode (blog). In this blog we cover how to deploy a typical RAG workload using PyTorch and torch.compile, how we improved its performance up to 1.7x for embedding model and 1.3x for RAG query on AWS Graviton3-based m7g.xlarge instance compared to the default PyTorch “eager mode”, and finally a few recommendations that you can apply for your RAG use cases.

    + +

    How to Optimize RAG?

    + +

    Without RAG, the LLM takes the user input and creates a response based on information it was trained on (what it already knows). With RAG, an information retrieval component is introduced that utilizes the user input to first pull information from a new data source. The user query and the relevant information are both given to the LLM. The LLM uses the new knowledge and its training data to create better responses. The following diagram shows the conceptual flow of using RAG with LLMs.

    + +

    Image 1: Conceptual flow of using RAG with LLMs

    + +

    Image 1: Conceptual flow of using RAG with LLMs

    + +

    Source: https://aws.amazon.com/what-is/retrieval-augmented-generation/

    + +

    Embedding model

    + +

    At the core of RAG is an embedding model that takes the text data and converts into a vector representation. These vectors are then stored in a vector db. When a user makes a query, the query is first converted to a vector and the RAG does a similarity search on the vector db. Hence, the first step in optimizing RAG performance is optimizing an embedding model’s inference performance. We used the AWS Graviton3-based m7g.xlarge instance and the HuggingFace sentence-transformer embedding model for the optimization work. Here is a sample script for profiling the HuggingFace sentence-transformer embedding model inference with PyTorch Eager mode.

    + +
    import torch
    +from torch.profiler import profile, ProfilerActivity, record_function
    +from transformers import AutoModel, AutoTokenizer
    +
    +model_name = "sentence-transformers/all-mpnet-base-v2"
    +input_text = ["This is an example sentence", "Each sentence is converted"]
    +
    +model = AutoModel.from_pretrained(model_name)
    +tokenizer = AutoTokenizer.from_pretrained(model_name)
    +
    +encoded_input = tokenizer(
    +    input_text, padding=True, truncation=True, return_tensors="pt"
    +)
    +
    +warmup, actual = 100, 100
    +model.eval()
    +
    +with torch.no_grad():
    +    # warmup
    +    for i in range(warmup):
    +        embeddings = model(**encoded_input)
    +
    +    with profile(activities=[ProfilerActivity.CPU]) as prof:
    +        with record_function("model_inference"):
    +            for i in range(actual):
    +                embeddings = model(**encoded_input)
    +        print(prof.key_averages().table(sort_by="self_cpu_time_total"))
    +
    + +

    Eager mode

    + +

    Since PyTorch eager mode was already optimized on AWS Graviton processors with the following runtime environment settings, we included them in the baseline and measured the following performance. Please refer to Optimized PyTorch 2.0 Inference with AWS Graviton processors for more details on how we optimized the PyTorch eager mode on AWS Graviton processors.

    + +
    # Enable the fast math GEMM kernels, to accelerate fp32 inference with bfloat16 gemm
    +export DNNL_DEFAULT_FPMATH_MODE=BF16
    +
    +# Enable Linux Transparent Huge Page (THP) allocations,
    +# to reduce the tensor memory allocation latency
    +export THP_MEM_ALLOC_ENABLE=1
    +
    +# Set LRU Cache capacity to cache the primitives and avoid redundant
    +# memory allocations
    +export LRU_CACHE_CAPACITY=1024
    +
    + +
    ---------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
    +                       Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
    +---------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
    +                aten::addmm        61.01%        2.638s        62.49%        2.702s     370.197us          7300  
    +            model_inference        12.01%     519.161ms       100.00%        4.324s        4.324s             1  
    +                  aten::bmm         6.25%     270.084ms        11.96%     517.089ms     215.454us          2400  
    +               aten::select         3.98%     172.165ms         5.34%     230.863ms       1.331us        173500  
    +                aten::copy_         2.11%      91.133ms         2.11%      91.133ms       6.200us         14700   
    +---------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
    +Self CPU time total: 4.324s
    +
    + +

    Table 1: Profiler output for HuggingFace sentence-transformer embedding model inference on AWS Graviton3-based m7g.xlarge instance with PyTorch Eager mode

    + +

    Next, we added torch.compile, weights pre-packing, and torch.inference_mode and observed around 1.7x performance improvement. The following section talks about each of these optimizations and the resulting speedup.

    + +

    torch.compile

    + +

    In contrast to eager mode, the torch.compile pre-compiles the entire model into a single graph in a manner that’s optimized for running on given hardware. Please refer to Accelerated PyTorch Inference with torch.compile on AWS Graviton processors for more details on torch.compile features and how we optimized them on AWS Graviton processors. Invoke torch.compile as shown in the following snippet to trigger PyTorch dynamo compilation for the model. This resulted in around 1.04x performance improvement from the baseline.

    + +
    model = torch.compile(model)
    +
    +----------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
    +                        Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
    +----------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
    +                 aten::addmm        64.46%        2.675s        66.66%        2.766s     378.905us          7300  
    +       Torch-Compiled Region        19.76%     820.085ms        99.04%        4.109s      41.094ms           100  
    +                   aten::bmm         6.66%     276.216ms        12.52%     519.527ms     216.470us          2400  
    +                aten::select         3.98%     164.991ms         5.41%     224.488ms       1.299us        172800  
    +            aten::as_strided         1.66%      69.039ms         1.66%      69.039ms       0.383us        180100  
    +----------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
    +Self CPU time total: 4.149s
    +
    + +

    Table 2: Profiler output for HuggingFace sentence-transformer embedding model inference on AWS Graviton3-based m7g.xlarge instance with torch.compile mode

    + +

    Weights pre-packing

    + +

    torch.compile opens up opportunities like pre-packing the model weights into a format that is more suitable for the given hardware during the model compilation, thus improving the performance. Set the following config to trigger weights pre-packing. This resulted in around 1.69x improvement from the baseline.

    + +
    import torch._inductor.config as config
    +config.cpp.weight_prepack=True
    +config.freezing=True
    +
    + +
    -----------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
    +                         Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
    +-----------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
    +    mkldnn::_linear_pointwise        39.10%     994.821ms        41.50%        1.056s     144.628us          7300  
    +        Torch-Compiled Region        35.12%     893.675ms        98.42%        2.504s      25.043ms           100  
    +                    aten::bmm        10.96%     278.859ms        21.66%     551.073ms     229.614us          2400  
    +                 aten::select         7.34%     186.838ms         9.98%     253.840ms       1.469us        172800  
    +             aten::as_strided         2.63%      67.002ms         2.63%      67.002ms       0.388us        172800   
    +-----------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
    +Self CPU time total: 2.544s
    +
    + +

    Table 3: Profiler output for HuggingFace sentence-transformer embedding model inference on AWS Graviton3-based m7g.xlarge instance with torch.compile and weights pre-packing

    + +

    torch.inference_mode

    + +

    Additionally, use torch.inference_mode() to get savings from turning off version control for tensors and view tracking of tensors. Please refer to the PyTorch documentation for more details.

    + +
    with torch.inference_mode():
    +# instead of
    +with torch.no_grad():
    +
    + +
    -----------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
    +                         Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
    +-----------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
    +    mkldnn::_linear_pointwise        38.92%     987.276ms        41.17%        1.044s     143.056us          7300  
    +        Torch-Compiled Region        34.92%     885.895ms        98.45%        2.498s      24.975ms           100  
    +                    aten::bmm        11.25%     285.292ms        22.22%     563.594ms     234.831us          2400  
    +                 aten::select         7.74%     196.223ms        10.22%     259.251ms       1.500us        172800  
    +             aten::as_strided         2.48%      63.027ms         2.48%      63.027ms       0.365us        172800  
    +-----------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
    +Self CPU time total: 2.537s
    +
    + +

    Table 4: Profiler output for HuggingFace sentence-transformer embedding model inference on AWS Graviton3-based m7g.xlarge instance with torch.compile, weights pre-packing, and inference_mode

    + +

    The following table shows the incremental performance improvements achieved for the standalone embedding model inference.

    + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Optimization level + Latency measured (in sec) + Improvement over the baseline +
    PyTorch eager mode (Baseline) + 0.04324 + NA +
    torch.compile + 0.04149 + 1.04x +
    weights pre-packing + 0.02544 + 1.69x +
    torch.inference_mode + 0.02537 + 1.70x +
    + +

    The following script is an updated example for the embedding model inference with the previously discussed optimizations included. The optimizations are highlighted in GREEN.

    + +
    +
    +import torch
    +from torch.profiler import profile, record_function, ProfilerActivity
    +from transformers import AutoTokenizer, AutoModel
    +import torch._inductor.config as config
    +config.cpp.weight_prepack=True
    +config.freezing=True
    +
    +model_name = "sentence-transformers/all-mpnet-base-v2"
    +input_text = ['This is an example sentence', 'Each sentence is converted']
    +
    +model = AutoModel.from_pretrained(model_name)
    +tokenizer = AutoTokenizer.from_pretrained(model_name)
    +
    +encoded_input = tokenizer(input_text, padding=True, truncation=True, return_tensors='pt')
    +
    +warmup , actual = 100, 100
    +model.eval()
    +model = torch.compile(model)
    +
    +with torch.inference_mode():
    +#instead of with torch.no_grad()
    +# warmup
    +  for i in range(warmup):
    +  	embeddings = model(**encoded_input)
    +
    +  with profile(activities=[ProfilerActivity.CPU]) as prof:
    +	with record_function("model_inference"):
    +  	for i in range(actual):
    +     	embeddings = model(**encoded_input)
    +  print(prof.key_averages().table(sort_by="self_cpu_time_total"))
    +
    +
    + +

    End-to-End RAG scenario on CPU

    + +

    After optimizing the embedding model inference, we started with a PyTorch eager mode based RAG setup, mainly to validate the functionality on the CPU backend. We built the RAG solution with HuggingFaceEmbeddings from langchain_community.embeddings, as shown in the following code snippet.

    + +
    from langchain_community.embeddings import HuggingFaceEmbeddings
    +from langchain_community.vectorstores import FAISS
    +from langchain.text_splitter import RecursiveCharacterTextSplitter
    +from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader
    +from langchain.prompts import PromptTemplate
    +from langchain_core.prompts import format_document
    +from bs4 import BeautifulSoup as Soup
    +import torch
    +
    +url =  "https://pytorch.org/blog/pytorch2-5/"
    +chunk_size = 1000
    +chunk_overlap = 0
    +embedding_model = "sentence-transformers/all-mpnet-base-v2"
    +N = 5
    +
    +question = "What's new in PyTorch 2.5?"
    +
    +from transformers import AutoTokenizer, AutoModel
    +from typing import Any, List
    +
    +loader = RecursiveUrlLoader(
    +            url=url, max_depth=3, extractor=lambda x: Soup(x, "html.parser").text
    +        )       
    +docs = loader.load()
    +
    +# Split the document into chunks with a specified chunk size
    +text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    +all_splits = text_splitter.split_documents(docs)
    +
    +# Store the document into a vector store with a specific embedding model
    +model = HuggingFaceEmbeddings(model_name=embedding_model)
    +
    +warmup , actual = 100, 100
    +
    +with torch.inference_mode():
    +    vectorstore = FAISS.from_documents(all_splits, model)
    +
    +    for i in range(warmup):
    +        searchDocs = vectorstore.similarity_search(question, k=N)
    +
    +    import time
    +
    +    start = time.time()
    +    for i in range(actual):
    +        searchDocs = vectorstore.similarity_search(question, k=N)
    +    end = time.time()
    +    print(f"Time for 1 inference is {(end-start)/actual} seconds")
    +
    +    doc_prompt = PromptTemplate.from_template("{page_content}")
    +    context = ""
    +    for i, doc in enumerate(searchDocs):
    +        context += f"\n{format_document(doc, doc_prompt)}\n"
    +
    + +

    Next, our goal was to optimize the end-to-end RAG use case with torch.compile and weights pre-packing that gave 1.7x improvement for the standalone embedding model inference. However, the optimizations didn’t work out of the box for the RAG scenario.

    + +

    What are the challenges and solutions to achieve similar gains in an end-to-end RAG scenario?

    + +

    Challenge 1: model handle

    + +

    There was no way to get the model handle that was instantiated with HuggingFaceEmbeddings, and the wrapper class doesn’t provide compile APIs. So, there was no way for our application to invoke torch.compile to trigger the PyTorch dynamo compilation process.

    + +

    Solution

    + +

    We implemented our custom embedding class so that we can get a handle for the model. This instantiated the embedding model from sentence-transformers , and maintained the handle for immediate compilation or compilation at a later stage. With this, we were able to trigger torch.compile and hence the dynamo compilation.

    + +
    class CustomEmbedding(HuggingFaceEmbeddings):
    +    
    +    def __init__(self, **kwargs: Any):
    +        """Initialize the sentence_transformer."""
    +        super().__init__(**kwargs)
    +
    +        # Load model from HuggingFace Hub
    +        self.client = AutoModel.from_pretrained(self.model_name)
    +    class Config:
    +        arbitrary_types_allowed = True
    +
    +
    +    
    +    def embed_documents(self, texts: List[str]) -> List[List[float]]:
    +        """Compute doc embeddings using a HuggingFace transformer model.
    +        Args:
    +            texts: The list of texts to embed.
    +        Returns:
    +            List of embeddings, one for each text.
    +        """
    +
    +        texts = list(map(lambda x: x.replace("\n", " "), texts))
    +
    +        # Tokenize sentences
    +        tokenizer = AutoTokenizer.from_pretrained(self.model_name)
    +        encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
    +        
    +        embeddings = self.client(
    +           **encoded_input, output_hidden_states=True
    +        )
    +        embeddings = embeddings.pooler_output.detach().numpy()
    +
    +        return embeddings.tolist()
    +
    +# instead of model = HuggingFaceEmbeddings(model_name=embedding_model)
    +model = CustomEmbedding(model_name=embedding_model)
    +
    +# torch.compile the model
    +model.client = torch.compile(model.client)
    +
    + +

    Challenge 2: triggering the optimization

    + +

    For a typical inference scenario where the graph is frozen and gradient calculations are disabled, Torch inductor (the compiler backend we used for CPUs) invokes hardware specific optimizations like graph rewrite into more performant operators, operator fusion, and weights pre-packing. Though Torch dynamo was able to see the model and trigger generic compilation, it failed to trigger these additional Fx passes in the Torch inductor.

    + +

    There were two main reasons for Torch inductor not triggering the optimization passes: (1) The application didn’t set no_grad() or inference_mode() for torch inductor to detect that the graph was frozen; and (2) We hit a limitation with the torch.compile framework, where, if the no_grad is set just at the beginning of the compiled region, torch.compile wouldn’t be able to detect it while invoking the inductor Fx passes because it would not have hit the no_grad region by then. Please refer to this GitHub issue for more details.

    + +

    Solution

    + +

    We work around this limitation by moving the no_grad() context into the application code from within the model class. With this, the model compilation happened as expected and gave around 1.3x performance improvement when we profiled the stable inference pass for eager and compiled versions.

    + +

    Challenge 3: extra compilation

    + +

    With the previous fixes, the query lookup inference performance was improved, but not the total execution time of the benchmarking script. We root-caused it to redundant compilation for the model during the RAG inference. Further deep diving revealed that it was because of the batch size mismatch between the word embedding and the RAG query stages. For example, in our benchmarking script, when the database was vectorized and stored in vector db, we used the batch size of 16, hence the model was compiled with shapes of 16xNxK. Whereas, the RAG query lookup is usually a single request of shape 1xNxK. So, there was a batch size mismatch (dimension “0” of these tensors) that triggered the recompilation for the query lookup stage. We confirmed it with the following Torch logging: TORCH_LOGS="recompiles"

    + +
    TORCH_LOGS="recompiles" python rag_compile.py 
    +V1103 02:48:08.805986 34281 site-packages/torch/_dynamo/guards.py:2813] [0/1] [__recompiles] Recompiling function forward in site-packages/transformers/models/mpnet/modeling_mpnet.py:502
    +V1103 02:48:08.805986 34281 site-packages/torch/_dynamo/guards.py:2813] [0/1] [__recompiles]     triggered by the following guard failure(s):
    +V1103 02:48:08.805986 34281 site-packages/torch/_dynamo/guards.py:2813] [0/1] [__recompiles]     - 0/0: tensor 'L['input_ids']' size mismatch at index 0. expected 16, actual 1
    +
    + +

    Solution

    + +

    Torch dynamo provides a decorator to mark the dimension of a given tensor as dynamic and specify an expected value for the same, so that re-compilation is not triggered. For example, specifying dimension “0” of input_ids and attention_mask as dynamic, and specifying that value of “1” is allowed in that dimension (as shown in the following code snippet), should have avoided the redundant compilations.

    + +
    torch._dynamo.decorators.mark_unbacked(encoded_input['input_ids'], 0)
    +torch._dynamo.mark_dynamic(encoded_input['input_ids'], 1)
    +        torch._dynamo.decorators.mark_unbacked(encoded_input['attention_mask'], 0)
    +torch._dynamo.mark_dynamic(encoded_input['attention_mask'], 1)
    +
    + +

    However, the Torch dynamo decorator and marking didn’t work in this particular case. Moreover, using the decorator created graph breaks. So, we added some warmup iterations to hide the compilation latency, and profiled the query lookup performance in the steady state. However, the good news is that, in practice, this re-compilation is triggered only for the first query, so it might not affect the production scenario if the database size is fixed. Moreover, PyTorch AOT Inductor (a new feature in PyTorch) addresses re-compilation and warm up challenges with torch.compile. In a follow-up blog we will address how in a production environment we can use AOT Inductor to address these challenges.

    + +

    With these solutions we were able to apply torch.compile, weights pre-packing and the AWS Graviton specific optimizations for an end-end RAG scenario and improve the performance by 1.3x from the baseline eager mode.

    + +

    Deployment

    + +

    A detailed guide on how to deploy torch compiled RAG on AWS Graviton-based Amazon EC2 instances and how to deploy it in conjunction with Llama using TorchServe can be found on the PyTorch website.

    + +

    Conclusion

    + +

    In this blog, we covered how we optimized embedding model inference performance on AWS Graviton3-based EC2 instances. We also shared the challenges faced, the solutions we implemented to bring those optimizations for a RAG use case, and the resulting speedups. We hope that you will give it a try! If you need any support with ML software on Graviton, please open an issue on the AWS Graviton Technical Guide GitHub.

    + +

    We would like to express our gratitude to Eli Uriegas for the support in making this blog post happen.

    + +

    Authors

    + +

    Sunita Nadampalli is a Principal Engineer and AI/ML expert at AWS. She leads AWS Graviton software performance optimizations for AI/ML and HPC workloads. She is passionate about open source software development and delivering high-performance and sustainable software solutions for SoCs based on the Arm ISA.

    + +

    Ankith Gunapal is an AI Partner Engineer at Meta (PyTorch). He leads customer support, evangelizing & release engineering of TorchServe. He is passionate about solving production problems in model inference and model serving. He also enjoys distilling technically complex material in a user friendly format.

    + +

    Hamid Shojanazeri leads the AI Frameworks Partner Engineering team at Meta. He is passionate about building scalable AI solutions and specializes in working with PyTorch to tackle the challenges of large-scale distributed training, inference, model serving, and optimization.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/index.html b/blog/index.html new file mode 100644 index 000000000000..f448c42d70c5 --- /dev/null +++ b/blog/index.html @@ -0,0 +1,995 @@ + + + + + + + + + + + + + Blog | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + +
    + + + + + + +
    +
    + +

    Featured Post

    +

    + Recap of the PyTorch Korea User Group Meetup: A Technical Conference with a PyTorch Core Maintainer +

    + + + + Read More + + + +
    +
    + +
    +
    +
    +
    + + + + + + + + +
    +
    +

    May 02, 2025

    +

    + PyTorch Day France Featured Sessions: A Defining Moment for Open Source AI +

    +

    PyTorch Day France offers a front-row seat to the future of open source AI. Taking place 7 May at Station F in Paris and co-located with GOSIM AI Paris, this one-day event will bring together developers, researchers, and industry leaders for a day of technical sessions, real-world insights, and community exchange. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    May 01, 2025

    +

    + Announcing the PyTorch Docathon 2025 +

    +

    + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    April 30, 2025

    +

    + FlexAttention Part II: FlexAttention for Inference +

    +

    Overview + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    April 30, 2025

    +

    + 6x faster Async Checkpointing in PyTorch, using Cached Plans, no GIL contention +

    +

    Meta: Less Wright, Meet Vadakkanchery, Saurabh Mishra, Ela Krepska, Hamid Shojanazeri, Pradeep Fernando +Crusoe: Ethan Petersen, Martin Cala, Chip Smith + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    April 29, 2025

    +

    + PyTorch Foundation Expands to an Umbrella Foundation to Accelerate AI Innovation +

    +

    Today, I am thrilled to announce a significant milestone for the PyTorch Foundation: we are expanding our scope to become an umbrella foundation, allowing us to host additional projects. This expansion positions the PyTorch Foundation to foster a broader ecosystem of high-value, trusted, and innovative AI projects that cater to all stages of the AI lifecycle—from training and inference to industry-specific applications. + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    April 28, 2025

    +

    + Accelerating Large Scale Training and Convergence with PyTorch Float8 Rowwise on Crusoe 2K H200s +

    +

    Meta: Less Wright, Hamid Shojanazeri, Vasiliy Kuznetsov, Daniel Vega-Myhre, Gokul Nadathur, Will Constable, Tianyu Liu, Tristan Rice, Driss Guessous, Josh Fromm, Luca Wehrstedt, Jiecao Yu +Crusoe: Ethan Petersen, Martin Cala, Chip Smith + +

    + +
    + + Read More + +
    + + + + +
    +
    +

    April 25, 2025

    +

    + Accelerate PyTorch 2.7 on Intel® GPUs +

    +

    PyTorch 2.7 continues to deliver significant functionality and performance enhancements on Intel® GPU architectures to streamline AI workflows. Application developers and researchers seeking to fine-tune, inference and develop PyTorch models on Intel GPUs will now have a consistent user experience across various operating systems, including Windows, Linux and Windows Subsystem for Linux (WSL2). This is made possible through improved installation, eager mode script debugging, a performance pro...

    + +
    + + Read More + +
    + +
    + +
    + +
    + +
    +
    +
    + +
    +
    +
    +
    + +

    Install PyTorch

    + +

    Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

    + +

    NOTE: Latest PyTorch requires Python 3.9 or later.

    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Your OS
    +
    +
    +
    Package
    +
    +
    +
    Language
    +
    +
    +
    Compute Platform
    +
    +
    +
    Run this Command:
    +
    +
    + +
    +
    +
    +
    PyTorch Build
    +
    +
    +
    Stable (1.13.0)
    +
    +
    +
    Preview (Nightly)
    +
    +
    +
    +
    +
    Your OS
    +
    +
    +
    Linux
    +
    +
    +
    Mac
    +
    +
    +
    Windows
    +
    +
    +
    +
    +
    Package
    +
    +
    +
    Pip
    +
    +
    +
    LibTorch
    +
    +
    +
    Source
    +
    +
    +
    +
    +
    Language
    +
    +
    +
    Python
    +
    +
    +
    C++ / Java
    +
    +
    +
    +
    +
    Compute Platform
    +
    +
    +
    CUDA 11.8
    +
    +
    +
    CUDA 12.1
    +
    +
    +
    CUDA 12.4
    +
    +
    +
    ROCm 5.2
    +
    +
    +
    CPU
    +
    +
    +
    +
    +
    Run this Command:
    +
    +
    +
    pip install torch torchvision
    +
    +
    +
    +
    +
    + + + + Previous versions of PyTorch + +
    + +
    +

    Quick Start With
    Cloud Partners

    + +

    Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

    + +
    + + +
    +
    +
    + Google Cloud Platform +
    + + + + + +
    +
    + +
    +
    +
    +

    Microsoft Azure

    +
    + + +
    +
    + +
    +
    +
    + Lightning Studios +
    + +
    +
    +
    + +
    +
    +
    +
    + + + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/inside-the-matrix/index.html b/blog/inside-the-matrix/index.html new file mode 100644 index 000000000000..87ecf6947944 --- /dev/null +++ b/blog/inside-the-matrix/index.html @@ -0,0 +1,1075 @@ + + + + + + + + + + + + + Inside the Matrix: Visualizing Matrix Multiplication, Attention and Beyond | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Basil Hosmer + +

    +

    Use 3D to visualize matrix multiplication expressions, attention heads with real weights, and more.

    + +

    Matrix multiplications (matmuls) are the building blocks of today’s ML models. This note presents mm, a visualization tool for matmuls and compositions of matmuls.

    + +

    Matrix multiplication is inherently a three-dimensional operation. Because mm uses all three spatial dimensions, it can convey meaning more clearly and intuitively than the usual squares-on-paper idioms, especially (though not only) for visual/spatial thinkers.

    + +

    We also have room to compose matmuls in geometrically consistent ways - so we can visualize big, compound structures like attention heads and MLP layers using the same rules as simple expressions. And more advanced features, like animating different matmul algorithms, partitioning for parallelism, and loading external data to explore the behavior of actual models, all build naturally on this foundation.

    + +

    mm is fully interactive, runs in the browser and keeps its complete state in the URL, so links are shareable sessions (the screenshots and videos in this note all have links that open the corresponding visualization in the tool). This reference guide describes all of the available functionality.

    + +

    We’ll first introduce the visualization approach, build intuition by visualizing some simple matmuls and expressions, then dive into some more extended examples:

    + +
      +
    1. Pitch - why is this way of visualizing better?
    2. +
    3. Warmup - animations - watching the canonical matmul decompositions in action
    4. +
    5. Warmup - expressions - a quick tour of some fundamental expression building blocks
    6. +
    7. Inside an attention head - an in-depth look at the structure, values and computation behavior of a couple of attention heads from GPT2 via NanoGPT
    8. +
    9. Parallelizing attention - visualizing attention head parallelization with examples from the recent Blockwise Parallel Transformer paper
    10. +
    11. Sizes in an attention layer - what do the MHA and FFA halves of an attention layer look like together, when we visualize a whole layer as a single structure? How does the picture change during autoregressive decoding?
    12. +
    13. LoRA - a visual explanation of this elaboration of the attention head architecture
    14. +
    15. Wrapup - next steps and call for feedback
    16. +
    + +

    1 Pitch

    + +

    mm’s visualization approach is based on the premise that matrix multiplication is fundamentally a three-dimensional operation.

    + +

    In other words this:

    + +

    matrix multiplication is fundamentally a three-dimensional operation

    + +

    is a sheet of paper trying to be this (open in mm):

    + +

    wrap the matmul around a cube

    + +

    When we wrap the matmul around a cube this way, the correct relationships between argument shapes, result shape and shared dimensions all fall into place.

    + +

    Now the computation makes geometric sense: each location i, j in the result matrix anchors a vector running along the depth dimension k in the cube’s interior, where the horizontal plane extending from row i in L and a vertical plane extending from column j in R intersect. Along this vector, pairs of (i, k) (k, j) elements from the left and right arguments meet and are multiplied, and the resulting products are summed along k and the result is deposited in location i, j of the result.

    + +

    (Jumping ahead momentarily, here’s an animation.)

    + +

    This is the intuitive meaning of matrix multiplication:

    + +
      +
    1. project two orthogonal matrices into the interior of a cube
    2. +
    3. multiply the pair of values at each intersection, forming a grid of products
    4. +
    5. sum along the third orthogonal dimension to produce a result matrix.
    6. +
    + +

    For orientation, the tool displays an arrow in the cube’s interior that points towards the result matrix, with a blue vane coming from the left argument and a red vane coming from the right argument. The tool also displays white guidelines to indicate the row axis of each matrix, though they’re faint in this screenshot.

    + +

    The layout constraints are straightforward:

    + +
      +
    • left argument and result must be adjoined along their shared height (i) dimension
    • +
    • right argument and result must be adjoined along their shared width (j) dimension
    • +
    • left and right arguments must be adjoined along their shared (left width/right height) dimension, which becomes the matmul’s depth (k) dimension
    • +
    + +

    This geometry gives us a solid foundation for visualizing all the standard matmul decompositions, and an intuitive basis for exploring nontrivially complex compositions of matmuls, as we’ll see below.

    + +

    2 Warmup - animations

    + +

    Before diving into some more complex examples, we’ll run through a few intuition builders to get a feel for how things look and feel in this style of visualization.

    + +

    2a Dot product

    + +

    First, the canonical algorithm - computing each result element by taking the dot product of the corresponding left row and right column. What we see in the animation is the sweep of multiplied value vectors through the cube’s interior, each delivering a summed result at the corresponding position.

    + +

    Here, L has blocks of rows filled with 1 (blue) or -1 (red); R has column blocks filled similarly. k is 24 here, so the result matrix (L @ R) has blue values of 24 and red values of -24 (open in mm - long click or control-click to inspect values):

    + +

    + +

    + +

    2b Matrix-vector products

    + +

    A matmul decomposed into matrix-vector products looks like a vertical plane (a product of the left argument with each column of the right argument) painting columns onto the result as it sweeps horizontally through the cube’s interior (open in mm):

    + +

    + +

    + +

    Observing the intermediate values of a decomposition can be quite interesting, even in simple examples.

    + +

    For instance, note the prominent vertical patterns in the intermediate matrix-vector products when we use randomly initialized arguments- reflecting the fact that each intermediate is a column-scaled replica of the left argument (open in mm):

    + +

    + +

    + +

    2c Vector-matrix products

    + +

    A matmul decomposed into vector-matrix products looks like a horizontal plane painting rows onto the result as it descends through the cube’s interior (open in mm):

    + +

    + +

    + +

    Switching to randomly initialized arguments, we see patterns analogous to those we saw with matrix-vector products - only this time the patterns are horizontal, corresponding to the fact that each intermediate vector-matrix product is a row-scaled replica of the right argument.

    + +

    When thinking about how matmuls express the rank and structure of their arguments, it’s useful to envision both of these patterns happening simultaneously in the computation (open in mm):

    + +

    + +

    + +

    Here’s one more intuition builder using vector-matrix products, showing how the identity matrix functions exactly like a mirror set at a 45deg angle to both its counterargument and the result (open in mm):

    + +

    + +

    + +

    2d Summed outer products

    + +

    The third planar decomposition is along the k axis, computing the matmul result by a pointwise summation of vector outer products. Here we see the plane of outer products sweeping the cube “from back to front”, accumulating into the result (open in mm):

    + +

    + +

    + +

    Using randomly initialized matrices with this decomposition, we can see not just values but rank accumulate in the result, as each rank-1 outer product is added to it.

    + +

    Among other things this builds intuition for why “low-rank factorization” - i.e. approximating a matrix by constructing a matmul whose arguments are small in the depth dimension - works best when the matrix being approximated is low rank. LoRA in a later section (open in mm):

    + +

    + +

    + +

    3 Warmup - expressions

    + +

    How can we extend this visualization approach to compositions of matmuls? Our examples so far have all visualized a single matmul L @ R of some matrices L and R - what about when L and/or R are themselves matmuls, and so on transitively?

    + +

    It turns out we can extend the approach nicely to compound expressions. The key rules are simple: the subexpression (child) matmul is another cube, subject to the same layout constraints as the parent, and the result face of the child is simultaneously the corresponding argument face of the parent, like a covalently shared electron.

    + +

    Within these constraints, we’re free to arrange the faces of a child matmul however we like. Here we use the tool’s default scheme, which generates alternating convex and concave cubes - this layout works well in practice to maximize use of space and minimize occlusion. (Layouts are completely customizable, however - see the reference for details.)

    + +

    In this section we’ll visualize some of the key building blocks we find in ML models, to gain fluency in the visual idiom and to see what intuitions even simple examples can give us.

    + +

    3a Left-associative expressions

    + +

    We’ll look at two expressions of the form (A @ B) @ C, each with its own distinctive shape and character. (Note: mm adheres to the convention that matrix multiplication is left-associative and writes this simply as A @ B @ C.)

    + +

    First we’ll give A @ B @ C the characteristic FFN shape, in which the “hidden dimension” is wider than the “input” or “output” dimensions. (Concretely in the context of this example, this means that the width of B is greater than the widths of A or C.)

    + +

    As in the single matmul examples, the floating arrows point towards the result matrix, blue vane coming from the left argument and red vane from right argument (open in mm):

    + +

    As in the single matmul examples, the floating arrows point towards the result matrix, blue vane coming from the left argument and red vane from right argument

    + +

    Next we’ll visualize A @ B @ C with the width of B narrower than that of A or C, giving it a bottleneck or “autoencoder” shape (open in mm):

    + +

    visualize A @ B @ C with the width of B narrower than that of A or C

    + +

    This pattern of alternating convex and concave blocks extends to chains of arbitrary length: for example this multilayer bottleneck (open in mm):

    + +

    pattern of alternating convex and concave blocks extends to chains of arbitrary length

    + +

    3b Right associative expressions

    + +

    Next we’ll visualize a right-associative expression A @ (B @ C).

    + +

    In the same way left-associative expressions extend horizontally - sprouting from the left argument of the root expression, so to speak - right-associative chains extend vertically, sprouting from the root’s right argument.

    + +

    One sometimes sees an MLP formulated right-associatively, i.e. with columnar input on the right and weight layers running right to left. Using the matrices from the 2-layer FFN example pictured above - suitably transposed - here’s what that looks like, with C now playing the role of the input, B the first layer and A the second layer (open in mm):

    + +

    an MLP formulated right-associatively

    + +

    Aside: in addition to the color of the arrow vanes (blue for left, red for right), a second visual cue for distinguishing left and right arguments is their orientation: the rows of the left argument are coplanar with those of the result - they stack along the same axis (i). Both cues tell us for example that B is the left argument to (B @ C) above.

    + +

    3c Binary expressions

    + +

    For a visualization tool to be useful beyond simple didactic examples, visualizations need to remain legible as expressions get more complicated. A key structural component in real-world use cases is binary expressions - matmuls with subexpressions on both the left and right.

    + +

    Here we’ll visualize the simplest such expression shape, (A @ B) @ (C @ D) (open in mm):

    + +

    binary expressions - matmuls with subexpressions on both the left and right

    + +

    3d Quick aside: partitioning and parallelism

    + +

    A full presentation of this topic is out of scope for this note, though we’ll see it in action later in the context of attention heads. But as a warmup, two quick examples should give a sense of how this style of visualization makes reasoning about parallelizing compound expressions very intuitive, via the simple geometry of partitioning.

    + +

    In the first example we’ll apply the canonical “data parallel” partitioning to the left-associative multilayer bottleneck example above. We partition along i, segmenting the initial left argument (“batch”) and all intermediate results (“activations”), but none of the subsequent arguments (“weights”) - the geometry making it obvious which participants in the expression are segmented and which remain whole (open in mm):

    + +

    the canonical "data parallel" partitioning to the left-associative multilayer bottleneck example

    + +

    The second example would (for me, anyway) be much harder to build intuition about without clear geometry to support it: it shows how a binary expression can be parallelized by partitioning the left subexpression along its j axis, the right subexpression along its i axis, and the parent expression along its k axis (open in mm):

    + +

    a binary expression can be parallelized by partitioning the left subexpression along its j axis, the right subexpression along its i axis, and the parent expression along its k axis

    + +

    4 Inside an Attention Head

    + +

    Let’s look at a GPT2 attention head - specifically layer 5, head 4 of the “gpt2” (small) configuration (layers=12, heads=12, embed=768) from NanoGPT, using OpenAI weights via HuggingFace. Input activations are taken from a forward pass on an OpenWebText training sample of 256 tokens.

    + +

    There’s nothing particularly unusual about this particular head; I chose it mainly because it computes a fairly common attention pattern and lives in the middle of the model, where activations have become structured and show some interesting texture. (Aside: in a subsequent note I’ll present an attention head explorer that lets you visualize all layers and heads of this model, along with some travel notes.)

    + +

    Open in mm (may take a few seconds to fetch model weights)

    + +

    There's nothing particularly unusual about this particular head

    + +

    4a Structure

    + +

    The entire attention head is visualized as a single compound expression, starting with input and ending with projected output. (Note: to keep things self-contained we do per-head output projection as described in Megatron-LM.)

    + +

    The computation contains six matmuls:

    + +
    Q = input @ wQ        // 1
    +K_t = wK_t @ input_t  // 2
    +V = input @ wV        // 3
    +attn = sdpa(Q @ K_t)  // 4
    +head_out = attn @ V   // 5
    +out = head_out @ wO   // 6
    +
    + +

    A thumbnail description of what we’re looking at:

    + +
      +
    • the blades of the windmill are matmuls 1, 2, 3 and 6: the former group are the in-projections from input to Q, K and V; the latter is the out-projection from attn @ V back to the embedding dimension.
    • +
    • at the hub is the double matmul that first calculates attention scores (convex cube in back), then uses them to produce output tokens from the values vector (concave cube in front). Causality means that the attention scores form a lower triangle.
    • +
    + +

    But I’d encourage exploring this example in the tool itself, rather than relying on the screenshot or the video below to convey just how much signal can be absorbed from it - both about its structure and the actual values flowing through the computation.

    + +

    4b Computation and Values

    + +

    Here’s an animation of the attention head computation. Specifically, we’re watching

    + +
    sdpa(input @ wQ @ K_t) @ V @ wO
    +
    + +

    (i.e., matmuls 1, 4 , 5 and 6 above, with K_t and V precomputed) being computed as a fused chain of vector-matrix products: each item in the sequence goes all the way from input through attention to output in one step. More on this animation choice in the later section on parallelization, but first let’s look at what the values being computed tell us.

    + +

    Open in mm

    + +

    + +

    + +

    There’s a lot of interesting stuff going on here.

    + +
      +
    • Before we even get to the attention calculation, it’s quite striking how low-rank Q and K_t are. Zooming in on the Q @ K_t vector-matrix product animation, the situation is even more vivid: a significant number of channels (embedding positions) in both Q and K look more or less constant across the sequence, implying that the useful attention signal is potentially driven by a only smallish subset of the embedding. Understanding and exploiting this phenomenon is one of the threads we’re pulling on as part of the SysML ATOM transformer efficiency project.
    • +
    • Perhaps most familiar is the strong-but-not-perfect diagonal that emerges in the attention matrix. This is a common pattern, showing up in many of the attention heads of this model (and those of many transformers). It produces localized attention: the value tokens in the small neighborhood immediately preceding an output token’s position largely determine that output token’s content pattern.
    • +
    • However, the size of this neighborhood and the influence of individual tokens within it vary nontrivially - this can be seen both in the off-diagonal frost in the attention grid, and in the fluctuating patterns of the attn[i] @ V vector-matrix product plane as it descends the attention matrix on its way through the sequence.
    • +
    • But note that the local neighborhood isn’t the only thing that’s attracting attention: the leftmost column of the attention grid, corresponding to the first token of the sequence, is entirely filled with nonzero (but fluctuating) values, meaning every output token will be influenced to some degree by the first value token.
    • +
    • Moreover there’s an inexact but discernible oscillation in attention score dominance between the current token neighborhood and the initial token. The period of the oscillation varies, but broadly speaking starts short and then lengthens as one travels down the sequence (evocatively correlated with the quantity of candidate attention tokens for each row, given causality).
    • +
    • To get a feel for how (attn @ V) is formed, it’s important not to focus on attention in isolation - V is an equal player. Each output item is a weighted average of the entire V vector: at the limit when attention is a perfect diagonal, attn @ V is simply an exact copy of V. Here we see something more textured: visible banding where particular tokens have scored high over a contiguous subsequence of attention rows, superimposed on a matrix visibly similar to to V but with some vertical smearing due to the fat diagonal. (Aside: per the mm reference guide, long-clicking or control-clicking will reveal the actual numeric values of visualized elements.)
    • +
    • Bear in mind that since we’re in a middle layer (5), the input to this attention head is an intermediate representation, not the original tokenized text. So the patterns seen in the input are themselves thought-provoking - in particular, the strong vertical threads are particular embedding positions whose values are uniformly high magnitude across long stretches of the sequence - sometimes almost the entire thing.
    • +
    • Interestingly, though, the first vector in the input sequence is distinctive, not only breaking the pattern of these high-magnitude columns but carrying atypical values at almost every position (aside: not visualized here, but this pattern is repeated over multiple sample inputs).
    • +
    + +

    Note: apropos of the last two bullet points, it’s worth reiterating that we’re visualizing computation over a single sample input. In practice I’ve found that each head has a characteristic pattern it will express consistently (though not identically) over a decent collection of samples (and the upcoming attention head browser will provide a collection of samples to play with), but when looking at any visualization that includes activations, it’s important to bear in mind that a full distribution of inputs may influence the ideas and intuitions it provokes it in subtle ways.

    + +

    Finally, one more pitch to explore the animation directly!

    + +

    4c Heads are different in interesting ways

    + +

    Before we move on, here’s one more demonstration of the usefulness of simply poking around a model to see how it works in detail.

    + +

    This is another attention head from GPT2. It behaves quite differently from layer 5, head 4 above - as one might expect, given that it’s in a very different part of the model. This head is in the very first layer: layer 0, head 2 (open in mm, may take a few seconds to load model weights):

    + +

    This is another attention head from GPT2

    + +

    Things to note:

    + +
      +
    • This head spreads attention very evenly. This has the effect of delivering a relatively unweighted average of V (or rather, the appropriate causal prefix of V) to each row in attn @ V, as can be seen in this animation: as we move down the attention score triangle, the attn[i] @ V vector-matrix product is small fluctuations away from being simply a downscaled, progressively revealed copy of V.
    • +
    • attn @ V has striking vertical uniformity - in large columnar regions of the embedding, the same value patterns persist over the entire sequence. One can think of these as properties shared by every token.
    • +
    • Aside: on the one hand one might expect some uniformity in attn @ V given the effect of very evenly spread attention. But each row has been constructed from only a causal subsequence of V rather than the whole thing - why is that not causing more variation, like a progressive morphing as one moves down the sequence? By visual inspection V isn’t uniform along its length, so the answer must lie in some more subtle property of its distribution of values.
    • +
    • Finally, this head’s output is even more vertically uniform after out-projection
    • +
    • the strong impression being that the bulk of the information being delivered by this attention head consists of properties which are shared by every token in the sequence. The composition of its output projection weights reinforces this intuition.
    • +
    + +

    Overall, it’s hard to resist the idea that the extremely regular, highly structured information this attention head produces might be obtained by computational means that are a bit… less lavish. Of course this isn’t an unexplored area, but the specificity and richness of signal of the visualized computation has been useful in generating new ideas, and reasoning about existing ones.

    + +

    4d Revisiting the pitch: invariants for free

    + +

    Stepping back, it’s worth reiterating that the reason we can visualize nontrivially compound operations like attention heads and have them remain intuitive is that important algebraic properties - like how argument shapes are constrained, or which parallelization axes intersect which operations - don’t require additional thinking: they arise directly from the geometry of the visualized object, rather than being additional rules to keep in mind.

    + +

    For example, in these attention head visualizations it’s immediately obvious that

    + +
      +
    • Q and attn @ V are the same length, K and V are the same length, and the lengths of these pairs are independent of each other
    • +
    • Q and K are the same width, V and attn @ V are the same width, and the widths of these pairs are independent of each other.
    • +
    + +

    These properties are true by construction, as a simple consequence of which parts of the compound structure the constituents inhabit and how they are oriented.

    + +

    This “properties for free” benefit can be especially useful when exploring variations on a canonical structure - an obvious example being the one-row-high attention matrix in autoregressive token-at-a-time decoding (open in mm):

    + +

    the one-row-high attention matrix in autoregressive token-at-a-time decoding

    + +

    5 Parallelizing attention

    + +

    In the animation of head 5, layer 4 above, we visualize 4 of the 6 matmuls in the attention head

    + +

    as a fused chain of vector-matrix products, confirming the geometric intuition that the entire left-associative chain from input to output is laminar along the shared i axis, and can be parallelized.

    + +

    5a Example: partitioning along i

    + +

    To parallelize the computation in practice, we would partition the input into blocks along the i axis. We can visualize this partition in the tool, by specifying that a given axis be partitioned into a particular number of blocks - in these examples we’ll use 8, but there’s nothing special about that number.

    + +

    Among other things, this visualization makes clear that wQ (for in-projection), K_t and V (for attention) and wO (for out-projection) are needed in their entirety by each parallel computation, since they’re adjacent to the partitioned matrices along those matrices’ unpartitioned dimensions (open in mm):

    + +

    wQ (for in-projection), K_t and V (for attention) and wO (for out-projection) are needed in their entirety by each parallel computation

    + +

    5b Example: double partitioning

    + +

    As an example of partitioning along multiple axes, we can visualize some recent work which innovates in this space (Block Parallel Transformer, building on work done in e.g. Flash Attention and its antecedents).

    + +

    First, BPT partitions along i as described above - and actually extends this horizontal partitioning of the sequence into chunks all the way through the second (FFN) half of the attention layer as well. (We’ll visualize this in a later section.)

    + +

    To fully attack the context length problem, a second partitioning is then added to MHA - that of the attention calculation itself (i.e., a partition along the j axis of Q @ K_t). The two partitions together divide attention into a grid of blocks (open in mm):

    + +

    The two partitions together divide attention into a grid of blocks

    + +

    This visualization makes clear

    + +
      +
    • the effectiveness of this double partitioning as an attack on the context length problem, since we’ve now visibly partitioned every occurrence of sequence length in the attention calculation
    • +
    • the “reach” of this second partitioning: it’s clear from the geometry that the in-projection computations of K and V can be partitioned along with the core double matmul
    • +
    + +

    Note one subtlety: the visual implication here is that we can also parallelize the subsequent matmul attn @ V along k and sum the partial results split-k style, thus parallelizing the entire double matmul. But the row-wise softmax in sdpa() adds the requirement that each row have all its segments normalized before the corresponding row of attn @ V can be computed, adding an extra row-wise step between the attention calculation and the final matmul.

    + +

    6 Sizes in an Attention Layer

    + +

    The first (MHA) half of an attention layer is famously computationally demanding because of its quadratic complexity, but the second (FFN) half is demanding in its own right due to the width of its hidden dimension, typically 4 times that of the model’s embedding dimension. Visualizing the biomass of a full attention layer can be useful in building intuition about how the two halves of the layer compare to each other.

    + +

    6a Visualizing the full layer

    + +

    Below is a full attention layer with the first half (MHA) in the background and the second (FFN) in the foreground. As usual, arrows point in the direction of computation.

    + +

    Notes:

    + +
      +
    • This visualization doesn’t depict individual attention heads, but instead shows the unsliced Q/K/V weights and projections surrounding a central double matmul. Of course this isn’t a faithful visualization of the full MHA operation - but the goal here is to give a clearer sense of the relative matrix sizes in the two halves of the layer, rather than the relative amounts of computation each half performs. (Also, randomized values are used rather than real weights.)
    • +
    • The dimensions used here are downsized to keep the browser (relatively) happy, but the proportions are preserved (from NanoGPT’s small config): model embedding dimension = 192 (from 768), FFN embedding dimension = 768 (from 3072), sequence length = 256 (from 1024), although sequence length is not fundamental to the model. (Visually, changes in sequence length would appear as changes in the width of the input blades, and consequently in the size of the attention hub and the height of the downstream vertical planes.)
    • +
    + +

    Open in mm:

    + +

    a full attention layer with the first half (MHA) in the background and the second (FFN) in the foreground

    + +

    6b Visualizing the BPT partitioned layer

    + +

    Revisiting Blockwise Parallel Transformer briefly, here we visualize BPT’s parallelization scheme in the context of an entire attention layer (with individual heads elided per above). In particular, note how the partitioning along i (of sequence blocks) extends through both MHA and FFN halves (open in mm):

    + +

    visualize BPT's parallelization scheme in the context of an entire attention layer

    + +

    6c Partitioning the FFN

    + +

    The visualization suggests an additional partitioning, orthogonal to the ones described above - in the FFN half of the attention layer, splitting the double matmul (attn_out @ FFN_1) @ FFN_2, first along j for attn_out @ FFN_1, then along k in the subsequent matmul with FFN_2. This partition slices both layers of FFN weights, reducing the capacity requirements of each participant in the computation at the cost of a final summation of the partial results.

    + +

    Here’s what this partition looks like applied to an otherwise unpartitioned attention layer (open in mm):

    + +

    what this partition looks like applied to an otherwise unpartitioned attention layer

    + +

    And here it is applied to a layer partitioned a la BPT (open in mm):

    + +

    applied to a layer partitioned a la BPT

    + +

    6d Visualizing token-at-a-time decoding

    + +

    During autoregressive token-at-a-time decoding, the query vector consists of a single token. It’s instructive to have a mental picture of what an attention layer looks like in that situation - a single embedding row working its way through an enormous tiled plane of weights.

    + +

    Aside from the emphasizing the sheer immensity of weights compared to activations, this view is also evocative of the notion that K_t and V function like dynamically generated layers in a 6-layer MLP, although the mux/demux computations of MHA itself (papered over here, per above) make the correspondence inexact (open in mm):

    + +

    the mux/demux computations of MHA itself

    + +

    7 LoRA

    + +

    The recent LoRA paper (LoRA: Low-Rank Adaptation of Large Language Models) describes an efficient finetuning technique based on the idea that weight deltas introduced during finetuning are low-rank. Per the paper, this “allows us to train some dense layers in a neural network indirectly by optimizing rank decomposition matrices of the dense layers’ change during adaptation […], while keeping the pre-trained weights frozen.”

    + +

    7a The basic idea

    + +

    In a nutshell, the key move is to train the factors of a weight matrix rather than the matrix itself: replace an I x J weights tensor with a matmul of an I x K tensor and a K x J tensor, holding K to some small number.

    + +

    If K is small enough the size win can be huge, but the tradeoff is that lowering it lowers the rank of what the product can express. As a quick illustration of both the size savings and the structuring effect on the result, here’s a matmul of random 128 x 4 left and 4 x 128 right arguments - a.k.a. a rank-4 factorization of a 128 x 128 matrix. Notice the vertical and horizontal patterning in L @ R (open in mm):

    + +

    a matmul of random 128 x 4 left and 4 x 128 right arguments

    + +

    7b Applying LoRA to an attention head

    + +

    The way LoRA applies this factoring move to the fine tuning process is to

    + +
      +
    • create a low-rank factorization for each weight tensor to be fine-tuned and train the factors, keeping the original weights frozen
    • +
    • after fine tuning, multiply each pair of low-rank factors to get a matrix in the shape of the original weights tensor, and add it to the original pretrained weights tensor
    • +
    + +

    The following visualization shows an attention head with the weight tensors wQ, wK_t, wV, wO replaced by low rank factorizations wQ_A @ wQ_B, etc. Visually, the factor matrices show up as low fences along the edges of the windmill blades (open in mm - spacebar stops the spin):

    + +

    + +

    + +

    8 Wrapup

    + +

    8a Call for feedback

    + +

    I’ve found this way of visualizing matmul expressions extremely helpful for building intuition and reasoning about not just matrix multiplication itself, but also many aspects of ML models and their computation, from efficiency to interpretability.

    + +

    if you try it out and have suggestions or comments, I definitely want to hear, either in the comments here or in the repo.

    + +

    8b Next steps

    + +
      +
    • There’s a GPT2 attention head explorer built on top of the tool which I’m currently using to inventory and classify the attention head traits found in that model. (This was the tool I used to find and explore the attention heads in this note.) Once complete I plan to post a note with the inventory.
    • +
    • As mentioned up top, embedding these visualizations in Python notebooks is dead simple. But session URLs can get… unwieldy, so it will be useful to have Python-side utilities for constructing them from configuration objects, similar to the simple JavaScript helpers used in the reference guide.
    • +
    • If you’ve got a use case you think might benefit from visualizations like this but it’s not obvious how to use the tool to do it, get in touch! I’m not necessarily looking to expand its core visualization capabilities that much further (right tool for the job, etc.), but e.g. the API for driving it programmatically is pretty basic, there’s plenty that can be done there.
    • +
    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/int4-decoding/index.html b/blog/int4-decoding/index.html new file mode 100644 index 000000000000..8423a67792ad --- /dev/null +++ b/blog/int4-decoding/index.html @@ -0,0 +1,4028 @@ + + + + + + + + + + + + + INT4 Decoding GQA CUDA Optimizations for LLM Inference | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Sarunya Pumma, Jongsoo Park, Jianyu Huang, Amy Yang, Jaewon Lee, Daniel Haziza, Grigory Sizov, Jeremy Reizenstein, Jeff Johnson, Ying Zhang + +

    +

    An efficient decoding Grouped-Query Attention with low-precision KV cache

    + +

    Introduction

    + +

    Generative AI has taken the world by storm with its ability to generate content like humans. Many of these generative AI tools are powered by large language models (LLMs), like Meta Llama models and OpenAI’s ChatGPT. One of the main challenges of LLMs is supporting large “context lengths” (also known as “sequence lengths”). The context length refers to the number of tokens that the model uses to understand the input context and generate responses. Longer context lengths generally translate into higher precision and quality in the responses. However, long context lengths are compute and memory intensive. This is mainly due to the following reasons:

    + +
      +
    • The computational complexity of attention layers increases proportionally with the context length (the growth rate depends on the attention algorithm). As a result, when using long context lengths, the attention layers can become a bottleneck, particularly during the prefill phase where attentions are compute bound.
    • +
    • The KV cache size grows linearly with the context length, thus, putting higher pressure on the memory requirement and consequently slowing down the already memory-bound attention decoding. Moreover, since the memory capacity is limited, the batch size reduces when the KV cache gets bigger, which generally results in a drop in throughput.
    • +
    + +

    The computational complexity growth is difficult to solve compared to the other problem mentioned above. One way to address the KV cache size growth problem is to use low precision KV cache. From our experiments, group-wise INT4 quantization provides comparable results in terms of accuracy compared to BF16 KV cache during the decode phase in Meta Llama 2 inference. However, we did not observe any latency improvement, despite reading 4x lesser data in attention decoding layers. This means that the INT4 attention is 4x less efficient at utilizing precious HBM bandwidth than BF16 attention.

    + +

    In this note, we discuss the CUDA optimizations that we applied to INT4 GQA (grouped-query attention – the attention layer that we use in the LLM inference phase) to improve its performance by up to 1.8x on the NVIDIA A100 GPU and 1.9x on the NVIDIA H100 GPU.

    + +
      +
    • The optimized CUDA INT4 GQA outperformed INT4 Flash-Decoding GQA (the best performing INT4 GQA that we used in the experiment mentioned above) by 1.4x-1.7x on A100 and 1.09x-1.3x on H100.
    • +
    • The optimized CUDA INT4 GQA performs better than BF16 Flash-Decoding GQA by 1.5x-1.7x on A100 and 1.4x-1.7x on H100.
    • +
    + +

    Background

    + +

    GQA for LLM Inference

    + +

    Grouped-Query Attention (GQA) is a variant of multi-head attention (MHA) where each KV cache head is shared across a group of query heads. Our LLM inference adopts GQA as an attention layer in both the prefill and decode phases in order to reduce the capacity requirement for the KV cache. We use multiple GPUs in inference where the KV cache and query heads are distributed across GPUs. Each GPU runs an attention layer with a single KV head and a group of Q heads. Therefore, when viewed from a single GPU perspective, the GQA component can also be described as MQA (Multi-Query Attention).

    + +

    The simplified workflow of decoding GQA is illustrated in Figure 1. GQA takes three main inputs: input query (denoted Q), K cache (denoted K), and V cache (denoted V). Our current GQA inference uses BF16 for Q, K, and V.

    + +
      +
    • Q is a 4D BF16 tensor of shape (B, 1, HQ, D)
    • +
    • K is a 4D BF16 tensor of shape (B, Tmax, HKV, D)
    • +
    • V is a 4D BF16 tensor of shape (B, Tmax, HKV, D)
    • +
    + +

    where

    + +
      +
    • B is the batch size (the number of input prompts)
    • +
    • HQ is the number of query heads
    • +
    • HKV is the number of KV heads (HQ must be divisible by HKV)
    • +
    • Tmax is the maximum context length
    • +
    • D is the head dimension (fixed to 128)
    • +
    + +

    GQA is simply bmm(softmax(bmm(Q, KT) / sqrt(D)), V). This yields a single output tensor (denoted as O) which is a 4D BF16 tensor that has the same shape as Q. Note that matrix multiplications are performed using BF16, however, accumulation and softmax are carried out in FP32. We call this “BF16 GQA” as the KV cache is BF16.

    + +

    Figure 1: The simplified workflow of BF16 GQA for LLM inference

    + +

    Figure 1 The simplified workflow of BF16 GQA for LLM inference

    + +

    INT4 GQA

    + +

    To further reduce the size of the KV cache, we explore the possibility of using INT4 for KV cache instead of BF16. We estimate the potential performance improvement by calculating the computational intensity (CI) of INT4 GQA and comparing it to that of BF16 GQA, as CI represents FLOPS per byte. We compute the CI for QKT and PV (as shown in Equation 1) as they take KV cache as an operand. Note that we disregard the Q load as it is negligible compared to the KV cache. We also ignore any intermediate data loads/stores that are not on global memory. Thus, the CI only takes into account the computation FLOPS and KV cache loads.

    + +

    Equation 1

    + +

    Equation (1)

    + +

    Assuming that HQ = 8 and HKV = 1, CI for BF16 KV cache is 8 while CI for INT4 KV cache is 32. The CIs indicate that both BF16 and INT4 GQAs are memory bound (the peak CIs for BF16 tensor cores for A100 and H100 are 312 TF / 2 TB/s = 141 and 990 TF / 3.35 TB/s = 269; note that these TF numbers are without sparsity). Moreover, with INT4 KV cache, we should expect up to 4x performance improvement compared to BF16 GQA.

    + +

    To enable INT4 KV cache support in GQA, we can dequantize the KV cache from INT4 to BF16 before passing it to the BF16 GQA operator. However, since KV cache is typically large, copying it from/to global memory can be costly. Moreover, decoding GQA is a memory bound operation (the memory unit is utilized much more heavily than the compute unit). Figure 2 shows the NCU profile of the FMHA CUTLASS BF16 GQA kernel in xFormers, which is one of the state of the art implementations of GQA. From the figure, it is obvious that memory is a bottleneck.

    + +

    Figure 2: The NCU profile of the FMHA CUTLASS BF16 kernel in xFormers

    + +

    Figure 2 The NCU profile of the FMHA CUTLASS BF16 kernel in xFormers

    + +

    A more efficient alternative is to fuse INT4 dequantization with the GQA operation (shown in Figure 3). In other words, having GQA read INT4 KV cache directly and perform the INT4 to BF16 conversion within the kernel. This change can potentially reduce the amount of global memory reads required for the KV cache, which could lead to a decrease in latency. We call this “INT4 GQA.”

    + +

    Figure 3: The workflow of fused INT4 GQA

    + +

    Figure 3 The workflow of fused INT4 GQA

    + +

    We list the state of the art implementations of GQA in the table below along with their features in Table 1.

    + +

    Table 1 State of the art GQA implementations

    + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Implementation + Denote + BF16 GQA + Fused INT4 GQA +
    Flash-Decoding (Triton implementation) + FD + Yes + Yes +
    Flash Attention (v2.3.3) + FA + Yes + No +
    CUDA baseline + CU + Yes + Yes +
    + +

    All implementations, except for CU, support both split-K and non split-K. CU only has the split-K implementation. Only FA has a heuristic in the backend to determine whether to run the split-K or non split-K kernel. For other implementations, users must explicitly choose which version to run. In this note, we focus on long context lengths (in our experiments, we use a context length of 8192) and therefore opt for the split-K version wherever possible.

    + +

    As the baseline, we measured the performance of the state of the art GQA implementations on NVIDIA A100 and H100 GPUs. The latency (time in microseconds) and achieved bandwidth (GB/s) are reported in Table 2. Note that we ran a range of split-Ks (from 2 to 128 splits) and reported the best performance for each implementation. For all experiments, we use a context length of 8192. For INT4 GQA, we used row-wise quantization (i.e., num quantized groups = 1).

    + +

    Table 2 Baseline GQA performance

    + +

    On A100

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Time (us) + BF16 GQA + INT4 GQA +
    Batch size + FD + FA + CU + FD + FA + CU +
    32 + 139 + 133 + 183 + 137 + - + 143 +
    64 + 245 + 229 + 335 + 234 + - + 257 +
    128 + 433 + 555 + 596 + 432 + - + 455 +
    256 + 826 + 977 + 1127 + 815 + - + 866 +
    512 + 1607 + 1670 + 2194 + 1581 + - + 1659 +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Effective Bandwidth (GB/s) + BF16 GQA + INT4 GQA +
    Batch size + FD + FA + CU + FD + FA + CU +
    32 + 965 + 1012 + 736 + 262 + - + 250 +
    64 + 1097 + 1175 + 802 + 305 + - + 278 +
    128 + 1240 + 968 + 901 + 331 + - + 314 +
    256 + 1301 + 1100 + 954 + 351 + - + 331 +
    512 + 1338 + 1287 + 980 + 362 + - + 345 +
    + +

    On H100

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Time (us) + BF16 GQA + INT4 GQA +
    Batch size + FD + FA + CU + FD + FA + CU +
    32 + 91 + 90 + 114 + 70 + - + 96 +
    64 + 148 + 146 + 200 + 113 + - + 162 +
    128 + 271 + 298 + 361 + 205 + - + 294 +
    256 + 515 + 499 + 658 + 389 + - + 558 +
    512 + 1000 + 1011 + 1260 + 756 + - + 1066 +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Effective Bandwidth (GB/s) + BF16 GQA + INT4 GQA +
    Batch size + FD + FA + CU + FD + FA + CU +
    32 + 1481 + 1496 + 1178 + 511 + - + 371 +
    64 + 1815 + 1840 + 1345 + 631 + - + 443 +
    128 + 1982 + 1802 + 1487 + 699 + - + 487 +
    256 + 2087 + 2156 + 1634 + 736 + - + 513 +
    512 + 2150 + 2127 + 1706 + 757 + - + 537 +
    + +

    First, let’s discuss the BF16 GQA performance: CU ranks last in terms of performance among all implementations. FD and FA have comparable performance. When the batch size is less than or equal to 64, FA utilizes the split-K kernel and performs slightly better than FD. However, when the batch size is greater than 64, FD performs better.

    + +

    The same trend holds true for INT4 GQAs. However, we did not measure the performance of FA as it does not support INT4 KV cache. FD outperforms CU for all cases.

    + +

    When comparing the latencies of FD between BF16 and INT4 GQAs, we find that they are almost identical. This suggests that INT4 GQA is highly inefficient, which can be further confirmed by the significantly lower achievable bandwidth for INT4 GQA compared to BF16 GQA. The same trend is also true when looking at the performance of CU.

    + +

    CUDA with Tensor Cores INT4 GQA Implementation

    + +

    In this section, we briefly describe our baseline implementation which is CUDA with tensor cores INT4 GQA (CU). Each thread block processes only one KV head and a group of query heads from one input prompt. Therefore, each thread block performs mm(softmax(mm(Q, KT) / sqrt(D)), V); notice that mm is being performed not bmm. Moreover, since this is a split-K implementation, tokens in the KV cache are split among different thread blocks. Note that each thread block contains 4 warps (each warp contains 32 threads for NVIDIA A100 and H100 GPUs). Work in each thread block is split among warps. Within each warp, we use the WMMA API to compute matrix multiplication on tensor cores. Figure 4 demonstrates the work partitioning in CU.

    + +

    Figure 4: CU work partitioning

    + +

    Figure 4 CU work partitioning

    + +

    Optimizing CUDA with Tensor Cores Kernel of INT4 GQA

    + +

    In this note, we discuss the optimizations that we have applied to the CUDA with tensor cores implementation of INT4 GQA (CU). The ideal goal is to improve the INT4 GQA performance by 4 times based on the CI analysis in the previous section. Note that the query size is negligible compared to the KV cache size when the context length is long.

    + +

    In our analysis, we used the NVIDIA Nsight Compute (NCU) as the main profiler. Our general bottleneck elimination approach is to minimize the stall cycles. We applied 10 optimizations to INT4 GQA, three of which are specific for NVIDIA A100/H100 GPUs. These optimizations are well known CUDA optimization techniques which can be generalized to many applications.

    + +

    It is worth noting that the reason that we choose to optimize the CUDA implementation rather than the Flash-Decoding implementation (FD) (which is Triton based) is because with CUDA, we have a better control of how the low-level instructions are being generated. Many optimization techniques that we apply such as, operating on tensor core fragments directly (Optimizations 7-9), cannot be done through Triton since it does not expose low-level details to developers. However, these optimizations can be integrated into the compiler-based solution to make the optimizations available to broader operators, which is indeed a part of our future plan.

    + +

    Optimization 1: Unroll K Loads

    + +

    Problem Analysis:

    + +

    The NCU profile shows that during K loading, there are only 2 global loads followed by memory stalls at dequantize_permuted_int4. The memory stalls are the long scoreboard stalls which indicates the waits for global memory access. This suggests that the kernel does not issue sufficient memory loads

    + +

    to hide the global load latency. The kernel issues data loading, and then waits to consume the data immediately causing the global load latency to be exposed. The stalls are shown in Figure 5.

    + +

    Figure 5: K loading before unrolling

    + +

    Figure 5 K loading before unrolling (the numbers that the arrows point to are stall cycles caused by global memory wait)

    + +

    Solution:

    + +

    In the baseline implementation, we use uint32_t to load 8 INT4 K values in a single load and we perform 2 uint32_t loads in each iteration, which is 16 INT4 K values. To allow for a better global load latency hiding, we issue 8 uint32_t loads instead of two before consuming the K values in dequantize_permuted_int4. This allows the compiler to unroll the loads as well as reorder the instructions to hide the global load latency better. Figure 6 shows the NCU profile of K loading after unrolling. Comparing Figure 5 and Figure 6, we effectively reduce the stall cycles by unrolling the K loads.

    + +

    Figure 6: K loading after unrolling

    + +

    Figure 6 K loading after unrolling (the numbers that the arrows point to are stall cycles caused by global memory wait)

    + +

    Results:

    + +

    Table 3 Performance of Optimization 1 for INT4 GQA (row-wise quantization)

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Batch size + Time (us) + Bandwidth (GB/s) + Speed up +
    FD + CU + FD + CU + vs FD + vs CU baseline +
    Baseline + Opt 1 + Baseline + Opt 1 +
    32 + 137 + 143 + 134 + 262 + 250 + 267 + 1.02 + 1.07 +
    64 + 234 + 257 + 237 + 305 + 278 + 302 + 0.99 + 1.09 +
    128 + 432 + 455 + 422 + 331 + 314 + 339 + 1.02 + 1.08 +
    256 + 815 + 866 + 806 + 351 + 331 + 355 + 1.01 + 1.07 +
    512 + 1581 + 1659 + 1550 + 362 + 345 + 369 + 1.02 + 1.07 +
    + +

    Optimization 2: Improve P Type Casting (FP32->BF16)

    + +

    Problem Analysis:

    + +

    Since the product of softmax(bmm(Q, KT) / sqrt(D)) is FP32 (denoted as P in Figure 3), the kernel has to convert P from FP32 to BF16 before feeding it to the next bmm computation. The kernel performs the FP32 to BF16 conversion of P by copying the FP32 data from one location in shared memory to another location in shared memory. This causes stalls during the shared memory access (shown in Figure 7) which might be caused by (1) the shared memory indirection; and (2) the shared memory bank conflict since each thread accesses an 16-bit element (because of this, two threads can access the same memory bank simultaneously).

    + +

    Figure 7: P type casting before Optimization 2

    + +

    Figure 7 P type casting before Optimization 2 (the number that the arrow points to is stall cycles caused by shared memory wait)

    + +

    Solution:

    + +

    We use all threads in the thread block to do in-place type conversion. Each thread operates on two consecutive elements in order to avoid the shared memory bank conflict when storing BF16. All threads work on the same head (h) at the same time to guarantee correctness of the conversion. The in-place conversion steps are as follows:

    + +
      +
    1. Each thread loads 2 FP32 token elements from the same head from the shared memory into registers
    2. +
    3. Call __syncthreads() to make sure that every thread finishes reading the data
    4. +
    5. Each thread converts its data to 2 BF16 token elements and then stores the results to the same shared memory
    6. +
    + +

    Some optimizations that we apply to the implementation:

    + +
      +
    • Use vector types (especially nv_bfloat2)
    • +
    • Unroll data loading/storing, i.e., performing multiple loads before calling __syncthreads() and performing multiple stores after __syncthreads()
    • +
    + +

    After this optimization, long stalls are not observed during P type casting as shown in Figure 8.

    + +

    Figure 8: P type casting after Optimization 2

    + +

    Figure 8 P type casting after Optimization 2 (the numbers that the arrow points to are stall cycles caused by shared memory wait)

    + +

    Culprits:

    + +

    Since we unroll data loading/storing by using registers as an intermediate storage, the number of registers per thread increases resulting in reduced occupancy.

    + +

    Results:

    + +

    Table 4 Performance of Optimization 2 for INT4 GQA (row-wise quantization)

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Batch size + Time (us) + Bandwidth (GB/s) + Speed up +
    FD + CU + FD + CU + vs FD + vs CU baseline +
    Baseline + Opt 2 + Baseline + Opt 2 +
    32 + 137 + 143 + 126 + 262 + 250 + 285 + 1.09 + 1.14 +
    64 + 234 + 257 + 221 + 305 + 278 + 324 + 1.06 + 1.16 +
    128 + 432 + 455 + 395 + 331 + 314 + 362 + 1.09 + 1.15 +
    256 + 815 + 866 + 749 + 351 + 331 + 382 + 1.09 + 1.16 +
    512 + 1581 + 1659 + 1435 + 362 + 345 + 399 + 1.10 + 1.16 +
    + +

    Optimization 3: Remove Local Memory Usage for max QKT computation

    + +

    Problem Analysis:

    + +

    During the softmax computation, the kernel has to compute max QKT for each head. It uses a temporary “thread-local” storage for storing per-thread max QKT results (one float value for each head). Depending on the compiler, the thread-local storage can be allocated on registers (on chip) or the local memory (off chip == global memory). Unfortunately, in the baseline, the thread-local storage resides in the local memory which is much slower than the registers (shown in Figure 9). We suspect that this is because the compiler cannot determine the indices of thread-local storage at compile time (since the number of heads (H) in the kernel is a runtime variable). Accessing local memory as if accessing registers can hurt the performance of the kernel.

    + +

    Figure 9: Local memory access during max QKT computation

    + +

    Figure 9 Local memory access during max QKT computation

    + +

    Solution:

    + +

    We realize that we do not need H (number of heads) floats as temporary storage per thread since each thread can compute max QKT for only one head instead of all the heads. Thus, we only need one float per thread, which can be easily stored in a register. To accumulate the max results among warps, we use shared memory. This optimization eliminates the local memory usage during max QKT computation.

    + +

    Results:

    + +

    Table 5 Performance of Optimization 3 for INT4 GQA (row-wise quantization)

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Batch size + Time (us) + Bandwidth (GB/s) + Speed up +
    FD + CU + FD + CU + vs FD + vs CU baseline +
    Baseline + Opt 3 + Baseline + Opt 3 +
    32 + 137 + 143 + 119 + 262 + 250 + 300 + 1.14 + 1.20 +
    64 + 234 + 257 + 206 + 305 + 278 + 348 + 1.14 + 1.25 +
    128 + 432 + 455 + 368 + 331 + 314 + 389 + 1.17 + 1.24 +
    256 + 815 + 866 + 696 + 351 + 331 + 411 + 1.17 + 1.24 +
    512 + 1581 + 1659 + 1338 + 362 + 345 + 428 + 1.18 + 1.24 +
    + +

    Optimization 4: Remove local memory usage for row sum

    + +

    Problem Analysis:

    + +

    Similar to Optimization 3, the local memory usage problem is also observed during the row sum computation in the softmax computation. Since local memory is off chip, accessing it as if accessing registers can hurt the performance of the kernel.

    + +

    Solution:

    + +

    We apply the same solution as the max QKT computation for the row sum computation. That is to have each thread compute a row sum of only one head, which requires only one float per thread. This eliminates the need for local memory.

    + +

    Results:

    + +

    Table 6 Performance of Optimization 4 for INT4 GQA (row-wise quantization)

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Batch size + Time (us) + Bandwidth (GB/s) + Speed up +
    FD + CU + FD + CU + vs FD + vs CU baseline +
    Baseline + Opt 4 + Baseline + Opt 4 +
    32 + 137 + 143 + 118 + 262 + 250 + 302 + 1.15 + 1.21 +
    64 + 234 + 257 + 204 + 305 + 278 + 351 + 1.15 + 1.26 +
    128 + 432 + 455 + 364 + 331 + 314 + 393 + 1.19 + 1.25 +
    256 + 815 + 866 + 688 + 351 + 331 + 416 + 1.18 + 1.26 +
    512 + 1581 + 1659 + 1328 + 362 + 345 + 431 + 1.19 + 1.25 +
    + +

    Optimization 5: Add prefetch for V load

    + +

    Problem Analysis:

    + +

    The same issue as K loading is observed when loading V. That is, the kernel issues data loading, and then waits to consume the data immediately causing the global load latency to be exposed. However, when using the unrolling technique mentioned above, the compiler allocates the temporary buffer on local memory instead of registers causing a large slow down.

    + +

    Solution:

    + +

    We adopt the data prefetching technique for V loading. We load the next iteration V values immediately after the current iteration values are consumed. This allows the data loading to be overlapped with the PK computation resulting in better kernel performance.

    + +

    Results:

    + +

    Table 7 Performance of Optimization 5 for INT4 GQA (row-wise quantization)

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Batch size + Time (us) + Bandwidth (GB/s) + Speed up +
    FD + CU + FD + CU + vs FD + vs CU baseline +
    Baseline + Opt 5 + Baseline + Opt 5 +
    32 + 137 + 143 + 109 + 262 + 250 + 327 + 1.25 + 1.31 +
    64 + 234 + 257 + 194 + 305 + 278 + 370 + 1.21 + 1.33 +
    128 + 432 + 455 + 345 + 331 + 314 + 414 + 1.25 + 1.32 +
    256 + 815 + 866 + 649 + 351 + 331 + 441 + 1.26 + 1.33 +
    512 + 1581 + 1659 + 1244 + 362 + 345 + 460 + 1.27 + 1.33 +
    + +

    Optimization 6: Add Group-Wise INT4 (Groups = 4) with Vector Load

    + +

    Problem Analysis:

    + +

    Prior to this optimization, CU only supported row-wise INT4 quantization. That is, every column in each row shares the same scales. The scales of each row are stored in the first 4 bytes of each row as shown in Figure 10. In the kernel, each thread loads only one row at a time. Since each row contains 68 bytes (4 bytes for scales and 64 bytes for data), it cannot guarantee that every row aligns with a size of any vector type. Thus, vector loads cannot be used for loading the KV cache.

    + +

    Figure 10: The layout of each row of INT4 KV cache with row-wise quantization

    + +

    Figure 10 The layout of each row of INT4 KV cache with row-wise quantization

    + +

    Solution:

    + +

    We have implemented support for group-wise INT4 quantization with num groups = 4. In this case, columns in each row in the KV cache tensor are divided into 4 equal groups. Columns within the same group share the same scales for quantization/dequantization. The data layout for INT4 KV cache is shown in Figure 11. The scales for all groups are serialized and stored at the beginning of each row. The INT4 data is also serialized and laid out next to the scales.

    + +

    Because the number of bytes in each row now becomes 80 bytes, we can use a vector type, i.e., uint2 in our case, to load data. (We do not use uint4 since each thread loads only 16 INT4s at a time due to the tensor core fragment size.) Vector load is generally better than scalar load since it does not cause extra byte loads.

    + +

    Figure 11: The layout of each row of INT4 KV cache with row-wise quantization

    + +

    Figure 11 The layout of each row of INT4 KV cache with row-wise quantization

    + +

    Results:

    + +

    Table 8 Performance of Optimization 6 for INT4 GQA (row-wise quantization)

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Batch size + Time (us) + Bandwidth (GB/s) + Speed up +
    FD + CU + FD + CU + vs FD + vs CU baseline +
    Baseline + Opt 6 + Baseline + Opt 6 +
    32 + 137 + 143 + 111 + 262 + 250 + 322 + 1.23 + 1.29 +
    64 + 234 + 257 + 192 + 305 + 278 + 372 + 1.22 + 1.34 +
    128 + 432 + 455 + 346 + 331 + 314 + 414 + 1.25 + 1.32 +
    256 + 815 + 866 + 642 + 351 + 331 + 446 + 1.27 + 1.35 +
    512 + 1581 + 1659 + 1244 + 362 + 345 + 460 + 1.27 + 1.33 +
    + +

    Table 9 Performance of Optimization 6 for INT4 GQA (group-wise quantization with num groups = 4)

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Batch size + Time (us) + Bandwidth (GB/s) + Speed up +
    FD + CUDA_WMMA + FD + CUDA_WMMA + vs FD +
    Opt 6 + Opt 6 +
    32 + 129 + 116 + 325 + 364 + 1.31 +
    64 + 219 + 195 + 385 + 431 + 1.36 +
    128 + 392 + 347 + 429 + 484 + 1.39 +
    256 + 719 + 638 + 468 + 527 + 1.41 +
    512 + 1375 + 1225 + 489 + 550 + 1.43 +
    + +

    Optimization 7: Compute max QKT From WMMA Fragment Directly (A100/H100 specific)

    + +

    Problem Analysis:

    + +

    We observe large stalls due to shared memory accessing during the max QKT computation (showing as large short scoreboard stalls) as shown in Figure 12.

    + +

    Figure 12: Stalls due to shared memory access during max QKT computation

    + +

    Figure 12 Stalls due to shared memory access during max QKT computation (the number that the arrow points to is stall cycles caused by shared memory wait)

    + +

    Solution:

    + +

    We bypass shared memory when computing max QKT by computing it from the WMMA fragment (i.e., the tensor core fragment) directly. The layout of the WMMA fragment is specific to the GPU architecture. In this optimization, we only enabled this optimization for the NVIDIA A100/H100 GPUs. Other GPUs will still use shared memory for the max QKT computation. By bypassing shared memory, we effectively eliminate the stalls caused by shared memory access. The tensor core layout of the C fragment which is used for storing the QKT results is shown in Figure 13.

    + +

    Figure 13: C fragment (QKT storage) tensor core layout on A100/H100

    + +

    Figure 13 C fragment (QKT storage) tensor core layout on A100/H100

    + +

    Table 10 Performance of Optimization 7 for INT4 GQA (row-wise quantization)

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Batch size + Time (us) + Bandwidth (GB/s) + Speed up +
    FD + CU + FD + CU + vs FD + vs CU baseline +
    Baseline + Opt 7 + Baseline + Opt 7 +
    32 + 137 + 143 + 107 + 262 + 250 + 333 + 1.27 + 1.33 +
    64 + 234 + 257 + 183 + 305 + 278 + 391 + 1.28 + 1.40 +
    128 + 432 + 455 + 333 + 331 + 314 + 430 + 1.30 + 1.37 +
    256 + 815 + 866 + 620 + 351 + 331 + 461 + 1.31 + 1.40 +
    512 + 1581 + 1659 + 1206 + 362 + 345 + 475 + 1.31 + 1.38 +
    + +

    Table 11 Performance of Optimization 7 for INT4 GQA (group-wise quantization with num groups = 4)

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Batch size + Time (us) + Bandwidth (GB/s) + Speed up +
    FD + CUDA_WMMA + FD + CUDA_WMMA + vs FD + vs CUDA_WMMA Opt 6 +
    Opt 6 + Opt 7 + Opt 6 + Opt 7 +
    32 + 129 + 116 + 111 + 325 + 364 + 380 + 1.17 + 1.04 +
    64 + 219 + 195 + 187 + 385 + 431 + 449 + 1.17 + 1.04 +
    128 + 392 + 347 + 333 + 429 + 484 + 506 + 1.18 + 1.04 +
    256 + 719 + 638 + 615 + 468 + 527 + 547 + 1.17 + 1.04 +
    512 + 1375 + 1225 + 1184 + 489 + 550 + 569 + 1.16 + 1.03 +
    + +

    Optimization 8: Write FP32->BF16 Results to P Fragment Directly (A100/H100 specific)

    + +

    Problem Analysis:

    + +

    During the FP32-BF16 conversion for the P fragment, the kernel loads the FP32 data from shared memory, does the conversion and then stores the BF16 data back to shared memory. Moreover, the conversion requires many thread block synchronizations (__syncthreads()).

    + +

    Solution:

    + +

    Due to the data partitioning design of the kernel, each warp performs only one pass through the P fragment. Thus, we do not have to write the conversion results back to the shared memory for future usage. To avoid writing the BF16 data to the shared memory and thread block synchronizations, we have each warp load the FP32 data of the P WMMA fragment from the shared memory, do the conversion and then write the BF16 data directly to the P fragment.

    + +

    Note that this optimization is applied to only the NVIDIA A100 and H100 GPUs because the WMMA fragment layout is architecture dependent. For non-A100/H100 GPUs, the kernel will fallback to the original path.

    + +

    The P fragment tensor core layout is shown in Figure 14. Note that this layout is specific to the NVIDIA A100/H100 GPU.

    + +

    Figure 14: P fragment tensor core layout on A100/H100

    + +

    Figure 14 P fragment tensor core layout on A100/H100

    + +

    Table 12 Performance of Optimization 8 for INT4 GQA (row-wise quantization)

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Batch size + Time (us) + Bandwidth (GB/s) + Speed up +
    FD + CU + FD + CU + vs FD + vs CU baseline +
    Baseline + Opt 8 + Baseline + Opt 8 +
    32 + 137 + 143 + 101 + 262 + 250 + 353 + 1.35 + 1.41 +
    64 + 234 + 257 + 174 + 305 + 278 + 410 + 1.34 + 1.47 +
    128 + 432 + 455 + 317 + 331 + 314 + 451 + 1.36 + 1.43 +
    256 + 815 + 866 + 590 + 351 + 331 + 485 + 1.38 + 1.47 +
    512 + 1581 + 1659 + 1143 + 362 + 345 + 501 + 1.38 + 1.45 +
    + +

    Table 13 Performance of Optimization 8 for INT4 GQA (group-wise quantization with num groups = 4)

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Batch size + Time (us) + Bandwidth (GB/s) + Speed up +
    FD + CUDA_WMMA + FD + CUDA_WMMA + vs FD + vs CUDA_WMMA Opt 6 +
    Opt 6 + Opt 8 + Opt 6 + Opt 8 +
    32 + 129 + 116 + 106 + 325 + 364 + 396 + 1.22 + 1.09 +
    64 + 219 + 195 + 180 + 385 + 431 + 467 + 1.21 + 1.08 +
    128 + 392 + 347 + 319 + 429 + 484 + 528 + 1.23 + 1.09 +
    256 + 719 + 638 + 596 + 468 + 527 + 565 + 1.21 + 1.07 +
    512 + 1375 + 1225 + 1138 + 489 + 550 + 591 + 1.21 + 1.08 +
    + +

    Optimization 9: Swizzle P Shared Memory Layouts (A100/H100 specific)

    + +

    Problem Analysis:

    + +

    We observe large shared memory bank conflicts during P loading. The amount of bank conflict depends on the memory access stride. For instance, for split-Ks = 32 and max seq length = 8192, we observed that only 4 out of 32 banks are being accessed in parallel (memory access stride = 256). From Figure 14, when all threads access element 0, threads that have the same threadIdx.x % 4 access the same bank.

    + +

    Figure 15: P fragment in shared memory before swizzling

    + +

    Figure 15 P fragment in shared memory before swizzling

    + +

    Solution:

    + +

    We shuffle the layout of P load/store in the shared memory in such a way that avoids bank conflicts. In other words, we store the QKT results (C fragment) and load them (P fragment) using the swizzled layout. Moreover, instead of using the original memory access stride which is dependent on the number of tokens per thread block, we use the fragment’s column size as the stride which is constant. Thus, the load and store of the P fragment is always contiguous.

    + +

    The new layouts for the C and P fragments are shown in Figure 16. With the new layout, it is guaranteed that 16 banks are being accessed in parallel as shown in Figure 17.

    + +

    Figure 16: The swizzled layouts of C and P fragments

    + +

    Figure 16 The swizzled layouts of C and P fragments

    + +

    Figure 17: P fragment in shared memory after swizzling

    + +

    Figure 17 P fragment in shared memory after swizzling

    + +

    Table 14 Performance of Optimization 9 for INT4 GQA (row-wise quantization)

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Batch size + Time (us) + Bandwidth (GB/s) + Speed up +
    FD + CU + FD + CU + vs FD + vs CU baseline +
    Baseline + Opt 9 + Baseline + Opt 9 +
    32 + 137 + 143 + 98 + 262 + 250 + 365 + 1.39 + 1.46 +
    64 + 234 + 257 + 167 + 305 + 278 + 429 + 1.41 + 1.54 +
    128 + 432 + 455 + 299 + 331 + 314 + 479 + 1.45 + 1.52 +
    256 + 815 + 866 + 549 + 351 + 331 + 521 + 1.48 + 1.58 +
    512 + 1581 + 1659 + 1060 + 362 + 345 + 540 + 1.49 + 1.56 +
    + +

    Table 15 Performance of Optimization 9 for INT4 GQA (group-wise quantization with num groups = 4)

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Batch size + Time (us) + Bandwidth (GB/s) + Speed up +
    FD + CUDA_WMMA + FD + CUDA_WMMA + vs FD + vs CUDA_WMMA Opt 6 +
    Opt 6 + Opt 9 + Opt 6 + Opt 9 +
    32 + 129 + 116 + 105 + 325 + 364 + 400 + 1.23 + 1.10 +
    64 + 219 + 195 + 174 + 385 + 431 + 484 + 1.26 + 1.12 +
    128 + 392 + 347 + 302 + 429 + 484 + 558 + 1.30 + 1.15 +
    256 + 719 + 638 + 560 + 468 + 527 + 601 + 1.28 + 1.14 +
    512 + 1375 + 1225 + 1065 + 489 + 550 + 632 + 1.29 + 1.15 +
    + +

    Optimization 10: Pad Shared Memory for INT4 Dequantization

    + +

    Problem Analysis:

    + +

    Once the kernel reads the INT4 K or V cache from global memory, it performs dequantization and stores the results (BF16) in the shared memory. Then, the BF16 data is loaded to the WMMA fragment from shared memory (via the WMMA interface). We observed a large number of bank conflicts for both K and V accesses. For instance, for K stores, only 4 out of 32 banks are being accessed in parallel. For K loads, 16 banks are being accessed in parallel. The same also occurs for V stores and loads. See the figures in the solution section.

    + +

    Solution:

    + +

    We pad the shared memory to reduce the bank conflict. Specifically, we pad each row by 2. That is, the row stride of K becomes F_K + 2 and the row stride of V becomes F_N + 2 (F_K and F_N are the fixed widths of the K and V WMMA fragments, respectively). With this optimization, we are able to reduce the bank conflict by 1.8x as shown in Figure 18.

    + +

    Figure 18: Bank conflicts before and after Optimization 10

    + +

    Figure 18 Bank conflicts before and after Optimization 10

    + +

    After Optimization 10, for K stores, 32 banks are being accessed in parallel (shown in Figure 19), while for K loads, 29 banks are accessed in parallel (shown in Figure 20).

    + +

    Figure 19: K fragment store shared memory layout without and with padding

    + +

    Figure 19 K fragment store shared memory layout without and with padding

    + +

    Figure 20: K fragment load shared memory layout without and with padding

    + +

    Figure 20 K fragment load shared memory layout without and with padding

    + +

    Table 16 Performance of Optimization 10 for INT4 GQA (row-wise quantization)

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Batch size + Time (us) + Bandwidth (GB/s) + Speed up +
    FD + CU + FD + CU + vs FD + vs CU baseline +
    Baseline + Opt 10 + Baseline + Opt 10 +
    32 + 137 + 143 + 94 + 262 + 250 + 380 + 1.45 + 1.52 +
    64 + 234 + 257 + 151 + 305 + 278 + 475 + 1.55 + 1.71 +
    128 + 432 + 455 + 266 + 331 + 314 + 538 + 1.63 + 1.71 +
    256 + 815 + 866 + 489 + 351 + 331 + 586 + 1.67 + 1.77 +
    512 + 1581 + 1659 + 930 + 362 + 345 + 616 + 1.70 + 1.79 +
    + +

    Table 17 Performance of Optimization 10 for INT4 GQA (group-wise quantization with num groups = 4)

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Batch size + Time (us) + Bandwidth (GB/s) + Speed up +
    FD + CUDA_WMMA + FD + CUDA_WMMA + vs FD + vs CUDA_WMMA Opt 6 +
    Opt 6 + Opt 10 + Opt 6 + Opt 10 +
    32 + 129 + 116 + 99 + 325 + 364 + 425 + 1.31 + 1.17 +
    64 + 219 + 195 + 161 + 385 + 431 + 523 + 1.36 + 1.21 +
    128 + 392 + 347 + 282 + 429 + 484 + 598 + 1.39 + 1.23 +
    256 + 719 + 638 + 509 + 468 + 527 + 662 + 1.41 + 1.25 +
    512 + 1375 + 1225 + 965 + 489 + 550 + 698 + 1.43 + 1.27 +
    + +

    Performance Evaluation

    + +

    Microbenchmark results

    + +

    We also evaluated BF16 GQA performance using our optimized kernel (as shown in Table 19). CU still performs generally worse than FD and FA for BF16. This is expected since our optimizations are INT4 focused.

    + +

    While INT4 GQA is still not as efficient as BF16 GQA (see the achieved bandwidths), it is important to note that when comparing FD BF16 GQA performance against CU INT4 GQA performance, we can see that the latency of INT4 is smaller than that of BF16.

    + +

    Table 19 Performance of BF16 GQA and INT GQA after CU optimizations

    + +

    On A100

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Time (us) + BF16 GQA + INT4 GQA +
    Batch size + FD + FA + CU before + CU after + FD + FA + CU before + CU after +
    32 + 139 + 133 + 183 + 163 + 137 + - + 143 + 94 +
    64 + 245 + 229 + 335 + 276 + 234 + - + 257 + 151 +
    128 + 433 + 555 + 596 + 517 + 432 + - + 455 + 266 +
    256 + 826 + 977 + 1127 + 999 + 815 + - + 866 + 489 +
    512 + 1607 + 1670 + 2194 + 1879 + 1581 + - + 1659 + 930 +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Effective Bandwidth (GB/s) + BF16 GQA + INT4 GQA +
    Batch size + FD + FA + CU before + CU after + FD + FA + CU before + CU after +
    32 + 965 + 1012 + 736 + 824 + 262 + - + 250 + 380 +
    64 + 1097 + 1175 + 802 + 972 + 305 + - + 278 + 475 +
    128 + 1240 + 968 + 901 + 1039 + 331 + - + 314 + 538 +
    256 + 1301 + 1100 + 954 + 1075 + 351 + - + 331 + 586 +
    512 + 1338 + 1287 + 980 + 1144 + 362 + - + 345 + 616 +
    + +

    On H100

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Time (us) + BF16 GQA + INT4 GQA +
    Batch size + FD + FA + CU before + CU after + FD + FA + CU before + CU after +
    32 + 91 + 90 + 114 + 100 + 70 + - + 96 + 64 +
    64 + 148 + 146 + 200 + 183 + 113 + - + 162 + 101 +
    128 + 271 + 298 + 361 + 308 + 205 + - + 294 + 170 +
    256 + 515 + 499 + 658 + 556 + 389 + - + 558 + 306 +
    512 + 1000 + 1011 + 1260 + 1066 + 756 + - + 1066 + 575 +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Effective Bandwidth (GB/s) + BF16 GQA + INT4 GQA +
    Batch size + FD + FA + CU before + CU after + FD + FA + CU before + CU after +
    32 + 1481 + 1496 + 1178 + 1341 + 511 + - + 371 + 560 +
    64 + 1815 + 1840 + 1345 + 1470 + 631 + - + 443 + 710 +
    128 + 1982 + 1802 + 1487 + 1743 + 699 + - + 487 + 844 +
    256 + 2087 + 2156 + 1634 + 1934 + 736 + - + 513 + 935 +
    512 + 2150 + 2127 + 1706 + 2015 + 757 + - + 537 + 996 +
    + +

    E2E results

    + +

    We evaluated our optimized INT4 GQA kernel in Llama 2 70B on 8 H100 GPUs. We ran the model end-to-end, but only reported the decode latency. We use FP8 FFN (feed forward network) to emphasize the attention performance in the decoding phase. We vary the batch size from 1 to 256 and the context length from 2,048 (2K) to 16,384 (16K). The E2E performance results are shown in the figure below.

    + +

    Figure 21: Meta Llama 2 decode latency (ms) comparison

    + +

    Figure 21 Meta Llama 2 decode latency (ms) comparison (BF16 GQA runs out of memory in large batch size configurations)

    + +

    Code

    + +

    If you are interested, please checkout our code here. If you have any questions, please feel free to open an issue on GitHub, and we will be happy to help. Your contributions are welcome!

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/int8-quantization/index.html b/blog/int8-quantization/index.html new file mode 100644 index 000000000000..2d9b8af94348 --- /dev/null +++ b/blog/int8-quantization/index.html @@ -0,0 +1,717 @@ + + + + + + + + + + + + + INT8 Quantization for x86 CPU in PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    August 07, 2023

    +

    + INT8 Quantization for x86 CPU in PyTorch +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Intel + +

    +

    Overview

    + +

    INT8 quantization is a powerful technique for speeding up deep learning inference on x86 CPU platforms. By reducing the precision of the model’s weights and activations from 32-bit floating-point (FP32) to 8-bit integer (INT8), INT8 quantization can significantly improve the inference speed and reduce memory requirements without sacrificing accuracy.

    + +

    In this blog, we will discuss the recent progress on INT8 quantization for x86 CPU in PyTorch, focusing on the new x86 quantization backend. We will also briefly look at the new quantization path with PyTorch 2.0 Export (PT2E) and TorchInductor.

    + +

    X86 Quantization Backend

    + +

    The current recommended way of quantization in PyTorch is FX. Before PyTorch 2.0, the default quantization backend (a.k.a. QEngine) on x86 CPUs was FBGEMM, which leveraged the FBGEMM performance library to achieve the performance speedup. In the PyTorch 2.0 release, a new quantization backend called X86 was introduced to replace FBGEMM. The x86 quantization backend offers improved INT8 inference performance when compared to the original FBGEMM backend by leveraging the strengths of both FBGEMM and the Intel® oneAPI Deep Neural Network Library (oneDNN) kernel libraries.

    + +

    Performance Benefit from X86 Backend

    + +

    To measure the performance benefits of the new X86 backend, we ran INT8 inference on 69 popular deep learning models (shown in Figures 1-3 below) using 4th Gen Intel® Xeon® Scalable processors. The results showed a 2.97X geomean performance speedup compared to FP32 inference performance, while the speedup was 1.43X with the FBGEMM backend. The charts below show the per-model performance speedup comparing the x86 backend and the FBGEMM backend.

    + +

    Figure 1: Models with less than 2x performance boost with x86 backend1

    + +

    Figure 1: Models with less than 2x performance boost with x86 backend1

    + +

    Figure 2: Models with 2x-4x performance boost with x86 backend1

    + +

    Figure 2: Models with 2x-4x performance boost with x86 backend1

    + +

    Figure 3: Models with larger than 4x performance boost with x86 backend1

    + +

    Figure 3: Models with larger than 4x performance boost with x86 backend1

    + +

    Usage of x86 Backend

    + +

    By default in 2.0, users on x86 platforms will use the x86 quantization backend and their PyTorch programs will remain unchanged when using the default backend. Alternatively, users can specify x86 as the quantization backend explicitly.
    +Below is an example code snippet of PyTorch static post-training quantization with x86 quantization backend.

    + +
    import torch
    +from torch.ao.quantization import get_default_qconfig_mapping
    +from torch.quantization.quantize_fx import prepare_fx, convert_fx
    +
    +qconfig_mapping = get_default_qconfig_mapping()
    +# Or explicity specify the qengine
    +# qengine = 'x86'
    +# torch.backends.quantized.engine = qengine
    +# qconfig_mapping = get_default_qconfig_mapping(qengine)
    +
    +model_fp32 = MyModel().eval()
    +x = torch.randn((1, 3, 224, 224), dtype=torch.float)
    +x = x.to(memory_format=torch.channels_last)
    +
    +# Insert observers according to qconfig and backend config
    +prepared_model = prepare_fx(model_fp32, qconfig_mapping, example_inputs=x)
    +
    +# Calibration code not shown
    +
    +# Convert to quantized model
    +quantized_model = convert_fx(prepared_model)
    +
    + +

    Technical Details of x86 Backend

    + +

    We devised heuristic dispatching rules according to the performance numbers from the models we benchmarked to decide whether to invoke oneDNN or FBGEMM performance library to execute the convolution or matrix multiplication operations. The rules are a combination of operation kinds, shapes, CPU architecture information, etc. Detailed logic is available here. For more design and technical discussion, please refer to the Request for Comments.

    + +

    Next Steps With a New Quantization Path PyTorch 2.0 Export

    + +

    Although still far from finalized, a new quantization path, PyTorch 2.0 Export (PT2E), is in early design and PoC stage. The new approach is slated to replace the FX quantization path in the future. It is built upon the capabilities of TorchDynamo Export, a feature introduced in the PyTorch 2.0 release for FX graph capturing. This graph is then quantized and lowered to different backends. TorchInductor, the new DL compiler of PyTorch, has shown promising results in terms of FP32 inference speedup on x86 CPU. We are working actively to enable it as one of the quantization backends of PT2E. We believe the new path will lead to further improvements in INT8 inference performance due to more flexibility of fusion at different levels.

    + +

    Conclusion

    + +

    The x86 backend introduced in PyTorch 2.0 release has demonstrated a remarkable improvement in INT8 inference speed on x86 CPU platforms. It offers a 1.43X speedup compared to the original FBGEMM backend while maintaining backward compatibility. This enhancement can benefit end users with minimal or no modifications to their programs. Furthermore, a new quantization path, PT2E, is currently in development and is expected to provide even more possibilities in the future.

    + +

    Acknowledgement

    + +

    Special thanks to Nikita Shulga, Vasiliy Kuznetsov, Supriya Rao, and Jongsoo Park. Together, we made one more step forward on the path of improving the PyTorch CPU ecosystem.

    + +

    Configuration

    + +

    1 AWS EC2 r7iz.metal-16xl instance (Intel(R) Xeon(R) Gold 6455B, 32-core/64-thread, Turbo Boost On, Hyper-Threading On, Memory: 8x64GB, Storage: 192GB); OS: Ubuntu 22.04.1 LTS; Kernel: 5.15.0-1028-aws; Batch Size: 1; Core per Instance: 4; PyTorch 2.0 RC3; TorchVision 0.15.0+cpu, test by Intel on 3/77/2023. May not reflect all publicly available security updates.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/intel-gpu-support-pytorch-2-5/index.html b/blog/intel-gpu-support-pytorch-2-5/index.html new file mode 100644 index 000000000000..2688626109bb --- /dev/null +++ b/blog/intel-gpu-support-pytorch-2-5/index.html @@ -0,0 +1,938 @@ + + + + + + + + + + + + + Intel GPU Support Now Available in PyTorch 2.5 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    + +
    + +
    +
    +
    + +
    +

    + by + + PyTorch Team at Intel + +

    +

    Support for Intel GPUs is now available in PyTorch® 2.5, providing improved functionality and performance for Intel GPUs which including Intel® Arc™ discrete graphics, Intel® Core™ Ultra processors with built-in Intel® Arc™ graphics and Intel® Data Center GPU Max Series. This integration brings Intel GPUs and the SYCL* software stack into the official PyTorch stack, ensuring a consistent user experience and enabling more extensive AI application scenarios, particularly in the AI PC domain.

    + +

    Developers and customers building for and using Intel GPUs will have a better user experience by directly obtaining continuous software support from native PyTorch, unified software distribution, and consistent product release time.

    + +

    Furthermore, Intel GPU support provides more choices to users. Now PyTorch provides a consistent GPU programming paradigm on both front ends and back ends. Developers can now run and deploy workloads on Intel GPUs with minimal coding efforts.

    + +

    Overview of Intel GPU support

    + +

    Intel GPU support in PyTorch provides eager mode and graph mode support in the PyTorch built-in front end. Eager mode now has an implementation of commonly used Aten operators with the SYCL programming language. Graph mode (torch.compile) now has an enabled Intel GPU back end to implement the optimization for Intel GPUs and to integrate Triton. 

    + +

    Essential components of Intel GPU support were added to PyTorch, including runtime, Aten operators, oneDNN, TorchInductor, Triton and Intel GPU tool chains integration. Meanwhile, quantization and distributed are being actively developed in preparation for the PyTorch 2.6 release.

    + +

    Features

    + +

    In addition to providing key features for Intel® Client GPUs and Intel® Data Center GPU Max Series for inference and training, PyTorch keeps the same user experience as other hardware the PyTorch supports. If you migrate code from CUDA*, you can run the existing application code on an Intel GPU with minimal code changes for the device name (from cuda to xpu). For example:

    + +

    # CUDA Code
    +tensor = torch.tensor([1.0, 2.0]).to(“cuda”)

    + +

    # Code for Intel GPU
    +tensor = torch.tensor([1.0, 2.0]).to(“xpu”)

    + +

    PyTorch 2.5 features with an Intel GPU include: 

    + +
      +
    • Inference and training workflows.
    • +
    • Enhance both torch.compile and eager mode functionalities (more Ops), together with performance improvement, and fully run three Dynamo Hugging Face*, TIMM* and TorchBench* benchmarks for eager and compile modes. 
    • +
    • Data types such as FP32, BF16, FP16, and automatic mixed precision (AMP).
    • +
    • Runs on Intel® Client GPUs and Intel® Data Center GPU Max Series.
    • +
    • Supports Linux (Ubuntu, SUSE Linux and Red Hat Linux) and Windows 10/11.
    • +
    + +

    Get Started

    + +

    Get a tour of the environment setup, PIP wheels installation, and examples on Intel® Client GPUs and Intel® Data Center GPU Max Series from Getting Started Guide. Support for Intel GPUs can be experienced through PyTorch PIP wheels installation by nightly and preview binary releases.

    + +
      +
    • +

      Try Intel® Client GPUs through Intel® Arc™ Graphics family (Codename DG2), Intel® Core™ Ultra processor family with Intel® Graphics (Codename Meteor Lake), and Intel® Core™ Ultra mobile processor family with Intel® Graphics (Codename Lunar Lake).

      +
    • +
    • +

      Try Intel Data Center GPU Max Series through Intel® Tiber™ AI Cloud.

      + +
        +
      1. +

        To learn how to create a free Standard account, see Get Started. Then do the following:

        + +
          +
        • +

          Sign in to the cloud console.

          +
        • +
        • +

          From the Training section, open the  PyTorch on Intel® GPUs  notebook and click “Launch Jupyter Notebook.”

          +
        • +
        • +

          Ensure that the PyTorch 2.5 kernel is selected for the notebook.

          +
        • +
        +
      2. +
      +
    • +
    + +

    Performance

    + +

    The performance of Intel GPU on PyTorch was continuously optimized to achieve decent result on three Dynamo Hugging Face, TIMM and TorchBench benchmarks for eager and compile modes.

    + +

    The latest performance data measured on top of PyTorch Dynamo Benchmarking Suite using Intel® Data Center GPU Max Series 1100 single card showcase the FP16/BF16 significant speedup ratio over FP32 on eager mode in Figure 1, and Torch.compile mode speedup ratio over eager mode in Figure 2. Both inference and training reached the similar significant improvements.

    + +

    Figure 2: FP16/BF16 Performance Gains Over FP32 Eager

    + +

    Figure 2: FP16/BF16 Performance Gains Over FP32 Eager

    + +

    Figure 3: Torch.compile Performance Gains Over Eager Mode

    + +

    Figure 3: Torch.compile Performance Gains Over Eager Mode

    + +

    Summary

    + +

    Intel GPU on PyTorch 2.5 brings Intel® Client GPUs (Intel® Core™ Ultra processors with built-in Intel® Arc™ graphics and Intel® Arc™ Graphics for dGPU parts) and Intel® Data Center GPU Max Series into the PyTorch ecosystem for AI workload acceleration. Especially, Client GPUs is added to the GPU-supported list for AI PC use scenarios on Windows and Linux environment.

    + +

    We warmly welcome the community to evaluate and provide feedback on these enhancements to  Intel GPU support on PyTorch. 

    + +

    Resources

    + + + +

    Acknowledgments

    + +

    We want thank PyTorch open source community for their technical discussions and insights: Andrey TalmanAlban Desmaison, Nikita ShulgaEli Uriegas, Jason Ansel, and Bin Bao.

    + +

    We also thank collaborators from PyTorch for their professional support and guidance.

    + +

    Performance Configuration

    + +

    The configurations in the table are collected with svr-info. Test by Intel on September 12, 2024.

    + +

    Table 1

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    ComponentDetails
    NameIntel® Max Series GPU 1100 in Intel® Tiber™ Developer Cloud
    TimeThu Sep 12 08:21:27 UTC 2024
    SystemSupermicro SYS-521GE-TNRT
    BaseboardSupermicro X13DEG-OA
    ChassisSupermicro Other
    CPU ModelIntel(R) Xeon(R) Platinum 8468V
    MicroarchitectureSPR_XCC
    Sockets2
    Cores per Socket48
    HyperthreadingEnabled
    CPUs192
    Intel Turbo BoostEnabled
    Base Frequency2.4GHz
    All-core Maximum Frequency2.4GHz
    Maximum Frequency2.9GHz
    NUMA Nodes2
    PrefetchersL2 HW: Enabled, L2 Adj.: Enabled, DCU HW: Enabled, DCU IP: Enabled, AMP: Disabled, Homeless: Disabled, LLC: Disabled
    PPINs5e3f862ef7ba9d50, 6c85812edfcc84b1
    AcceleratorsDLB 2, DSA 2, IAA 2, QAT (on CPU) 2, QAT (on chipset) 0
    Installed Memory1024GB (16x64GB DDR5 4800 MT/s [4800 MT/s])
    Hugepagesize2048 kB
    Transparent Huge Pagesmadvise
    Automatic NUMA BalancingEnabled
    NIC2 x Ethernet Controller X710 for 10GBASE-T, 4 x MT2892 Family [ConnectX-6 Dx]
    Disk1 x 894.3G Micron_7450_MTFDKBG960TFR
    BIOS1.4a
    Microcode0x2b0004b1
    OSUbuntu 22.04.2 LTS
    Kernel5.15.0-73-generic
    TDP330W
    Power & Perf PolicyNormal (6)
    Frequency Governorperformance
    Frequency Driveracpi-cpufreq
    Max C-State9
    + +

    Table 2

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    ComponentDetails
    Single CardIntel® Max Series GPU 1100 series on 4th Gen Intel® Xeon® processors of Intel Tiber Developer Cloud
    Workload & versionTimm ac34701, TorchBench 03cde49, Torchvision d23a6e1, Torchaudio b3f6f51, Transformers 243e186
    Software Stackintel-for-pytorch-gpu-dev 0.5.3, intel-pti-dev 0.9.0, Intel xpu backend for Triton cc981fe
    FrameworkPytorch 4a3dabd67f8ce63f2fc45f278421cca3cc532cfe
    GPU driveragama-ci-devel-803.61
    GFX FW VersionPVC2_1.23374
    + +

    Notices & Disclaimers

    + +

    Performance varies by use, configuration and other factors. Learn more on the Performance Index site. Performance results are based on testing as of dates shown in configurations and may not reflect all publicly available updates.  See backup for configuration details.  No product or component can be absolutely secure. Your costs and results may vary. Intel technologies may require enabled hardware, software or service activation.

    + +

    Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others.

    + +

    AI disclaimer:
    +AI features may require software purchase, subscription or enablement by a software or platform provider, or may have specific configuration or compatibility requirements. Details at  www.intel.com/AIPC. Results may vary.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/intel-gpus-pytorch-2-4/index.html b/blog/intel-gpus-pytorch-2-4/index.html new file mode 100644 index 000000000000..5aa8181979d7 --- /dev/null +++ b/blog/intel-gpus-pytorch-2-4/index.html @@ -0,0 +1,719 @@ + + + + + + + + + + + + + Accelerate Your AI: PyTorch 2.4 Now Supports Intel GPUs for Faster Workloads | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + the PyTorch Team at Intel + +

    +

    We have exciting news! PyTorch 2.4 now supports Intel® Data Center GPU Max Series and the SYCL software stack, making it easier to speed up your AI workflows for both training and inference. This update allows for you to have a consistent programming experience with minimal coding effort and extends PyTorch’s device and runtime capabilities, including device, stream, event, generator, allocator, and guard, to seamlessly support streaming devices. This enhancement simplifies deploying PyTorch on ubiquitous hardware, making it easier for you to integrate different hardware back ends.

    + +

    Intel GPU support upstreamed into PyTorch provides support for both eager and graph modes, fully running Dynamo Hugging Face benchmarks. Eager mode now includes common Aten operators implemented with SYCL. The most performance-critical graphs and operators are highly optimized by using oneAPI Deep Neural Network Library (oneDNN) and oneAPI Math Kernel Library (oneMKL). Graph mode (torch.compile) now has an enabled Intel GPU back end to implement the optimization for Intel GPUs and to integrate Triton. Furthermore, data types such as FP32, BF16, FP16, and automatic mixed precision (AMP) are supported. The PyTorch Profiler, based on Kineto and oneMKL, is being developed for the upcoming PyTorch 2.5 release.

    + +

    Take a look at the current and planned front-end and back-end improvements for Intel GPU upstreamed into PyTorch.

    + +

    the current and planned front-end and back-end improvements for Intel GPU upstreamed into PyTorch

    + +

    PyTorch 2.4 on Linux supports Intel Data Center GPU Max Series for training and inference while maintaining the same user experience as other hardware. If you’re migrating code from CUDA, you can run your existing application on an Intel GPU with minimal changes—just update the device name from cuda to xpu. For example:

    + +
    # CUDA Code 
    +tensor = torch.tensor([1.0, 2.0]).to("cuda") 
    + 
    +# Code for Intel GPU 
    +tensor = torch.tensor([1.0, 2.0]).to("xpu")
    +
    + +

    Get Started

    + +

    Try PyTorch 2.4 on the Intel Data Center GPU Max Series through the Intel® Tiber™ Developer Cloud. Get a tour of the environment setup, source build, and examples. To learn how to create a free Standard account, see Get Started, then do the following:

    + +
      +
    1. +

      Sign in to the cloud console.

      +
    2. +
    3. +

      From the Training section, open the PyTorch 2.4 on Intel GPUs notebook.

      +
    4. +
    5. +

      Ensure that the PyTorch 2.4 kernel is selected for the notebook.

      +
    6. +
    + +

    Summary

    + +

    PyTorch 2.4 introduces initial support for Intel Data Center GPU Max Series to accelerate your AI workloads. With Intel GPU, you’ll get continuous software support, unified distribution, and synchronized release schedules for a smoother development experience. We’re enhancing this functionality to reach Beta quality in PyTorch 2.5. Planned features in 2.5 include:

    + +
      +
    • +

      More Aten operators and full Dynamo Torchbench and TIMM support in Eager Mode.

      +
    • +
    • +

      Full Dynamo Torchbench and TIMM benchmark support in torch.compile.

      +
    • +
    • +

      Intel GPU support in torch.profile.

      +
    • +
    • +

      PyPI wheels distribution.

      +
    • +
    • +

      Windows and Intel Client GPU Series support.

      +
    • +
    + +

    We welcome the community to evaluate these new contributions to Intel GPU support on PyTorch. 

    + +

    Resources

    + + + +

    Acknowledgments

    + +

    We want thank PyTorch open source community for their technical discussions and insights: Nikita Shulga, Jason Ansel, Andrey Talman, Alban Desmaison, and Bin Bao.

    + +

    We also thank collaborators from PyTorch for their professional support and guidance.

    + +

    1 To enable GPU support and improve performance, we suggest installing the Intel® Extension for PyTorch

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/intel-joins-pytorch/index.html b/blog/intel-joins-pytorch/index.html new file mode 100644 index 000000000000..ea973120d9e7 --- /dev/null +++ b/blog/intel-joins-pytorch/index.html @@ -0,0 +1,679 @@ + + + + + + + + + + + + + Intel Joins the PyTorch Foundation as a Premier Member | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    Intel logo

    + +

    The PyTorch Foundation, a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem, is announcing today that Intel has joined as a premier member.

    + +

    “The PyTorch Foundation is thrilled to welcome Intel as a premier member, marking a significant milestone in our mission to empower the global AI community. Intel’s extensive expertise and commitment to advancing cutting-edge technologies align perfectly with our vision of fostering open-source innovation,” said PyTorch Foundation Executive Director Ibrahim Haddad. “Together, we will accelerate the development and democratization of PyTorch, and use the collaboration to shape a vibrant future of AI for all.”

    + +

    Intel has developed and released several PyTorch-based tools and libraries to enable developers to accelerate their AI workflows, and is actively working on optimizing PyTorch to leverage Intel hardware capabilities.

    + +

    “At Intel, we believe in the power of collaboration and open-source innovation to propel the ecosystem towards an AI Everywhere future. Joining the Governing Board of the PyTorch Foundation is a testament to Intel’s commitment to advancing and democratizing AI,” said Wei Li, Vice President and General Manager of Artificial Intelligence and Analytics (AIA) at Intel. “By harnessing the collective expertise and resources within the deep learning community, we aim to accelerate the development of PyTorch and continue to drive breakthroughs in AI research and applications.”

    + +

    Intel fosters industry collaboration, co-engineering, and open source contributions to accelerate software innovation and develop new technologies that bring benefits to the open source community. By working together with other member companies and under the guidance of the PyTorch Foundation, Intel remains committed to actively contributing to and advocating for the community.

    + +

    As a premier member, Intel is granted one seat to the PyTorch Foundation Governing Board. The Board sets policy through our bylaws, mission and vision statements, describing the overarching scope of foundation initiatives, technical vision, and direction.

    + +

    Wei Li

    + +

    We’re happy to welcome Wei Li, Vice President and General Manager of Artificial Intelligence and Analytics (AIA) at Intel, to our board. Dr. Wei Li is Vice President and General Manager of Artificial Intelligence and Analytics (AIA) at Intel, where he leads a world-wide team of engineering “magicians” who make AI Everywhere a reality by supercharging machine performance and developer productivity. Wei and his team have been instrumental in Intel’s recent multi-billion-dollar AI revenue growth by delivering 10-100X software acceleration, across deep learning, statistical machine learning and big data analytics, to complement Intel’s AI-optimized hardware portfolio.

    + +

    To learn more about how you can be a part of the PyTorch Foundation, visit our website.

    + +

    Read more about Intel’s commitment to the PyTorch Community here.

    + +

    About Intel

    + +

    Intel (Nasdaq: INTC) is an industry leader, creating world-changing technology that enables global progress and enriches lives. Inspired by Moore’s Law, we continuously work to advance the design and manufacturing of semiconductors to help address our customers’ greatest challenges. By embedding intelligence in the cloud, network, edge and every kind of computing device, we unleash the potential of data to transform business and society for the better. To learn more about Intel’s innovations, go to newsroom.intel.com and intel.com.

    + +

    © Intel Corporation. Intel, the Intel logo and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others.

    + +

    About PyTorch Foundation

    + +

    The PyTorch Foundation is a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem. The PyTorch Foundation is supported by its members and leading contributors to the PyTorch open source project. The Foundation leverages resources provided by members and contributors to enable community discussions and collaboration.

    + +

    About The Linux Foundation

    + +

    The Linux Foundation is the world’s leading home for collaboration on open source software, hardware, standards, and data. Linux Foundation projects are critical to the world’s infrastructure including Linux, Kubernetes, Node.js, ONAP, PyTorch, RISC-V, SPDX, OpenChain, and more. The Linux Foundation focuses on leveraging best practices and addressing the needs of contributors, users, and solution providers to create sustainable models for open collaboration. For more information, please visit us at linuxfoundation.org. The Linux Foundation has registered trademarks and uses trademarks. For a list of trademarks of The Linux Foundation, please see its trademark usage page. Linux is a registered trademark of Linus Torvalds.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/interactive-chat-gen-model/index.html b/blog/interactive-chat-gen-model/index.html new file mode 100644 index 000000000000..f8ca15bd3d31 --- /dev/null +++ b/blog/interactive-chat-gen-model/index.html @@ -0,0 +1,805 @@ + + + + + + + + + + + + + How to Build an Interactive Chat-Generation Model using DialoGPT and PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Intel + +

    +

    The focus on interactive chat-generation (or conversational response-generation) models has greatly increased in the past several months. Conversational response-generation models such as ChatGPT and Google Bard have taken the AI world by storm. The purpose of interactive chat generation is to answer various questions posed by humans, and these AI based models use natural language processing (NLP) to generate conversations almost indistinguishable from those generated by humans.

    + +

    This article showcases a code sample on how to create interactive chats based on a pre-trained DialoGPT model from Hugging Face with the addition of the Intel® Extension for PyTorch to perform dynamic quantization on the model.

    + +

    Get Started

    + +

    Why DialoGPT?

    + +

    DialoGPT (Dialogue Generative Pre-trained Transformer) is a large-scale, pre-trained dialogue-response-generation model trained on 147M conversation-like exchanges pulled out from Reddit comment chains and discussion threads. DialoGPT was proposed by Microsoft in 2019. The main goal was to create open-domain chatbots capable of producing natural responses to a variety of conversational topics. The conversational response-generation systems that leverage DialoGPT generate more applicable, resourceful, diverse, and context-specific replies.

    + +

    DialoGPT Architecture

    + +

    DialoGPT architecture is based on the GPT-2 model. It is formulated as an autoregressive language model and uses a multi-layer transformer as the model architecture. GPT-2 was proposed by OpenAI. GPT-2 models are trained on general text data whereas DialoGPT is trained on Reddit discussion threads.

    + +

    Let’s look at the GPT-2 architecture. There are two types of blocks in general transformer architecture:

    + +
      +
    • Encoder - contains self-attention layer and feed-forward neural network
    • +
    • Decoder - similar to encoder, but the self-attention layer is masked
    • +
    + +

    The self-attention layer allows a position to peak at tokens to the right of the current word (the successive words in text), whereas masked self-attention layer prevents that from happening.

    + +

    self-attention layer vs masked self-attention layer

    + +

    GPT-2 is built using transformer decoder blocks. This means that the following layers are used in the architecture:

    + +
      +
    1. Embedding Layer – responsible for converting input text into embeddings (each word is converted to a fixed-length vector representation)
    2. +
    3. Transformer Decoder – includes multiple decoder blocks with masked self-attention and feed forward neural network layers
    4. +
    5. Output Layer – responsible for converting embeddings obtained from the decoder into words
    6. +
    + +

    GPT-2 architecture (and DialoGPT architecture) is shown below.

    + +

    GPT-2 architecture

    + +

    As the model is based on transformers architecture, it has the issue of repetition and copying the inputs. To avoid repetition, we can use Top-K sampling and Top-p sampling.

    + +
      +
    • Top-K sampling - filters the K most likely next words and redistributes the probability mass among only those K next words.
    • +
    • Top-p sampling - rather than selecting only the most likely K words, selects the smallest possible set of words whose cumulative probability exceeds the probability p.
    • +
    + +

    The probability mass is then redistributed among the words in the set. As a result, the size of the set of words can be dynamically increased and decreased based on the probability distribution of the next word.

    + +

    Quantization using Intel® Extension for PyTorch

    + +

    What is Quantization?

    + +

    Quantization is a systematic reduction of the precision of all or several layers within the model. This means a higher-precision type, such as the single-precision floating-point (FP32) mostly used in deep learning, is converted into a lower-precision type such as FP16 (16 bits) or INT8 (8 bits).

    + +

    This helps in achieving,

    + +
      +
    • lower memory bandwidth
    • +
    • lower storage
    • +
    • higher performance with minimum-to-zero accuracy loss
    • +
    + +

    Quantization is especially important with large models such as those based on the Transformer architecture like BERT or GPT.

    + +

    There are two types of quantization:

    + +
      +
    • Static – Static quantization quantizes the weights and activations of the model. This quantization is used when both memory bandwidth and compute savings are important.
    • +
    • Dynamic – In dynamic quantization, the weights are quantized ahead of time, but the activations are dynamically quantized during inference.
    • +
    + +

    Intel Extension for PyTorch: The Intel Extension extends PyTorch with up-to-date features and optimizations for an extra performance boost on Intel® hardware. Learn how to install it standalone or get it a part of the Intel® AI Analytics Toolkit.

    + +

    The extension can be loaded as a Python* module or linked as a C++ library. Python users can enable it dynamically by importing intel_extension_for_pytorch.

    + +
      +
    • This CPU tutorial gives detailed information about Intel Extension for PyTorch for Intel CPUs. Source code is available at the master branch.
    • +
    • This GPU tutorial gives detailed information about Intel Extension for PyTorch for Intel GPUs. Source code is available at the xpu-master branch.
    • +
    + +

    How to perform dynamic quantization using Intel Extension for PyTorch?

    + +

    Here are the steps to quantize the existing FP32 model to INT8 model using dynamic quantization:

    + +
      +
    1. Prepare quantization configuration - We can use default dynamic quantization configuration with ipex.quantization.default_dynamic_qconfig.
    2. +
    3. Prepare the FP32 model by using the** ipex.quantization.prepare **method (provide the input parameters such as FP32 model to quantize, the prepared configuration, example inputs and information if the quantization should be in place).
    4. +
    5. Convert the model from FP32 to INT8 - Use ipex.quantization.convert method for conversion. The input model will be the model prepared in step 2.
    6. +
    + +

    We also encourage you to check out the Intel® Neural Compressor tool that automates popular model-compression technologies such as quantization, pruning, and knowledge distillation across multiple deep learning frameworks.

    + +

    Code Sample

    + +

    The following steps are implemented in the code sample:

    + +
      +
    1. Load model and tokenizer: Transformers library (check out Intel® Extension for Transformers) and Auto Classes available in the Hugging Face Main Classes are used in this step. These allow us to automatically find the relevant model by the given name. It also allows to easily change the model without major changes in the code on the developer’s side as shown below: +
      tokenizer = AutoTokenizer.from_pretrained(model)
      +model = AutoModelForCausalLM.from_pretrained(model)
      +
      +

      The model parameter is specified as an input for the tokenizer, and model initialization is just the path to the pre-trained DialoGPT model. In this sample, we are using ‘microsoft/DialoGPT-large.’ If you have limited resources, you can use ‘microsoft/DialoGPT-medium’ or ‘microsoft/DialoGPT-small’ models and receive comparable results.

      +
    2. +
    3. Perform dynamic quantization of the model: +
        +
      1. Create the configuration using the default dynamic quantization configuration from Intel Extension for PyTorch library.
      2. +
      3. Prepare the model.
      4. +
      5. Convert the model from FP32 to INT8.
        +The steps are explained in detail in the above section.
      6. +
      +
    4. +
    5. Response generation: The first step in response generation is to encode the input sentence as shown in the code below: +
      new_input_ids = tokenizer.encode(input(">> You:") + tokenizer.eos_token, return_tensors='pt')
      +
      +

      In this sample, we want our model to save history, so we are adding input sentences in the form of tokens to the chat history:

      +
      bot_input_ids = torch.cat([chat_history_ids, new_input_ids], dim=-1) if chat_round > 0 else new_input_ids
      +
      +

      The text generation can be done by the model.generate function, where we can specify all important parameters like saved chat history, length of the response in tokens, and usage of both Top-K and Top-p sampling.

      +
      chat_history_ids = model.generate(bot_input_ids, do_sample=True, max_length=2000, top_k=50, top_p=0.95, pad_token_id=tokenizer.eos_token_id) 
      +
      +

      The last step is to decode and print the response:

      +
    6. +
    7. Preparation for interactive conversation: After response generation, the last step is to add interaction. This can be done by using a simple for loop. Based on the initialized tokenizer, model, and empty chat history, responses are generated for a number of rounds: +
      for chat_round in range(n):
      +chat_history_ids = generate_response(
      +tokenizer,
      +model,
      +chat_round,
      +chat_history_ids
      +)
      +
      +

      An example of interactive chat generation will look like the one shown in the picture below.

      +
    8. +
    + +

    An example of interactive chat generation

    + +

    What’s Next?

    + +

    Get started with interactive chat-generation models using Intel Extension for PyTorch and DialoGPT. Download and try the Intel AI Analytics Toolkit and Intel Extension for PyTorch for yourself to build various end-to-end AI applications.

    + +

    We encourage you to also check out and incorporate Intel’s other AI/ML Framework optimizations and end-to-end portfolio of tools into your AI workflow and learn about the unified, open, standards-based oneAPI programming model that forms the foundation of Intel’s AI Software Portfolio to help you prepare, build, deploy, and scale your AI solutions.

    + +

    For more details about the new 4th Gen Intel® Xeon® Scalable processors, visit Intel’s AI Solution Platform portal where you can learn how Intel is empowering developers to run end-to-end AI pipelines on these powerful CPUs.

    + +

    Useful resources

    + + + +

    Explore more AI code samples

    + + + +

    See all code samples

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/introducing-accelerated-pytorch-training-on-mac/index.html b/blog/introducing-accelerated-pytorch-training-on-mac/index.html new file mode 100644 index 000000000000..50b2957e03f6 --- /dev/null +++ b/blog/introducing-accelerated-pytorch-training-on-mac/index.html @@ -0,0 +1,675 @@ + + + + + + + + + + + + + Introducing Accelerated PyTorch Training on Mac | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + PyTorch + +

    +

    In collaboration with the Metal engineering team at Apple, we are excited to announce support for GPU-accelerated PyTorch training on Mac. Until now, PyTorch training on Mac only leveraged the CPU, but with the upcoming PyTorch v1.12 release, developers and researchers can take advantage of Apple silicon GPUs for significantly faster model training. This unlocks the ability to perform machine learning workflows like prototyping and fine-tuning locally, right on Mac.

    + +

    + +

    + +

    Metal Acceleration

    + +

    Accelerated GPU training is enabled using Apple’s Metal Performance Shaders (MPS) as a backend for PyTorch. The MPS backend extends the PyTorch framework, providing scripts and capabilities to set up and run operations on Mac. MPS optimizes compute performance with kernels that are fine-tuned for the unique characteristics of each Metal GPU family. The new device maps machine learning computational graphs and primitives on the MPS Graph framework and tuned kernels provided by MPS.

    + +

    Training Benefits on Apple Silicon

    + +

    Every Apple silicon Mac has a unified memory architecture, providing the GPU with direct access to the full memory store. This makes Mac a great platform for machine learning, enabling users to train larger networks or batch sizes locally. This reduces costs associated with cloud-based development or the need for additional local GPUs. The Unified Memory architecture also reduces data retrieval latency, improving end-to-end performance.

    + +

    In the graphs below, you can see the performance speedup from accelerated GPU training and evaluation compared to the CPU baseline:

    + +

    + +

    + +

    +Accelerated GPU training and evaluation speedups over CPU-only (times faster) +

    + +

    Getting Started

    + +

    To get started, just install the latest Preview (Nightly) build on your Apple silicon Mac running macOS 12.3 or later with a native version (arm64) of Python.

    + +

    You can also learn more about Metal and MPS on Apple’s Metal page.

    + +

    * Testing conducted by Apple in April 2022 using production Mac Studio systems with Apple M1 Ultra, 20-core CPU, 64-core GPU 128GB of RAM, and 2TB SSD. Tested with macOS Monterey 12.3, prerelease PyTorch 1.12, ResNet50 (batch size=128), HuggingFace BERT (batch size=64), and VGG16 (batch size=64). Performance tests are conducted using specific computer systems and reflect the approximate performance of Mac Studio.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/introducing-depyf/index.html b/blog/introducing-depyf/index.html new file mode 100644 index 000000000000..3b8370882fb7 --- /dev/null +++ b/blog/introducing-depyf/index.html @@ -0,0 +1,829 @@ + + + + + + + + + + + + + Introducing depyf: mastering torch.compile with ease | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Kaichao You + +

    +

    depyf logo

    + +

    We are thrilled to introduce depyf, a new project to the PyTorch ecosystem designed to help users understand, learn, and adapt to torch.compile!

    + +

    Motivation

    + +

    torch.compile is a cornerstone of PyTorch 2.x, offering a straightforward path to accelerate machine learning workflows with just a single line of code for both training and inference. The mere inclusion of @torch.compile can dramatically enhance the performance of your code. However, identifying the optimal insertion point for torch.compile is not easy, not to mention the complexity of adjusting various knobs for maximum efficiency.

    + +

    The intricacies of the torch.compile stack, encompassing Dynamo, AOTAutograd, Inductor, and more, present a steep learning curve. These components, essential for deep learning performance optimization, can be daunting without a solid foundation in the subject.

    + +

    Note: For an introductory example of how torch.compile works, please refer to this walk-through explanation.

    + +

    A common tool: TORCH_COMPILE_DEBUG

    + +

    To demystify torch.compile, the common approach involves leveraging the TORCH_COMPILE_DEBUG environment variable. While it provides more information, deciphering the output remains a formidable task.

    + +

    For example, when we have the following code:

    + +
    # test.py
    +import torch
    +from torch import _dynamo as torchdynamo
    +from typing import List
    +
    +@torch.compile
    +def toy_example(a, b):
    +   x = a / (torch.abs(a) + 1)
    +   if b.sum() < 0:
    +       b = b * -1
    +   return x * b
    +
    +def main():
    +   for _ in range(100):
    +       toy_example(torch.randn(10), torch.randn(10))
    +
    +if __name__ == "__main__":
    +   main()
    +
    + +

    And run it with TORCH_COMPILE_DEBUG=1 python test.py , we will get a directory named torch_compile_debug/run_2024_02_05_23_02_45_552124-pid_9520 , under which there are these files:

    + +
    .
    +├── torchdynamo
    +│   └── debug.log
    +└── torchinductor
    +   ├── aot_model___0_debug.log
    +   ├── aot_model___10_debug.log
    +   ├── aot_model___11_debug.log
    +   ├── model__4_inference_10.1
    +   │   ├── fx_graph_readable.py
    +   │   ├── fx_graph_runnable.py
    +   │   ├── fx_graph_transformed.py
    +   │   ├── ir_post_fusion.txt
    +   │   ├── ir_pre_fusion.txt
    +   │   └── output_code.py
    +   ├── model__5_inference_11.2
    +   │   ├── fx_graph_readable.py
    +   │   ├── fx_graph_runnable.py
    +   │   ├── fx_graph_transformed.py
    +   │   ├── ir_post_fusion.txt
    +   │   ├── ir_pre_fusion.txt
    +   │   └── output_code.py
    +   └── model___9.0
    +       ├── fx_graph_readable.py
    +       ├── fx_graph_runnable.py
    +       ├── fx_graph_transformed.py
    +       ├── ir_post_fusion.txt
    +       ├── ir_pre_fusion.txt
    +       └── output_code.py
    +
    + +

    The generated files and logs often raise more questions than they answer, leaving developers puzzled over the meaning and relationships within the data. Common puzzles for TORCH_COMPILE_DEBUG include:

    + +
      +
    • What does model__4_inference_10.1 mean?
    • +
    • I have one function but three model__xxx.py in the directory, what is their correspondence?
    • +
    • What are those LOAD_GLOBAL stuff in debug.log ?
    • +
    + +

    A better tool: depyf comes to rescue

    + +

    Let’s see how depyf can help developers to resolve the above challenges. To use depyf , simply execute pip install depyf or follow the project page https://github.com/thuml/depyf to install the latest version, and then surround the main code within with depyf.prepare_debug .

    + +
    # test.py
    +import torch
    +from torch import _dynamo as torchdynamo
    +from typing import List
    +
    +@torch.compile
    +def toy_example(a, b):
    +   x = a / (torch.abs(a) + 1)
    +   if b.sum() < 0:
    +       b = b * -1
    +   return x * b
    +
    +def main():
    +   for _ in range(100):
    +       toy_example(torch.randn(10), torch.randn(10))
    +
    +if __name__ == "__main__":
    +   import depyf
    +   with depyf.prepare_debug("depyf_debug_dir"):
    +       main()
    +
    + +

    After executing python test.py , depyf will produce a directory named depyf_debug_dir (the argument of the prepare_debug function). Under the directory, there would be these files:

    + +
    .
    +├── __compiled_fn_0 AFTER POST GRAD 0.py
    +├── __compiled_fn_0 Captured Graph 0.py
    +├── __compiled_fn_0 Forward graph 0.py
    +├── __compiled_fn_0 kernel 0.py
    +├── __compiled_fn_3 AFTER POST GRAD 0.py
    +├── __compiled_fn_3 Captured Graph 0.py
    +├── __compiled_fn_3 Forward graph 0.py
    +├── __compiled_fn_3 kernel 0.py
    +├── __compiled_fn_4 AFTER POST GRAD 0.py
    +├── __compiled_fn_4 Captured Graph 0.py
    +├── __compiled_fn_4 Forward graph 0.py
    +├── __compiled_fn_4 kernel 0.py
    +├── __transformed_code_0_for_torch_dynamo_resume_in_toy_example_at_8.py
    +├── __transformed_code_0_for_toy_example.py
    +├── __transformed_code_1_for_torch_dynamo_resume_in_toy_example_at_8.py
    +└── full_code_for_toy_example_0.py
    +
    + +

    And there are two obvious benefits:

    + +
      +
    1. The long and difficult-to-understand torchdynamo/debug.log is gone. Its content is cleaned up and shown as human-readable source code, in full_code_for_xxx.py and __transformed_code_{n}_for_xxx.py . It is worth to note, that the most tedious and difficult job of depyf is to decompile the bytecode inside torchdynamo/debug.log into Python source code, freeing developers from intimidating internals of Python.
    2. +
    3. The correspondence between function names and computation graphs are respected. For example, in __transformed_code_0_for_toy_example.py , we can see a function named __compiled_fn_0 , and we will immediately know its corresponding computation graphs are in __compiled_fn_0_xxx.py , because they share the same __compiled_fn_0 prefix name.
    4. +
    + +

    Starting with full_code_for_xxx.py , and following the functions involved, users will have a clear view of what torch.compile does to their code.

    + +

    One more thing: step-through debuggability

    + +

    Stepping through code line by line using debuggers is a great way to understand how code works. However, under TORCH_COMPILE_DEBUG , those files are only for users’ information, and cannot be executed with the data users concern.

    + +

    Note: By “debug”, we mean the process of inspecting and improving a program, rather than correcting buggy code.

    + +

    A standout feature of depyf is its capability to facilitate step-through debugging for torch.compile: all of the files it generates are linked with runtime code objects inside Python interpreter, and we can set breakpoints in these files. The usage is simple, just add one context manager with depyf.debug() , and it should do the trick:

    + +
    # test.py
    +import torch
    +from torch import _dynamo as torchdynamo
    +from typing import List
    +
    +@torch.compile
    +def toy_example(a, b):
    +   x = a / (torch.abs(a) + 1)
    +   if b.sum() < 0:
    +       b = b * -1
    +   return x * b
    +
    +def main():
    +   for _ in range(100):
    +       toy_example(torch.randn(10), torch.randn(10))
    +
    +if __name__ == "__main__":
    +   import depyf
    +   with depyf.prepare_debug("depyf_debug_dir"):
    +       main()
    +   with depyf.debug():
    +       main()
    +
    + +

    Just one caveat: the workflow of debugging torch.compile deviates from standard debugging workflow. With torch.compile, many codes are dynamically generated. Therefore, we need to:

    + +
      +
    1. launch the program
    2. +
    3. when the program exits with depyf.prepare_debug("depyf_debug_dir") , code will be available in depyf_debug_dir.
    4. +
    5. when the program enters with depyf.debug() , it will automatically set a breakpoint internally, so that the program is paused.
    6. +
    7. navigate to depyf_debug_dir to set breakpoints.
    8. +
    9. continue to run the code, and debuggers will hit these breakpoints!
    10. +
    + +

    depyf screenshot

    + +

    Here is a screenshot of what it looks like. All code and tensor variables are live, and we can inspect any variable, and step through the code, as in our daily debugging workflow now! The only difference is that we are debugging torch.compile generated code rather than human-written code.

    + +

    Conclusion

    + +

    torch.compile serves as an invaluable tool for accelerating PyTorch code effortlessly. For those looking to delve deeper into torch.compile, whether to leverage its full potential or to integrate custom operations, the learning curve can be very steep though. depyf is designed to lower this barrier, offering a user-friendly experience to understand, learn, and adapt to torch.compile.

    + +

    Do explore depyf and experience its benefits firsthand! The project is open-source and readily available at https://github.com/thuml/depyf. Installation is straightforward via pip install depyf. We hope depyf can enhance everyone’s development workflow with torch.compile.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/introducing-hidet/index.html b/blog/introducing-hidet/index.html new file mode 100644 index 000000000000..2810acbe54cb --- /dev/null +++ b/blog/introducing-hidet/index.html @@ -0,0 +1,762 @@ + + + + + + + + + + + + + Introducing Hidet: A Deep Learning Compiler for Efficient Model Serving | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team Hidet + +

    +

    Hidet is a powerful deep learning compiler that simplifies the process of implementing high-performing deep learning operators on modern accelerators (e.g., NVIDIA GPUs). With the new feature of torch.compile(...) in PyTorch 2.0, integrating a novel compiler into PyTorch is easier than ever - Hidet now can be used as a torch.compile(...) backend to accelerate PyTorch models, making it an attractive option for PyTorch users who want to improve the inference performance of their models, especially for those who also need to implement extremely optimized custom operators.

    + +

    Using Hidet to Compile A PyTorch Model

    + +

    To use Hidet in PyTorch, you need to first install the hidet package via pip:

    + +
    pip install hidet
    +
    + +

    Hidet is integrated with PyTorch as a torch.compile(...) backend following the Custom Backends tutorial. You can specify hidet as the backend when you compile a model. (Note: requires PyTorch version 2.0+):

    + +
    torch.compile(..., backend='hidet')
    +
    + +

    Hidet converts the given PyTorch model in the torch.fx.Graph format into its internal graph representation, and conducts a series of optimizations. Hidet provides a few options to configure the optimizations. For example, we can use hidet.torch.dynamo_config.use_tensor_core(True) to allow Hidet to generate CUDA kernels that leverage the Tensor Cores on NVIDIA GPUs, and use hidet.torch.dynamo_config.search_space(2) to allow Hidet to search for the best operator schedule specific for your hardware and input sizes. More configurations can be found in Hidet’s documentation.

    + +

    Here’s a complete example of how to use Hidet to compile and optimize a pre-trained ResNet50 model from torchvision:

    + +
    import hidet
    +import torch
    +
    +# Load a pre-trained ResNet50 model
    +x = torch.randn(1, 3, 224, 224, device='cuda').half()
    +model = torch.hub.load(
    +    'pytorch/vision:v0.6.0', 'resnet50', pretrained=True
    +).cuda().half().eval()
    +
    +# Configure hidet to use tensor core and enable tuning
    +hidet.torch.dynamo_config.use_tensor_core(True)
    +hidet.torch.dynamo_config.search_space(2) 
    +
    +# Compile the model using Hidet
    +model_opt = torch.compile(model, backend='hidet')
    +
    +# Check correctness
    +torch.testing.assert_close(actual=model_opt(x), expected=model(x), rtol=1e-2, atol=1e-2)
    +
    +# Benchmark
    +from hidet.utils import benchmark_func
    +print('eager: {:2f}'.format(benchmark_func(lambda: model(x))))
    +print('hidet: {:2f}'.format(benchmark_func(lambda: model_opt(x))))
    +
    + +

    We encourage you to try out the above script on your own NVIDIA GPU(s)! If you run this script on an aws.g5.2xlarge instance, you would get the result shown in the following figure. Hidet achieves the speedup because it could automatically fuse multiple operators, tune operator schedules, and use CUDA Graph to reduce framework-level overhead. More results can be found in the ASPLOS’23 publication of Hidet and our performance tracking

    + +

    Eager vs Hidet latency

    + +

    Using Hidet Script to Write Custom Operators

    + +

    Hidet Script is one approach to implement tensor operators in Python. The following example shows how to implement a naive matrix multiplication using Hidet Script and integrate it as a PyTorch operator.

    + +
    import torch
    +import hidet
    +
    +
    +def matmul(m_size, n_size, k_size):
    +    from hidet.lang import f32, attr
    +    from hidet.lang.cuda import threadIdx, blockIdx, blockDim
    +
    +    with hidet.script_module() as script_module:
    +        @hidet.script
    +        def matmul(
    +            a: f32[m_size, k_size],
    +            b: f32[k_size, n_size],
    +            c: f32[m_size, n_size]
    +        ):
    +            attr.cuda_grid_dim = ((m_size + 31) // 32, (n_size + 31) // 32)
    +            attr.cuda_block_dim = (32, 32)
    +            i = threadIdx.x + blockIdx.x * blockDim.x
    +            j = threadIdx.y + blockIdx.y * blockDim.y
    +            if i < m_size and j < n_size:
    +                c[i, j] = 0.0
    +                for k in range(k_size):
    +                    c[i, j] += a[i, k] * b[k, j]
    +
    +    ir_module = script_module.ir_module()
    +    func = hidet.driver.build_ir_module(ir_module)
    +    return func
    +
    +
    +class NaiveMatmul(torch.autograd.Function):
    +    @staticmethod
    +    def forward(ctx, a, b):
    +        m, k = a.shape
    +        k, n = b.shape
    +        c = torch.empty([m, n], dtype=a.dtype, device=a.device)
    +        func = matmul(m, n, k)
    +        func(a, b, c)
    +        return c
    +
    +
    +a = torch.randn([3, 4], device='cuda')
    +b = torch.randn([4, 5], device='cuda')
    +c = NaiveMatmul.apply(a, b)
    +cc = torch.matmul(a, b)
    +torch.testing.assert_close(c, cc)
    +
    + +

    More optimizations can be applied, see the example in our documentation to learn more.

    + +

    Hidet Script vs. Triton: Triton greatly simplifies the CUDA programming by introducing the tile-based programming model where the parallel execution unit is thread blocks instead of threads. However, this simplification also prevents the tensor program developers from manipulating the fine-grained computation and memory resources (e.g., warps, shared memory) in their preferred ways. It would be challenging to implement an optimization that requires fine-grained control of these resources using Triton if it has not been implemented by the Triton compiler itself. Hidet Script, on the other hand, simplifies tensor programming while still enabling users to implement their own optimizations with extensive flexibility. It’s worth noting that the more granular control of Hidet Script also brings added complexity compared to Triton.

    + +

    More about Hidet

    + +

    Hidet originates from a research project led by the EcoSystem lab at the University of Toronto (UofT) and AWS. The authors propose a new way, named the task-mapping programming paradigm, to construct tensor programs. It aims to simplify the tensor programming without sacrificing any optimization opportunity. Now, Hidet is an open-source project, jointly supported by CentML and the EcoSystem lab, that aims to provide an efficient solution to end-to-end inference on modern accelerators (e.g., NVIDIA GPUs).

    + +

    Additional Resources

    + + + +

    Acknowledgement

    + +

    We would like to thank Jerry Park, Mark Saroufim, Jason Liang and Helen Suk for their valuable help on preparing the blog post and feedback on the text. We also would like to thank Nikita Shulga, Jason Ansel, and Dmytro Dzhulgakov for reviewing and improving our PR https://github.com/pytorch/pytorch/pull/93873 on the 3rd-party dynamo backend registration.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/introducing-nvfuser-a-deep-learning-compiler-for-pytorch/index.html b/blog/introducing-nvfuser-a-deep-learning-compiler-for-pytorch/index.html new file mode 100644 index 000000000000..a8c0cdeb8a1d --- /dev/null +++ b/blog/introducing-nvfuser-a-deep-learning-compiler-for-pytorch/index.html @@ -0,0 +1,747 @@ + + + + + + + + + + + + + Introducing nvFuser, a deep learning compiler for PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Christian Sarofeen, Piotr Bialecki, Jie Jiang, Kevin Stephano, Masaki Kozuki, Neal Vaidya, Stas Bekman + +

    +

    nvFuser is a Deep Learning Compiler for NVIDIA GPUs that automatically just-in-time compiles fast and flexible kernels to reliably accelerate users’ networks. It provides significant speedups for deep learning networks running on Volta and later CUDA accelerators by generating fast custom “fusion” kernels at runtime. nvFuser is specifically designed to meet the unique requirements of the PyTorch community, and it supports diverse network architectures and programs with dynamic inputs of varying shapes and strides. +In this blog post we’ll describe nvFuser and how it’s used today, show the significant performance improvements it can obtain on models from HuggingFace and TIMM, and look ahead to nvFuser in PyTorch 1.13 and beyond. If you would like to know more about how and why fusion improves the speed of training for Deep Learning networks, please see our previous talks on nvFuser from GTC 2022 and GTC 2021. +nvFuser relies on a graph representation of PyTorch operations to optimize and accelerate. Since PyTorch has an eager execution model, the PyTorch operations users are running are not directly accessible as a whole program that can be optimized by a system like nvFuser. Therefore users must utilize systems built on top of nvFuser which are capable of capturing users programs and translating them into a form that is optimizable by nvFuser. These higher level systems then pass these captured operations to nvFuser, so that nvFuser can optimize the execution of the user’s script for NVIDIA GPUs. There are three systems that capture, translate, and pass user programs to nvFuser for optimization:

    + +
      +
    • TorchScript jit.script +
        +
      • This system directly parses sections of an annotated python script to translate into its own representation what the user is doing. This system then applies its own version of auto differentiation to the graph, and passes sections of the subsequent forward and backwards graphs to nvFuser for optimization.
      • +
      +
    • +
    • FuncTorch +
        +
      • This system doesn’t directly look at the user python script, instead inserting a mechanism that captures PyTorch operations as they’re being run. We refer to this type of capture system as “trace program acquisition”, since we’re tracing what has been performed. FuncTorch doesn’t perform its own auto differentiation – it simply traces PyTorch’s autograd directly to get backward graphs.
      • +
      +
    • +
    • TorchDynamo +
        +
      • TorchDynamo is another program acquisition mechanism built on top of FuncTorch. TorchDynamo parses the Python bytecode produced from the user script in order to select portions to trace with FuncTorch. The benefit of TorchDynamo is that it’s able to apply decorators to a user’s script, effectively isolating what should be sent to FuncTorch, making it easier for FuncTorch to successfully trace complex Python scripts.
      • +
      +
    • +
    + +

    These systems are available for users to interact with directly while nvFuser automatically and seamlessly optimizes performance critical regions of the user’s code. These systems automatically send parsed user programs to nvFuser so nvFuser can:

    + +
      +
    1. Analyze the operations being run on GPUs
    2. +
    3. Plan parallelization and optimization strategies for those operations
    4. +
    5. Apply those strategies in generated GPU code
    6. +
    7. Runtime-compile the generated optimized GPU functions
    8. +
    9. Execute those CUDA kernels on subsequent iterations
    10. +
    + +

    It is important to note nvFuser does not yet support all PyTorch operations, and there are still some scenarios that are actively being improved in nvFuser that are discussed herein. However, nvFuser does support many DL performance critical operations today, and the number of supported operations will grow in subsequent PyTorch releases. nvFuser is capable of generating highly specialized and optimized GPU functions for the operations it does have support for. This means nvFuser is able to power new PyTorch systems like TorchDynamo and FuncTorch to combine the flexibility PyTorch is known for with unbeatable performance.

    + +

    nvFuser Performance

    + +

    Before getting into how to use nvFuser, in this section we’ll show the improvements in training speed nvFuser provides for a variety of models from the HuggingFace Transformers and PyTorch Image Models (TIMM) repositories and we will discuss current gaps in nvFuser performance that are under development today. All performance numbers in this section were taken using an NVIDIA A100 40GB GPU, and used either FuncTorch alone or Functorch with TorchDynamo.

    + +

    HuggingFace Transformer Benchmarks

    + +

    nvFuser can dramatically accelerate training of HuggingFace Transformers when combined with another important optimization (more on that in a moment). Performance improvements can be seen in Figure 1 to range between 1.12x and 1.50x across a subset of popular HuggingFace Transformer networks.

    + +

    + +

    + +

    +Figure 1: Performance gains of 8 training scenarios from HuggingFace’s Transformer repository. First performance boost in the dark green is due to replacing the optimizer with an NVIDIA Apex fused AdamW optimizer. The light green is due to adding nvFuser. Models were run with batch size and sequence lengths of [64, 128], [8, 512], [2, 1024], [64, 128], [8, 512], [8, src_seql=512, tgt_seql=128], [8, src_seql=1024, tgt_seql=128], and [8, 512] respectively. All networks were run with Automatic Mixed Precision (AMP) enabled with dtype=float16. +

    + +

    While these speedups are significant, it’s important to understand that nvFuser doesn’t (yet) automate everything about running networks quickly. For HuggingFace Transformers, for example, it was important to use the AdamW fused optimizer from NVIDIA’s Apex repository as the optimizer otherwise consumed a large portion of runtime. Using the fused AdamW optimizer to make the network faster exposes the next major performance bottleneck — memory bound operations. These operations are optimized by nvFuser, providing another large performance boost. With the fused optimizer and nvFuser enabled, the training speed of these networks improved between 1.12x to 1.5x. +HuggingFace Transformer models were run with the torch.amp module. (“amp” stands for Automated Mixed Precision, see the “What Every User Should Know about Mixed Precision in PyTorch” blog post for details.) An option to use nvFuser was added to HuggingFace’sTrainer. If you have TorchDynamo installed you can activate it to enable nvFuser in HuggingFace by passing torchdynamo = ‘nvfuser’ to the Trainer class. +nvFuser has great support for normalization kernels and related fusions frequently found in Natural Language Processing (NLP) models, and it is recommended users try nvFuser in their NLP workloads.

    + +

    PyTorch Image Models (TIMM) Benchmarks

    +

    nvFuser, can also significantly reduce the training time of TIMM networks, up to over 1.3x vs. eager PyTorch, and up to 1.44x vs. eager PyTorch when combined with the torch.amp module. Figure 1 shows nvFuser’s speedup without torch.amp, and when torch.amp is used with the NHWC (“channels last”) and NCHW (“channels first”) formats. nvFuser is integrated in TIMM through FuncTorch tracing directly (without TorchDynamo) and can be used by adding the –aot-autograd command line argument when running the TIMM benchmark or training script.

    + +

    + +

    + +

    +Figure 1: The Y-axis is the performance gain nvFuser provides over not using nvFuser. A value of 1.0 means no change in perf, 2.0 would mean nvFuser is twice as fast, 0.5 would mean nvFuser takes twice the time to run. Square markers are with float16 Automatic Mixed Precision (AMP) and channels first contiguous inputs, circle markers are float32 inputs, and triangles are with float16 AMP and channels last contiguous inputs. Missing data points are due to an error being encountered when tracing. +

    + +

    When running with float32 precision nvFuser provides a 1.12x geometric mean (“geomean”) speedup on TIMM networks, and when running with torch.amp and “channels first” it provides a 1.14x geomean speedup. However, nvFuser currently doesn’t speedup torch.amp and “channels last” training (a .9x geomean regression), so we recommend not using it in those cases. We are actively working on improving “channels last” performance now, and soon we will have two additional optimization strategies (grid persistent optimizations for channels-last normalizations and fast transposes) which we expect will provide speedups comparable to “channels first” in PyTorch version 1.13 and later. Many of nvFuser’s optimizations can also help in inference cases. However, in PyTorch when running inference on small batch sizes, the performance is typically limited by CPU overhead, which nvFuser can’t completely remove or fix. Therefore, typically the most important optimization for inference is to enable CUDA Graphs when possible. Once CUDA Graphs is enabled, then it can also be beneficial to also enable fusion through nvFuser. Performance of inference is shown in Figure 2 and Figure 3. Inference is only run with float16 AMP as it is uncommon to run inference workloads in full float32 precision.

    + +

    + +

    + +

    + +

    + +

    +Figure 2: Performance gains of enabling CUDA Graphs, and CUDA Graphs with nvFuser compared to the performance of native PyTorch without CUDA Graphs and nvFuser across TIMM models with float16 AMP, channels first inputs, and a batch size of 1 and 8 respectively. There is a geomean speedup of 2.74x with CUDA Graphs and 2.71x with CUDA Graphs + nvFuser respectively. nvFuser provides a maximum regression of 0.68x and a maximum performance gain of 2.74x (relative to CUDA Graphs without nvFuser). Performance gain is measured relative to the average time per iteration PyTorch takes without CUDA Graphs and without nvFuser. Models are sorted by how much additional performance nvFuser is providing. +

    + +

    + +

    + +

    + +

    + +

    +Figure 3: Performance gains of enabling CUDA Graphs, and CUDA Graphs with nvFuser compared to the performance of native PyTorch without CUDA Graphs and nvFuser across TIMM models with AMP, channels last inputs, and a batch size of 1 and 8 respectively. There is a geomean speedup of 2.29x with CUDA Graphs and 2.95x with CUDA Graphs + nvFuser respectively. nvFuser provides a maximum regression of 0.86x and a maximum performance gain of 3.82x (relative to CUDA Graphs without nvFuser). Performance gain is measured relative to the average time per iteration PyTorch takes without CUDA Graphs and without nvFuser. Models are sorted by how much additional performance nvFuser is providing. +

    + +

    So far nvFuser performance has not been tuned for inference workloads so its performance benefit is not consistent across all cases. However, there are still many models that benefit significantly from nvFuser during inference and we encourage users to try nvFuser in inference workloads to see if you would benefit today. Performance of nvFuser in inference workloads will improve in the future and if you’re interested in nvFuser in inference workloads please reach out to us on the PyTorch forums.

    + +

    Getting Started - Accelerate Your Scripts with nvFuser

    + +

    We’ve created a tutorial demonstrating how to take advantage of nvFuser to accelerate part of a standard transformer block, and how nvFuser can be used to define fast and novel operations. There are still some rough edges in nvFuser that we’re working hard on improving as we’ve outlined in this blog post. However we’ve also demonstrated some great improvements for training speed on multiple networks in HuggingFace and TIMM and we expect there are opportunities in your networks where nvFuser can help today, and many more opportunities it will help in the future. +If you would like to learn more about nvFuser we recommend watching our presentations from NVIDIA’s GTC conference GTC 2022 and GTC 2021.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/introducing-pytorch-fully-sharded-data-parallel-api/index.html b/blog/introducing-pytorch-fully-sharded-data-parallel-api/index.html new file mode 100644 index 000000000000..89a38f358e95 --- /dev/null +++ b/blog/introducing-pytorch-fully-sharded-data-parallel-api/index.html @@ -0,0 +1,809 @@ + + + + + + + + + + + + + Introducing PyTorch Fully Sharded Data Parallel (FSDP) API | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Yanli Zhao, Rohan Varma, Chien-Chin Huang, Shen Li, Min Xu, Alban Desmaison + +

    +

    Recent studies have shown that large model training will be beneficial for improving model quality. During the last 3 years, model size grew 10,000 times from BERT with 110M parameters to Megatron-2 with one trillion. However, training large AI models is not easy—aside from the need for large amounts of computing resources, software engineering complexity is also challenging. PyTorch has been working on building tools and infrastructure to make it easier.

    + +

    PyTorch Distributed data parallelism is a staple of scalable deep learning because of its robustness and simplicity. It however requires the model to fit on one GPU. Recent approaches like DeepSpeed ZeRO and FairScale’s Fully Sharded Data Parallel allow us to break this barrier by sharding a model’s parameters, gradients and optimizer states across data parallel workers while still maintaining the simplicity of data parallelism.

    + +

    With PyTorch 1.11 we’re adding native support for Fully Sharded Data Parallel (FSDP), currently available as a prototype feature. Its implementation heavily borrows from FairScale’s version while bringing more streamlined APIs and additional performance improvements.

    + +

    Scaling tests of PyTorch FSDP on AWS show it can scale up to train dense models with 1T parameters. Realized performance in our experiments reached 84 TFLOPS per A100 GPU for GPT 1T model and 159 TFLOPS per A100 GPU for GPT 175B model on AWS cluster. Native FSDP implementation also dramatically improved model initialization time compared to FairScale’s original when CPU offloading was enabled.

    + +

    In future PyTorch versions, we’re going to enable users to seamlessly switch between DDP, ZeRO-1, ZeRO-2 and FSDP flavors of data parallelism, so that users can train different scales of models with simple configurations in the unified API.

    + +

    How FSDP Works

    + +

    FSDP is a type of data-parallel training, but unlike traditional data-parallel, which maintains a per-GPU copy of a model’s parameters, gradients and optimizer states, it shards all of these states across data-parallel workers and can optionally offload the sharded model parameters to CPUs.

    + +

    The figure below shows how FSDP works for 2 data-parallel processes:

    + +

    + +

    + +

    +Figure 1. FSDP workflow +

    + +

    Usually, model layers are wrapped with FSDP in a nested way, so that only layers in a single FSDP instance need to gather the full parameters to a single device during forward or backward computations. The gathered full parameters will be freed immediately after computation, and the freed memory can be used for the next layer’s computation. In this way, peak GPU memory could be saved and thus training can be scaled to use a larger model size or larger batch size. To further maximize memory efficiency, FSDP can offload the parameters, gradients and optimizer states to CPUs when the instance is not active in the computation.

    + +

    Using FSDP in PyTorch

    + +

    There are two ways to wrap a model with PyTorch FSDP. Auto wrapping is a drop-in replacement for DDP; manual wrapping needs minimal changes of model definition code with the ability to explore complex sharding strategies.

    + +

    Auto Wrapping

    + +

    Model layers should be wrapped in FSDP in a nested way to save peak memory and enable communication and computation overlapping. The simplest way to do it is auto wrapping, which can serve as a drop-in replacement for DDP without changing the rest of the code.

    + +

    fsdp_auto_wrap_policy argument allows specifying a callable function to recursively wrap layers with FSDP. default_auto_wrap_policy function provided by the PyTorch FSDP recursively wraps layers with the number of parameters larger than 100M. You can supply your own wrapping policy as needed. The example of writing a customized wrapping policy is shown in the FSDP API doc.

    + +

    In addition, cpu_offload could be configured optionally to offload wrapped parameters to CPUs when these parameters are not used in computation. This can further improve memory efficiency at the cost of data transfer overhead between host and device.

    + +

    The example below shows how FSDP is wrapped using auto wrapping.

    + +
    from torch.distributed.fsdp import (
    +   FullyShardedDataParallel,
    +   CPUOffload,
    +)
    +from torch.distributed.fsdp.wrap import (
    +   default_auto_wrap_policy,
    +)
    +import torch.nn as nn
    + 
    +class model(nn.Module):
    +   def __init__(self):
    +       super().__init__()
    +       self.layer1 = nn.Linear(8, 4)
    +       self.layer2 = nn.Linear(4, 16)
    +       self.layer3 = nn.Linear(16, 4)
    + 
    +model = DistributedDataParallel(model())
    +fsdp_model = FullyShardedDataParallel(
    +   model(),
    +   fsdp_auto_wrap_policy=default_auto_wrap_policy,
    +   cpu_offload=CPUOffload(offload_params=True),
    +)
    +
    + +

    Manual Wrapping

    + +

    Manual wrapping can be useful to explore complex sharding strategies by applying wrap selectively to some parts of the model. Overall settings can be passed to the enable_wrap() context manager.

    + +
    from torch.distributed.fsdp import (
    +   FullyShardedDataParallel,
    +   CPUOffload,
    +)
    +from torch.distributed.fsdp.wrap import (
    +   enable_wrap,
    +   wrap,
    +)
    +import torch.nn as nn
    +from typing import Dict
    + 
    + 
    +class model(nn.Module):
    +   def __init__(self):
    +       super().__init__()
    +       self.layer1 = wrap(nn.Linear(8, 4))
    +       self.layer2 = nn.Linear(4, 16)
    +       self.layer3 = wrap(nn.Linear(16, 4))
    + 
    +wrapper_kwargs = Dict(cpu_offload=CPUOffload(offload_params=True))
    +with enable_wrap(wrapper_cls=FullyShardedDataParallel, **wrapper_kwargs):
    +   fsdp_model = wrap(model())
    +
    + +

    After wrapping the model with FSDP using one of the two above approaches, the model can be trained in a similar way as local training, like this:

    + +
    optim = torch.optim.Adam(fsdp_model.parameters(), lr=0.0001)
    +for sample, label in next_batch():
    +  out = fsdp_model(input)
    +  loss = criterion(out, label)
    +  loss.backward()
    +  optim.step()
    +
    + +

    Benchmark Results

    + +

    We ran extensive scaling tests for 175B and 1T GPT models on AWS clusters using PyTorch FSDP. Each cluster node is an instance with 8 NVIDIA A100-SXM4-40GB GPUs, and inter-nodes are connected via AWS Elastic Fabric Adapter (EFA) with 400 Gbps network bandwidth.

    + +

    GPT models are implemented using minGPT. A randomly generated input dataset is used for benchmarking purposes. All experiments ran with 50K vocabulary size, fp16 precision and SGD optimizer.

    + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    ModelNumber of layersHidden sizeAttention headsModel size, billions of parameters
    GPT 175B961228896175
    GPT 1T128256001601008
    + +

    In addition to using FSDP with parameters CPU offloading in the experiments, the activation checkpointing feature in PyTorch is also applied in the tests.

    + +

    The maximum per-GPU throughput of 159 teraFLOP/s (51% of NVIDIA A100 peak theoretical performance 312 teraFLOP/s/GPU) is achieved with batch size 20 and sequence length 512 on 128 GPUs for the GPT 175B model; further increase of the number of GPUs leads to per-GPU throughput degradation because of growing communication between the nodes.

    + +

    For the GPT 1T model, the maximum per-GPU throughput of 84 teraFLOP/s (27% of the peak teraFLOP/s) is achieved with batch size 4 and sequence length 2048 on 128 GPUs. However, further increase of the number of GPUs doesn’t affect the per-GPU throughput too much because we observed that the largest bottleneck in the 1T model training is not from communication but from the slow CUDA cache allocator when peak GPU memory is reaching the limit. The use of A100 80G GPUs with larger memory capacity will mostly resolve this issue and also help scale the batch size to achieve much larger throughput.

    + +

    + +

    + +

    + +

    + +

    Future Work

    + +

    In the next beta release, we are planning to add efficient distributed model/states checkpointing APIs, meta device support for large model materialization, and mixed-precision support inside FSDP computation and communication. We’re also going to make it easier to switch between DDP, ZeRO1, ZeRO2 and FSDP flavors of data parallelism in the new API. To further improve FSDP performance, memory fragmentation reduction and communication efficiency improvements are also planned.

    + +

    A Bit of History of 2 Versions of FSDP

    + +

    FairScale FSDP was released in early 2021 as part of the FairScale library. And then we started the effort to upstream FairScale FSDP to PyTorch in PT 1.11, making it production-ready. We have selectively upstreamed and refactored key features from FairScale FSDP, redesigned user interfaces and made performance improvements.

    + +

    In the near future, FairScale FSDP will stay in the FairScale repository for research projects, while generic and widely adopted features will be upstreamed to PyTorch incrementally and hardened accordingly.

    + +

    Meanwhile, PyTorch FSDP will focus more on production readiness and long-term support. This includes better integration with ecosystems and improvements on performance, usability, reliability, debuggability and composability.

    + +

    Acknowledgments

    + +

    We would like to thank the authors of FairScale FSDP: Myle Ott, Sam Shleifer, Min Xu, Priya Goyal, Quentin Duval, Vittorio Caggiano, Tingting Markstrum, Anjali Sridhar. Thanks to the Microsoft DeepSpeed ZeRO team for developing and popularizing sharded data parallel techniques. Thanks to Pavel Belevich, Jessica Choi, Sisil Mehta for running experiments using PyTorch FSDP on different clusters. Thanks to Geeta Chauhan, Mahesh Yadav, Pritam Damania, Dmytro Dzhulgakov for supporting this effort and insightful discussions.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/introducing-pytorch-profiler-the-new-and-improved-performance-tool/index.html b/blog/introducing-pytorch-profiler-the-new-and-improved-performance-tool/index.html new file mode 100644 index 000000000000..6d5eab4c5706 --- /dev/null +++ b/blog/introducing-pytorch-profiler-the-new-and-improved-performance-tool/index.html @@ -0,0 +1,704 @@ + + + + + + + + + + + + + Introducing PyTorch Profiler - the new and improved performance tool | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Maxim Lukiyanov - Principal PM at Microsoft, Guoliang Hua - Principal Engineering Manager at Microsoft, Geeta Chauhan - Partner Engineering Lead at Facebook, Gisle Dankel - Tech Lead at Facebook + +

    +

    Along with PyTorch 1.8.1 release, we are excited to announce PyTorch Profiler – the new and improved performance debugging profiler for PyTorch. Developed as part of a collaboration between Microsoft and Facebook, the PyTorch Profiler is an open-source tool that enables accurate and efficient performance analysis and troubleshooting for large-scale deep learning models.

    + +

    Analyzing and improving large-scale deep learning model performance is an ongoing challenge that grows in importance as the model sizes increase. For a long time, PyTorch users had a hard time solving this challenge due to the lack of available tools. There were standard performance debugging tools that provide GPU hardware level information but missed PyTorch-specific context of operations. In order to recover missed information, users needed to combine multiple tools together or manually add minimum correlation information to make sense of the data. There was also the autograd profiler (torch.autograd.profiler) which can capture information about PyTorch operations but does not capture detailed GPU hardware-level information and cannot provide support for visualization.

    + +

    The new PyTorch Profiler (torch.profiler) is a tool that brings both types of information together and then builds experience that realizes the full potential of that information. This new profiler collects both GPU hardware and PyTorch related information, correlates them, performs automatic detection of bottlenecks in the model, and generates recommendations on how to resolve these bottlenecks. All of this information from the profiler is visualized for the user in TensorBoard. The new Profiler API is natively supported in PyTorch and delivers the simplest experience available to date where users can profile their models without installing any additional packages and see results immediately in TensorBoard with the new PyTorch Profiler plugin. Below is the screenshot of PyTorch Profiler - automatic bottleneck detection.

    + +
    + +
    + +

    Getting started

    + +

    PyTorch Profiler is the next version of the PyTorch autograd profiler. It has a new module namespace torch.profiler but maintains compatibility with autograd profiler APIs. The Profiler uses a new GPU profiling engine, built using Nvidia CUPTI APIs, and is able to capture GPU kernel events with high fidelity. To profile your model training loop, wrap the code in the profiler context manager as shown below.

    + +
     with torch.profiler.profile(
    +    schedule=torch.profiler.schedule(
    +        wait=2,
    +        warmup=2,
    +        active=6,
    +        repeat=1),
    +    on_trace_ready=tensorboard_trace_handler,
    +    with_stack=True
    +) as profiler:
    +    for step, data in enumerate(trainloader, 0):
    +        print("step:{}".format(step))
    +        inputs, labels = data[0].to(device=device), data[1].to(device=device)
    +
    +        outputs = model(inputs)
    +        loss = criterion(outputs, labels)
    +
    +        optimizer.zero_grad()
    +        loss.backward()
    +        optimizer.step()
    +        profiler.step()
    +
    +

    The schedule parameter allows you to limit the number of training steps included in the profile to reduce the amount of data collected and simplify visual analysis by focusing on what’s important. The tensorboard_trace_handler automatically saves profiling results to disk for analysis in TensorBoard.

    + +

    To view results of the profiling session in TensorBoard, install PyTorch Profiler TensorBoard Plugin package.

    + +
    pip install torch_tb_profiler
    +
    +

    Visual Studio Code Integration

    +

    Microsoft Visual Studio Code is one of the most popular code editors for Python developers and data scientists. The Python extension for VS Code recently added the integration of TensorBoard into the code editor, including support for the PyTorch Profiler. Once you have VS Code and the Python extension installed, you can quickly open the TensorBoard Profiler plugin by launching the Command Palette using the keyboard shortcut CTRL + SHIFT + P (CMD + SHIFT + P on a Mac) and typing the “Launch TensorBoard” command.

    + +
    + +
    + +

    This integration comes with a built-in lifecycle management feature. VS Code will install the TensorBoard package and the PyTorch Profiler plugin package (coming in mid-April) automatically if you don’t have them on your system. VS Code will also launch TensorBoard process for you and automatically look for any TensorBoard log files within your current directory. When you’re done, just close the tab and VS Code will automatically close the process. No more Terminal windows running on your system to provide a backend for the TensorBoard UI! Below is PyTorch Profiler Trace View running in TensorBoard.

    + +
    + +
    + +

    Learn more about TensorBoard support in VS Code in this blog.

    + +

    Feedback

    + +

    Review PyTorch Profiler documentation, give Profiler a try and let us know about your experience. Provide your feedback on PyTorch Discussion Forum or file issues on PyTorch GitHub.

    + + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/introducing-the-playtorch-app/index.html b/blog/introducing-the-playtorch-app/index.html new file mode 100644 index 000000000000..6caf68b3e8d3 --- /dev/null +++ b/blog/introducing-the-playtorch-app/index.html @@ -0,0 +1,745 @@ + + + + + + + + + + + + + Introducing the PlayTorch app: Rapidly Create Mobile AI Experiences | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + PlayTorch Team + +

    +

    + +

    + +

    In December, we announced PyTorch Live, a toolkit for building AI-powered mobile prototypes in minutes. The initial release included a command-line interface to set up a development environment and an SDK for building AI-powered experiences in React Native. Today, we’re excited to share that PyTorch Live will now be known as PlayTorch. This new release provides an improved and simplified developer experience. PlayTorch development is independent from the PyTorch project and the PlayTorch code repository is moving into the Meta Research GitHub organization.

    + +

    A New Workflow: The PlayTorch App

    + +

    The PlayTorch team is excited to announce that we have partnered with Expo to change the way AI powered mobile experiences are built. Our new release simplifies the process of building mobile AI experiences by eliminating the need for a complicated development environment. You will now be able to build cross platform AI powered prototypes from the very browser you are using to read this blog.

    + +

    In order to make this happen, we are releasing the PlayTorch app which is able to run AI-powered experiences built in the Expo Snack web based code editor.

    + +

    + +

    + +

    The PlayTorch app can be downloaded from the Apple App Store and Google Play Store. With the app installed, you can head over to playtorch.dev/snack and write the code for your AI-powered PlayTorch Snack. When you want to try what you’ve built, you can use the PlayTorch app’s QR code scanner to scan the QR code on the Snack page and load the code to your device.

    + +

    NOTE: PlayTorch Snacks will not work in the Expo Go app.

    + +

    More to Explore in the PlayTorch App

    + +

    AI Demos

    + +

    The PlayTorch app comes with several examples of how you can build AI powered experiences with a variety of different machine learning models from object detection to natural language processing. See what can be built with the PlayTorch SDK and be inspired to make something of your own as you play with the examples.

    + +

    + +

    + +

    Sharing Your Creations

    + +

    Any PlayTorch Snack that you run in the PlayTorch app can be shared with others in an instant. When they open the link on their device, the PlayTorch app will instantly load what you’ve built from the cloud so they can experience it first hand.

    + +

    + +

    + +

    When you have something you want to share, let us know on Discord or Twitter or embed the PlayTorch Snack on your own webpage.

    + +

    SDK Overhaul

    + +

    We learned a lot from the community after our initial launch in December and have been hard at work over the past several months to make the PlayTorch SDK (formerly known as PyTorch Live) simple, performant, and robust. In our initial version, the SDK relied on config files to define how a model ingested and output data.

    + +

    Today, we are happy to announce the next version of our SDK can handle data processing in JavaScript for your prototypes with the new PlayTorch API that leverages the JavaScript Interface (JSI) to directly call C++ code. Not only have we completely redone the way you can interact with models, but we have also greatly expanded the variety of supported model architectures.

    + +

    A New Data Processing API for Prototyping

    + +

    With this JSI API, we now allow users direct access to tensors (data format for machine learning). Instead of only having access to predefined transformations, you can now manipulate tensors however you would like for your prototypes.

    + +

    + +

    + +

    No more switching back and forth between code and config. You will now be able to write everything in JavaScript and have access to all of the type annotations and autocomplete features available to you in those languages.

    + +

    Check out our tutorials to see the new Data Processing API in action, take a deeper dive in the API docs, or inspect the code yourself on GitHub.

    + +

    Expanded Use Cases

    + +

    With the new version of the SDK, we have added support for several cutting edge models.

    + +

    + +

    + +

    Image-to-image transformations are now supported thanks to our robust JSI API, so you can see what your world would look like if it were an anime.

    + +

    + +

    + +

    Translate French to English with an AI powered translator using the Seq2Seq model.

    + +

    + +

    + +

    Use DeepLab V3 to segment images!

    + +

    Start Playing

    + +

    If you want to start creating AI experiences yourself, head over to playtorch.dev and try out our tutorials. Each tutorial will guide you through building a simple AI powered experience that you can instantly run on your phone and share with others.

    + +

    How to Get Support

    + +

    Join us on Discord, collaborate with us on GitHub, or follow us on Twitter. Got questions or feedback? We’d love to hear from you!

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/introducing-torchmultimodal/index.html b/blog/introducing-torchmultimodal/index.html new file mode 100644 index 000000000000..fa7aed65f1fe --- /dev/null +++ b/blog/introducing-torchmultimodal/index.html @@ -0,0 +1,752 @@ + + + + + + + + + + + + + Introducing TorchMultimodal - a library for accelerating exploration in Multimodal AI | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Kartikay Khandelwal, Ankita De + +

    +

    We are announcing TorchMultimodal Beta, a PyTorch domain library for training SoTA multi-task multimodal models at scale. The library provides composable building blocks (modules, transforms, loss functions) to accelerate model development, SoTA model architectures (FLAVA, MDETR, Omnivore) from published research, training and evaluation scripts, as well as notebooks for exploring these models. The library is under active development, and we’d love to hear your feedback! You can find more details on how to get started here.

    + +

    Why TorchMultimodal?

    + +

    Interest is rising around AI models that understand multiple input types (text, images, videos and audio signals), and optionally use this understanding to generate different forms of outputs (sentences, pictures, videos). Recent work from FAIR such as FLAVA, Omnivore and data2vec have shown that multimodal models for understanding are competitive with unimodal counterparts, and in some cases are establishing the new state-of-the art. Generative models such as Make-a-video and Make-a-scene are redefining what modern AI systems can do.

    + +

    As interest in multimodal AI has grown, researchers are looking for tools and libraries to quickly experiment with ideas, and build on top of the latest research in the field. While the PyTorch ecosystem has a rich repository of libraries and frameworks, it’s not always obvious how components from these interoperate with each other, or how they can be stitched together to build SoTA multimodal models.

    + +

    TorchMultimodal solves this problem by providing:

    + +
      +
    • +

      Composable and easy-to-use building blocks which researchers can use to accelerate model development and experimentation in their own workflows. These are designed to be modular, and can be easily extended to handle new modalities.

      +
    • +
    • +

      End-to-end examples for training and evaluating the latest models from research. These should serve as starting points for ongoing/future research, as well as examples for using advanced features such as integrating with FSDP and activation checkpointing for scaling up model and batch sizes.

      +
    • +
    + +

    Introducing TorchMultimodal

    + +

    TorchMultimodal is a PyTorch domain library for training multi-task multimodal models at scale. In the repository, we provide:

    + +
      +
    • +

      Building Blocks. A collection of modular and composable building blocks like models, fusion layers, loss functions, datasets and utilities. Some examples include:

      + +
        +
      • +

        Contrastive Loss with Temperature. Commonly used function for training models like CLIP and FLAVA. We also include variants such as ImageTextContrastiveLoss used in models like ALBEF.

        +
      • +
      • +

        Codebook layers which compresses high dimensional data by nearest neighbor lookup in an embedding space and is a vital component of VQVAEs (provided as a model in the repository).

        +
      • +
      • +

        Shifted-window Attention window based multi-head self attention which is a vital component of encoders like Swin 3D Transformers.

        +
      • +
      • +

        Components for CLIP. A popular model published by OpenAI which has proven to be extremely effective at learning text and image representations.

        +
      • +
      • +

        Multimodal GPT. An abstraction that extends OpenAI’s GPT architecture for multimodal generation when combined with the generation utility.

        +
      • +
      • +

        MultiHeadAttention. A critical component for attention-based models with support for fast auto-regressive decoding.

        +
      • +
      +
    • +
    • +

      Examples. A collection of examples that show how to combine these building blocks with components and common infrastructure (Lightning, TorchMetrics) from across the PyTorch Ecosystem to replicate state-of-the-art models published in literature. We currently provide five examples, which include.

      + +
        +
      • +

        FLAVA [paper]. Official code for the paper accepted at CVPR, including a tutorial on finetuning FLAVA.

        +
      • +
      • +

        MDETR [paper]. Collaboration with authors from NYU to provide an example which alleviates interoperability pain points in the PyTorch ecosystem, including a notebook on using MDETR for phrase grounding and visual question answering.

        +
      • +
      • +

        Omnivore [paper]. First example in TorchMultimodal of a model which deals with Video and 3D data, including a notebook for exploring the model.

        +
      • +
      • +

        MUGEN [paper]. Foundational work for auto-regressive generation and retrieval, including demos for text-video generation and retrieval with a large-scale synthetic dataset enriched from OpenAI coinrun.

        +
      • +
      • +

        ALBEF [paper] Code for the model, including a notebook for using this model for Visual Question Answering.

        +
      • +
      +
    • +
    + +

    The following code snippet showcases an example usage of several TorchMultimodal components related to CLIP:

    + +
    
    +# instantiate clip transform
    +clip_transform = CLIPTransform()
    +
    +# pass the transform to your dataset. Here we use coco captions
    +dataset = CocoCaptions(root= ..., annFile=..., transforms=clip_transform)
    +dataloader = DataLoader(dataset, batch_size=16)
    +
    +# instantiate model. Here we use clip with vit-L as the image encoder
    +model= clip_vit_l14()
    +
    +# define loss and other things needed for training
    +clip_loss = ContrastiveLossWithTemperature()
    +optim = torch.optim.AdamW(model.parameters(), lr = 1e-5)
    +epochs = 1
    +
    +# write your train loop
    +for _ in range(epochs):
    +	for batch_idx, batch in enumerate(dataloader):
    +		image, text = batch
    +		image_embeddings, text_embeddings = model(image, text)
    +		loss = contrastive_loss_with_temperature(image_embeddings, text_embeddings)
    +		loss.backward()
    +		optimizer.step()
    +
    + +

    Apart from the code, we are also releasing a tutorial for fine-tuning multimodal foundation models, and a blog post (with code pointers) on how to scale up such models using techniques from PyTorch Distributed (FSDP and activation checkpointing). We hope such examples and tutorials will serve to demystify a number of advanced features available in the PyTorch ecosystem.

    + +

    What’s Next?

    + +

    While this is an exciting launch, there’s a lot more to come. The library is under development and we are working on adding some of the exciting developments in the space of diffusion models, and examples to showcase common trends from research. As you explore and use the library, we’d love to hear any feedback you might have! You can find more details on how to get started here.

    + +

    Team

    + +

    The primary contributors and developers of TorchMultimodal include Ankita De, Evan Smothers, Kartikay Khandelwal, Lan Gong, Laurence Rouesnel, Nahiyan Malik, Rafi Ayub and Yosua Michael Maranatha.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/introducing-torchrec/index.html b/blog/introducing-torchrec/index.html new file mode 100644 index 000000000000..dfe82d13eafb --- /dev/null +++ b/blog/introducing-torchrec/index.html @@ -0,0 +1,717 @@ + + + + + + + + + + + + + Introducing TorchRec, a library for modern production recommendation systems | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Meta AI - Donny Greenberg, Colin Taylor, Dmytro Ivchenko, Xing Liu, Anirudh Sudarshan + +

    +

    We are excited to announce TorchRec, a PyTorch domain library for Recommendation Systems. This new library provides common sparsity and parallelism primitives, enabling researchers to build state-of-the-art personalization models and deploy them in production.

    + +

    + +

    + +

    How did we get here?

    +

    Recommendation Systems (RecSys) comprise a large footprint of production-deployed AI today, but you might not know it from looking at Github. Unlike areas like Vision and NLP, much of the ongoing innovation and development in RecSys is behind closed company doors. For academic researchers studying these techniques or companies building personalized user experiences, the field is far from democratized. Further, RecSys as an area is largely defined by learning models over sparse and/or sequential events, which has large overlaps with other areas of AI. Many of the techniques are transferable, particularly for scaling and distributed execution. A large portion of the global investment in AI is in developing these RecSys techniques, so cordoning them off blocks this investment from flowing into the broader AI field.

    + +

    By mid-2020, the PyTorch team received a lot of feedback that there hasn’t been a large-scale production-quality recommender systems package in the open-source PyTorch ecosystem. While we were trying to find a good answer, a group of engineers at Meta wanted to contribute Meta’s production RecSys stack as a PyTorch domain library, with a strong commitment to growing an ecosystem around it. This seemed like a good idea that benefits researchers and companies across the RecSys domain. So, starting from Meta’s stack, we began modularizing and designing a fully-scalable codebase that is adaptable for diverse recommendation use-cases. Our goal was to extract the key building blocks from across Meta’s software stack to simultaneously enable creative exploration and scale. After nearly two years, a battery of benchmarks, migrations, and testing across Meta, we’re excited to finally embark on this journey together with the RecSys community. We want this package to open a dialogue and collaboration across the RecSys industry, starting with Meta as the first sizable contributor.

    + +

    Introducing TorchRec

    +

    TorchRec includes a scalable low-level modeling foundation alongside rich batteries-included modules. We initially target “two-tower” ([1], [2]) architectures that have separate submodules to learn representations of candidate items and the query or context. Input signals can be a mix of floating point “dense” features or high-cardinality categorical “sparse” features that require large embedding tables to be trained. Efficient training of such architectures involves combining data parallelism that replicates the “dense” part of computation and model parallelism that partitions large embedding tables across many nodes.

    + +

    In particular, the library includes:

    +
      +
    • Modeling primitives, such as embedding bags and jagged tensors, that enable easy authoring of large, performant multi-device/multi-node models using hybrid data-parallelism and model-parallelism.
    • +
    • Optimized RecSys kernels powered by FBGEMM , including support for sparse and quantized operations.
    • +
    • A sharder which can partition embedding tables with a variety of different strategies including data-parallel, table-wise, row-wise, table-wise-row-wise, and column-wise sharding.
    • +
    • A planner which can automatically generate optimized sharding plans for models.
    • +
    • Pipelining to overlap dataloading device transfer (copy to GPU), inter-device communications (input_dist), and computation (forward, backward) for increased performance.
    • +
    • GPU inference support.
    • +
    • Common modules for RecSys, such as models and public datasets (Criteo & Movielens).
    • +
    + +

    To showcase the flexibility of this tooling, let’s look at the following code snippet, pulled from our DLRM Event Prediction example:

    +
    # Specify the sparse embedding layers
    +eb_configs = [
    +   EmbeddingBagConfig(
    +       name=f"t_{feature_name}",
    +       embedding_dim=64,
    +       num_embeddings=100_000,
    +       feature_names=[feature_name],
    +   )
    +   for feature_idx, feature_name in enumerate(DEFAULT_CAT_NAMES)
    +]
    +
    +# Import and instantiate the model with the embedding configuration
    +# The "meta" device indicates lazy instantiation, with no memory allocated
    +train_model = DLRM(
    +   embedding_bag_collection=EmbeddingBagCollection(
    +       tables=eb_configs, device=torch.device("meta")
    +   ),
    +   dense_in_features=len(DEFAULT_INT_NAMES),
    +   dense_arch_layer_sizes=[512, 256, 64],
    +   over_arch_layer_sizes=[512, 512, 256, 1],
    +   dense_device=device,
    +)
    +
    +# Distribute the model over many devices, just as one would with DDP.
    +model = DistributedModelParallel(
    +   module=train_model,
    +   device=device,
    +)
    +
    +optimizer = torch.optim.SGD(params, lr=args.learning_rate)
    +# Optimize the model in a standard loop just as you would any other model!
    +# Or, you can use the pipeliner to synchronize communication and compute
    +for epoch in range(epochs):
    +   # Train
    +
    + +

    Scaling Performance

    +

    TorchRec has state-of-the-art infrastructure for scaled Recommendations AI, powering some of the largest models at Meta. It was used to train a 1.25 trillion parameter model, pushed to production in January, and a 3 trillion parameter model which will be in production soon. This should be a good indication that PyTorch is fully capable of the largest scale RecSys problems in industry. We’ve heard from many in the community that sharded embeddings are a pain point. TorchRec cleanly addresses that. Unfortunately it is challenging to provide large-scale benchmarks with public datasets, as most open-source benchmarks are too small to show performance at scale.

    + +

    Looking ahead

    +

    Open-source and open-technology have universal benefits. Meta is seeding the PyTorch community with a state-of-the-art RecSys package, with the hope that many join in on building it forward, enabling new research and helping many companies. The team behind TorchRec plan to continue this program indefinitely, building up TorchRec to meet the needs of the RecSys community, to welcome new contributors, and to continue to power personalization at Meta. We’re excited to begin this journey and look forward to contributions, ideas, and feedback!

    + +

    References

    +

    [1] Sampling-Bias-Corrected Neural Modeling for Large Corpus Item Recommendations

    + +

    [2] DLRM: An advanced, open source deep learning recommendation model

    + + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/introducing-torchvision-new-multi-weight-support-api/index.html b/blog/introducing-torchvision-new-multi-weight-support-api/index.html new file mode 100644 index 000000000000..f13e60af061e --- /dev/null +++ b/blog/introducing-torchvision-new-multi-weight-support-api/index.html @@ -0,0 +1,986 @@ + + + + + + + + + + + + + Introducing TorchVision’s New Multi-Weight Support API | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Vasilis Vryniotis + +

    +

    TorchVision has a new backwards compatible API for building models with multi-weight support. The new API allows loading different pre-trained weights on the same model variant, keeps track of vital meta-data such as the classification labels and includes the preprocessing transforms necessary for using the models. In this blog post, we plan to review the prototype API, show-case its features and highlight key differences with the existing one.

    + +
    + +
    + +

    We are hoping to get your thoughts about the API prior finalizing it. To collect your feedback, we have created a Github issue where you can post your thoughts, questions and comments.

    + +

    Limitations of the current API

    + +

    TorchVision currently provides pre-trained models which could be a starting point for transfer learning or used as-is in Computer Vision applications. The typical way to instantiate a pre-trained model and make a prediction is:

    + +
    import torch
    +
    +from PIL import Image
    +from torchvision import models as M
    +from torchvision.transforms import transforms as T
    +
    +
    +img = Image.open("test/assets/encode_jpeg/grace_hopper_517x606.jpg")
    +
    +# Step 1: Initialize model
    +model = M.resnet50(pretrained=True)
    +model.eval()
    +
    +# Step 2: Define and initialize the inference transforms
    +preprocess = T.Compose([
    +    T.Resize([256, ]),
    +    T.CenterCrop(224),
    +    T.PILToTensor(),
    +    T.ConvertImageDtype(torch.float),
    +    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    +])
    +
    +# Step 3: Apply inference preprocessing transforms
    +batch = preprocess(img).unsqueeze(0)
    +prediction = model(batch).squeeze(0).softmax(0)
    +
    +# Step 4: Use the model and print the predicted category
    +class_id = prediction.argmax().item()
    +score = prediction[class_id].item()
    +with open("imagenet_classes.txt", "r") as f:
    +    categories = [s.strip() for s in f.readlines()]
    +    category_name = categories[class_id]
    +print(f"{category_name}: {100 * score}%")
    +
    +
    + +

    There are a few limitations with the above approach:

    + +
      +
    1. Inability to support multiple pre-trained weights: Since the pretrained variable is boolean, we can only offer one set of weights. This poses a severe limitation when we significantly improve the accuracy of existing models and we want to make those improvements available to the community. It also stops us from offering pre-trained weights of the same model variant on different datasets.
    2. +
    3. Missing inference/preprocessing transforms: The user is forced to define the necessary transforms prior using the model. The inference transforms are usually linked to the training process and dataset used to estimate the weights. Any minor discrepancies in these transforms (such as interpolation value, resize/crop sizes etc) can lead to major reductions in accuracy or unusable models.
    4. +
    5. Lack of meta-data: Critical pieces of information in relation to the weights are unavailable to the users. For example, one needs to look into external sources and the documentation to find things like the category labels, the training recipe, the accuracy metrics etc.
    6. +
    + +

    The new API addresses the above limitations and reduces the amount of boilerplate code needed for standard tasks.

    + +

    Overview of the prototype API

    + +

    Let’s see how we can achieve exactly the same results as above using the new API:

    + +
    from PIL import Image
    +from torchvision.prototype import models as PM
    +
    +
    +img = Image.open("test/assets/encode_jpeg/grace_hopper_517x606.jpg")
    +
    +# Step 1: Initialize model
    +weights = PM.ResNet50_Weights.IMAGENET1K_V1
    +model = PM.resnet50(weights=weights)
    +model.eval()
    +
    +# Step 2: Initialize the inference transforms
    +preprocess = weights.transforms()
    +
    +# Step 3: Apply inference preprocessing transforms
    +batch = preprocess(img).unsqueeze(0)
    +prediction = model(batch).squeeze(0).softmax(0)
    +
    +# Step 4: Use the model and print the predicted category
    +class_id = prediction.argmax().item()
    +score = prediction[class_id].item()
    +category_name = weights.meta["categories"][class_id]
    +print(f"{category_name}: {100 * score}*%*")
    +
    + +

    As we can see the new API eliminates the aforementioned limitations. Let’s explore the new features in detail.

    + +

    Multi-weight support

    + +

    At the heart of the new API, we have the ability to define multiple different weights for the same model variant. Each model building method (eg resnet50) has an associated Enum class (eg ResNet50_Weights) which has as many entries as the number of pre-trained weights available. Additionally, each Enum class has a DEFAULT alias which points to the best available weights for the specific model. This allows the users who want to always use the best available weights to do so without modifying their code.

    + +

    Here is an example of initializing models with different weights:

    + +
    from torchvision.prototype.models import resnet50, ResNet50_Weights
    +
    +# Legacy weights with accuracy 76.130%
    +model = resnet50(weights=ResNet50_Weights.IMAGENET1K_V1)
    +
    +# New weights with accuracy 80.858%
    +model = resnet50(weights=ResNet50_Weights.IMAGENET1K_V2)
    +
    +# Best available weights (currently alias for IMAGENET1K_V2)
    +model = resnet50(weights=ResNet50_Weights.DEFAULT)
    +
    +# No weights - random initialization
    +model = resnet50(weights=None)
    +
    + +

    Associated meta-data & preprocessing transforms

    + +

    The weights of each model are associated with meta-data. The type of information we store depends on the task of the model (Classification, Detection, Segmentation etc). Typical information includes a link to the training recipe, the interpolation mode, information such as the categories and validation metrics. These values are programmatically accessible via the meta attribute:

    + +
    from torchvision.prototype.models import ResNet50_Weights
    +
    +# Accessing a single record
    +size = ResNet50_Weights.IMAGENET1K_V2.meta["size"]
    +
    +# Iterating the items of the meta-data dictionary
    +for k, v in ResNet50_Weights.IMAGENET1K_V2.meta.items():
    +    print(k, v)
    +
    + +

    Additionally, each weights entry is associated with the necessary preprocessing transforms. All current preprocessing transforms are JIT-scriptable and can be accessed via the transforms attribute. Prior using them with the data, the transforms need to be initialized/constructed. This lazy initialization scheme is done to ensure the solution is memory efficient. The input of the transforms can be either a PIL.Image or a Tensor read using torchvision.io.

    + +
    from torchvision.prototype.models import ResNet50_Weights
    +
    +# Initializing preprocessing at standard 224x224 resolution
    +preprocess = ResNet50_Weights.IMAGENET1K_V2.transforms()
    +
    +# Initializing preprocessing at 400x400 resolution
    +preprocess = ResNet50_Weights.IMAGENET1K_V2.transforms(crop_size=400, resize_size=400)
    +
    +# Once initialized the callable can accept the image data:
    +# img_preprocessed = preprocess(img)
    +
    + +

    Associating the weights with their meta-data and preprocessing will boost transparency, improve reproducibility and make it easier to document how a set of weights was produced.

    + +

    Get weights by name

    + +

    The ability to link directly the weights with their properties (meta data, preprocessing callables etc) is the reason why our implementation uses Enums instead of Strings. Nevertheless for cases when only the name of the weights is available, we offer a method capable of linking Weight names to their Enums:

    + +
    from torchvision.prototype.models import get_weight
    +
    +# Weights can be retrieved by name:
    +assert get_weight("ResNet50_Weights.IMAGENET1K_V1") == ResNet50_Weights.IMAGENET1K_V1
    +assert get_weight("ResNet50_Weights.IMAGENET1K_V2") == ResNet50_Weights.IMAGENET1K_V2
    +
    +# Including using the DEFAULT alias:
    +assert get_weight("ResNet50_Weights.DEFAULT") == ResNet50_Weights.IMAGENET1K_V2
    +
    + +

    Deprecations

    + +

    In the new API the boolean pretrained and pretrained_backbone parameters, which were previously used to load weights to the full model or to its backbone, are deprecated. The current implementation is fully backwards compatible as it seamlessly maps the old parameters to the new ones. Using the old parameters to the new builders emits the following deprecation warnings:

    + +
    >>> model = torchvision.prototype.models.resnet50(pretrained=True)
    + UserWarning: The parameter 'pretrained' is deprecated, please use 'weights' instead.
    +UserWarning:
    +Arguments other than a weight enum or `None` for 'weights' are deprecated.
    +The current behavior is equivalent to passing `weights=ResNet50_Weights.IMAGENET1K_V1`.
    +You can also use `weights=ResNet50_Weights.DEFAULT` to get the most up-to-date weights.
    +
    + +

    Additionally the builder methods require using keyword parameters. The use of positional parameter is deprecated and using them emits the following warning:

    + +
    >>> model = torchvision.prototype.models.resnet50(None)
    +UserWarning:
    +Using 'weights' as positional parameter(s) is deprecated.
    +Please use keyword parameter(s) instead.
    +
    + +

    Testing the new API

    + +

    Migrating to the new API is very straightforward. The following method calls between the 2 APIs are all equivalent:

    + +
    # Using pretrained weights:
    +torchvision.prototype.models.resnet50(weights=ResNet50_Weights.IMAGENET1K_V1)
    +torchvision.models.resnet50(pretrained=True)
    +torchvision.models.resnet50(True)
    +
    +# Using no weights:
    +torchvision.prototype.models.resnet50(weights=None)
    +torchvision.models.resnet50(pretrained=False)
    +torchvision.models.resnet50(False)
    +
    + +

    Note that the prototype features are available only on the nightly versions of TorchVision, so to use it you need to install it as follows:

    + +
    conda install torchvision -c pytorch-nightly
    +
    + +

    For alternative ways to install the nightly have a look on the PyTorch download page. You can also install TorchVision from source from the latest main; for more information have a look on our repo.

    + +

    Accessing state-of-the-art model weights with the new API

    + +

    If you are still unconvinced about giving a try to the new API, here is one more reason to do so. We’ve recently refreshed our training recipe and achieved SOTA accuracy from many of our models. The improved weights can easily be accessed via the new API. Here is a quick overview of the model improvements:

    + +
    + +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    ModelOld Acc@1New Acc@1
    EfficientNet B178.64279.838
    MobileNetV3 Large74.04275.274
    Quantized ResNet5075.9280.282
    Quantized ResNeXt101 32x8d78.98682.574
    RegNet X 400mf72.83474.864
    RegNet X 800mf75.21277.522
    RegNet X 1 6gf77.0479.668
    RegNet X 3 2gf78.36481.198
    RegNet X 8gf79.34481.682
    RegNet X 16gf80.05882.72
    RegNet X 32gf80.62283.018
    RegNet Y 400mf74.04675.806
    RegNet Y 800mf76.4278.838
    RegNet Y 1 6gf77.9580.882
    RegNet Y 3 2gf78.94881.984
    RegNet Y 8gf80.03282.828
    RegNet Y 16gf80.42482.89
    RegNet Y 32gf80.87883.366
    ResNet5076.1380.858
    ResNet10177.37481.886
    ResNet15278.31282.284
    ResNeXt50 32x4d77.61881.198
    ResNeXt101 32x8d79.31282.834
    Wide ResNet50 278.46881.602
    Wide ResNet101 278.84882.51
    + +

    Please spare a few minutes to provide your feedback on the new API, as this is crucial for graduating it from prototype and including it in the next release. You can do this on the dedicated Github Issue. We are looking forward to reading your comments!

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/introduction-to-quantization-on-pytorch/index.html b/blog/introduction-to-quantization-on-pytorch/index.html new file mode 100644 index 000000000000..12728b3c7d82 --- /dev/null +++ b/blog/introduction-to-quantization-on-pytorch/index.html @@ -0,0 +1,955 @@ + + + + + + + + + + + + + Introduction to Quantization on PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    March 26, 2020

    +

    + Introduction to Quantization on PyTorch +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Raghuraman Krishnamoorthi, James Reed, Min Ni, Chris Gottbrath, and Seth Weidman + +

    +

    It’s important to make efficient use of both server-side and on-device compute resources when developing machine learning applications. To support more efficient deployment on servers and edge devices, PyTorch added a support for model quantization using the familiar eager mode Python API.

    + +

    Quantization leverages 8bit integer (int8) instructions to reduce the model size and run the inference faster (reduced latency) and can be the difference between a model achieving quality of service goals or even fitting into the resources available on a mobile device. Even when resources aren’t quite so constrained it may enable you to deploy a larger and more accurate model. Quantization is available in PyTorch starting in version 1.3 and with the release of PyTorch 1.4 we published quantized models for ResNet, ResNext, MobileNetV2, GoogleNet, InceptionV3 and ShuffleNetV2 in the PyTorch torchvision 0.5 library.

    + +

    This blog post provides an overview of the quantization support on PyTorch and its incorporation with the TorchVision domain library.

    + +

    What is Quantization?

    + +

    Quantization refers to techniques for doing both computations and memory accesses with lower precision data, usually int8 compared to floating point implementations. This enables performance gains in several important areas:

    +
      +
    • 4x reduction in model size;
    • +
    • 2-4x reduction in memory bandwidth;
    • +
    • 2-4x faster inference due to savings in memory bandwidth and faster compute with int8 arithmetic (the exact speed up varies depending on the hardware, the runtime, and the model).
    • +
    + +

    Quantization does not however come without additional cost. Fundamentally quantization means introducing approximations and the resulting networks have slightly less accuracy. These techniques attempt to minimize the gap between the full floating point accuracy and the quantized accuracy.

    + +

    We designed quantization to fit into the PyTorch framework. The means that:

    +
      +
    1. PyTorch has data types corresponding to quantized tensors, which share many of the features of tensors.
    2. +
    3. One can write kernels with quantized tensors, much like kernels for floating point tensors to customize their implementation. PyTorch supports quantized modules for common operations as part of the torch.nn.quantized and torch.nn.quantized.dynamic name-space.
    4. +
    5. Quantization is compatible with the rest of PyTorch: quantized models are traceable and scriptable. The quantization method is virtually identical for both server and mobile backends. One can easily mix quantized and floating point operations in a model.
    6. +
    7. Mapping of floating point tensors to quantized tensors is customizable with user defined observer/fake-quantization blocks. PyTorch provides default implementations that should work for most use cases.
    8. +
    + +
    + +
    + +

    We developed three techniques for quantizing neural networks in PyTorch as part of quantization tooling in the torch.quantization name-space.

    + +

    The Three Modes of Quantization Supported in PyTorch starting version 1.3

    + +
      +
    1. +

      Dynamic Quantization

      +

      The easiest method of quantization PyTorch supports is called dynamic quantization. This involves not just converting the weights to int8 - as happens in all quantization variants - but also converting the activations to int8 on the fly, just before doing the computation (hence “dynamic”). The computations will thus be performed using efficient int8 matrix multiplication and convolution implementations, resulting in faster compute. However, the activations are read and written to memory in floating point format.

      +
        +
      • PyTorch API: we have a simple API for dynamic quantization in PyTorch. torch.quantization.quantize_dynamic takes in a model, as well as a couple other arguments, and produces a quantized model! Our end-to-end tutorial illustrates this for a BERT model; while the tutorial is long and contains sections on loading pre-trained models and other concepts unrelated to quantization, the part the quantizes the BERT model is simply:
      • +
      + +
      import torch.quantization
      +quantized_model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
      +
      +
        +
      • See the documentation for the function here an end-to-end example in our tutorials here and here.
      • +
      +
    2. +
    3. +

      Post-Training Static Quantization

      + +

      One can further improve the performance (latency) by converting networks to use both integer arithmetic and int8 memory accesses. Static quantization performs the additional step of first feeding batches of data through the network and computing the resulting distributions of the different activations (specifically, this is done by inserting “observer” modules at different points that record these distributions). This information is used to determine how specifically the different activations should be quantized at inference time (a simple technique would be to simply divide the entire range of activations into 256 levels, but we support more sophisticated methods as well). Importantly, this additional step allows us to pass quantized values between operations instead of converting these values to floats - and then back to ints - between every operation, resulting in a significant speed-up.

      + +

      With this release, we’re supporting several features that allow users to optimize their static quantization:

      +
        +
      1. Observers: you can customize observer modules which specify how statistics are collected prior to quantization to try out more advanced methods to quantize your data.
      2. +
      3. Operator fusion: you can fuse multiple operations into a single operation, saving on memory access while also improving the operation’s numerical accuracy.
      4. +
      5. Per-channel quantization: we can independently quantize weights for each output channel in a convolution/linear layer, which can lead to higher accuracy with almost the same speed.
      6. +
      + +
        +
      • +

        PyTorch API:

        +
          +
        • To fuse modules, we have torch.quantization.fuse_modules
        • +
        • Observers are inserted using torch.quantization.prepare
        • +
        • Finally, quantization itself is done using torch.quantization.convert
        • +
        +
      • +
      + +

      We have a tutorial with an end-to-end example of quantization (this same tutorial also covers our third quantization method, quantization-aware training), but because of our simple API, the three lines that perform post-training static quantization on the pre-trained model myModel are:

      +
      # set quantization config for server (x86)
      +deploymentmyModel.qconfig = torch.quantization.get_default_config('fbgemm')
      +
      +# insert observers
      +torch.quantization.prepare(myModel, inplace=True)
      +# Calibrate the model and collect statistics
      +
      +# convert to quantized version
      +torch.quantization.convert(myModel, inplace=True)
      +
      +
    4. +
    5. +

      Quantization Aware Training

      +

      Quantization-aware training(QAT) is the third method, and the one that typically results in highest accuracy of these three. With QAT, all weights and activations are “fake quantized” during both the forward and backward passes of training: that is, float values are rounded to mimic int8 values, but all computations are still done with floating point numbers. Thus, all the weight adjustments during training are made while “aware” of the fact that the model will ultimately be quantized; after quantizing, therefore, this method usually yields higher accuracy than the other two methods.

      +
        +
      • +

        PyTorch API:

        +
          +
        • torch.quantization.prepare_qat inserts fake quantization modules to model quantization.
        • +
        • Mimicking the static quantization API, torch.quantization.convert actually quantizes the model once training is complete.
        • +
        +
      • +
      + +

      For example, in the end-to-end example, we load in a pre-trained model as qat_model, then we simply perform quantization-aware training using:

      + +
      # specify quantization config for QAT
      +qat_model.qconfig=torch.quantization.get_default_qat_qconfig('fbgemm')
      +
      +# prepare QAT
      +torch.quantization.prepare_qat(qat_model, inplace=True)
      +
      +# convert to quantized version, removing dropout, to check for accuracy on each
      +epochquantized_model=torch.quantization.convert(qat_model.eval(), inplace=False)
      +
      +
    6. +
    + +

    Device and Operator Support

    +

    Quantization support is restricted to a subset of available operators, depending on the method being used, for a list of supported operators, please see the documentation at https://pytorch.org/docs/stable/quantization.html.

    + +

    The set of available operators and the quantization numerics also depend on the backend being used to run quantized models. Currently quantized operators are supported only for CPU inference in the following backends: x86 and ARM. Both the quantization configuration (how tensors should be quantized and the quantized kernels (arithmetic with quantized tensors) are backend dependent. One can specify the backend by doing:

    + +
    import torchbackend='fbgemm'
    +# 'fbgemm' for server, 'qnnpack' for mobile
    +my_model.qconfig = torch.quantization.get_default_qconfig(backend)
    +# prepare and convert model
    +# Set the backend on which the quantized kernels need to be run
    +torch.backends.quantized.engine=backend
    +
    + +

    However, quantization aware training occurs in full floating point and can run on either GPU or CPU. Quantization aware training is typically only used in CNN models when post training static or dynamic quantization doesn’t yield sufficient accuracy. This can occur with models that are highly optimized to achieve small size (such as Mobilenet).

    + +

    Integration in torchvision

    +

    We’ve also enabled quantization for some of the most popular models in torchvision: Googlenet, Inception, Resnet, ResNeXt, Mobilenet and Shufflenet. We have upstreamed these changes to torchvision in three forms:

    +
      +
    1. Pre-trained quantized weights so that you can use them right away.
    2. +
    3. Quantization ready model definitions so that you can do post-training quantization or quantization aware training.
    4. +
    5. A script for doing quantization aware training — which is available for any of these model though, as you will learn below, we only found it necessary for achieving accuracy with Mobilenet.
    6. +
    7. We also have a tutorial showing how you can do transfer learning with quantization using one of the torchvision models.
    8. +
    + +

    Choosing an approach

    +

    The choice of which scheme to use depends on multiple factors:

    +
      +
    1. Model/Target requirements: Some models might be sensitive to quantization, requiring quantization aware training.
    2. +
    3. Operator/Backend support: Some backends require fully quantized operators.
    4. +
    + +

    Currently, operator coverage is limited and may restrict the choices listed in the table below: +The table below provides a guideline.

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Model TypePreferred schemeWhy
    LSTM/RNNDynamic QuantizationThroughput dominated by compute/memory bandwidth for weights
    BERT/TransformerDynamic QuantizationThroughput dominated by compute/memory bandwidth for weights
    CNNStatic QuantizationThroughput limited by memory bandwidth for activations
    CNNQuantization Aware TrainingIn the case where accuracy can't be achieved with static quantization
    + +

    Performance Results

    +

    Quantization provides a 4x reduction in the model size and a speedup of 2x to 3x compared to floating point implementations depending on the hardware platform and the model being benchmarked. Some sample results are:

    + +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    ModelFloat Latency (ms)Quantized Latency (ms)Inference Performance GainDeviceNotes
    BERT5813131.8xXeon-D2191 (1.6GHz)Batch size = 1, Maximum sequence length= 128, Single thread, x86-64, Dynamic quantization
    Resnet-502141032xXeon-D2191 (1.6GHz)Single thread, x86-64, Static quantization
    Mobilenet-v297175.7xSamsung S9Static quantization, Floating point numbers are based on Caffe2 run-time and are not optimized
    +
    + +

    Accuracy results

    +

    We also compared the accuracy of static quantized models with the floating point models on Imagenet. For dynamic quantization, we compared the F1 score of BERT on the GLUE benchmark for MRPC.

    + +

    Computer Vision Model accuracy

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    ModelTop-1 Accuracy (Float)Top-1 Accuracy (Quantized)Quantization scheme
    Googlenet69.869.7Static post training quantization
    Inception-v377.577.1Static post training quantization
    ResNet-1869.869.4Static post training quantization
    Resnet-5076.175.9Static post training quantization
    ResNext-101 32x8d79.379Static post training quantization
    Mobilenet-v271.971.6Quantization Aware Training
    Shufflenet-v269.468.4Static post training quantization
    + +

    Speech and NLP Model accuracy

    + +
    + + + + + + + + + + + + + +
    ModelF1 (GLUEMRPC) FloatF1 (GLUEMRPC) QuantizedQuantization scheme
    BERT0.9020.895Dynamic quantization
    +
    + +

    Conclusion

    +

    To get started on quantizing your models in PyTorch, start with the tutorials on the PyTorch website. If you are working with sequence data start with dynamic quantization for LSTM, or BERT. If you are working with image data then we recommend starting with the transfer learning with quantization tutorial. Then you can explore static post training quantization. If you find that the accuracy drop with post training quantization is too high, then try quantization aware training.

    + +

    If you run into issues you can get community help by posting in at discuss.pytorch.org, use the quantization category for quantization related issues.

    + +

    This post is authored by Raghuraman Krishnamoorthi, James Reed, Min Ni, Chris Gottbrath and Seth Weidman. Special thanks to Jianyu Huang, Lingyi Liu and Haixin Liu for producing quantization metrics included in this post.

    + +

    Further reading:

    +
      +
    1. PyTorch quantization presentation at Neurips: (https://research.fb.com/wp-content/uploads/2019/12/2.-Quantization.pptx)
    2. +
    3. Quantized Tensors (https://github.com/pytorch/pytorch/wiki/ +Introducing-Quantized-Tensor)
    4. +
    5. Quantization RFC on Github (https://github.com/pytorch/pytorch/ +issues/18318)
    6. +
    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/join-pytorch/index.html b/blog/join-pytorch/index.html new file mode 100644 index 000000000000..38c92bd4eddd --- /dev/null +++ b/blog/join-pytorch/index.html @@ -0,0 +1,715 @@ + + + + + + + + + + + + + Join the PyTorch Foundation: Membership Now Open | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    In September 2022, we welcomed PyTorch to the Linux Foundation from Meta, which formed the PyTorch Foundation with founding members AMD, Amazon Web Services (AWS), Google, Meta, Microsoft, and NVIDIA.

    + +

    Since then, we’ve seen significant growth, including a 39% increase in commits across all repositories, 27% increase of unique contributors, and a 12% increase community contributions – all in the last 90 days! We’re grateful to our founding members for their support to move the foundation forward.

    + +

    Today, we’re announcing that membership is now open to join the PyTorch Foundation.

    + +

    As a member of the PyTorch Foundation, you’ll have access to resources that allow you to be stewards of stable, secure, and long-lasting codebases. You can collaborate on training and certification programs, local and regional events, open source developer tooling, academic research, and guides to help new users and contributors have a productive experience.

    + +

    The PyTorch Foundation’s goal is to help end users navigate the PyTorch ecosystem, recruit talent, and adopt PyTorch and support open source AI technologies successfully.

    + +

    Why join as a member

    + +

    Being a part of the PyTorch Foundation grants opportunities to help build the future of end-to-end machine learning frameworks alongside your industry peers.

    + +

    Membership benefits include:

    + +
      +
    • Gain technical traction and insight for your organization’s products by immersing your teams with other industry leaders.
    • +
    • Influence technical priorities, approaches, and code.
    • +
    • Support the PyTorch project community by helping fund programs and services that the project and its community rely on.
    • +
    • Engage with the PyTorch project ecosystem, network with fellow members, and contribute to building and maintaining an engaging and strong PyTorch ecosystem.
    • +
    • Provide thought leadership and participate in unique, wide-reaching networking and marketing programs expanding industry awareness as PyTorch amplifies member progress.
    • +
    • Retain, attract, and increase engineering skills and employees and build your innovation partner network, supply chain, and customer pipeline.
    • +
    • As an active member of the PyTorch community, you can deepen your engagement and leadership in local and industry developer networks and conferences.
    • +
    + +

    How to join

    + +

    Commercial organizations are invited to apply for General membership, while non-profits and academic institutions are encouraged to apply for Associate membership.

    + +

    Premier Members

    + +

    Organizations are welcome to submit an application to be considered as a Premier member. Premier members are the highest tier. They will appoint one voting representative in any subcommittees or activities of the PTF Governing Board, and receive prominent placement in displays of membership including website, landscape and marketing materials, exclusive live webinars with PyTorch online programs and everything included within a “general” membership. The annual fee is $150,000 + an LF Silver Membership.

    + +

    General Members

    + +

    General members will participate in all marketing, community and thought leadership opportunities, as well as discounts on event sponsorships and training courses. General members also have the opportunity to be considered for a PTF board position. The annual fee is dependent on the size of your organization. More details can be found here.

    + +

    Associate Members

    + +

    Associate members are free to join and will receive support and participation opportunities with the PyTorch Foundation team. More information can be found here.

    + +

    Hear from our founding members

    + +

    AMD

    + +

    “AMD strongly believes in and supports an open software ecosystem. We are very proud to be a founding member of the PyTorch Foundation, helping to develop an open and collaborative community for AI and ML. AI and ML have the opportunity to impact everything we do, and the work done through the PyTorch Foundation is critical in developing an open framework that is vendor neutral and helps democratize AI for all.”

    + +

    AWS

    + +

    “AWS is a firm believer in the PyTorch Foundation mission to develop AI and deep learning tools through open collaboration. Our customers use PyTorch every day to build, train, and deploy machine learning models on AWS. Through our involvement, AWS is supporting innovation and helping to make open source tooling more accessible to our customers and the broader community.”

    + +

    Google

    + +

    “The AI revolution is upon us and it’s being built on PyTorch. With new applications like ChatGPT and Stable Diffusion built on PyTorch, the wave of generative AI continues to be felt across every facet of society. We at Google are excited to be a founding member of the PyTorch Foundation and we’re excited for the opportunity to work closely with other leaders in AI to help grow this amazing and innovative community.”

    + +

    Meta

    + +

    “Meta has a long history of putting open science at the core of our work in AI and PyTorch is no exception. PyTorch was built from the ground up with an open source, community-first philosophy. We transitioned PyTorch to the PyTorch Foundation because we believe this approach enables the fastest progress in building and deploying new systems that will address real-world needs and answer fundamental questions about the nature of intelligence. With the PyTorch Foundation, the entire AI community is positioned to push the field forward in countless exciting new ways.”

    + +

    Microsoft

    + +

    “Microsoft believes strongly in PyTorch and it’s been an honor to be a founding member of the PyTorch Foundation. Internally, we use PyTorch extensively, and an outgrowth of that is the Azure Container for PyTorch, which provides deep optimization for PyTorch development, including ONNX Runtime, DeepSpeed, and Nebula to greatly reduce training cost and accelerate training times on Azure Machine Learning. As part of our ongoing commitment to open source machine learning platforms, we look forward to partnering with industry leaders to continue contributing to the advancement of PyTorch.”

    + +

    NVIDIA

    + +

    “As a leading Python-based AI framework, PyTorch has been fundamental to the development of LLMs and GenAI. NVIDIA’s goal is to deepen our collaboration with the open-source AI community as part of the PyTorch Foundation, and help build the next wave of advanced, energy efficient, and cost-effective applications with accelerated computing.”

    + +

    Join today

    + +

    We are excited to see the PyTorch Foundation continue to grow alongside the community through neutral governance and support. We hope you’ll join us as a member!

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/language-identification/index.html b/blog/language-identification/index.html new file mode 100644 index 000000000000..6607d9b79247 --- /dev/null +++ b/blog/language-identification/index.html @@ -0,0 +1,835 @@ + + + + + + + + + + + + + Language Identification: Building an End-to-End AI Solution using PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Intel + +

    +

    Language Identification is the process of identifying the primary language from multiple audio input samples. In natural language processing (NLP), language identification is an important problem and a challenging issue. There are many language-related tasks such as entering text on your phone, finding news articles you enjoy, or discovering answers to questions that you may have. All these tasks are powered by NLP models. To decide which model to invoke at a particular point in time, we must perform language identification.

    + +

    This article presents an in-depth solution and code sample for language identification using Intel® Extension for PyTorch, which is a version of the popular PyTorch AI framework optimized for use on Intel® processors, and Intel® Neural Compressor, which is a tool to accelerate AI inference without sacrificing accuracy.

    + +

    The code sample demonstrates how to train a model to perform language identification using the Hugging Face SpeechBrain* toolkit and optimize it using the Intel® AI Analytics Toolkit (AI Kit). The user can modify the code sample and identify up to 93 languages using the Common Voice dataset.

    + +

    Proposed Methodology for Language Identification

    + +

    In the proposed solution, the user will use an Intel AI Analytics Toolkit container environment to train a model and perform inference leveraging Intel-optimized libraries for PyTorch. There is also an option to quantize the trained model with Intel Neural Compressor to speed up inference.

    + +

    Dataset

    + +

    The Common Voice dataset is used and for this code sample, specifically, Common Voice Corpus 11.0 for Japanese and Swedish. This dataset is used to train an Emphasized Channel Attention, Propagation and Aggregation Time Delay Neural Network (ECAPA-TDNN), which is implemented using the Hugging Face SpeechBrain library. Time Delay Neural Networks (TDNNs), aka one-dimensional Convolutional Neural Networks (1D CNNs), are multilayer artificial neural network architectures to classify patterns with shift-invariance and model context at each layer of the network. ECAPA-TDNN is a new TDNN-based speaker-embedding extractor for speaker verification; it is built upon the original x-vector architecture and puts more emphasis on channel attention, propagation, and aggregation.

    + +

    Implementation

    + +

    After downloading the Common Voice dataset, the data is preprocessed by converting the MP3 files into WAV format to avoid information loss and separated into training, validation, and testing sets.

    + +

    A pretrained VoxLingua107 model is retrained with the Common Voice dataset using the Hugging Face SpeechBrain library to focus on the languages of interest. VoxLingua107 is a speech dataset used for training spoken language recognition models that work well with real-world and varying speech data. This dataset contains data for 107 languages. By default, Japanese and Swedish are used, and more languages can be included. This model is then used for inference on the testing dataset or a user-specified dataset. Also, there is an option to utilize SpeechBrain’s Voice Activity Detection (VAD) where only the speech segments from the audio files are extracted and combined before samples are randomly selected as input into the model. This link provides all the necessary tools to perform VAD. To improve performance, the user may quantize the trained model to integer-8 (INT8) using Intel Neural Compressor to decrease latency.

    + +

    Training

    + +

    The copies of training scripts are added to the current working directory, including create_wds_shards.py - for creating the WebDataset shards, train.py - to perform the actual training procedure, and train_ecapa.yaml - to configure the training options. The script to create WebDataset shards and YAML file are patched to work with the two languages chosen for this code sample.

    + +

    In the data preprocessing phase, prepareAllCommonVoice.py script is executed to randomly select a specified number of samples to convert the input from MP3 to WAV format. Here, 80% of these samples will be used for training, 10% for validation, and 10% for testing. At least 2000 samples are recommended as the number of input samples and is the default value.

    + +

    In the next step, WebDataset shards are created from the training and validation datasets. This stores the audio files as tar files which allows writing purely sequential I/O pipelines for large-scale deep learning in order to achieve high I/O rates from local storage—about 3x-10x faster compared to random access.

    + +

    The YAML file will be modified by the user. This includes setting the value for the largest number for the WebDataset shards, output neurons to the number of languages of interest, number of epochs to train over the entire dataset, and the batch size. The batch size should be decreased if the CPU or GPU runs out of memory while running the training script.

    + +

    In this code sample, the training script will be executed with CPU. While running the script, “cpu” will be passed as an input parameter. The configurations defined in train_ecapa.yaml are also passed as parameters.

    + +

    The command to run the script to train the model is:

    + +
    python train.py train_ecapa.yaml --device "cpu"
    +
    + +

    In the future, the training script train.py will be designed to work for Intel® GPUs such as the Intel® Data Center GPU Flex Series, Intel® Data Center GPU Max Series, and Intel® Arc™ A-Series with updates from Intel Extension for PyTorch.

    + +

    Run the training script to learn how to train the models and execute the training script. The 4th Generation Intel® Xeon® Scalable Processor is recommended for this transfer learning application because of its performance improvements through its Intel® Advanced Matrix Extensions (Intel® AMX) instruction set.

    + +

    After training, checkpoint files are available. These files are used to load the model for inference.

    + +

    Inference

    + +

    Inference Pipeline

    + +

    The crucial step before running inference is to patch the SpeechBrain library’s pretrained interfaces.py file so that PyTorch TorchScript* can be run to improve the runtime. TorchScript requires the output of the model to be only tensors.

    + +

    Users can choose to run inference using the testing set from Common Voice or their own custom data in WAV format. The following are the options the inference scripts (inference_custom.py and inference_commonVoice.py) can be run with:

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Input Option + Description +
    -p + Specify the data path. +
    -d + Specify the duration of wave sample. The default value is 3. +
    -s + Specify size of sample waves, default is 100. +
    --vad + (`inference_custom.py` only) Enable VAD model to detect active speech. The VAD option will identify speech segments in the audio file and construct a new .wav file containing only the speech segments. This improves the quality of speech data used as input into the language identification model. +
    --ipex + Run inference with optimizations from Intel Extension for PyTorch. This option will apply optimizations to the pretrained model. Using this option should result in performance improvements related to latency. +
    --ground_truth_compare + (`inference_custom.py` only) Enable comparison of prediction labels to ground truth values. +
    --verbose + Print additional debug information, like latency. +
    + +

    The path to the data must be specified. By default, 100 audio samples of 3-seconds will be randomly selected from the original audio file and used as input to the language identification model.

    + +

    A small Convolutional Recurrent Deep Neural Network (CRDNN) pretrained on the LibriParty dataset is used to process audio samples and output the segments where speech activity is detected. This can be used in inference with the --vad option.

    + +

    From the figure below, the timestamps where speech will be detected is delivered from the CRDNN model, and these are used to construct a new, shorter audio file with only speech. Sampling from this new audio file will give a better prediction of the primary language spoken.

    + +

    Audio wave file visualization

    + +

    Run the inference script yourself. An example command of running inference:

    + +
    python inference_custom.py -p data_custom -d 3 -s 50 --vad
    +
    + +

    This will run inference on data you provide located inside the data_custom folder. This command performs inference on 50 randomly selected 3-second audio samples with voice activity detection.

    + +

    If you want to run the code sample for other languages, download Common Voice Corpus 11.0 datasets for other languages.

    + +

    Optimizations with Intel Extension for PyTorch and Intel Neural Compressor

    + +

    PyTorch

    + +

    The Intel extension expands PyTorch with up-to-date features and optimizations for an extra performance boost on Intel hardware. Check out how to install Intel Extension for PyTorch. The extension can be loaded as a Python module or linked as a C++ library. Python users can enable it dynamically by importing intel_extension_for_pytorch.

    + +
      +
    • The CPU tutorial gives detailed information about Intel Extension for PyTorch for Intel CPUs. Source code is available at the master branch.
    • +
    • The GPU tutorial gives detailed information about Intel Extension for PyTorch for Intel GPUs. Source code is available at the xpu-master branch.
    • +
    + +

    To optimize the model for inference using Intel Extension for PyTorch, the --ipexoption can be passed in. The model is optimized using the plug-in. TorchScript speeds up inference because PyTorch is run in graph mode. The command to run with this optimization is:

    + +
    python inference_custom.py -p data_custom -d 3 -s 50 --vad --ipex --verbose
    +
    + +

    Note: The --verbose option is required to view the latency measurements.

    + +

    Auto-mixed precision such as bfloat16 (BF16) support will be added in a future release of the code sample.

    + +

    Intel Neural Compressor

    + +

    This is an open-source Python library that runs on CPUs or GPUs, which:

    + +
      +
    • Performs model quantization to reduce the model size and increase the speed of deep learning inference for deployment.
    • +
    • Automates popular methods such as quantization, compression, pruning, and knowledge distillation across multiple deep-learning frameworks.
    • +
    • Is part of the AI Kit
    • +
    + +

    The model can be quantized from float32 (FP32) precision to integer-8 (INT8) by running the quantize_model.py script while passing in the path to the model and a validation dataset. The following code can be used to load this INT8 model for inference:

    + +
    from neural_compressor.utils.pytorch import load
    +model_int8 = load("./lang_id_commonvoice_model_INT8", self.language_id)
    +signal = self.language_id.load_audio(data_path)
    +prediction = self.model_int8(signal)
    +
    + +

    Note that the original model is required when loading the quantized model. The command to quantize the trained model from FP32 to INT8 by using quantize_model.py is:

    + +
    python quantize_model.py -p ./lang_id_commonvoice_model -datapath $COMMON_VOICE_PATH/commonVoiceData/commonVoice/dev
    +
    + +

    What’s Next?

    + +

    Try out the above code sample by upgrading the hardware to a 4th Generation Intel Xeon Scalable Processor with Intel AMX and identify up to 93 different languages from Common Voice datasets.

    + +

    We encourage you to learn more about and incorporate Intel’s other AI/ML Framework optimizations and end-to-end portfolio of tools into your AI workflow. Also, visit AI & ML page covering Intel’s AI software development resources for preparing, building, deploying, and scaling your AI solutions.

    + +

    For more details about the new 4th Gen Intel Xeon Scalable processors, visit Intel’s AI Solution Platform portal where you can learn how Intel is empowering developers to run end-to-end AI pipelines on these powerful CPUs.

    + +

    Useful resources

    + + + +

    Explore more AI code samples

    + + + +

    See all code samples

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/large-scale-training-hugging-face/index.html b/blog/large-scale-training-hugging-face/index.html new file mode 100644 index 000000000000..889b0c1317fb --- /dev/null +++ b/blog/large-scale-training-hugging-face/index.html @@ -0,0 +1,943 @@ + + + + + + + + + + + + + Large Scale Training of Hugging Face Transformers on TPUs With PyTorch/XLA FSDP | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Alex Wertheim, Milad Mohammadi, Jack Cao, Alex Spiridonov, Joe Spisak, Lysandre Debut, Sylvain Gugger, Sourab Mangrulkar + +

    +

    AI is transforming many industries through advanced capabilities such as understanding and generating language, answering questions, and delivering accurate recommendations. These capabilities are fueled by ever-increasing size and complexity of AI models, which require vast amounts of computing power to train.

    + +

    To meet the growing demands of AI training at scale, last year we introduced Fully Sharded Data Parallel (FSDP) in PyTorch/XLA. FSDP is a model parallelism architecture that unlocks the ability to easily and efficiently scale AI models into hundreds of billions of parameters. With PyTorch/XLA FSDP, during distributed training, each device can store a specific model shard, and all-gather the full model weights when it is time to perform the forward pass. Nested FSDP further optimizes performance by only using a given layer’s full parameters during its forward pass.

    + +

    We are excited to announce that PyTorch/XLA FSDP has landed in Hugging Face Transformers. Now, Hugging Face users can train PyTorch models with up to 20 times more parameters using the same amount of computing power as before.

    + +

    We built PyTorch/XLA FSDP support directly into the Hugging Face Trainer class, so that any model using Trainer can leverage FSDP. And with the addition of automatic wrapping to PyTorch/XLA FSDP, nested FSDP wrapping is both flexible and simple to apply. These new features make it easy to train a wide range of Hugging Face models at large scales. In this guide, we demonstrate training GPT-2 models with up to 128B parameters on Google Cloud TPUs. PyTorch/XLA FSDP training on TPUs is highly efficient, achieving up to 45.1% model FLOPS utilization (MFU) for GPT-2:

    + +

    Figure 1: Model FLOPS utilization for Hugging Face GPT-2 on Google Cloud TPU v4

    + +

    Figure 1: Model FLOPS utilization for Hugging Face GPT-2 on Google Cloud TPU v4

    + +

    Configuring PyTorch/XLA FSDP in the Hugging Face Trainer

    + +

    First, follow your preferred method to create your TPU(s) and install PyTorch and PyTorch/XLA. You need versions >= 2.0 for PyTorch and PyTorch/XLA.

    + +
        pip3 install https://storage.googleapis.com/tpu-pytorch/wheels/tpuvm/torc h-2.0-cp38-cp38-linux_x86_64.whl --user
    +
    +    pip3 install https://storage.googleapis.com/tpu-pytorch/wheels/tpuvm/torc h_xla-2.0-cp38-cp38-linux_x86_64.whl
    +
    + +

    Next, clone and install the Hugging Face Transformers repo. Install all necessary dependencies (e.g., datasets, evaluate, scikit-learn, accelerate).

    + +
        cd $HOME
    +    git clone https://github.com/huggingface/transformers.git cd transformers
    +    git checkout v4.31-release
    +    pip3 install -e .
    +    pip3 install datasets evaluate scikit-learn
    +    pip3 install accelerate==0.21.0
    +
    + +

    In $HOME/transformers, create any model-specific configuration files you might need. Here is an example of a configuration file for a GPT-2 model with 2B parameters, which we later refer to as gpt2_config.json:

    + +
    {
    +    "activation_function": "gelu_new", 
    +    "architectures": [
    +        "GPT2LMHeadModel"
    +    ],
    +    "attn_pdrop": 0.1,
    +    "bos_token_id": 50256, "embd_pdrop": 0.1, "eos_token_id": 50256, "initializer_range": 0.02, "layer_norm_epsilon": 1e-05, "model_type": "gpt2",
    +    "n_embd": 3072,
    +    "n_head": 24,
    +    "n_layer": 18,
    +    "n_positions": 1024,
    +    "resid_pdrop": 0.1,
    +    "summary_activation": null,
    +    "summary_first_dropout": 0.1,
    +    "summary_proj_to_labels": true,
    +    "summary_type": "cls_index",
    +    "summary_use_proj": true,
    +    "task_specific_params": {
    +        "text-generation": {
    +            "do_sample": true,
    +            "max_length": 50
    +        }
    +    },
    +    "vocab_size": 50257
    +}
    +
    + +

    With PyTorch/XLA FSDP, it is possible to train model sizes much bigger than this on large accelerator slices. We have trained GPT-2 models as large as 128B parameters with these techniques; for expert tips on how to replicate this scale, see the appendix.

    + +

    In $HOME/transformers, create your FSDP configuration file, a JSON file containing all of the configurable aspects of your XLA FSDP wrapping stored as a dictionary. Following the official Hugging Face Transformers XLA FSDP documentation, the following arguments are available to set:

    +
      +
    • xla (bool, \*optional\*, defaults to False): This is a boolean which determines whether or not you use XLA FSDP. Make sure to set this to true.
    • +
    • xla_fsdp_settings (dict, \*optional\*): This is a dictionary which stores all of the XLA FSDP wrapping parameters you want to set; note that you do not have to specify settings for parameters where you are using the default value. For a complete list of settings, see here.
    • +
    + +

    For compute_dtype and buffer_dtype, enter these as strings which contain the corresponding torch data type, e.g. bfloat16.

    + +
      +
    • fsdp_min_num_params (int, \*optional\*, defaults to 0): An integer which sets the minimum number of parameters for size-based auto wrapping. Every module with at least as many parameters as fsdp_min_num_params will be XLA FSDP wrapped.
    • +
    • fsdp_transformer_layer_cls_to_wrap (List[str], \*optional\*): A list of (case-sensitive) transformer layer class names to wrap. Note that this is mutually exclusive with fsdp_min_num_params. Example: ["GPT2Block", "GPT2MLP"].
    • +
    • xla_fsdp_grad_ckpt (bool, \*optional\*, defaults to False): This is a boolean which determines whether to use gradient checkpointing over each nested XLA FSDP wrapped layer. This setting can only be used when the xla flag is set to true, and an auto wrapping policy is specified through fsdp_min_num_params or fsdp_transformer_layer_cls_to_wrap.
    • +
    + +

    Note: For transformer-based models, use fsdp_transformer_layer_cls_to_wrap instead of fsdp_min_num_params when performing automatic nested FSDP wrapping. Layers which share weights should not belong to separate FSDP wrapped units, and the input and output embedding layers in transformer-based models share weights.

    + +

    For this GPT-2 example, here is what the corresponding fsdp_config.json file looks like:

    + +
        {
    +        "fsdp_transformer_layer_cls_to_wrap": [
    +            "GPT2Block"
    +        ],
    +        "xla": true,
    +        "xla_fsdp_settings": {
    +            "compute_dtype": "bfloat16",
    +            "shard_param_on_dim_0": true,
    +            "pin_layout_in_collective_ops": true
    +        },
    +       "xla_fsdp_grad_ckpt": true
    +    }
    +
    + + + + + + + +
    Now, it’s time to train your model! First, ensure that you have your PyTorch/XLA runtime set up appropriately by setting
    + +
        export PJRT_DEVICE=TPU
    +
    + +

    When running training, the key flags to pass are:

    + +

    a) --fsdp "full_shard" +b) --fsdp_config fsdp_config.json

    + +

    where you should replace fsdp_config.json with whatever you named your FSDP configuration file. Here is a sample command to train our example 2B GPT-2 model, where training is started by xla_spawn.py, a launcher script for distributed TPU training.

    + +
        python3 -u examples/pytorch/xla_spawn.py --num_cores 4 examples/pytorch/language-modeling/run_clm.py \
    +    --num_train_epochs 1 \
    +    --dataset_name wikitext \
    +    --dataset_config_name wikitext-2-raw-v1 \ --per_device_train_batch_size 32 \ --per_device_eval_batch_size 32 \
    +    --do_train \
    +    --do_eval \
    +    --output_dir /tmp/test-clm \
    +    --overwrite_output_dir \
    +    --config_name gpt2_config.json \
    +    --cache_dir /tmp \
    +    --tokenizer_name gpt2 \
    +    --block_size 1024 \
    +    --optim adafactor \
    +    --adafactor true \
    +    --save_strategy no \
    +    --logging_strategy no \
    +    --fsdp "full_shard" \
    +    --fsdp_config fsdp_config.json
    +
    + +

    Measuring Model FLOPS Utilization (MFU) for GPT-2

    + +

    Model FLOPS are the floating point operations required to perform a single forward and backward pass. Model FLOPS are hardware- and implementation- independent, and only depend on the underlying model. In each step, the number of FLOPS is computed via the following formulas:

    + +
    tokens_per_batch = global_batch_size \* seq_len
    +
    +FLOPS_per_step = 6 \* tokens_per_batch \* num_params
    +
    + +

    where seq_len is the sequence length and num_params is the number of parameters in the model. We note that this estimation assumes that the input dimensionality is much larger than the input sequence length (d_model >> seq_len). If this assumption is violated the self-attention FLOPs start to be significant enough and this expression will underestimate the true MFU.

    + +

    Based on the step time and the hardware details (numbers of chips and the peak FLOPS per chip), we can compute Model FLOPS Utilization (MFU), which measures how effectively our implementation is using the underlying hardware. Achieving 100% MFU means that the hardware is being used perfectly by that model. We calculate MFU using the following formula:

    + +
    model_FLOPS_utilization = FLOPS_per_step / step_time(s) / chip_count / FLOPS_per_chip
    +
    + +

    When training a GPT-2 model with 2B parameters with the XLA FSDP configuration file above on a Cloud TPU v4-8, we measure a step time of 4.191s. Using the above formula, we calculate 35.7% MFU on a v4-8. For further details on calculating MFU, refer to the PaLM paper.

    + +

    The table below presents MFU for GPT-2 models with sizes between 2B and 128B, with a sequence length of 1024.

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    TPU NumCoresv4-8v4-64v4-128v4-128v4-256v4-512
    # of Tokens / Batch131,072524,288524,288524,2881,048,5761,048,576
    # of Parameters2B16B20B32B64B128B
    Step Time (ms)4,19114,5927,82412,97025,65330,460
    PFLOPS / Step1.655062101404809
    MFU35.7%38.8%45.1%44.4%44.7%37.7%
    + +

    Table 1: GPT-2 model FLOPS utilization calculation details

    + +

    Among these configurations, MFU peaks at 45.1% for the 20B parameter model on v4-128. This result compares favorably to, for example, 41.5% MFU for a 22B Megatron-like model.

    + +

    There are two actionable insights from these experiments:

    + +

    First, simply increasing the number of chips without increasing the batch size generally means lower FLOPS utilization, because more time is spent on sharing the model shards. FSDP uses all-reduce communication collectives which are not asynchronous, which means that chip-to-chip communication cannot be overlapped with computation. As the number of chips increases, the number of model shards that must be communicated increases, and so we should expect the portion of the step time spent on communication to increase with the number of chips.

    + +

    Second, increasing the batch size generally means better FLOPS utilization. As the number of chips increases, the memory footprint of the model decreases, which often frees up high bandwidth memory (HBM) to scale up the global batch size. With a larger global batch size, the number of tokens processed in each step increases, and thus, so does the FLOPS per step. As long as the step time does not increase proportionally, we expect a larger global batch size to improve MFU.

    + +

    Therefore, to maximize the MFU, we recommend training with the largest global batch size possible that can fit in the HBM of the TPU slice, using FSDP to reduce memory required for the model parameters.

    + +

    Training Very Large Models (tested to 128B parameters)

    + +

    When using PyTorch/XLA, tensors must be initialized on the CPU before being moved to the XLA device. This means one may encounter host-side out-of-memory errors if the model is sufficiently large, even though the model can fit in the device HBM after sharding. To avoid this, we must defer each submodule’s initialization until it is FSDP wrapped, which ensures that submodules are sharded as soon as their values are populated, avoiding host-side limitations.

    + +

    Below, we explain how to modify a local copy of the Hugging Face transformers repository to train a GPT-2 model with up to 128B parameters using this technique.

    + +

    First, using the commands below, install torchdistX, which is a library containing experimental PyTorch Distributed features. This is the engine behind deferred initialization, and allows you to create tensors that don’t require immediate storage and can be materialized later. You also need to install a specific PyTorch/XLA 2.0 version that takes advantage of this package; note that you must uninstall PyTorch and PyTorch/XLA first, if you installed them earlier.

    + +
    pip3 install torch==2.0 --index-url [https://download.pytorch.org/whl/test/cpu](https://download.pytorch.org/whl/test/cpu) --user
    +pip3 install torch_xla[torchdistx] -f https://storage.googleapis.com/tpu-pytorch/wheels/tpuvm/experimen tal/torch_xla-2.0-cp38-cp38-linux_x86_64.whl
    +
    + +

    Next, apply the following changes to your local copy of Hugging Face Transformers:

    + +

    In src/transformers/trainer.py, add the following function in _wrap_model on the line immediately prior to PyTorch/XLA FSDP wrapping:

    + +
    from torchdistx import deferred_init
    +
    +def _init_with_torchdistX(module):
    +    def check_fn(k):
    +        return not isinstance(k, FSDP)
    +    deferred_init.materialize_module(module, check_fn=check_fn)
    +
    + +

    The function materialize_module will initialize the model tensors if check_fn returns True. In this case, check_fn checks whether the module has been FSDP wrapped.

    + +

    Within _wrap_model, modify your FSDP wrapping to accept the additional argument param_init_fn=_init_with_torchdistX:

    + +
    self.model = model = FSDP(
    +        model,
    +        auto_wrap_policy=auto_wrap_policy,
    +        auto_wrapper_callable=auto_wrapper_callable,
    +        param_init_fn=_init_with_torchdistX,
    +        \*\*fsdp_kwargs,
    +    )
    +
    + +

    In examples/pytorch/language-modeling/run_clm.py, add the following import statement at the beginning of the file:

    + +
    from torchdistx import deferred_init
    +
    + +

    Edit the model initialization so that the model is wrapped with deferred_init.deferred_init by replacing the line

    + +
    model = AutoModelForCausalLM.from_config(config)
    +
    + +

    with

    + +
    model = deferred_init.deferred_init(AutoModelForCausalLM.from_config, config)
    +
    + +

    Note that this assumes you are supplying your own model configuration file. Otherwise, you should modify your model initialization statement accordingly.

    + +

    You should also comment out these two lines which immediately follow the line above:

    + +
    n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values()) logger.info(f"Training new model from scratch - Total size={n_params/2\*\*20:.2f}M params")
    +
    + +

    They will cause an error if left unmodified, since the model tensors do not actually have storage when these lines are executed.

    + +

    With these changes, you can now run GPT-2 models with as many as 128B parameters, provided the accelerator size is suitably large.

    + +

    Next Steps & Acknowledgements

    + +

    To learn more, the docs can be found here. We’d love to hear from you if you run into any issues with FSDP in PyTorch/XLA, or just want to tell us about how you are using it.

    + +

    We are ecstatic about what’s ahead for PyTorch/XLA and invite the community to join us. PyTorch/XLA is developed fully in open source. So, please file issues, submit pull requests, and send RFCs to GitHub so that we can openly collaborate.

    + +

    We’d like to thank Ronghang Hu and Ross Girshick at Meta AI and Lysandre Debut, Sourab Mangrulkar, Sylvain Gugger and Arthur Zucker for all the support and collaboration. We’d also like to thank Jiewen Tan, Liyang Lu, Will Cromar, Vaibhav Singh, and Chandra Devarakonda for their assistance in preparing this post.

    + +

    Cheers!

    + +

    The PyTorch/XLA Team at Google

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/lightning-ai-joins-pytorch/index.html b/blog/lightning-ai-joins-pytorch/index.html new file mode 100644 index 000000000000..e7848dac6981 --- /dev/null +++ b/blog/lightning-ai-joins-pytorch/index.html @@ -0,0 +1,671 @@ + + + + + + + + + + + + + Lightning AI Joins the PyTorch Foundation as a Premier Member | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    The PyTorch Foundation, a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem, is announcing today that Lightning AI has joined as a premier member.

    + +

    Lightning AI is the company behind PyTorch Lightning, the platform and open-source framework for companies to build and deploy AI products leveraging the latest generative AI models.

    + +

    “This is a very important milestone for Lightning AI and the PyTorch Lightning community,” remarks Luca Antiga, Chief Technology Officer of Lightning AI. “By joining the PyTorch Foundation, we are strengthening our commitment to boost the adoption of PyTorch across industries. We look forward to partnering with the Foundation to push the vision of PyTorch forward.”

    + +

    PyTorch Lightning is one of the leading projects in the PyTorch ecosystem, allowing developers to build, train, fine-tune and deploy AI models at scale. PyTorch Lightning is helping drive the rapid adoption of PyTorch by both the research community and the enterprise.

    + +

    “Lightning AI has been a great steward of the AI community, and notably a key contributor to PyTorch over the years,” said PyTorch Foundation Executive Director Ibrahim Haddad. “Their goal of making AI research scalable directly aligns with our mission at the foundation.”

    + +

    As a premier member, Lightning AI is granted one seat to the PyTorch Foundation Governing Board. The Board sets policy through our bylaws, mission and vision statements, describing the overarching scope of foundation initiatives, technical vision, and direction.

    + +

    We’re happy to welcome Luca Antiga, Chief Technology Officer at Lightning AI, to our board. Luca joined the Lightning AI team in April 2021 when the Tensorwerk team joined Grid AI. Prior to joining Lightning AI, Luca co-founded Orobix, an applied AI company, and Tensorwerk. He was an early core contributor to PyTorch and co-authored Deep Learning with PyTorch (Manning).

    + +

    To learn more about how you can be a part of the PyTorch Foundation, visit our website.

    + +

    About Lightning AI

    + +

    Lightning AI is the creator of PyTorch Lightning, the deep learning platform and open-source framework of choice for developers and companies seeking to build and deploy AI products.

    + +

    About PyTorch Foundation

    + +

    The PyTorch Foundation is a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem. The PyTorch Foundation is supported by its members and leading contributors to the PyTorch open source project. The Foundation leverages resources provided by members and contributors to enable community discussions and collaboration.

    + +

    About The Linux Foundation

    + +

    The Linux Foundation is the world’s leading home for collaboration on open source software, hardware, standards, and data. Linux Foundation projects are critical to the world’s infrastructure including Linux, Kubernetes, Node.js, ONAP, PyTorch, RISC-V, SPDX, OpenChain, and more. The Linux Foundation focuses on leveraging best practices and addressing the needs of contributors, users, and solution providers to create sustainable models for open collaboration. For more information, please visit us at linuxfoundation.org. The Linux Foundation has registered trademarks and uses trademarks. For a list of trademarks of The Linux Foundation, please see its trademark usage page. Linux is a registered trademark of Linus Torvalds.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/llama-into-torchtune/index.html b/blog/llama-into-torchtune/index.html new file mode 100644 index 000000000000..bc239575e481 --- /dev/null +++ b/blog/llama-into-torchtune/index.html @@ -0,0 +1,1418 @@ + + + + + + + + + + + + + Distilling Llama3.1 8B into 1B in torchtune | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    November 18, 2024

    +

    + Distilling Llama3.1 8B into 1B in torchtune +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Linda Wang, Evan Smothers, Kartikay Khandelwal + +

    +

    In this blog, we present a case study on distilling a Llama 3.1 8B model into Llama 3.2 1B using torchtune’s knowledge distillation recipe. We demonstrate how knowledge distillation (KD) can be used in post-training to improve instruction-following task performance and showcase how users can leverage the recipe.

    + +

    What is Knowledge Distillation?

    + +

    Knowledge Distillation is a widely used compression technique that transfers knowledge from a larger (teacher) model to a smaller (student) model. Larger models have more parameters and capacity for knowledge, however, this larger capacity is also more computationally expensive to deploy. Knowledge distillation can be used to compress the knowledge of a larger model into a smaller model. The idea is that performance of smaller models can be improved by learning from larger model’s outputs.

    + +

    How does Knowledge Distillation work?

    + +

    Knowledge is transferred from the teacher to student model by training on a transfer set where the student is trained to imitate the token-level probability distributions of the teacher. The assumption is that the teacher model distribution is similar to the transfer dataset. The diagram below is a simplified representation of how KD works.

    + +

    Figure 1: Simplified representation of knowledge transfer from teacher to student model

    + +

    Figure 1: Simplified representation of knowledge transfer from teacher to student model

    + +

    As knowledge distillation for LLMs is an active area of research, there are papers, such as MiniLLM, DistiLLM, AKL, and Generalized KD, investigating different loss approaches. In this case study, we focus on the standard cross-entropy (CE) loss with the forward Kullback-Leibler (KL) divergence loss as the baseline. Forward KL divergence aims to minimize the difference by forcing the student’s distribution to align with all of the teacher’s distributions.

    + +

    Why is Knowledge Distillation useful?

    + +

    The idea of knowledge distillation is that a smaller model can achieve better performance using a teacher model’s outputs as an additional signal than it could training from scratch or with supervised fine-tuning. For instance, Llama 3.2 lightweight 1B and 3B text models incorporated logits from Llama 3.1 8B and 70B to recover performance after pruning. In addition, for fine-tuning on instruction-following tasks, research in LLM distillation demonstrates that knowledge distillation methods can outperform supervised fine-tuning (SFT) alone.

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Model + Method + DollyEval + Self-Inst + S-NI +
    GPT-4 Eval + GPT-4 Eval + Rouge-L +
    Llama 7B + SFT + 73.0 + 69.2 + 32.4 +
    KD + 73.7 + 70.5 + 33.7 +
    MiniLLM + 76.4 + 73.1 + 35.5 +
    Llama 1.1B + SFT + 22.1 + - + 27.8 +
    KD + 22.2 + - + 28.1 +
    AKL + 24.4 + - + 31.4 +
    OpenLlama 3B + SFT + 47.3 + 41.7 + 29.3 +
    KD + 44.9 + 42.1 + 27.9 +
    SeqKD + 48.1 + 46.0 + 29.1 +
    DistiLLM + 59.9 + 53.3 + 37.6 +
    + +

    Table 1: Comparison of knowledge distillation approaches to supervised fine-tuning

    + +

    Below is a simplified example of how knowledge distillation differs from supervised fine-tuning.

    + + + + + + + + + + +
    Supervised fine-tuning + Knowledge distillation +
    +
    +   
    +model = llama3_2_1b()
    +ce_loss = CrossEntropyLoss()
    +kd_loss = ForwardKLLoss()
    +
    +tokens, labels = batch["tokens"], batch["labels"]
    +logits = model(tokens, ...)
    +
    +loss = ce_loss(logits, labels)
    +loss.backward()
    +
    +   
    +   
    +
    +
    +   
    +model = llama3_2_1b()
    +teacher_model = llama3_1_8b()
    +ce_loss = CrossEntropyLoss()
    +kd_loss = ForwardKLLoss()
    +
    +tokens, labels = batch["tokens"], batch["labels"]
    +logits = model(tokens, ...)
    +teacher_logits = teacher_model(tokens, ...)
    +loss = ce_loss(logits, labels) + kd_loss(logits, teacher_logits, labels)
    +loss.backward()
    +   
    +   
    +
    + +

    KD recipe in torchtune

    + +

    With torchtune, we can easily apply knowledge distillation to Llama3, as well as other LLM model families, using torchtune’s KD recipe. The objective for this recipe is to fine-tune Llama3.2-1B on the Alpaca instruction-following dataset by distilling from Llama3.1-8B. This recipe focuses on post-training and assumes the teacher and student models have already been pre-trained.

    + +

    First, we have to download the model weights. To be consistent with other torchtune fine-tuning configs, we will use the instruction tuned models of Llama3.1-8B as teacher and Llama3.2-1B as student.

    + +
    tune download meta-llama/Meta-Llama-3.1-8B-Instruct --output-dir /tmp/Meta-Llama-3.1-8B-Instruct --ignore-patterns "original/consolidated.00.pth" --hf_token <HF_TOKEN>
    +
    +tune download meta-llama/Llama-3.2-1B-Instruct --output-dir /tmp/Llama-3.2-1B-Instruct --ignore-patterns "original/consolidated.00.pth" --hf_token <HF_TOKEN>
    +
    + +

    In order for the teacher model distribution to be similar to the Alpaca dataset, we will fine-tune the teacher model using LoRA. Based on our experiments, shown in the next section, we’ve found that KD performs better when the teacher model is already fine-tuned on the target dataset.

    + +
    tune run lora_finetune_single_device --config llama3_1/8B_lora_single_device
    +
    + +

    Finally, we can run the following command to distill the fine-tuned 8B model into the 1B model on a single GPU. For this case study, we used a single A100 80GB GPU. We also have a distributed recipe for running on multiple devices.

    + +
    tune run knowledge_distillation_single_device --config llama3_2/knowledge_distillation_single_device
    +
    + +

    Ablation studies

    + +

    In this section, we demonstrate how changing configurations and hyperparameters can affect performance. By default, our configuration uses the LoRA fine-tuned 8B teacher model, downloaded 1B student model, learning rate of 3e-4 and KD loss ratio of 0.5. For this case study, we fine-tuned on the alpaca_cleaned_dataset and evaluated the models on truthfulqa_mc2, hellaswag and commonsense_qa tasks through the EleutherAI LM evaluation harness. Let’s take a look at the effects of:

    + +
      +
    1. Using a fine-tuned teacher model
    2. +
    3. Using a fine-tuned student model
    4. +
    5. Hyperparameter tuning of KD loss ratio and learning rate
    6. +
    + +

    Using a fine-tuned teacher model

    + +

    The default settings in the config uses the fine-tuned teacher model. Now, let’s take a look at the effects of not fine-tuning the teacher model first.

    + +

    Taking a loss at the losses, using the baseline 8B as teacher results in a higher loss than using the fine-tuned teacher model. The KD loss also remains relatively constant, suggesting that the teacher model should have the same distributions as the transfer dataset.

    + +

    Figure 2: (left to right) KD loss from forward KL divergence, class loss from cross entropy, total loss: even combination of KD and class loss.

    + +

    Figure 2: (left to right) KD loss from forward KL divergence, class loss from cross entropy, total loss: even combination of KD and class loss.

    + +

    In our benchmarks, we can see that supervised fine-tuning of the 1B model achieves better accuracy than the baseline 1B model. By using the fine-tuned 8B teacher model, we see comparable results for truthfulqa and improvement for hellaswag and commonsense. When using the baseline 8B as a teacher, we see improvement across all metrics, but lower than the other configurations.

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Model + TruthfulQA + hellaswag + commonsense +
    mc2 + acc + acc_norm + acc +
    Baseline Llama 3.1 8B + 0.5401 + 0.5911 + 0.7915 + 0.7707 +
    Fine-tuned Llama 3.1 8B using LoRA + 0.5475 + 0.6031 + 0.7951 + 0.7789 +
    Baseline Llama 3.2 1B + 0.4384 + 0.4517 + 0.6064 + 0.5536 +
    Fine-tuned Llama 3.2 1B using LoRA + 0.4492 + 0.4595 + 0.6132 + 0.5528 +
    KD using baseline 8B as teacher + 0.444 + 0.4576 + 0.6123 + 0.5561 +
    KD using fine-tuned 8B as teacher + 0.4481 + 0.4603 + 0.6157 + 0.5569 +
    + +

    Table 2: Comparison between using baseline and fine-tuned 8B as teacher model

    + +

    Using a fine-tuned student model

    + +

    For these experiments, we look at the effects of KD when the student model is already fine-tuned. We analyze the effects using different combinations of baseline and fine-tuned 8B and 1B models.

    + +

    Based on the loss graphs, using a fine-tuned teacher model results in a lower loss irrespective of whether the student model is fine-tuned or not. It’s also interesting to note that the class loss starts to increase when using a fine-tuned student model.

    + +

    Figure 3: Comparing losses of different teacher and student model initializations

    + +

    Figure 3: Comparing losses of different teacher and student model initializations

    + +

    Using the fine-tuned student model boosts accuracy even further for truthfulqa, but the accuracy drops for hellaswag and commonsense. Using a fine-tuned teacher model and baseline student model achieved the best results on hellaswag and commonsense dataset. Based on these findings, the best configuration will change depending on which evaluation dataset and metric you are optimizing for.

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Model + TruthfulQA + hellaswag + commonsense +
    mc2 + acc + acc_norm + acc +
    Baseline Llama 3.1 8B + 0.5401 + 0.5911 + 0.7915 + 0.7707 +
    Fine-tuned Llama 3.1 8B using LoRA + 0.5475 + 0.6031 + 0.7951 + 0.7789 +
    Baseline Llama 3.2 1B + 0.4384 + 0.4517 + 0.6064 + 0.5536 +
    Fine-tuned Llama 3.2 1B using LoRA + 0.4492 + 0.4595 + 0.6132 + 0.5528 +
    KD using baseline 8B and baseline 1B + 0.444 + 0.4576 + 0.6123 + 0.5561 +
    KD using baseline 8B and fine-tuned 1B + 0.4508 + 0.448 + 0.6004 + 0.5274 +
    KD using fine-tuned 8B and baseline 1B + 0.4481 + 0.4603 + 0.6157 + 0.5569 +
    KD using fine-tuned 8B and fine-tuned 1B + 0.4713 + 0.4512 + 0.599 + 0.5233 +
    + +

    Table 3: Comparison using baseline and fine-tuned teacher and student models

    + +

    Hyperparameter tuning: learning rate

    + +

    By default, the recipe has a learning rate of 3e-4. For these experiments, we changed the learning rate from as high as 1e-3 to as low as 1e-5.

    + +

    Based on the loss graphs, all learning rates result in similar losses except for 1e-5, which has a higher KD and class loss.

    + +

    Figure 4: Comparing losses of different learning rates

    + +

    Figure 4: Comparing losses of different learning rates

    + +

    Based on our benchmarks, the optimal learning rate changes depending on which metric and tasks you are optimizing for.

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Model + learning rate + TruthfulQA + hellaswag + commonsense +
    mc2 + acc + acc_norm + acc +
    Baseline Llama 3.1 8B + - + 0.5401 + 0.5911 + 0.7915 + 0.7707 +
    Fine-tuned Llama 3.1 8B using LoRA + - + 0.5475 + 0.6031 + 0.7951 + 0.7789 +
    Baseline Llama 3.2 1B + - + 0.4384 + 0.4517 + 0.6064 + 0.5536 +
    Fine-tuned Llama 3.2 1B using LoRA + - + 0.4492 + 0.4595 + 0.6132 + 0.5528 +
    KD using fine-tuned 8B and baseline 1B + 3e-4 + 0.4481 + 0.4603 + 0.6157 + 0.5569 +
    KD using fine-tuned 8B and baseline 1B + 1e-3 + 0.4453 + 0.4535 + 0.6071 + 0.5258 +
    KD using fine-tuned 8B and baseline 1B + 1e-4 + 0.4489 + 0.4606 + 0.6156 + 0.5586 +
    KD using fine-tuned 8B and baseline 1B + 1e-5 + 0.4547 + 0.4548 + 0.6114 + 0.5487 +
    + +

    Table 4: Effects of tuning learning rate

    + +

    Hyperparameter tuning: KD ratio

    + +

    By default, the KD ratio is set to 0.5, which gives even weighting to both the class and KD loss. In these experiments, we look at the effects of different KD ratios, where 0 only uses the class loss and 1 only uses the KD loss.

    + +

    Overall, the benchmark results show that for these tasks and metrics, higher KD ratios perform slightly better.

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Model + kd_ratio (lr=3e-4) + TruthfulQA + hellaswag + commonsense +
    mc2 + acc + acc_norm + acc +
    Baseline Llama 3.1 8B + - + 0.5401 + 0.5911 + 0.7915 + 0.7707 +
    Fine-tuned Llama 3.1 8B using LoRA + - + 0.5475 + 0.6031 + 0.7951 + 0.7789 +
    Baseline Llama 3.2 1B + - + 0.4384 + 0.4517 + 0.6064 + 0.5536 +
    Fine-tuned Llama 3.2 1B using LoRA + - + 0.4492 + 0.4595 + 0.6132 + 0.5528 +
    KD using fine-tuned 8B and baseline 1B + 0.25 + 0.4485 + 0.4595 + 0.6155 + 0.5602 +
    KD using fine-tuned 8B and baseline 1B + 0.5 + 0.4481 + 0.4603 + 0.6157 + 0.5569 +
    KD using fine-tuned 8B and baseline 1B + 0.75 + 0.4543 + 0.463 + 0.6189 + 0.5643 +
    KD using fine-tuned 8B and baseline 1B + 1.0 + 0.4537 + 0.4641 + 0.6177 + 0.5717 +
    + +

    Table 5: Effects of tuning KD ratio

    + +

    Looking Ahead

    + +

    In this blog, we presented a study on how to distill LLMs through torchtune using the forward KL divergence loss on Llama 3.1 8B and Llama 3.2 1B logits. There are many directions for future exploration to further improve performance and offer more flexibility in distillation methods.

    + +
      +
    • Expand KD loss offerings. The KD recipe uses the forward KL divergence loss. However, aligning the student distribution to the whole teacher distribution may not be effective, as mentioned above. There are multiple papers, such as MiniLLM, DistiLLM, and Generalized KD, that introduce new KD losses and policies to address the limitation and have shown to outperform the standard use of cross entropy with forward KL divergence loss. For instance, MiniLLM uses reverse KL divergence to prevent the student from over-estimating low-probability regions of the teacher. DistiLLM introduces a skewed KL loss and an adaptive training policy.
    • +
    • Enable cross-tokenizer distillation. The current recipe requires the teacher and student model to use the same tokenizer, which limits the ability to distill across different LLM families. There has been research on cross-tokenizer approaches (e.g. Universal Logit Distillation) that we could explore.
    • +
    • Expand distillation to multimodal LLMs and encoder models. A natural extension of the KD recipe is to expand to multimodal LLMs. Similar to deploying more efficient LLMs, there’s also a need to deploy smaller and more efficient multimodal LLMs. In addition, there has been work in demonstrating LLMs as encoder models (e.g. LLM2Vec). Distillation from LLMs as encoders to smaller encoder models may also be a promising direction to explore.
    • +
    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/mapillary-research/index.html b/blog/mapillary-research/index.html new file mode 100644 index 000000000000..3e917576ec61 --- /dev/null +++ b/blog/mapillary-research/index.html @@ -0,0 +1,749 @@ + + + + + + + + + + + + + Mapillary Research: Seamless Scene Segmentation and In-Place Activated BatchNorm | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Lorenzo Porzi, Mapillary + +

    +

    With roads in developed countries like the US changing up to 15% annually, Mapillary addresses a growing demand for keeping maps updated by combining images from any camera into a 3D visualization of the world. Mapillary’s independent and collaborative approach enables anyone to collect, share, and use street-level images for improving maps, developing cities, and advancing the automotive industry.

    + +

    Today, people and organizations all over the world have contributed more than 600 million images toward Mapillary’s mission of helping people understand the world’s places through images and making this data available, with clients and partners including the World Bank, HERE, and Toyota Research Institute.

    + +

    Mapillary’s computer vision technology brings intelligence to maps in an unprecedented way, increasing our overall understanding of the world. Mapillary runs state-of-the-art semantic image analysis and image-based 3d modeling at scale and on all its images. In this post we discuss two recent works from Mapillary Research and their implementations in PyTorch - Seamless Scene Segmentation [1] and In-Place Activated BatchNorm [2] - generating Panoptic segmentation results and saving up to 50% of GPU memory during training, respectively.

    + +

    Seamless Scene Segmentation

    + +

    Github project page: https://github.com/mapillary/seamseg/

    + +
    + +
    + +

    The objective of Seamless Scene Segmentation is to predict a “panoptic” segmentation [3] from an image, that is a complete labeling where each pixel is assigned with a class id and, where possible, an instance id. Like many modern CNNs dealing with instance detection and segmentation, we adopt the Mask R-CNN framework [4], using ResNet50 + FPN [5] as a backbone. This architecture works in two stages: first, the “Proposal Head” selects a set of candidate bounding boxes on the image (i.e. the proposals) that could contain an object; then, the “Mask Head” focuses on each proposal, predicting its class and segmentation mask. The output of this process is a “sparse” instance segmentation, covering only the parts of the image that contain countable objects (e.g. cars and pedestrians).

    + +

    To complete our panoptic approach coined Seamless Scene Segmentation, we add a third stage to Mask R-CNN. Stemming from the same backbone, the “Semantic Head” predicts a dense semantic segmentation over the whole image, also accounting for the uncountable or amorphous classes (e.g. road and sky). The outputs of the Mask and Semantic heads are finally fused using a simple non-maximum suppression algorithm to generate the final panoptic prediction. All details about the actual network architecture, used losses and underlying math can be found at the project website for our CVPR 2019 paper [1].

    + +

    While several versions of Mask R-CNN are publicly available, including an official implementation written in Caffe2, at Mapillary we decided to build Seamless Scene Segmentation from scratch using PyTorch, in order to have full control and understanding of the whole pipeline. While doing so we encountered a couple of main stumbling blocks, and had to come up with some creative workarounds we are going to describe next.

    + +

    Dealing with variable-sized tensors

    + +

    Something that sets aside panoptic segmentation networks from traditional CNNs is the prevalence of variable-sized data. In fact, many of the quantities we are dealing with cannot be easily represented with fixed sized tensors: each image contains a different number of objects, the Proposal head can produce a different number of proposals for each image, and the images themselves can have different sizes. While this is not a problem per-se – one could just process images one at a time – we would still like to exploit batch-level parallelism as much as possible. Furthermore, when performing distributed training with multiple GPUs, DistributedDataParallel expects its inputs to be batched, uniformly-sized tensors.

    + +
    + +
    + +

    Our solution to these issues is to wrap each batch of variable-sized tensors in a PackedSequence. PackedSequence is little more than a glorified list class for tensors, tagging its contents as “related”, ensuring that they all share the same type, and providing useful methods like moving all the tensors to a particular device, etc. When performing light-weight operations that wouldn’t be much faster with batch-level parallelism, we simply iterate over the contents of the PackedSequence in a for loop. When performance is crucial, e.g. in the body of the network, we simply concatenate the contents of the PackedSequence, adding zero padding as required (like in RNNs with variable-length inputs), and keeping track of the original dimensions of each tensor.

    + +

    PackedSequences also help us deal with the second problem highlighted above. We slightly modify DistributedDataParallel to recognize PackedSequence inputs, splitting them in equally sized chunks and distributing their contents across the GPUs.

    + +

    Asymmetric computational graphs with Distributed Data Parallel

    + +

    Another, perhaps more subtle, peculiarity of our network is that it can generate asymmetric computational graphs across GPUs. In fact, some of the modules that compose the network are “optional”, in the sense that they are not always computed for all images. As an example, when the Proposal head doesn’t output any proposal, the Mask head is not traversed at all. If we are training on multiple GPUs with DistributedDataParallel, this results in one of the replicas not computing gradients for the Mask head parameters.

    + +

    Prior to PyTorch 1.1, this resulted in a crash, so we had to develop a workaround. Our simple but effective solution was to compute a “fake forward pass” when no actual forward is required, i.e. something like this:

    + +
    def fake_forward():
    +    fake_input = get_correctly_shaped_fake_input()
    +    fake_output = mask_head(fake_input)
    +    fake_loss = fake_output.sum() * 0
    +    return fake_loss
    +
    + +

    Here, we generate a batch of bogus data, pass it through the Mask head, and return a loss that always back-progates zeros to all parameters.

    + +

    Starting from PyTorch 1.1 this workaround is no longer required: by setting find_unused_parameters=True in the constructor, DistributedDataParallel is told to identify parameters whose gradients have not been computed by all replicas and correctly handle them. This leads to some substantial simplifications in our code base!

    + +

    In-place Activated BatchNorm

    + +

    Github project page: https://github.com/mapillary/inplace_abn/

    + +

    Most researchers would probably agree that there are always constraints in terms of available GPU resources, regardless if their research lab has access to only a few or multiple thousands of GPUs. In a time where at Mapillary we still worked at rather few and mostly 12GB Titan X - style prosumer GPUs, we were searching for a solution that virtually enhances the usable memory during training, so we would be able to obtain and push state-of-the-art results on dense labeling tasks like semantic segmentation. In-place activated BatchNorm is enabling us to use up to 50% more memory (at little computational overhead) and is therefore deeply integrated in all our current projects (including Seamless Scene Segmentation described above).

    + +
    + +
    + +

    When processing a BN-Activation-Convolution sequence in the forward pass, most deep learning frameworks (including PyTorch) need to store two big buffers, i.e. the input x of BN and the input z of Conv. This is necessary because the standard implementations of the backward passes of BN and Conv depend on their inputs to calculate the gradients. Using InPlace-ABN to replace the BN-Activation sequence, we can safely discard x, thus saving up to 50% GPU memory at training time. To achieve this, we rewrite the backward pass of BN in terms of its output y, which is in turn reconstructed from z by inverting the activation function.

    + +

    The only limitation of InPlace-ABN is that it requires using an invertible activation function, such as leaky relu or elu. Except for this, it can be used as a direct, drop-in replacement for BN+activation modules in any network. Our native CUDA implementation offers minimal computational overhead compared to PyTorch’s standard BN, and is available for anyone to use from here: https://github.com/mapillary/inplace_abn/.

    + +

    Synchronized BN with asymmetric graphs and unbalanced batches

    + +

    When training networks with synchronized SGD over multiple GPUs and/or multiple nodes, it’s common practice to compute BatchNorm statistics separately on each device. However, in our experience working with semantic and panoptic segmentation networks, we found that accumulating mean and variance across all workers can bring a substantial boost in accuracy. This is particularly true when dealing with small batches, like in Seamless Scene Segmentation where we train with a single, super-high resolution image per GPU.

    + +

    InPlace-ABN supports synchronized operation over multiple GPUs and multiple nodes, and, since version 1.1, this can also be achieved in the standard PyTorch library using SyncBatchNorm. Compared to SyncBatchNorm, however, we support some additional functionality which is particularly important for Seamless Scene Segmentation: unbalanced batches and asymmetric graphs.

    + +

    As mentioned before, Mask R-CNN-like networks naturally give rise to variable-sized tensors. Thus, in InPlace-ABN we calculate synchronized statistics using a variant of the parallel algorithm described here, which properly takes into account the fact that each GPU can hold a different number of samples. PyTorch’s SyncBatchNorm is currently being revised to support this, and the improved functionality will be available in a future release.

    + +

    Asymmetric graphs (in the sense mentioned above) are another complicating factor one has to deal with when creating a synchronized BatchNorm implementation. Luckily, PyTorch’s distributed group functionality allows us to restrict distributed communication to a subset of workers, easily excluding those that are currently inactive. The only missing piece is that, in order to create a distributed group, each process needs to know the ids of all processes that will participate in the group, and even processes that are not part of the group need to call the new_group() function. In InPlace-ABN we handle it with a function like this:

    + +
    import torch
    +import torch.distributed as distributed
    +
    +def active_group(active):
    +    """Initialize a distributed group where each process can independently decide whether to participate or not"""
    +    world_size = distributed.get_world_size()
    +    rank = distributed.get_rank()
    +
    +    # Gather active status from all workers
    +    active = torch.tensor(rank if active else -1, dtype=torch.long, device=torch.cuda.current_device())
    +    active_workers = torch.empty(world_size, dtype=torch.long, device=torch.cuda.current_device())
    +    distributed.all_gather(list(active_workers.unbind(0)), active)
    +
    +    # Create group
    +    active_workers = [int(i) for i in active_workers.tolist() if i != -1]
    +    group = distributed.new_group(active_workers)
    +    return group
    +
    + +

    First each process, including inactive ones, communicates its status to all others through an all_gather call, then it creates the distributed group with the shared information. In the actual implementation we also include a caching mechanism for groups, since new_group() is usually too expensive to call at each batch.

    + +

    References

    + +

    [1] Seamless Scene Segmentation; Lorenzo Porzi, Samuel Rota Bulò, Aleksander Colovic, Peter Kontschieder; Computer Vision and Pattern Recognition (CVPR), 2019

    + +

    [2] In-place Activated BatchNorm for Memory-Optimized Training of DNNs; Samuel Rota Bulò, Lorenzo Porzi, Peter Kontschieder; Computer Vision and Pattern Recognition (CVPR), 2018

    + +

    [3] Panoptic Segmentation; Alexander Kirillov, Kaiming He, Ross Girshick, Carsten Rother, Piotr Dollar; Computer Vision and Pattern Recognition (CVPR), 2019

    + +

    [4] Mask R-CNN; Kaiming He, Georgia Gkioxari, Piotr Dollar, Ross Girshick; International Conference on Computer Vision (ICCV), 2017

    + +

    [5] Feature Pyramid Networks for Object Detection; Tsung-Yi Lin, Piotr Dollar, Ross Girshick, Kaiming He, Bharath Hariharan, Serge Belongie; Computer Vision and Pattern Recognition (CVPR), 2017

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/maximizing-training-throughput/index.html b/blog/maximizing-training-throughput/index.html new file mode 100644 index 000000000000..85bb770c027d --- /dev/null +++ b/blog/maximizing-training-throughput/index.html @@ -0,0 +1,843 @@ + + + + + + + + + + + + + Maximizing Training Throughput Using PyTorch FSDP and Torch.compile | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch at IBM and Team PyTorch at Meta + +

    +

    Recently, we demonstrated how FSDP and selective activation checkpointing can be used to achieve 57% MFU (Model Flops Utilization) for training a 7B model on A100 GPUs. We also demonstrated how it can train a high quality model, which we open sourced as Granite 7B base model on Hugging Face Hub under the Apache v2.0 license.

    + +

    We continued our quest to improve the utilization of GPUs by leveraging torch.compile. Using torch.compile and the selective activation checkpointing from our previous work, we achieve a MFU of 68% for the 7B model on A100 GPUs! torch.compile improves training MFU between 10% and 23% for various model sizes.

    + +

    This blog is organized into three parts: (1) Challenges addressed in order to train using torch.compile, (2) Numerical parity of compile with no-compile, and (3) MFU report.

    + +

    We open sourced all the code and updated it in the fms-fsdp repository. We are also working with Team PyTorch at Meta to contribute these to the newly released torch titan repository for pre-training.

    + +

    Challenges of using torch.compile

    + +

    torch.compile is a graph compilation technique that improves GPU utilization. For details on how torch compile works, we refer the readers to the recent PyTorch paper and associated tutorials. A key challenge in getting torch.compile to perform well is to minimize (or eliminate) graph breaks. We initially started with the Llama implementation provided by Meta, but compiling it caused too many graph breaks resulting in reduced training throughput.

    + +

    Several portions of the model architecture had to be fixed, with the most important one being the positional embedding layer (RoPE). The typical RoPE implementation uses complex numbers, which was not supported in torch.compile at the time of testing. We implemented RoPE using einops while maintaining parity with the original model architecture implementation. We had to properly cache the frequencies so that we did not run into graph breaks within the RoPE implementation.

    + +

    Compiling an FSDP model does result in graph breaks, which the PyTorch team at Meta is working to remove. However, these graph breaks as of PyTorch 2.3 are at FSDP unit boundaries and do not affect throughput significantly.

    + +

    When using custom kernels, we need to wrap each kernel by exposing its API to torch.compile. This involves indicating what parameters are modified in-place, how they are modified, and what shapes and strides will their return values have based on the inputs. In our case, SDPA Flash attention is already integrated appropriately and we were able to get that kernel to work with torch.compile with no graph breaks.

    + +

    We also noticed that when increasing the amount of data from 2T to 6T tokens, the data loader became a bottleneck. A key reason for this is the fact that previously, we implemented document shuffling in our dataloader naively, by having each worker maintain a list of shuffled document pointers.

    + +

    With the larger dataset, these pointer lists were growing to hundreds of thousands of entries per worker. Maintaining pointer lists at this scale became expensive enough that cpu contention throttled our training throughput. We re-implemented document shuffling without any pointer lists using a Linear Congruential Generator. LCG is a pseudorandom number generator algorithm that implements a random walk over a population, providing sampling without replacement.

    + +

    We leveraged the same idea to produce implicit bijective mappings from ordered to shuffled document indices. This enables us to shrink those annoying lists of hundreds of thousands of pointers down to a single integer state for the LCG. This eliminated 80% of the bottleneck and provided a significant boost to our performance. We will devote a separate blog to go into all the details of our performant pre-training data loader.

    + +

    Numerical Parity of torch.compile and torch.no-compile

    + +

    We had previously observed parity issues when training with compile and no-compile options, with one of these being related to the use of SDPA. After a few days of intense debugging sessions between the PyTorch teams at Meta and IBM, we were able to achieve parity between PyTorch compile and no-compile modes. To document and verify this parity, we take a mini-Llama model architecture of 1.4B size and train it to 100B tokens in four variations – no-compile, compile with no activation checkpointing, compile with selective activation checkpointing, and compile with full activation checkpointing.

    + +

    We plot the loss curves and gradient norm for these options below:

    + +

    Figure 1: Loss curve and gradient norm for various compile options

    + +

    Figure 1: Loss curve and gradient norm for various compile options

    + +

    Further, we run the lm-evaluation-harness and compare the various model scores on different benchmarks and observe no major differences between compile and no-compile, which is shown below.

    + +

    Figure 2: lm-evaluation-harness comparison of various benchmarks between compile and no-compile

    + +

    Figure 2: lm-evaluation-harness comparison of various benchmarks between compile and no-compile

    + +

    We observe from all these results that compile with all its variants is equal to no-compile option, thus demonstrating parity between compile and no-compile.

    + +

    MFU report

    + +

    Finally, like our previous blog, we compute the MFU for four different model sizes on two clusters. One cluster is 128 A100 GPUs with 400 Gbps inter-node connectivity, and the other is 464 H100 GPUs with 3.2 Tbps inter-node connectivity. We use the selective activation checkpointing that we covered in the prior blog in addition to compile. We capture the results in the table below.

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Model size + Batch size + MFU no-compile + MFU compile + Percentage gain (%) +
    7B + 2 + 0.57 + 0.68 + 20 +
    13B + 2 + 0.51 + 0.60 + 17 +
    34B + 2 + 0.47 + 0.54 + 15 +
    70B + 2 + 0.50 + 0.55 + 10 +
    + +

    Table 1: MFU results with compile and no compile for Llama2 model architectures on 128 A100 80GB GPUs with 400Gbps internode interconnect

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Model size + Batch size + MFU no-compile + MFU compile + Percentage gain +
    7B + 2 + 0.37 + 0.45 + 21 +
    13B + 2 + 0.35 + 0.43 + 23 +
    34B + 2 + 0.32 + 0.38 + 19 +
    70B + 2 + 0.32 + 0.38 + 19 +
    + +

    Table 2: MFU results with compile and no compile for Llama2 model architectures on 464 H100 80GB GPUs with 3.2Tbps internode interconnect

    + +

    We also had an internal production run on 448 GPUs using a Llama2 7B architecture. Using compile and selective activation checkpointing, with a global batch size of 3.7M, we trained for 4T tokens in 13 days 10 hours!

    + +

    During training, the data center cooling had to kick in with extra air conditioning and our training team was alerted to this, since we were using the GPUs quite effectively ☺

    + +

    One key observation from the tables 1 and 2 is that the MFU numbers do not linearly scale with model size. There are two possible explanations that we are actively investigating, one is the scalability of FSDP as model size increases and when tensor parallel needs to be enabled to more effectively use the GPU and the other is batch size, which can be increased further to get better MFU. We plan to explore FSDP v2 and selective operator checkpointing along with the tensor parallel feature to study the scaling laws of FSDP with model size.

    + +

    Future Work

    + +

    We plan to start testing FSDP v2 which will be released as part of PyTorch 2.4. FSDP2 provides per parameter sharding and selective operator checkpointing feature that can potentially provide even better memory-compute tradeoffs.

    + +

    We have also been engaged with the PyTorch team at Meta to evaluate the new asynchronous checkpointing feature that can further improve the GPU utilization by reducing the time to write checkpoints.

    + +

    We are exploring extending various Triton kernels currently used in inference to perform backward operations to gain speedups beyond inference only.

    + +

    Finally, as recent work on use of fp8 is emerging, we plan to explore how we can even further accelerate model training using the new data type that promises a 2x acceleration.

    + +

    Acknowledgements

    + +

    There are several teams that have been involved in reaching this proof point and we would like to thank the teams across Meta and IBM. Specifically, we extend our gratitude to the Meta PyTorch distributed and compiler teams and IBM Research.

    + +

    Multiple people were extensively involved in the effort of achieving torch.compile numerical parity with our models, and we wish to acknowledge the key folks involved in this effort; Animesh Jain and Less Wright at Meta, and Linsong Chu, Davis Wertheimer, Brian Vaughan, Antoni i Viros Martin, Mudhakar Srivatsa, and Raghu Ganti at IBM Research.

    + +

    Special thanks to Stas Bekman, who provided extensive feedback and helped improve this blog. Their insights have been invaluable in highlighting key aspects of optimizing the training and exploring further enhancements.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/maximizing-training/index.html b/blog/maximizing-training/index.html new file mode 100644 index 000000000000..852b54d575c1 --- /dev/null +++ b/blog/maximizing-training/index.html @@ -0,0 +1,1095 @@ + + + + + + + + + + + + + Maximizing training throughput using PyTorch FSDP | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch at IBM and Team PyTorch at Meta + +

    +

    In this blog, we demonstrate the scalability of FSDP with a pre-training exemplar, a 7B model trained for 2T tokens, and share various techniques we used to achieve a rapid training speed of 3,700 tokens/sec/GPU, or 40B tokens/day on 128 A100 GPUs. This translates to a model FLOPS utilization (MFU) and hardware FLOPS utilization (HFU) of 57%. Additionally, we have observed near linear scaling of FSDP to 512 GPUs, implying that training a 7B model on 512 GPUs to 2T tokens using this method would take just under two weeks.

    + +

    IBM researchers trained a Meta Llama 2 7B architecture to 2T tokens, which we will refer to as LlamaT(est). This model demonstrates comparable model quality as Llama 2 on various academic benchmarks. All of the training code, along with our methodology to achieve this throughput, can be found in this blog. We also share the configuration knobs that work well for the Llama 2 models – 7B, 13B, 34B, and 70B for A100s and H100s.

    + +

    In this process, we also propose a _new _selective activation checkpointing mechanism that applies to FSDP which gives us a 10% boost beyond out-of-the box FSDP. We have open sourced the training code base and an associated scalable data loader as the methodology to achieve this throughput.

    + +

    One key benefit of a PyTorch native pathway for training is the ability to seamlessly train on multiple hardware backends. For example, the recent end-to-end stack for training that was released by AllenAI through OLMo also leverages PyTorch FSDP for training on AMD and NVIDIA GPUs. There are three main components that we leverage from FSDP to achieve our throughput:

    + +
      +
    1. SDPA Flash attention, that enables fused attention kernels and efficient attention computation
    2. +
    3. Overlap in computation and communication allows for better utilization of the GPU
    4. +
    5. Selective activation checkpointing enables us to tradeoff between GPU memory and compute
    6. +
    + +

    IBM has been working closely with Team PyTorch at Meta on PyTorch FSDP for nearly two years: introducing the rate limiter for achieving better throughput on Ethernet interconnects, distributed checkpointing to improve the checkpoint times by an order of magnitude, and implementing the early version of checkpointing for the hybrid sharding mode of FSDP. Late last year, we used FSDP to train a model end-to-end.

    + +

    Training Details

    + +

    The 7B model is trained on 128 A100 GPUs with 400Gbps network connectivity and GPU direct RDMA. We use SDPA FlashAttention v2 for attention computation, and for this model we turned off activation checkpointing that limits the batch size, but provides the highest throughput – batch size is 1 million tokens per batch for 128 GPUs and improves throughput by about 10% when compared to activation checkpointing. With these parameters, we have an almost full overlap in computation and communication. We use the AdamW optimizer in 32-bit with beta1 of 0.9 and beta2 of 0.95, weight decay of 0.1, and a learning rate ending at 3e-5 with a warmup to max learning rate of 3e-4 and a cosine schedule to reduce to 3e-5 over 2T tokens. The training was performed using mixed precision bf16 on an internal dataset. The training stack is using IBM’s Foundation Model Stack for model architecture and PyTorch nightlies post-2.2 release for FSDP and SDPA. We tried a few different nightlies during the time period of Nov 2023 through Feb 2024 and we observed an improvement in the throughput.

    + +

    Selective activation checkpointing

    + +

    We jointly implemented a simple and effective mechanism of selective activation checkpointing (AC). In FSDP, the common practice is to checkpoint each transformer block. A simple extension is to checkpoint every _n _blocks and reduce the amount of recomputation, while increasing the memory needed. This is quite effective for the 13B model size, increasing the throughput by 10%. For the 7B model size, we did not need activation checkpointing at all. Future versions of FSDP will provide selective activation checkpointing at an operator level, enabling an optimal compute-memory tradeoff. The code for the above is implemented here.

    + +

    Throughput and MFU, HFU computation

    + +

    While we only trained the 7B model to 2T tokens, we performed numerous experiments on the other model sizes to provide the best configuration options. This is summarized in the table below for two types of infrastructure — an A100 cluster with 128 GPUs and 400Gbps inter-node interconnect, and an H100 cluster with 96 GPUs and 800Gbps inter-node interconnect.

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +Model size + + + +Batch size + + + +Activation checkpoint + + + +Throughput tokens/sec/GPU (A100 80GB and 400Gbps interconnect) + + + +MFU % (A100 80GB) + + + +HFU % (A100 80GB) + + + +Throughput tokens/sec/GPU (H100 80GB and 800Gbps interconnect) + + + +MFU % (H100 80GB) + + + +HFU % (H100 80GB) + + +
    +7B + + + +2 + + + +No + + + +3700 + + + +0.57 + + + +0.57 + + + +7500 + + + +0.37 + + + +0.37 + + +
    +13B + + + +2 + + + +Selective + + + +1800 + + + +0.51 + + + +0.59 + + + +3800 + + + +0.35 + + + +0.40 + + +
    +34B + + + +2 + + + +Yes + + + +700 + + + +0.47 + + + +0.64 + + + +1550 + + + +0.32 + + + +0.44 + + +
    +70B + + + +2 + + + +Yes + + + +370 + + + +0.50 + + + +0.67 + + + +800 + + + +0.34 + + + +0.45 + + +
    + +

    Table 1: Model and Hardware FLOPS utilization of various model sizes on A100 and H100 GPUs

    + +

    HFU numbers are computed using the PyTorch FLOP counter and the theoretical bf16 performance of A100 and H100 GPUs, whereas MFU numbers are computed using the methodology outlined in NanoGPT and the PaLM paper. We also note that the batch sizes we use for the larger models are intentionally kept at 2 per GPU to mimic choices made in training models of 4k sequence length and achieve this up to 512 GPUs without exceeding the 4M tokens popular batch size. Beyond that, we would need tensor parallelism or sequence parallelism.

    + +

    We note in the table above that for A100s, that activation recomputation causes the MFU to reduce, while HFU increases! With the introduction of better activation checkpointing schemes, we expect MFU to increase and catch up with HFU. However, we observe that for H100s, both MFU and HFU are relatively low. We analyze the PyTorch profile traces on H100 and observe that there is a 10% gap due to network “peeking” out. In addition, we hypothesize that the HBM bandwidth of H100s is the cause for the reduced HFU/MFU on H100s and not being able to obtain the 3x improvement (H100s are theoretically 3x faster than A100s - 312 vs 989TFLOPS, but only have <2x the HBM bandwidth than A100s - 2.0 vs 3.35TBps). We plan to try out other configuration options like Tensor Parallel to improve the knobs for the 70B model on H100s.

    + +

    Model details

    + +

    The loss curve for training is shown in the below figure.

    + +

    loss curve for training

    + +

    Figure 1: LlamaT training loss curve

    + +

    The 2T checkpoint is converted to Hugging Face format by a script that is provided in the repository and we then use lm-evaluation-harness to compute key academic benchmarks and compare that by running it on Llama2-7B. These results are captured in the below table.

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Evaluation metric + Llama2-7B (baseline) + LlamaT-7B +
    MMLU (zero shot) + 0.41 + 0.43 +
    MMLU (5-shot weighted avg) + 0.47 + 0.50 +
    Arc challenge + 0.46 + 0.44 +
    Arc easy + 0.74 + 0.71 +
    Boolq + 0.78 + 0.76 +
    Copa + 0.87 + 0.83 +
    Hellaswag + 0.76 + 0.74 +
    Openbookqa + 0.44 + 0.42 +
    Piqa + 0.79 + 0.79 +
    Sciq + 0.91 + 0.91 +
    Winogrande + 0.69 + 0.67 +
    Truthfulqa + 0.39 + 0.39 +
    GSM8k (8-shot) + 0.13 + 0.11 +
    + +

    Table 1: LM eval harness scores

    + +

    We observe that the model performs competitively with Llama2 (bolder is better).

    + +

    Training chronicles

    + +

    Training was stable with no crashes, though we did observe a few hiccups:

    + +

    0-200B tokens: We observed a slowdown in the iteration time (time taken to execute one training step). We stopped the job to ensure that the data loader was not causing any slowdowns and the checkpointing was performant and accurate. We did not find any issues. By this time, HSDP checkpointing code was available in PyTorch, and we took this opportunity to make the switch to PyTorch checkpointing code.

    + +

    200B tokens-1.9T: We did not do any manual intervention in the job in late December. When we came back early January, disk space had exceeded and checkpoints were failing to be written, although the training job continued. The last known checkpoint was 1.5T.

    + +

    1.5T-1.7T: We evaluated the 1.5T checkpoint with lm-evaluation-harness and discovered that model has been trained with an extra special token between two documents due to the Hugging Face tokenizer introducing a separator token and our dataloader also appending its own document separator. We modified the dataloader to eliminate the extra special token, and continued training with the modified dataloader from 1.7T token onwards.

    + +

    1.7T-2T: The loss initially spiked due to the change in the special tokens which was quickly recovered in a few billion tokens. The training finished without any other manual intervention!

    + +

    Key takeaways and even more speed

    + +

    We demonstrated how one can use FSDP to train a model to 2T tokens with an excellent performance of 3700 tokens/sec/GPU and that generates a good quality model. As part of this exercise, we open sourced all our code for training and the knobs to achieve this throughput. These knobs can be leveraged by not only large-scale runs, but also smaller scale tuning runs. You can find the code here.

    + +

    FSDP APIs implement the ZeRO algorithms in a PyTorch native manner and allow for tuning and training of large models. In the past, we have seen FSDP proof points (Stanford Alpaca, Hugging Face, Llama 2 recipes) on tuning a variety of LLMs (such as Meta Llama 2 7B to 70B Llama) using simple training loops and achieving good throughputs and training times.

    + +

    Finally, we note that there are several levers for speeding up training:

    + +
      +
    1. Node optimizations that can speedup specific operations (e.g., attention computation using Flash Attention V2)
    2. +
    3. Graph optimizations (e.g., fusing kernels, torch.compile)
    4. +
    5. Overlap in compute-communications
    6. +
    7. Activation recomputation
    8. +
    + +

    We have leveraged 1, 3, and a variation of 4 in this blog and are working closely with Team PyTorch at Meta to get torch.compile (2) as well as a more advanced version of 4 with per-operator selective activation recomputation. We plan to share a simple formatting code and example data to ingest into our data loader to enable others to use the code base for training of models.

    + +

    Acknowledgements

    + +

    There are several teams that have been involved in reaching this proof point and we would like to thank the teams across Meta and IBM. Specifically, we extend our gratitude to the PyTorch distributed team, Facebook Research and Applied AI teams that built the FSDP APIs and made enhancements based on our feedback. We also wish to thank the data team at IBM Research that curated the data corpus used in this exercise and the infrastructure team at IBM Research (especially, Claudia Misale, Shweta Salaria, and Seetharami Seelam) that optimized NCCL and network configurations. By building and leveraging all of these components, we have successfully demonstrated the LlamaT proof point.

    + +

    The selective activation checkpointing was conceptualized at IBM by Linsong Chu, Davis Wertheimer, Mudhakar Srivatsa, and Raghu Ganti and implemented by Less Wright at Meta.

    + +

    Special thanks to Stas Bekman and Minjia Zhang, who provided extensive feedback and helped improve the blog. Their insights have been invaluable in highlighting key aspects of optimizing the training and exploring further enhancements.

    + +

    Appendix

    + +

    Communication computation overlap

    + +

    Another key aspect of training in a multi-node setting is the ability to overlap communication and computation. In FSDP, there are multiple opportunities for overlapping – during the FSDP unit gathering phase at forward pass as well as the backward pass computation. Overlapping the gather during forward pass while the computation of the previous unit and overlapping backward computation with the next unit gathering and gradient scattering help improve GPU utilization by nearly 2x. We illustrate this on the 400Gbps network interconnect with A100 80GB GPUs. In the case of HSDP, there is no inter-node traffic during the pre-fetch stage for forward pass and the overlap is only for the backward gradient computation phase. Of course, HSDP is feasible only when the model can be sharded within a single node, limiting the size of models to around 30B parameters.

    + +

    The below figure shows three steps in FSDP with the communication between nodes at the bottom and the compute stream at the top of the second half of the image. For the 7B model with no activation recomputation, we observe the overlap to be complete. In practice, the overlap percentage possible is 90% since the first block during forward pass and the last block during backward pass are not able to overlap.

    + +

    three steps in FSDP with the communication between nodes at the bottom and the compute stream at the top of the second half

    + +

    A zoomed in view of the above three-step process is shown below for a single step. We can clearly see the granularity of the computation and communication and how they overlap in an interleaved manner.

    + +

    zoomed in view of the above three-step process

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/microsoft-becomes-maintainer-of-the-windows-version-of-pytorch/index.html b/blog/microsoft-becomes-maintainer-of-the-windows-version-of-pytorch/index.html new file mode 100644 index 000000000000..c114b93457db --- /dev/null +++ b/blog/microsoft-becomes-maintainer-of-the-windows-version-of-pytorch/index.html @@ -0,0 +1,672 @@ + + + + + + + + + + + + + Microsoft becomes maintainer of the Windows version of PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Maxim Lukiyanov - Principal PM at Microsoft, Emad Barsoum - Group EM at Microsoft, Guoliang Hua - Principal EM at Microsoft, Nikita Shulga - Tech Lead at Facebook, Geeta Chauhan - PE Lead at Facebook, Chris Gottbrath - Technical PM at Facebook, Jiachen Pu - Engineer at Facebook + +

    +

    Along with the PyTorch 1.6 release, we are excited to announce that Microsoft has expanded its participation in the PyTorch community and will be responsible for the development and maintenance of the PyTorch build for Windows.

    + +

    According to the latest Stack Overflow developer survey, Windows remains the primary operating system for the developer community (46% Windows vs 28% MacOS). Jiachen Pu initially made a heroic effort to add support for PyTorch on Windows, but due to limited resources, Windows support for PyTorch has lagged behind other platforms. Lack of test coverage resulted in unexpected issues popping up every now and then. Some of the core tutorials, meant for new users to learn and adopt PyTorch, would fail to run. The installation experience was also not as smooth, with the lack of official PyPI support for PyTorch on Windows. Lastly, some of the PyTorch functionality was simply not available on the Windows platform, such as the TorchAudio domain library and distributed training support. To help alleviate this pain, Microsoft is happy to bring its Windows expertise to the table and bring PyTorch on Windows to its best possible self.

    + +

    In the PyTorch 1.6 release, we have improved the core quality of the Windows build by bringing test coverage up to par with Linux for core PyTorch and its domain libraries and by automating tutorial testing. Thanks to the broader PyTorch community, which contributed TorchAudio support to Windows, we were able to add test coverage to all three domain libraries: TorchVision, TorchText and TorchAudio. In subsequent releases of PyTorch, we will continue improving the Windows experience based on community feedback and requests. So far, the feedback we received from the community points to distributed training support and a better installation experience using pip as the next areas of improvement.

    + +

    In addition to the native Windows experience, Microsoft released a preview adding GPU compute support to Windows Subsystem for Linux (WSL) 2 distros, with a focus on enabling AI and ML developer workflows. WSL is designed for developers that want to run any Linux based tools directly on Windows. This preview enables valuable scenarios for a variety of frameworks and Python packages that utilize NVIDIA CUDA for acceleration and only support Linux. This means WSL customers using the preview can run native Linux based PyTorch applications on Windows unmodified without the need for a traditional virtual machine or a dual boot setup.

    + +

    Getting started with PyTorch on Windows

    +

    It’s easy to get started with PyTorch on Windows. To install PyTorch using Anaconda with the latest GPU support, run the command below. To install different supported configurations of PyTorch, refer to the installation instructions on pytorch.org.

    + +

    conda install pytorch torchvision cudatoolkit=10.2 -c pytorch

    + +

    Once you install PyTorch, learn more by visiting the PyTorch Tutorials and documentation.

    + +
    + +
    + +

    Getting started with PyTorch on Windows Subsystem for Linux

    +

    The preview of NVIDIA CUDA support in WSL is now available to Windows Insiders running Build 20150 or higher. In WSL, the command to install PyTorch using Anaconda is the same as the above command for native Windows. If you prefer pip, use the command below.

    + +

    pip install torch torchvision

    + +

    You can use the same tutorials and documentation inside your WSL environment as on native Windows. This functionality is still in preview so if you run into issues with WSL please share feedback via the WSL GitHub repo or with NVIDIA CUDA support share via NVIDIA’s Community Forum for CUDA on WSL.

    + +

    Feedback

    +

    If you find gaps in the PyTorch experience on Windows, please let us know on the PyTorch discussion forum or file an issue on GitHub using the #module: windows label.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/ml-model-server-resource-saving/index.html b/blog/ml-model-server-resource-saving/index.html new file mode 100644 index 000000000000..33dd72866b00 --- /dev/null +++ b/blog/ml-model-server-resource-saving/index.html @@ -0,0 +1,882 @@ + + + + + + + + + + + + + ML Model Server Resource Saving - Transition From High-Cost GPUs to Intel CPUs and oneAPI powered Software with performance | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Sangjune Park(Naver GplaceAI MLOps), Jooyoung Lee(Naver GplaceAI MLE), Junho Min(Naver GplaceAI MLE) + +

    +

    Reviewers: Yunsang Ju(Naver GplaceAI Leader), Min Jean Cho(Intel), Jing Xu(Intel), Mark Saroufim(Meta)

    + +

    Intro

    + +

    Here, We will be sharing our experience in moving AI workloads from our GPU servers to our Intel CPU servers without any performance or quality degradation, and saving annual costs of approximately 340 thousand U.S. Dollar (refer to the Conclusion) in the process.

    + +

    We aim to provide value to our consumers by serving various AI models that enhance the Online to Offline (O2O) experience. With the ongoing growth in the demand for new models and the limited nature of high-cost resource GPUs, we needed to transition relatively lightweight AI models from GPU servers to Intel CPU servers for reducing resource consumption. In the same setting, however, the CPU server had issues where performance of rps, inference time, etc. was reduced by tens of times. We applied various engineering techniques and lightweighted the model to solve this problem, and we were able to successfully transition to the Intel CPU servers with the same performance or better performance as the GPU servers with just a three-fold scale out.

    + +

    For a more detailed introduction about our team, please refer to the Introduction to NAVER Place AI Development Team.

    + +

    I’ll mention it again in the middle, but I’ve received a lot of help from Grokking Pytorch Intel CPU Performance From First Principles written by Intel and PyTorch in the overall work.

    + +

    Problem Definition

    + +

    1: Service Architecture

    + +

    Simplified service architecture

    + +

    Simplified service architecture (Image Source: NAVER GplaceAI)

    + +

    To facilitate understanding, a brief introduction to our service architecture will be provided. CPU intensive tasks such as preprocessing input to tensor format (then forwarded to the model) and post processing inference results to human readable output (e.g. natural language and image formats) are performed on the App Server(FastAPI) The Model Server(TorchServe) exclusively handles inference operations. For stable operation of the service, the following actions need to be performed with sufficient throughput and low latency.

    + +

    The specific processing sequence is as follows:

    + +
      +
    • The client submits a request to the app server via the Traefik gateway.
    • +
    • The app server pre-processes the input by performing actions such as resizing and transforming, and converting it into a Torch tensor before then requesting the model server.
    • +
    • The model server performs inference and returns the feature to the app server
    • +
    • The app server converts the feature into a format understandable by humans through post-processing and returns it to the client
    • +
    + +

    2:  Throughput and Latency Measurement

    + +

    Comparison of Image Scoring Models

    + +

    Comparison of Image Scoring Models

    + +

    With all other conditions remaining the same, deploying on a threefold increase CPU server pod, yet, notably, the RPS (requests per second) and response time deteriorated by more than tenfold. While it was not surprising that CPU inference performance is inferior to GPUs, the challenging situation was evident. Given the goal of maintaining performance within limited resources, achieving an approximate 10 to 20 times performance improvement was necessary Barring any additional scaling.

    + +

    3: Challenges From a Throughput Perspective

    + +
    Type     Name                                                                          # reqs      # fails |    Avg     Min     Max    Med |   req/s  failures/s
    +--------|----------------------------------------------------------------------------|-------|-------------|-------|-------|-------|-------|--------|-----------
    +POST     /predictions/image-scoring                                                        37     0(0.00%) |   9031    4043   28985   8200 |    1.00        0.00
    +--------|----------------------------------------------------------------------------|-------|-------------|-------|-------|-------|-------|--------|-----------
    +         Aggregated                                                                        37     0(0.00%) |   9031    4043   28985   8200 |    1.00        0.00
    +
    + +

    One of the first steps TorchServer framework users might take in order to improve throughput is to increase the number of workers in TorchServe. This approach is effective on GPU servers Because of parallel workload processing, excluding the linear memory usage increase as workers scale. However, we were experiencing worse performance when increasing the number of workers. Identifying the cause of performance degradation on CPU servers required further investigation.

    + +

    4: Challenges From a Latency Perspective

    + +

    Our primary concern was latency. Throughput improvement is normally achievable when a system’s implementation is faithful to scale-out principles, except for perhaps very rare worst-case scenarios. However, in the case of the Image Scoring model example, even performing a single inference took more than 1 second, and as the request volume increased, latency increased to as much as 4 seconds. It was a situation where the timeout criteria to satisfy the client could not be met even with a single inference.

    + +

    Proposed Solutions

    + +

    Improvements were needed from both an ML and an engineering perspective. It was essential to fundamentally reduce the inference time on the CPU and to identify the causes of performance degradation when applying config that generally enhances performance, in order to find the optimal configuration values. To accomplish this, collaboration was established with MLE professionals to concurrently execute tasks encompassing ‘model lightweighting without compromising performance’, and ‘Identify optimal configurations for achieving peak performance’. Using the aforementioned approaches we were able to effectively transition workload handling to our CPU servers.

    + +

    1: Resolving Low RPS from an Engineering Perspective

    + +

    First, the reason for performance degradation even after increasing the worker number was the front-end bound caused by logical threads in GEMM operations. Generally, when increasing the number of workers, the expected improvement effect is the increase in parallelism. Conversely, if performance decreases, one can infer the corresponding trade-off effect.

    + +

    CPU + GPU

    + +

    Image Source: Nvidia

    + +

    As many are aware, the reason model inference performance on CPUs is inferior to GPUs lies in the difference in hardware design, particularly in terms of multi-threading capabilities. Diving deeper, model inference is fundamentally a repetition of GEMM (General Matrix Multiply) operations, and these GEMM operations are executed independently in “fused-multiply-add” (FMA) or “dot-product” (DP) execution units. If the GEMM operation becomes a bottleneck on the CPU, increasing parallelism might actually result in decreased performance. While researching the problem we found relevant information within the PyTorch documentation.

    + +

    While two logical threads run GEMM at the same time, they will be sharing the same core resources causing front-end bound

    + +

    This information highlighted that logical threads could cause a bottleneck in CPU GEMM operations, which helped us intuitively understand why performance decreased when increasing the worker num. This is because the default value of the torch thread corresponds to the physical core value of the CPU.

    + +
    root@test-pod:/# lscpu
    +  …
    +Thread(s) per core: 2
    +Core(s) per socket: 12
    +  …
    +root@test-pod:/# python
    +>>> import torch
    +>>> print(torch.get_num_threads())
    +24
    +
    + +

    When the worker_num increases, the total thread count increases by the product of the physical core * worker number. Consequently, logical threads are utilized. In order to improve performance, the total number of threads per worker was adjusted to align with the physical core count. Below, it can be observed that the metric RPS increased approximately threefold to 6.3(from the previous value of 2.1) when the worker_num was increased to 4 and the total thread count was aligned with the number of physical cores.

    + +
    Type     Name                                                                          # reqs      # fails |    Avg     Min     Max    Med |   req/s  failures/s
    +--------|----------------------------------------------------------------------------|-------|-------------|-------|-------|-------|-------|--------|-----------
    +POST     /predictions/image-scoring                                                       265     0(0.00%) |   3154    1885    4008   3200 |    6.30        0.00
    +--------|----------------------------------------------------------------------------|-------|-------------|-------|-------|-------|-------|--------|-----------
    +         Aggregated                                                                       265     0(0.00%) |   3154    1885    4008   3200 |    6.30        0.00
    +
    + +

    Cautionary Note 1: Our team is Using Kubernetes to maintain our deployments. So we are adjusting the which required us to adjust according to the CPU resource limit of the pod, rather than the physical core count of the node that can be checked using the lscpu command. (Setting the torch thread of each worker to 8/4 = 2, or 24/4 = 6 resulted in performance degradation.)

    + +

    Cautionary Note 2: Since torch thread settings for each worker can only be configured as integers, it’s advisable to set the CPU limit divisible by the worker_num in order to adequately utilize CPU usage.

    + +

    example

    + +

    ex) core=8, In the case of worker_num=3: int(8/worker_num) = 2, 2*worker_num/8 = 75%

    + +

    example

    + +

    ex) core=8, In the case of worker_num=4: int(8/worker_num) = 2, 2*worker_num/8 = 100%

    + +

    We also analyzed the model containers to see why we got a mere threefold improvement in performance despite a four times increase in the number of workers. Various resources were monitored, and among them, the core utilization rate was identified as the underlying cause.

    + +

    threads

    + +

    Even when the total thread count was adjusted to match the CPU(2nd Generation, Intel(R) Xeon(R) Silver 4214) limit(8 core), there were instances where computations were executed from logical thread to logical core. Due to the presence of 24 physical cores, the cores numbered 25 to 48 are classified as logical cores. The possibility of confining thread execution solely within physical cores seemed to offer the potential for further performance enhancement. The reference to this solution could be found within the source document mentioned in the PyTorch-geometric article that warned about CPU GEMM bottlenecks.

    + + + +

    As per the instructions in the document, Intel provides Intel® Extension for PyTorch where we can simply pin cores to specific sockets. The application method is also made very simple, by adding the following settings to the torchserve config.properties file.(used intel_extension_for_pytorch==1.13.0)

    + +
    ipex_enable=true
    +CPU_launcher_enable=true
    +
    + +

    two-socket configuration

    + +

    Image Source: PyTorch

    + +

    Beyond the removal of logical threads through socket pinning, there is an additional effect of eliminating UPI cache hit overhead. Since the CPU comprises more than one socket when threads scheduled on socket 1 are rescheduled on socket 2, cache hits occur in cases of accessing the cache of socket 1 via Intel Ultra Path Interconnect (UPI). At this point, UPI access to the local cache becomes more than twice as slow as local cache access, resulting in more bottlenecks. With threads being pinned to socket units by oneAPI powered Intel® Extension for PyTorch, We observed rps handling increase of up to four times than when the bottleneck existed.

    + +
    Type     Name                                                                          # reqs      # fails |    Avg     Min     Max    Med |   req/s  failures/s
    +--------|----------------------------------------------------------------------------|-------|-------------|-------|-------|-------|-------|--------|-----------
    +POST     /predictions/image-scoring                                                       131     0(0.00%) |   3456    1412    6813   3100 |    7.90        0.00
    +--------|----------------------------------------------------------------------------|-------|-------------|-------|-------|-------|-------|--------|-----------
    +         Aggregated                                                                       131     0(0.00%) |   3456    1412    6813   3100 |    7.90        0.00
    +
    + +

    Cautionary Note 1: Intel® Extension for PyTorch is specialized in neural network (referred to as “nn” hereafter) inference optimization, so the performance improvement from additional techniques outside nn might be minimal. Indeed, in the instance of the image scoring system highlighted as an example, where svr (support vector regression) is applied post-inference, the performance enhancement was confined to a 4-fold increase. However, for a purely nn inference model such as the food recognition model, a performance boost of 7-fold (2.5rps -> 17.5rps) was detected.

    + +
    Type     Name                                                                          # reqs      # fails |    Avg     Min     Max    Med |   req/s  failures/s
    +--------|----------------------------------------------------------------------------|-------|-------------|-------|-------|-------|-------|--------|-----------
    +POST     /predictions/food-classification                                                 446     0(0.00%) |   1113     249    1804   1200 |   17.50        0.00
    +--------|----------------------------------------------------------------------------|-------|-------------|-------|-------|-------|-------|--------|-----------
    +         Aggregated                                                                       446     0(0.00%) |   1113     249    1804   1200 |   17.50        0.00
    +
    + +

    Cautionary Note 2: Applying Intel® Extension for PyTorch requires torchserve version 0.6.1 or higher. Since our team was using version 0.6.0, there was an issue where socket pinning was not functioning correctly. Currently, we have made modifications to the guide document, specifying the required version.

    + +

    Within WorkerLifeCycle.java, multi-worker pinning is not supported in 0.6.0 and below (ninstance is hardcoded to 1)

    + +
    // 0.6.0 version
    +
    +public ArrayList<String> launcherArgsToList() {
    +   ArrayList<String> arrlist = new ArrayList<String>();
    +   arrlist.add("-m");
    +   arrlist.add("intel_extension_for_pytorch.cpu.launch");
    +   arrlist.add(" — ninstance");
    +   arrlist.add("1");
    +   if (launcherArgs != null && launcherArgs.length() > 1) {
    +     String[] argarray = launcherArgs.split(" ");
    +     for (int i = 0; i < argarray.length; i++) {
    +       arrlist.add(argarray[i]);
    +     }
    +   }
    +   return arrlist;
    + }
    +// master version
    +
    +if (this.numWorker > 1) {
    +   argl.add(" — ninstances");
    +   argl.add(String.valueOf(this.numWorker));
    +   argl.add(" — instance_idx");
    +   argl.add(String.valueOf(this.currNumRunningWorkers));
    + }
    +
    + +

    2: Addressing Slow Latency Through Model Lightweighting

    + +

    We also streamlined our model using Knowledge Distillation (commonly abbreviated as KD) to further reduce latency. As is widely known, kd is a technique where knowledge from a larger network (Teacher network) is conveyed to a smaller, lightweight network (Student network) which is less resource intensive and can be more readily deployed. For more detailed information, please refer to the paper where this concept was initially introduced, titled Distilling the Knowledge in a Neural Network.

    + +

    neural networks

    + +

    There is a variety of KD techniques available and because we were primarily focused on accuracy loss minimization, we adopted the approach from the paper Knowledge Distillation from A Stronger Teacher, which was published in the year 2022. The concept is straightforward. Unlike the conventional method of distillation that utilizes only the model’s prop values, the chosen approach involves having the student network learn the correlations between classes in the teacher network. When put into actual application, We observed effective model weight reduction to observe the effective reduction in the model’s weight while mainting high accuracy. The following are the outcomes of our experimentation with the mentioned knowledge distillation technique on several candidate student models, where selections were made based on the maintained level of accuracy.

    + +

    table of services

    + +

    For the image scoring system, additional measures were taken to reduce the input size. Considering that the prior use of CPU-based ML technique SVR (Support Vector Regression) was used (2-stage: CNN + SVR), even when this was streamlined into a 1-stage model, significant speed advantages were not observed in CPU inference. In order for streamlining to have significance, the input size of the student model during inference needed further reduction. Consequently, experiments were conducted with the size reduced from 384384 to 224224.

    + +

    Further simplifying transformations, the 2-stage (CNN + SVR) approach was unified into a 1-stage model with a larger ConvNext, and then kd was applied using the lightweight EfficientNet to resolve the accuracy trade-off. During the experiments, we encountered a problem where changing Img_resize to 224 led to a performance drop from 0.4007 to 0.4296 in terms of MAE. Due to the reduction in input size, various preprocessing techniques applied to the original training images (such as Affine, RandomRotate90, Blur, OneOf [GridDistortion, OpticalDistortion, ElasticTransform], VerticalFlip) had a counterproductive effect. By adopting these measures, effective training of the student was achieved, and the MAE value improved by 25% compared to the previous one (.518 to .3876).

    + +

    Validation

    + +

    1: Final Performance Measurement

    + +

    The following shows the final performance improvements using CPU servers, on the three models mentioned throughout this article.

    + +
    # Food photo classifier (pod 3): 2.5rps -> 84 rps
    +
    + Type Name                                                                           # reqs # fails | Avg Min Max Med | req/s failures/s
    + --------|----------------------------------------------------------------------------|------|------------|-------|------|-------|-------|--------|--------- 
    +POST /predictions/food-classification 2341 0(0.00%) | 208 130 508 200 | 84.50 0.00 
    +--------|----------------------------------------------------------------------------|--------|-------------|------|-------|--------|------|--------|----------
    +         Aggregated                                                                      2341     0(0.00%) |    208     130     508    200 |   84.50        0.00
    +
    +# Image scoring (pod 3): 2.1rps -> 62rps
    + Type Name                                                                               #reqs #fails | Avg Min Max Median | req/s failures/s
    + --------|---------------------------------------------------------------------------------|--------|-------------|--------|-------|--------|---------|--------|--------- 
    +  POST /predictions/image-scoring 1298 0 (0.00%) | 323 99 607 370 | 61.90 0.00 
    +--------|---------------------------------------------------------------------------------|--------|-------------|--------|------|--------|---------|--------|----------
    +          Aggregated                                                                          1298     0(0.00%)  |     323      99     607     370  |   61.90        0.00
    +
    +# receipt classifier(pod 3) : 20rps -> 111.8rps
    +Type     Name                                                                          # reqs      # fails |    Avg     Min     Max    Med |   req/s  failures/s
    +--------|----------------------------------------------------------------------------|-------|-------------|-------|-------|-------|-------|--------|-----------
    +POST     /predictions/receipt-classification                                             4024     0(0.00%) |    266     133    2211    200 |   111.8        0.00
    +--------|----------------------------------------------------------------------------|-------|-------------|-------|-------|-------|-------|--------|-----------
    +         Aggregated                                                                      4020     0(0.00%) |    266     133    2211    200 |   111.8        0.00
    +
    + +

    2:  Traffic Mirroring

    + +

    As previously mentioned, our team’s service architecture employs the tool “traefik” as a gateway in front of the app server, as briefly introduced at the beginning of the article. For final validation, the mirroring feature of this traefik gateway was utilized to mirror traffic from production to staging for a month of validation before applying it to production, which is now operational.

    + +

    Details regarding mirroring are beyond the scope of this topic and hence omitted. For those interested, kindly refer to the document at https://doc.traefik.io/traefik/routing/services/#mirroring-service.

    + +

    In Conclusion

    + +

    This concludes the discussion about transitioning from a GPU model server to a CPU server while maintaining service quality. Through this effort, our team was able to save 15 GPUs each in South Korea and Japan, resulting in an annual cost savings of approximately 340 thousand U.S. Dollar. Although we directly purchase and use GPUs within NAVER, we calculated a rough cost reduction based on AWS EC2 instances that stably support T4 GPUs.

    + +

    instance sizes

    + +

    Calculation: 1.306 (1-year reserved instance effective hourly cost) * 24 (hours) * 365 (days) * 15 (number of GPUs) * 2 (KR + JP)

    + +

    These secured GPUs will be harnessed to further advance and enhance our team’s AI services, delivering exceptional service experiences. We sincerely appreciate your encouragement and anticipation.:)

    + +

    Explore More

    + + + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/ml-models-torchvision-v0.9/index.html b/blog/ml-models-torchvision-v0.9/index.html new file mode 100644 index 000000000000..a1e9de343f93 --- /dev/null +++ b/blog/ml-models-torchvision-v0.9/index.html @@ -0,0 +1,688 @@ + + + + + + + + + + + + + An overview of the ML models introduced in TorchVision v0.9 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    TorchVision v0.9 has been released and it is packed with numerous new Machine Learning models and features, speed improvements and bug fixes. In this blog post, we provide a quick overview of the newly introduced ML models and discuss their key features and characteristics.

    + +

    Classification

    +
      +
    • +

      MobileNetV3 Large & Small: These two classification models are optimized for Mobile use-cases and are used as backbones on other Computer Vision tasks. The implementation of the new MobileNetV3 architecture supports the Large & Small variants and the depth multiplier parameter as described in the original paper. We offer pre-trained weights on ImageNet for both Large and Small networks with depth multiplier 1.0 and resolution 224x224. Our previous training recipes have been updated and can be used to easily train the models from scratch (shoutout to Ross Wightman for inspiring some of our training configuration). The Large variant offers a competitive accuracy comparing to ResNet50 while being over 6x faster on CPU, meaning that it is a good candidate for applications where speed is important. For applications where speed is critical, one can sacrifice further accuracy for speed and use the Small variant which is 15x faster than ResNet50.

      +
    • +
    • +

      Quantized MobileNetV3 Large: The quantized version of MobilNetV3 Large reduces the number of parameters by 45% and it is roughly 2.5x faster than the non-quantized version while remaining competitive in terms of accuracy. It was fitted on ImageNet using Quantization Aware Training by iterating on the non-quantized version and it can be trained from scratch using the existing reference scripts.

      +
    • +
    + +

    Usage:

    +
    model = torchvision.models.mobilenet_v3_large(pretrained=True)
    +# model = torchvision.models.mobilenet_v3_small(pretrained=True)
    +# model = torchvision.models.quantization.mobilenet_v3_large(pretrained=True)
    +model.eval()
    +predictions = model(img)
    +
    +

    Object Detection

    +
      +
    • Faster R-CNN MobileNetV3-Large FPN: Combining the MobileNetV3 Large backbone with a Faster R-CNN detector and a Feature Pyramid Network leads to a highly accurate and fast object detector. The pre-trained weights are fitted on COCO 2017 using the provided reference scripts and the model is 5x faster on CPU than the equivalent ResNet50 detector while remaining competitive in terms of accuracy.
    • +
    • Faster R-CNN MobileNetV3-Large 320 FPN: This is an iteration of the previous model that uses reduced resolution (min_size=320 pixel) and sacrifices accuracy for speed. It is 25x faster on CPU than the equivalent ResNet50 detector and thus it is good for real mobile use-cases.
    • +
    + +

    Usage:

    +
    model = torchvision.models.detection.fasterrcnn_mobilenet_v3_large_fpn(pretrained=True)
    +# model = torchvision.models.detection.fasterrcnn_mobilenet_v3_large_320_fpn(pretrained=True)
    +model.eval()
    +predictions = model(img)
    +
    +

    Semantic Segmentation

    +
      +
    • DeepLabV3 with Dilated MobileNetV3 Large Backbone: A dilated version of the MobileNetV3 Large backbone combined with DeepLabV3 helps us build a highly accurate and fast semantic segmentation model. The pre-trained weights are fitted on COCO 2017 using our standard training recipes. The final model has the same accuracy as the FCN ResNet50 but it is 8.5x faster on CPU and thus making it an excellent replacement for the majority of applications.
    • +
    • Lite R-ASPP with Dilated MobileNetV3 Large Backbone: We introduce the implementation of a new segmentation head called Lite R-ASPP and combine it with the dilated MobileNetV3 Large backbone to build a very fast segmentation model. The new model sacrifices some accuracy to achieve a 15x speed improvement comparing to the previously most lightweight segmentation model which was the FCN ResNet50.
    • +
    + +

    Usage:

    +
    model = torchvision.models.segmentation.deeplabv3_mobilenet_v3_large(pretrained=True)
    +# model = torchvision.models.segmentation.lraspp_mobilenet_v3_large(pretrained=True)
    +model.eval()
    +predictions = model(img)
    +
    +

    In the near future we plan to publish an article that covers the details of how the above models were trained and discuss their tradeoffs and design choices. Until then we encourage you to try out the new models and provide your feedback.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/mlops-workflow/index.html b/blog/mlops-workflow/index.html new file mode 100644 index 000000000000..a26a1c581ec8 --- /dev/null +++ b/blog/mlops-workflow/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + MLOps Workflow Simplified for PyTorch with Arm and GitHub Collaboration | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Eric Sondhi, Arm + +

    +

    PyTorch is one of the most widely used and most powerful deep learning frameworks for training and deploying complex neural networks. It has never been easier to train and deploy AI applications, and low-cost, high-performance, energy-efficient hardware, tools, and technology for creating optimized workflows are more accessible than ever. But data science, machine learning, and devops can be deep topics unto themselves, and it can be overwhelming for developers with one specialty to see how they all come together in the real world, or even to know where to get started.

    + +

    To that end, we at Arm have collaborated with our friends at GitHub to decompose the basic elements of real world MLOps pipelines that use PyTorch models and create a simplified workflow and MLOps tutorial that anyone with a GitHub and a Docker Hub account can leverage.

    + +

    MLOps Overview

    + +

    The software development lifecycle for machine learning applications typically starts from training data, which is used to train sophisticated neural networks (NNs) that are optimized, integrated into software images, and then deployed onto compute clusters and even fleets of devices in the field. These devices are typically continuously collecting data and are managed by cloud services, which actively monitor performance of the ML algorithm(s) and feedback data for retraining in the next iteration of the lifecycle – enabling continuous improvement of the algorithms, as well as supporting deployment of new AI features.

    + +

    process flow chart

    + +

    Example of a typical ML software development lifecycle.

    + +

    Scott Arbeit from GitHub recently published an excellent blog that highlights the importance of MLOps in machine learning and describes automation via simplified GitHub actions for several key tasks including:

    + +
      +
    • Data preprocessing: cleaning and preparation of data for training.
    • +
    • Model training and validation: automatic execution of training scripts when new data is pushed or when changes are made to the model code.
    • +
    • Deployment: automatic packaging and deployment of models to production environments upon successful training and validation.
    • +
    • Monitoring and alerts: workflows to monitor model performance and send alerts if certain thresholds are breached.
    • +
    + +

    The article also describes a conceptual efficient MLOps pipeline that takes advantage of new, low-cost Arm Runners natively integrated into GitHub Actions to train and validate PyTorch models. It also uses containerization for consistent deployment across different environments.

    + +

    Our team at Arm put GitHub’s ideas and conceptual workflow into practice and created a tutorial to help you get started today.

    + +

    Optimizing Your PyTorch MLOps Workflow

    + +

    A new Arm Learning Path unpacks each of the key phases described in Scott’s blog, and demonstrates each key task in detail, providing prescriptive instructions and code examples to leverage several aspects of the PyTorch framework to implement each phase.

    + +

    process flow chart

    + +

    Key ML tasks to setup and automate with GitHub Actions.

    + +

    With this learning path you will be able to take advantage of the following strategies with a real-world object detection use case to make your own streamlined MLOps workflow:

    + +
      +
    • Containerization: Package your PyTorch model and its dependencies into a Docker container to help ensure consistent performance across different environments.
    • +
    • Efficient Data Loading: Optimize data loading pipelines to help minimize I/O bottlenecks and maximize GPU utilization.
    • +
    • Model Optimization: Explore techniques like model quantization, pruning, and knowledge distillation to help reduce model size and improve inference speed.
    • +
    • Leverage PyTorch’s Ecosystem: Utilize libraries like TorchVision to help streamline common deep learning tasks.
    • +
    • Monitor and Profile: Monitor resource utilization and identify potential bottlenecks to further optimize your workflow.
    • +
    + +

    An End-to-End MLOps Workflow

    + +

    The best part of this learning path is not just that it takes you through each task in detail, but it brings it all together into a unified automated workflow.

    + +

    With GitHub Actions, you can build an end-to-end custom MLOPs workflow that combines and automates the individual workflows for each ML task. To demonstrate this, the repository contains a workflow in a boilerplate .yml file that automates the individual steps.

    + +

    You can run an MLOps workflow using GitHub Actions natively for managing all the steps in your ML application’s lifecycle.

    + +

    process flow chart

    + +

    A successful run of this MLOps workflow in GitHub Actions.

    + +

    Try It Yourself!

    + +

    Our Arm team has battle-tested this tutorial in the field and delivered the tutorial as a workshop at GitHub Universe 2024 earlier this year. Now it’s time for you to take it for a spin and get hands-on with PyTorch and MLOps.

    + +

    Try the Arm Learning Path Here!

    + +

    By the end of this tutorial, you can:

    + +
      +
    • Set up a new GitHub Arm-runner to natively build an arm64 image to take advantage of the lowest-cost, most power efficient compute available.
    • +
    • Train and test a PyTorch ML model with the German Traffic Sign Recognition Benchmark (GTSRB) dataset.
    • +
    • Compare the performance of two trained PyTorch ML models; one model compiled with OpenBLAS (Open Basic Linear Algebra Subprograms Library) and oneDNN (Deep Neural Network Library), and the other model compiled with Arm Compute Library (ACL).
    • +
    • Containerize a ML model and push the container to DockerHub.
    • +
    • Automate each task into a single MLOps pipeline Using GitHub Actions.
    • +
    + +

    Combining the power of PyTorch with the simplicity of GitHub Actions and the efficiency of native Arm Runners significantly helps you accelerate your deep learning development and deployment processes. Following the best practices outlined in this blog post helps you achieve optimal performance and cost-effectiveness for your PyTorch projects.

    + +

    We’d love to see what you create based on this example. If you have created your own Arm Learning Path, you are invited to share it here.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/mobile-demo-apps-overview/index.html b/blog/mobile-demo-apps-overview/index.html new file mode 100644 index 000000000000..a4436a3fe8b8 --- /dev/null +++ b/blog/mobile-demo-apps-overview/index.html @@ -0,0 +1,775 @@ + + + + + + + + + + + + + An Overview of the PyTorch Mobile Demo Apps | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    + +
    + +
    +
    +
    + +
    +

    + by + + Jeff Tang and Mark Saroufim + +

    +

    PyTorch Mobile provides a runtime environment to execute state-of-the-art machine learning models on mobile devices. Latency is reduced, privacy preserved, and models can run on mobile devices anytime, anywhere.

    + +

    In this blog post, we provide a quick overview of 10 currently available PyTorch Mobile powered demo apps running various state-of-the-art PyTorch 1.9 machine learning models spanning images, video, audio and text.

    + +

    It’s never been easier to deploy a state-of-the-art ML model to a phone. You don’t need any domain knowledge in Machine Learning and we hope one of the below examples resonates enough with you to be the starting point for your next project.

    + +
    + +
    + +

    Computer Vision

    +

    Image Classification

    +

    This app demonstrates how to use PyTorch C++ libraries on iOS and Android to classify a static image with the MobileNetv2/3 model.

    + +

    iOS #1 iOS #2 Android #1 Android #2

    + +

    iOS Android

    + +
    + +
    + +

    Live Image Classification

    +

    This app demonstrates how to run a quantized MobileNetV2 and Resnet18 models to classify images in real time with an iOS and Android device camera.

    + +

    iOS Android

    + +
    + + +
    + +

    Image Segmentation

    +

    This app demonstrates how to use the PyTorch DeepLabV3 model to segment images. The updated app for PyTorch 1.9 also demonstrates how to create the model using the Mobile Interpreter and load the model with the LiteModuleLoader API.

    + +

    iOS Android

    + +

    iOS Android

    + +
    + +
    + +

    Vision Transformer for Handwritten Digit Recognition

    +

    This app demonstrates how to use Facebook’s latest optimized Vision Transformer DeiT model to do image classification and handwritten digit recognition.

    + +

    iOS Android

    + +

    Android

    + +
    + +
    + +

    Object Detection

    +

    This app demonstrates how to convert the popular YOLOv5 model and use it on an iOS app that detects objects from pictures in your photos, taken with camera, or with live camera.

    + +

    iOS Android

    + +

    iOS Android

    + +
    + +
    + +

    D2Go

    +

    This app demonstrates how to create and use a much lighter and faster Facebook D2Go model to detect objects from pictures in your photos, taken with camera, or with live camera.

    + +

    iOS Android

    + +

    iOS Android

    + +
    + +
    + +

    Video

    +

    Video Classification

    +

    This app demonstrates how to use a pre-trained PyTorchVideo model to perform video classification on tested videos, videos from the Photos library, or even real-time videos.

    + +

    iOS Android

    + +

    iOS Android Deep Dive

    + +
    + +
    + +

    Natural Language Processing

    +

    Text Classification

    +

    This app demonstrates how to use a pre-trained Reddit model to perform text classification.

    + +

    iOS Android

    + +
    + +
    + +

    Machine Translation

    +

    This app demonstrates how to convert a sequence-to-sequence neural machine translation model trained with the code in the PyTorch NMT tutorial for french to english translation.

    + +

    iOS Android

    + +

    iOS Android

    + +
    + +
    + +

    Question Answering

    +

    This app demonstrates how to use the DistilBERT Hugging Face transformer model to answer questions about Pytorch Mobile itself.

    + +

    iOS Android

    + +

    iOS Android

    + +
    + +
    + +

    Audio

    +

    Speech Recognition

    +

    This app demonstrates how to convert Facebook AI’s torchaudio-powered wav2vec 2.0, one of the leading models in speech recognition to TorchScript before deploying it.

    + +

    iOS Android

    + +
    + +
    + +

    We really hope one of these demo apps stood out for you. For the full list, make sure to visit the iOS and Android demo app repos. You should also definitely check out the video An Overview of the PyTorch Mobile Demo Apps which provides both an overview of the PyTorch mobile demo apps and a deep dive into the PyTorch Video app for iOS and Android.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/model-serving-in-pyorch/index.html b/blog/model-serving-in-pyorch/index.html new file mode 100644 index 000000000000..1326adba574f --- /dev/null +++ b/blog/model-serving-in-pyorch/index.html @@ -0,0 +1,712 @@ + + + + + + + + + + + + + Model Serving in PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    May 08, 2019

    +

    + Model Serving in PyTorch +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Jeff Smith + +

    +

    PyTorch has seen a lot of adoption in research, but people can get confused about how well PyTorch models can be taken into production. This blog post is meant to clear up any confusion people might have about the road to production in PyTorch. +Usually when people talk about taking a model “to production,” they usually mean performing inference, sometimes called model evaluation or prediction or serving. At the level of a function call, in PyTorch, inference looks something like this:

    + +
      +
    • In Python +
        +
      • module(input)
      • +
      +
    • +
    • In traced modules +
        +
      • module(input)
      • +
      +
    • +
    • In C++ +
        +
      • at::Tensor output = module->forward(inputs).toTensor();
      • +
      +
    • +
    + +

    Since we at Facebook perform inference operations using PyTorch hundreds of trillions of times per day, we’ve done a lot to make sure that inference runs as efficiently as possible.

    + +

    Serving Strategies

    + +

    That zoomed-in view of how you use models in inference isn’t usually the whole story, though. In a real world machine learning system, you often need to do more than just run a single inference operation in the REPL or Jupyter notebook. Instead, you usually need to integrate your model into a larger application in some way. Depending on what you need to do, you can usually take one of the following approaches.

    + +

    Direct embedding

    + +

    In application settings like mobile, we often just directly call the model as part of a larger program. This isn’t just for apps; usually this is how robotics and dedicated devices work as well. At a code-level, the call to the model is exactly the same as what is shown above in the section about inference shown above. A key concern is often that a Python interpreter is not present in such environments, which is why PyTorch allows you to call your models from C++ and ship a model without the need for a Python runtime.

    + +

    Model microservices

    + +

    If you’re using your model in a server side context and you’re managing multiple models, you might choose to treat each individual model (or each individual model version) as a separate service, usually using some sort of packaging mechanism like a Docker container. Then that service is often made network accessible via some sort of service, either using JSON over HTTP or an RPC technology like gRPC. The key characteristic of this approach is that you’re defining a service with a single endpoint that just calls your model. Then you do do all of your model management (promotion, rollback, etc.) via whatever system you already use to manage your services (e.g. kubernetes, ECS).

    + +

    Model servers

    + +

    An additional possible solution is to use a model server. This is an application built to manage and serve models. It allows you to upload multiple models and get distinct prediction endpoints for each of them. Typically such systems include a number of other features to help solve more of the whole problem of managing and serving models. This can include things like metrics, visualization, data pre-processing, and more. Even something as simple as having a system for automatically versioning models can make building important features like model rollbacks much easier.

    + +

    Evolving Patterns

    + +

    The above is a somewhat arbitrary breakdown of different approaches based on a snapshot in time. Design patterns are still evolving. Recently, model server designs have started to adopt more of the technologies of general service infrastructure such as Docker containers and kubernetes, so many model servers have started to share properties of the model microservice design discussed above. For a deeper dive into the general concepts of model server designs, you can check out my book on machine learning systems.

    + +

    Serving PyTorch Models

    + +

    So, if you’re a PyTorch user, what should you use if you want to take your models to production?

    + +

    If you’re on mobile or working on an embedded system like a robot, direct embedding in your application is often the right choice. +For mobile specifically, your use case might be served by the ONNX export functionality. +Note that ONNX, by its very nature, has limitations and doesn’t support all of the functionality provided by the larger PyTorch project. +You can check out this tutorial on deploying PyTorch models to mobile using ONNX to see if this path might suit your use case. +That said, we’ve heard that there’s a lot more that PyTorch users want to do on mobile, so look for more mobile-specific functionality in PyTorch in the future. +For other embedded systems, like robots, running inference on a PyTorch model from the C++ API could be the right solution.

    + +

    If you can’t use the cloud or prefer to manage all services using the same technology, you can follow this example to build a simple model microservice using the Flask web framework.

    + +

    If you want to manage multiple models within a non-cloud service solution, there are teams developing PyTorch support in model servers like MLFlow, Kubeflow, and RedisAI. We’re excited to see innovation from multiple teams building OSS model servers, and we’ll continue to highlight innovation in the PyTorch ecosystem in the future.

    + +

    If you can use the cloud for your application, there are several great choices for working with models in the cloud. For AWS Sagemaker, you can start find a guide to all of the resources from AWS for working with PyTorch, including docs on how to use the Sagemaker Python SDK. You can also see some talks we’ve given on using PyTorch on Sagemaker. Finally, if you happen to be using PyTorch via FastAI, then they’ve written a really simple guide to getting up and running on Sagemaker.

    + +

    The story is similar across other major clouds. On Google Cloud, you can follow these instructions to get access to a Deep Learning VM with PyTorch pre-installed. On Microsoft Azure, you have a number of ways to get started from Azure Machine Learning Service to Azure Notebooks showing how to use PyTorch.

    + +

    Your Models

    + +

    Whichever approach you take to bringing your PyTorch models to production, we want to support you and enable your success. Do you love one of the options above? Are you having difficulty with that one crucial feature you can’t find support for? We’d love to discuss more on the deployment category on the PyTorch Discuss forums. We’d love to help, and where you’re seeing success, amplify your story.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/new-executive-director/index.html b/blog/new-executive-director/index.html new file mode 100644 index 000000000000..9ba2fc054e72 --- /dev/null +++ b/blog/new-executive-director/index.html @@ -0,0 +1,661 @@ + + + + + + + + + + + + + PyTorch Foundation Welcomes New Executive Director | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    Matt White +The PyTorch Foundation is excited to welcome Matt White, our new executive director. The PyTorch Foundation formed in 2022 with the goal to drive adoption of AI tooling by fostering and sustaining an ecosystem of open source, vendor-neutral projects with PyTorch. Over the past 2 years, we’ve seen excellent growth across the project – with both contributor and member growth.

    + +

    “I am honored to be a part of the PyTorch Foundation, working with such a passionate and skilled community,” said Matt White. “I am looking forward to working with our contributors and members to advance the PyTorch ecosystem through research, cutting edge technologies and open source best practices.”

    + +

    Matt is a career technologist, researcher and innovator and has over 25 years of experience in AI, data, autonomous systems and simulations. He is the Co-founder and Chair of the Open Metaverse Foundation, a part of the Linux Foundation. Previously, Matt was the Director of the Generative AI Commons at the Linux Foundation, leading the advancement of open science and open-source artificial intelligence projects. He is also the GM of AI at the Linux Foundation.

    + +

    Learn more about the PyTorch Foundation:

    + + + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/new-features-for-ai/index.html b/blog/new-features-for-ai/index.html new file mode 100644 index 000000000000..fb3037290de1 --- /dev/null +++ b/blog/new-features-for-ai/index.html @@ -0,0 +1,1238 @@ + + + + + + + + + + + + + PyTorch 2.1 Contains New Performance Features for AI Developers | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Intel + +

    +

    We are excited to see the release of PyTorch 2.1. In this blog, we discuss the five features for which Intel made significant contributions to PyTorch 2.1:

    + +
      +
    1. TorchInductor-CPU optimizations including Bfloat16 inference path for torch.compile
    2. +
    3. CPU dynamic shape inference path for torch.compile
    4. +
    5. C++ wrapper (prototype)
    6. +
    7. Flash-attention-based scaled dot product algorithm for CPU
    8. +
    9. PyTorch 2 export post-training auantization with an x86 back end through an inductor
    10. +
    + +

    At Intel, we are delighted to be part of the PyTorch community and appreciate the collaboration with and feedback from our colleagues at Meta* as we co-developed these features.

    + +

    Let’s get started.

    + +

    TorchInductor-CPU Optimizations

    + +

    This feature optimizes bfloat16 inference performance for TorchInductor. The 3rd and 4th generation Intel® Xeon® Scalable processors have built-in hardware accelerators for speeding up dot-product computation with the bfloat16 data type. Figure 1 shows a code snippet of how to specify the BF16 inference path.

    + +
    user_model = ...
    +
    +user_model.eval()
    +with torch.no_grad(), torch.autocast("cpu"):
    +	compiled_model = torch.compile(user_model)
    +	y = compiled_model(x)
    +
    + +

    Figure 1. Code snippet showing the use of BF16 inference with TorchInductor \

    + +

    We measured the performance on three TorchInductor benchmark suites—TorchBench, Hugging Face, and TIMM—and the results are as follows in Table 1. Here we see that performance in graph mode (TorchInductor) outperforms eager mode by factors ranging from 1.25x to 2.35x.

    + +

    Table 1. Bfloat16 performance geometric mean speedup in graph mode, compared with eager mode

    + + + + + + + + + + + + + + + + + +
    +Bfloat16 Geometric Mean Speedup (Single-Socket Multithreads) +
    +Compiler + +torchbench + +huggingface + +timm_models +
    +inductor + +1.81x + +1.25x + +2.35x +
    + + + + + + + + + + + + + + + + + +
    +Bfloat16 Geometric Mean Speedup (Single-Core Single Thread) +
    +Compiler + +torchbench + +huggingface + +timm_models +
    +inductor + +1.74x + +1.28x + +1.29x +
    + +

    Developers can fully deploy their models on 4th generation Intel Xeon processors to take advantage of the Intel® Advanced Matrix Extensions (Intel® AMX) feature to get peak performance for torch.compile. Intel AMX has two primary components: tiles and tiled matrix multiplication (TMUL). The tiles store large amounts of data in eight two-dimensional registers, each one kilobyte in size. TMUL is an accelerator engine attached to the tiles that contain instructions to compute larger matrices in a single operation.

    + +

    CPU Dynamic Shapes Inference Path for torch.compile

    + +

    Dynamic shapes is one of the key features in PyTorch 2.0. PyTorch 2.0 assumes everything is static by default. If we recompile because a size changed, we will instead attempt to recompile that size as being dynamic (sizes that have changed are likely to change in the future). Dynamic shapes support is required for popular models like large language models (LLM). Dynamic shapes that provide support for a broad scope of models can help users get more benefit from torch.compile. For dynamic shapes, we provide the post-op fusion for conv/gemm operators and vectorization code-gen for non-conv/gemm operators.

    + +

    Dynamic shapes is supported by both the inductor Triton back end for CUDA* and the C++ back end for CPU. The scope covers improvements for both functionality (as measured by model passing rate) and performance (as measured by inference latency/throughput). Figure 2 shows a code snippet for the use of dynamic shape inference with TorchInductor.

    + +
    user_model = ...
    +
    +# Training example
    +compiled_model = torch.compile(user_model)
    +y = compiled_model(x_size1)
    +# Here trigger the recompile because the input size changed
    +y = compiled_model(x_size2)
    +
    +
    +# Inference example
    +user_model.eval()
    +compiled_model = torch.compile(user_model)
    +with torch.no_grad():
    +	y = compiled_model(x_size1)
    + # Here trigger the recompile because the input size changed
    + y = compiled_model(x_size2)
    +
    + +

    Figure 2. Code snippet showing the use of dynamic shape inference with TorchInductor

    + +

    We again measured the performance on the three TorchInductor benchmark suites—TorchBench, Hugging Face, and TIMM—and the results are in Table 2. Here we see that performance in graph mode outperforms eager mode by factors ranging from 1.15x to 1.79x.

    + +

    Table 2. Dynamic shape geometric mean speedup compared with Eager mode

    + + + + + + + + + + + + + + + + + +
    +Dynamic Shape Geometric Mean Speedup (Single-Socket Multithreads) +
    +Compiler + +torchbench + +huggingface + +timm_models +
    +inductor + +1.35x + +1.15x + +1.79x +
    + + + + + + + + + + + + + + + + + +
    +Dynamic Shape Geometric Mean Speedup (Single-Core Single-Thread) +
    +Compiler + +torchbench + +huggingface + +timm_models +
    +inductor + +1.48x + +1.15x + +1.48x +
    + +

    C++ Wrapper (Prototype)

    + +

    The feature generates C++ code instead of Python* code to invoke the generated kernels and external kernels in TorchInductor to reduce Python overhead. It is also an intermediate step to support deployment in environments without Python.

    + +

    To enable this feature, use the following configuration:

    + +
    import torch
    +import torch._inductor.config as config
    +config.cpp_wrapper = True
    +
    + +

    For light workloads where the overhead of the Python wrapper is more dominant, C++ wrapper demonstrates a higher performance boost ratio. We grouped the models in TorchBench, Hugging Face, and TIMM per the average inference time of one iteration and categorized them into small, medium, and large categories. Table 3 shows the geometric mean speedups achieved by the C++ wrapper in comparison to the default Python wrapper.

    + +

    Table 3. C++ wrapper geometric mean speedup compared with Eager mode

    + + + + + + + + + + + + + + + + + +
    +FP32 Static Shape Mode Geometric Mean Speedup (Single-Socket Multithreads) +
    +Compiler + +Small (t <= 0.04s) + +Medium (0.04s < t <= 1.5s) + +Large (t > 1.5s) +
    +inductor + +1.06x + +1.01x + +1.00x +
    + + + + + + + + + + + + + + + + + +
    +FP32 Static Shape Mode Geometric Mean Speedup (Single-Core Single-Thread) +
    +Compiler + +Small (t <= 0.04s) + +Medium (0.04s < t <= 1.5s) + +Large (t > 1.5s) +
    +inductor + +1.13x + +1.02x + +1.01x +
    + + + + + + + + + + + + + + + + + +
    +FP32 Dynamic Shape Mode Geometric Mean Speedup (Single-Socket Multithreads) +
    +Compiler + +Small (t <= 0.04s) + +Medium (0.04s < t <= 1.5s) + +Large (t > 1.5s) +
    +inductor + +1.05x + +1.01x + +1.00x +
    + + + + + + + + + + + + + + + + + +
    +FP32 Dynamic Shape Mode Geometric Mean Speedup (Single-Core Single-Thread) +
    +Compiler + +Small (t <= 0.04s) + +Medium (0.04s < t <= 1.5s) + +Large (t > 1.5s) +
    +inductor + +1.14x + +1.02x + +1.01x +
    + + + + + + + + + + + + + + + + + +
    +BF16 Static Shape Mode Geometric Mean Speedup (Single-Socket Multithreads) +
    +Compiler + +Small (t <= 0.04s) + +Medium (0.04s < t <= 1.5s) + +Large (t > 1.5s) +
    +inductor + +1.09x + +1.03x + +1.04x +
    + + + + + + + + + + + + + + + + + +
    +BF16 Static Shape Mode Geometric Mean Speedup (Single-Core Single-Thread) +
    +Compiler + +Small (t <= 0.04s) + +Medium (0.04s < t <= 1.5s) + +Large (t > 1.5s) +
    +inductor + +1.17x + +1.04x + +1.03x +
    + +

    Flash-Attention-Based Scaled Dot Product Algorithm for CPU

    + +

    Scaled dot product attention (SDPA) is one of the flagship features of PyTorch 2.0 that helps speed up transformer models. It is accelerated with optimal CUDA kernels while still lacking optimized CPU kernels. This flash-attention implementation targets both training and inference, with both FP32 and Bfloat16 data types supported. There is no front-end use change for users to leverage this SDPA optimization. When calling SDPA, a specific implementation will be chosen automatically, including this new implementation.

    + +

    We have measured the SDPA-related models in Hugging Face, and they are proven effective when compared to the unfused SDPA. Shown in Table 4 are the geometric mean speedups for SDPA optimization. \

    + +

    Table 4. SDPA optimization performance geometric mean speedup

    + + + + + + + + + + + + + + + +
    +SDPA Geometric Mean Speedup (Single-Socket Multithreads) +
    +Compiler + +Geometric Speedup FP32 + +Geometric Speedup BF16 +
    +inductor + +1.15x, 20/20 + +1.07x, 20/20 +
    + + + + + + + + + + + + + + + +
    +SDPA Geometric Mean Speedup (Single-Core Single-Thread) +
    +Compiler + +Geometric Speedup FP32 + +Geometric Speedup BF16 +
    +inductor + +1.02x, 20/20 + +1.04x, 20/20 +
    + +

    PyTorch 2 Export Post-Training Quantization with x86 Back End through Inductor

    + +

    PyTorch provides a new quantization flow in the PyTorch 2.0 export. This feature uses TorchInductor with an x86 CPU device as the back end for post-training static quantization with this new quantization flow. An example code snippet is shown in Figure 3.

    + +
    import torch
    +import torch._dynamo as torchdynamo
    +from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
    +import torch.ao.quantization.quantizer.x86_inductor_quantizer as xiq
    +
    +model = ... 
    +
    +model.eval()
    +with torch.no_grad():
    + # Step 1: Trace the model into an FX graph of flattened ATen operators
    + exported_graph_module, guards = torchdynamo.export(
    +	 model,
    +	 *copy.deepcopy(example_inputs),
    +	 aten_graph=True,
    + )
    +
    + # Step 2: Insert observers or fake quantize modules
    + quantizer = xiq.X86InductorQuantizer()
    + operator_config = xiq.get_default_x86_inductor_quantization_config()
    + quantizer.set_global(operator_config)
    + prepared_graph_module = prepare_pt2e(exported_graph_module, quantizer)
    +
    + # Doing calibration here.
    +
    + # Step 3: Quantize the model
    + convert_graph_module = convert_pt2e(prepared_graph_module)
    +
    + # Step 4: Lower Quantized Model into the backend
    + compile_model = torch.compile(convert_graph_module)
    +
    + +

    Figure 3. Code snippet showing the use of Inductor as back end for PyTorch 2 export post-training quantization

    + +

    All convolutional neural networks (CNN) models from the TorchBench test suite have been measured and proven effective when compared with the Inductor FP32 inference path. Performance metrics are shown in Table 5.

    + + + + + + + + + + + + +
    +Compiler + +Geometric Speedup + +Geometric Related Accuracy Loss +
    +inductor + +3.25x, 12/12 + +0.44%, 12/12 +
    + +

    Next Steps

    + +

    Get the Software

    + +

    Try out PyTorch 2.1 and realize the performance benefits for yourself from these features contributed by Intel.

    + +

    We encourage you to check out Intel’s other AI Tools and framework optimizations and learn about the open, standards-based oneAPI multiarchitecture, multivendor programming model that forms the foundation of Intel’s AI software portfolio.

    + +

    For more details about the 4th generation Intel Xeon Scalable processor, visit the AI platform where you can learn how Intel is empowering developers to run high-performance, efficient end-to-end AI pipelines.

    + +

    PyTorch Resources

    + + + +

    Product and Performance Information

    + +

    1 Amazon EC2* m7i.16xlarge: 1-node, Intel Xeon Platinum 8488C processor with 256 GB memory (1 x 256 GB DDR5 4800 MT/s), microcode 0x2b000461, hyperthreading on, turbo on, Ubuntu* 22.04.3 LTS, kernel 6.2.0-1011-aws, GCC* 11.3.0, Amazon Elastic Block Store 200 GB, BIOS Amazon EC2 1.0 10/16/2017; Software: PyTorch 2.1.0_rc4, Intel® oneAPI Deep Neural Network Library (oneDNN) version 3.1.1, TorchBench, TorchVision, TorchText, TorchAudio, TorchData, TorchDynamo Benchmarks, tested by Intel on 9/12/2023.

    + +

    2 Amazon EC2 c6i.16xlarge: 1-node, Intel Xeon Platinum 8375C processor with 128 GB memory (1 x 128 GB DDR4 3200 MT/s), microcode 0xd0003a5, hyperthreading on, turbo on, Ubuntu 22.04.2 LTS, kernel 6.2.0-1011-aws, gcc 11.3.0, Amazon Elastic Block Store 200 GB, BIOS Amazon EC2 1.010/16/2017; Software: PyTorch 2.1.0_rc4, oneDNN version 3.1.1, TorchBench, TorchVision, TorchText, TorchAudio, TorchData, TorchDynamo Benchmarks, TorchBench cpu userbenchmark, tested by Intel on 9/12/2023.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/new-in-docs/index.html b/blog/new-in-docs/index.html new file mode 100644 index 000000000000..eec1c3da2f95 --- /dev/null +++ b/blog/new-in-docs/index.html @@ -0,0 +1,666 @@ + + + + + + + + + + + + + What's New in PyTorch Documentation | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    February 01, 2024

    +

    + What's New in PyTorch Documentation +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    Greetings to the PyTorch community! Here is a quick update on PyTorch docs.

    + +

    In November 2023, we successfully conducted a PyTorch Docathon, a community event where PyTorch community members gathered together to improve PyTorch documentation and tutorials. This event saw a global participation of contributors who dedicated their time and effort to enhance our docs. We extend our sincere gratitude to everyone involved.

    + +

    A key accomplishment of the Docathon was the comprehensive work carried out on docstrings. Our community contributors meticulously reviewed and improved the docstrings based on the provided tasks.

    + +

    In addition to that, we’ve added three new tutorials that showcase real-world applications of PyTorch. We are particularly proud that two of these tutorials were contributed by PyTorch ecosystem partners.

    + +

    Here is the new tutorials for you to explore:

    + +
      +
    • Whole Slide Image Classification Using PyTorch and TIAToolbox —This tutorial demonstrates how to classify Whole Slide Images (WSIs) using PyTorch deep learning models with TIAToolbox, which are images of human tissue samples used by pathologists and researchers to study diseases like cancer at the microscopic level.
    • +
    • Semi-Supervised Learning using USB built upon PyTorch – This tutorial introduces USB, a flexible and modular semi-supervised learning framework based on PyTorch, demonstrating its ease of use in training a FreeMatch/SoftMatch model on CIFAR-10 using pre-trained ViT and its adaptability to various algorithms and imbalanced datasets.
    • +
    • Deploying a PyTorch Stable Diffusion model as a Vertex AI Endpoint – This tutorial provides a step-by-step guide on how to streamline the deployment of a PyTorch Stable Diffusion model (v1.5) using Vertex AI, a fully-managed machine learning platform, by creating a custom TorchServe handler, uploading model artifacts to Google Cloud Storage, creating a Vertex AI model with the model artifacts and a prebuilt PyTorch container image, and finally deploying the model onto an endpoint.
    • +
    + +

    We’re planning more community events this year, so stay tuned!

    + +

    And finally, we just published new 2.2 PyTorch documentation and tutorials. Check it out!

    + +

    Best regards,
    +The PyTorch Team

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/new-library-updates-in-pytorch-1.13/index.html b/blog/new-library-updates-in-pytorch-1.13/index.html new file mode 100644 index 000000000000..4632b07dc91c --- /dev/null +++ b/blog/new-library-updates-in-pytorch-1.13/index.html @@ -0,0 +1,1093 @@ + + + + + + + + + + + + + New Library Updates in PyTorch 1.13 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    October 28, 2022

    +

    + New Library Updates in PyTorch 1.13 +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    Summary

    + +

    We are bringing a number of improvements to the current PyTorch libraries, alongside the PyTorch 1.13 release. These updates demonstrate our focus on developing common and extensible APIs across all domains to make it easier for our community to build ecosystem projects on PyTorch.

    + +

    Along with 1.13, we are releasing updates to the PyTorch Libraries, please find them below.

    + +

    TorchAudio

    + +

    (Beta) Hybrid Demucs Model and Pipeline

    + +

    Hybrid Demucs is a music source separation model that uses both spectrogram and time domain features. It has demonstrated state-of-the-art performance in the Sony® Music DeMixing Challenge. (citation: https://arxiv.org/abs/2111.03600)

    + +

    The TorchAudio v0.13 release includes the following features

    + +
      +
    • MUSDB_HQ Dataset, which is used in Hybrid Demucs training (docs)
    • +
    • Hybrid Demucs model architecture (docs)
    • +
    • Three factory functions suitable for different sample rate ranges
    • +
    • Pre-trained pipelines (docs)
    • +
    • SDR Results of pre-trained pipelines on MUSDB_HQ test set
    • +
    • Tutorial that steps through music source separation using the pretrained pipeline (docs)
    • +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    PipelineAllDrumsBassOtherVocals
    HDEMUCS_HIGH_MUSDB*6.427.766.514.476.93
    HDEMUCS_HIGH_MUSDB_PLUS**9.3711.3810.537.248.32
    + +

    * Trained on the training data of MUSDB-HQ dataset.
    ** Trained on both training and test sets of MUSDB-HQ and 150 extra songs from an internal database that were specifically produced for Meta.

    + +
    from torchaudio.pipelines import HDEMUCS_HIGH_MUSDB_PLUS
    +
    +bundle = HDEMUCS_HIGH_MUSDB_PLUS
    +model = bundle.get_model()
    +sources_list = model.sources
    +
    +mixture, samplerate = torchaudio.load("song.wav")
    +sources = model(mixture)
    +audios = dict(zip(sources_list, sources)
    +
    + +

    Special thanks to Alexandre Defossez for the guidance.

    + +

    (Beta) Datasets and Metadata Mode for SUPERB Benchmark

    + +

    TorchAudio adds support for various audio-related datasets used in downstream tasks for benchmarking self-supervised learning models. With the addition of several new datasets, there is now support for the downstream tasks in version 1 of the SUPERB benchmark, which can be found in the s3prl repository.

    + +

    For these datasets, we also add metadata support through a get_metadata function, enabling faster dataset iteration or preprocessing without the need to load waveforms. The function returns the same features as __getitem__, except it returns the relative waveform path rather than the loaded waveform.

    + +

    Datasets with metadata functionality

    + + + +

    (Beta) Custom Language Model support in CTC Beam Search Decoding

    + +

    TorchAudio released a CTC beam search decoder in release 0.12, with KenLM language model support. This release, there is added functionality for creating custom Python language models that are compatible with the decoder, using the torchaudio.models.decoder.CTCDecoderLM wrapper.

    + +

    For more information on using a custom language model, please refer to the documentation and tutorial.

    + +

    (Beta) StreamWriter

    + +

    torchaudio.io.StreamWriter is a class for encoding media including audio and video. This can handle a wide variety of codecs, chunk-by-chunk encoding and GPU encoding.

    + +
    writer = StreamWriter("example.mp4")
    +writer.add_audio_stream(
    +    sample_rate=16_000,
    +    num_channels=2,
    +)
    +writer.add_video_stream(
    +    frame_rate=30,
    +    height=96,
    +    width=128,
    +    format="rgb24",
    +)
    +with writer.open():
    +    writer.write_audio_chunk(0, audio)
    +    writer.write_video_chunk(1, video)
    +
    + +

    For more information, refer to the documentation and the following tutorials

    + + +

    TorchData

    + +

    For a complete list of changes and new features, please visit our repository’s 0.5.0 release note.

    + +

    (Prototype) DataLoader2

    + +

    DataLoader2 was introduced in the last release to execute DataPipe graph, with support for dynamic sharding for multi-process/distributed data loading, multiple backend ReadingServices, and DataPipe graph in-place modification (e.g. shuffle control).

    + +

    In this release, we further consolidated the API for DataLoader2 and a detailed documentation is now available here. We continue to welcome early adopters and feedback, as well as potential contributors. If you are interested in trying it out, we encourage you to install the nightly version of TorchData.

    + +

    (Beta) Data Loading from Cloud Service Providers

    + +

    We extended our support to load data from additional cloud storage providers via DataPipes, now covering AWS, Google Cloud Storage, and Azure. A tutorial is also available. We are open to feedback and feature requests.

    + +

    We also performed a simple benchmark, comparing the performance of data loading from AWS S3 and attached volume on an AWS EC2 instance.

    + +

    torch::deploy (Beta)

    + +

    torch::deploy is now in Beta! torch::deploy is a C++ library for Linux based operating systems that allows you to run multiple Python interpreters in a single process. You can run your existing eager PyTorch models without any changes for production inference use cases. Highlights include:

    + +
      +
    • Existing models work out of the box–no need to modify your python code to support tracing.
    • +
    • Full support for your existing Python environment including C extensions.
    • +
    • No need to cross process boundaries to load balance in multi-GPU serving environments.
    • +
    • Model weight can be shared between multiple Python interpreters.
    • +
    • A vastly improved installation and setup process.
    • +
    + +
    torch::deploy::InterpreterManager manager(4);
    +
    +// access one of the 4 interpreters
    +auto I = manager.acquireOne();
    +
    +// run infer from your_model.py
    +I.global("your_model", "infer")({at::randn({10, 240, 320})});
    +
    + +

    Learn more here.

    + +

    (Beta) CUDA/ROCm/CPU Backends

    + +

    torch::deploy now links against standard PyTorch Python distributions so all accelerators that PyTorch core supports such as CUDA and AMD/HIP work out of the box.

    + + + +

    (Prototype) aarch64/arm64 support

    + +

    torch::deploy now has basic support for aarch64 Linux systems.

    + + + +

    TorchEval

    + +

    (Prototype) Introducing Native Metrics Support for PyTorch

    + +

    TorchEval is a library built for users who want highly performant implementations of common metrics to evaluate machine learning models. It also provides an easy to use interface for building custom metrics with the same toolkit. Building your metrics with TorchEval makes running distributed training loops with torch.distributed a breeze.

    + +

    Learn more with our docs, see our examples, or check out our GitHub repo.

    + +

    TorchMultimodal Release (Beta)

    + +

    Please watch for upcoming blogs in early November that will introduce TorchMultimodal, a PyTorch domain library for training SoTA multi-task multimodal models at scale, in more details; in the meantime, play around with the library and models through our tutorial.

    + +

    TorchRec

    + +

    (Prototype) Simplified Optimizer Fusion APIs

    + +

    We’ve provided a simplified and more intuitive API for setting fused optimizer settings via apply_optimizer_in_backward. This new approach enables the ability to specify optimizer settings on a per-parameter basis and sharded modules will configure FBGEMM’s TableBatchedEmbedding modules accordingly. Additionally, this now let’s TorchRec’s planner account for optimizer memory usage. This should alleviate reports of sharding jobs OOMing after using Adam using a plan generated from planner.

    + +

    (Prototype) Simplified Sharding APIs

    + +

    We’re introducing the shard API, which now allows you to shard only the embedding modules within a model, and provides an alternative to the current main entry point - DistributedModelParallel. This lets you have a finer grained control over the rest of the model, which can be useful for customized parallelization logic, and inference use cases (which may not require any parallelization on the dense layers). We’re also introducing construct_module_sharding_plan, providing a simpler interface to the TorchRec sharder.

    + +

    (Beta) Quantized Comms

    + +

    Applying quantization or mixed precision to tensors in a collective call during model parallel training greatly improves training efficiency, with little to no effect on model quality. TorchRec now integrates with the quantized comms library provided by FBGEMM GPU and provides an interface to construct encoders and decoders (codecs) that surround the all_to_all, and reduce_scatter collective calls in the output_dist of a sharded module. We also allow you to construct your own codecs to apply to your sharded module. The codces provided by FBGEMM allow FP16, BF16, FP8, and INT8 compressions, and you may use different quantizations for the forward pass and backward pass.

    + +

    TorchSnapshot (Beta)

    + +

    Along with PyTorch 1.13, we are releasing the beta version of TorchSnapshot, which is a performant, memory-efficient checkpointing library for PyTorch applications, designed with large, complex distributed workloads in mind. Highlights include:

    + +
      +
    • Performance: TorchSnapshot provides a fast checkpointing implementation employing various optimizations, including zero-copy serialization for most tensor types, overlapped device-to-host copy and storage I/O, parallelized storage I/O
    • +
    • Memory Use: TorchSnapshot’s memory usage adapts to the host’s available resources, greatly reducing the chance of out-of-memory issues when saving and loading checkpoints
    • +
    • Usability: Simple APIs that are consistent between distributed and non-distributed workloads
    • +
    + +

    Learn more with our tutorial.

    + +

    TorchVision

    + +

    We are happy to introduce torchvision v0.14 (release note). This version introduces a new model registration API to help users retrieving and listing models and weights. It also includes new image and video classification models such as MViT, S3D, Swin Transformer V2, and MaxViT. Last but not least, we also have new primitives and augmentation such as PolynomicalLR scheduler and SimpleCopyPaste.

    + +

    (Beta) Model Registration API

    + +

    Following up on the multi-weight support API that was released on the previous version, we have added a new model registration API to help users retrieve models and weights. There are now 4 new methods under the torchvision.models module: get_model, get_model_weights, get_weight, and list_models. Here are examples of how we can use them:

    + +
    import torchvision
    +from torchvision.models import get_model, get_model_weights, list_models
    +
    +
    +max_params = 5000000
    +
    +tiny_models = []
    +for model_name in list_models(module=torchvision.models):
    +    weights_enum = get_model_weights(model_name)
    +    if len([w for w in weights_enum if w.meta["num_params"] <= max_params]) > 0:
    +        tiny_models.append(model_name)
    +
    +print(tiny_models)
    +# ['mnasnet0_5', 'mnasnet0_75', 'mnasnet1_0', 'mobilenet_v2', ...]
    +
    +model = get_model(tiny_models[0], weights="DEFAULT")
    +print(sum(x.numel() for x in model.state_dict().values()))
    +# 2239188
    +
    + +

    (Beta) New Video Classification Models

    + +

    We added two new video classification models, MViT and S3D. MViT is a state of the art video classification transformer model which has 80.757% accuracy on the Kinetics400 dataset, while S3D is a relatively small model with good accuracy for its size. These models can be used as follows:

    + +
    import torch
    +from torchvision.models.video import *
    +
    +video = torch.rand(3, 32, 800, 600)
    +model = mvit_v2_s(weights="DEFAULT")
    +# model = s3d(weights="DEFAULT")
    +model.eval()
    +prediction = model(images)
    +
    + +

    Here is the table showing the accuracy of the new video classification models tested in the Kinetics400 dataset.

    + + + + + + + + + + + + + + + + + + + + + + + + + + +
    ModelAcc@1Acc@5
    mvit_v1_b81.47495.776
    mvit_v2_s83.19696.36
    s3d83.58296.64
    + +

    We would like to thank Haoqi Fan, Yanghao Li, Christoph Feichtenhofer and Wan-Yen Lo for their work on PyTorchVideo and their support during the development of the MViT model. We would like to thank Sophia Zhi for her contribution implementing the S3D model in torchvision.

    + +

    (Stable) New Architecture and Model Variants

    + +

    For Classification Models, we’ve added the Swin Transformer V2 architecture along with pre-trained weights for its tiny/small/base variants. In addition, we have added support for the MaxViT transformer. Here is an example on how to use the models:

    + +
    import torch
    +from torchvision.models import *
    +
    +image = torch.rand(1, 3, 224, 224)
    +model = swin_v2_t(weights="DEFAULT").eval()
    +# model = maxvit_t(weights="DEFAULT").eval()
    +prediction = model(image)
    +
    + +

    Here is the table showing the accuracy of the models tested on ImageNet1K dataset.

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    ModelAcc@1Acc@1 change over V1Acc@5Acc@5 change over V1
    swin_v2_t82.072+ 0.59896.132+ 0.356
    swin_v2_s83.712+ 0.51696.816+ 0.456
    swin_v2_b84.112+ 0.53096.864+ 0.224
    maxvit_t83.700-96.722-
    + +

    We would like to thank Ren Pang and Teodor Poncu for contributing the 2 models to torchvision.

    + +

    (Stable) New Primitives & Augmentations

    + +

    In this release we’ve added the SimpleCopyPaste augmentation in our reference scripts and we up-streamed the PolynomialLR scheduler to PyTorch Core. We would like to thank Lezwon Castelino and Federico Pozzi for their contributions. We are continuing our efforts to modernize TorchVision by adding more SoTA primitives, Augmentations and architectures with the help of our community. If you are interested in contributing, have a look at the following issue.

    + +

    Torch-TensorRT

    + +

    (Prototype) TensorRT with FX2TRT frontend

    + +

    Torch-TensorRT is the PyTorch integration for TensorRT, providing high performance inference on NVIDIA GPUs. Torch-TRT allows for optimizing models directly in PyTorch for deployment providing up to 6x performance improvement.

    + +

    Torch-TRT is an AoT compiler which ingests an nn.Module or TorchScript module, optimizes compatible subgraphs in TensorRT & leaves the rest to run in PyTorch. This gives users the performance of TensorRT, but the usability and familiarity of Torch.

    + +

    Torch-TensorRT is part of the PyTorch ecosystem, and was released as v1.0 in November ‘21. There are currently two distinct front-ends: Torchscript & FX. Each provides the same value proposition and underlying operation with the primary difference being the input & output formats (TS vs FX / Python).

    + +

    The Torchscript front-end was included in v1.0 and should be considered stable. The FX front-end is first released in v1.2 and should be considered a Beta.

    + +

    Relevant Links:

    + + + +

    (Stable) Introducing Torch-TensorRT

    + +

    Torch-TensorRT is an integration for PyTorch that leverages inference optimizations of TensorRT on NVIDIA GPUs. It takes advantage of TensorRT optimizations, such as FP16 and INT8 reduced precision, graph optimization, operation fusion, etc. while offering a fallback to native PyTorch when TensorRT does not support the model subgraphs. Currently, there are two frontend paths existing in the library that help to convert a PyTorch model to tensorRT engine. One path is through Torch Script (TS) and the other is through FX frontend. That being said, the models are traced by either TS or FX into their IR graph and then converted to TensorRT from it.

    + +

    Learn more with our tutorial.

    + +

    TorchX

    + +

    TorchX 0.3 updates include a new list API, experiment tracking, elastic training and improved scheduler support. There’s also a new Multi-Objective NAS tutorial using TorchX + Ax.

    + +

    (Prototype) List

    + +

    The newly added list command and API allows you to list recently launched jobs and their statuses for a given scheduler directly from within TorchX.

    + +
      +
    • This removes the need for using secondary tools to list the jobs.
    • +
    • Full programmatic access to recent jobs for integration with custom tools.
    • +
    + +
    $ torchx list -s kubernetes
    +APP HANDLE                                                       APP STATUS
    +-----------------------------------------------            -----------------
    +kubernetes://torchx/default:train-f2nx4459p5crr   SUCCEEDED
    +
    + +

    Learn more with our documentation.

    + +

    (Prototype) Tracker

    + +

    TorchX Tracker is a new prototype library that provides a flexible and customizable experiment and artifact tracking interface. This allows you to track inputs and outputs for jobs across multiple steps to make it easier to use TorchX with pipelines and other external systems.

    + +
    from torchx import tracker
    +
    +app_run = tracker.app_run_from_env()
    +app_run.add_metadata(lr=lr, gamma=gamma) # hyper parameters
    +app_run.add_artifact("model", "storage://path/mnist_cnn.pt") # logs / checkpoints
    +app_run.add_source(parent_run_id, "model") # lineage
    +
    + +

    Example:

    + + + +

    (Prototype) Elastic Training and Autoscaling

    + +

    Elasticity on Ray and Kubernetes – automatic scale up of distributed training jobs when using a supported scheduler. Learn more with our documentation.

    + +

    (Prototype) Scheduler Improvements: IBM® Spectrum LSF

    + +

    Added prototype support for the IBM Spectrum LSF scheduler.

    + +

    (Beta) AWS Batch Scheduler

    + +

    The AWS Batch scheduler integration is now in beta.

    + + + +

    (Prototype) AnyPrecision Optimizer

    + +

    Drop in replacement for AdamW optimizer that reduces GPU memory, enables two main features:

    + +
      +
    • Ability to successfully train the entire model pipeline in full BFloat16. +Kahan summation ensures precision. This can improve training throughput, especially on huge models, by reduced memory and increased computation speed.
    • +
    • Ability to change the variance state to BFloat16. This can reduce overall memory required for model training with additional speed improvements.
    • +
    + +

    Find more information here.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/new-library-updates-in-pytorch-2.0/index.html b/blog/new-library-updates-in-pytorch-2.0/index.html new file mode 100644 index 000000000000..adba26639d84 --- /dev/null +++ b/blog/new-library-updates-in-pytorch-2.0/index.html @@ -0,0 +1,845 @@ + + + + + + + + + + + + + New Library Updates in PyTorch 2.0 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    March 15, 2023

    +

    + New Library Updates in PyTorch 2.0 +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    Summary

    + +

    We are bringing a number of improvements to the current PyTorch libraries, alongside the PyTorch 2.0 release. These updates demonstrate our focus on developing common and extensible APIs across all domains to make it easier for our community to build ecosystem projects on PyTorch.

    + +

    Along with 2.0, we are also releasing a series of beta updates to the PyTorch domain libraries, including those that are in-tree, and separate libraries including TorchAudio, TorchVision, and TorchText. An update for TorchX is also being released as it moves to community supported mode. Please find the list of the latest stable versions and updates below.

    + +

    Latest Stable Library Versions (Full List)

    + + + + + + + + + + + + + + + + +
    TorchArrow 0.1.0 + TorchRec 0.4.0 + TorchVision 0.15 +
    TorchAudio 2.0 + TorchServe 0.7.1 + TorchX 0.4.0 +
    TorchData 0.6.0 + TorchText 0.15.0 + PyTorch on XLA Devices 1.14 +
    + +

    *To see prior versions or (unstable) nightlies, click on versions in the top left menu above ‘Search Docs’.

    + +

    TorchAudio

    + +

    [Beta] Data augmentation operators

    + +

    The release adds several data augmentation operators under torchaudio.functional and torchaudio.transforms:

    +
      +
    • torchaudio.functional.add_noise
    • +
    • torchaudio.functional.convolve
    • +
    • torchaudio.functional.deemphasis
    • +
    • torchaudio.functional.fftconvolve
    • +
    • torchaudio.functional.preemphasis
    • +
    • torchaudio.functional.speed
    • +
    • torchaudio.transforms.AddNoise
    • +
    • torchaudio.transforms.Convolve
    • +
    • torchaudio.transforms.Deemphasis
    • +
    • torchaudio.transforms.FFTConvolve
    • +
    • torchaudio.transforms.Preemphasis
    • +
    • torchaudio.transforms.Speed
    • +
    • torchaudio.transforms.SpeedPerturbation
    • +
    + +

    The operators can be used to synthetically diversify training data to improve the generalizability of downstream models.

    + +

    For usage details, please refer to the functional and transform documentation and Audio Data Augmentation tutorial.

    + +

    [Beta] WavLM and XLS-R models

    + +

    The release adds two self-supervised learning models for speech and audio.

    + +
      +
    • WavLM that is robust to noise and reverberation.
    • +
    • XLS-R that is trained on cross-lingual datasets.
    • +
    + +

    Besides the model architectures, torchaudio also supports corresponding pre-trained pipelines:

    + +
      +
    • torchaudio.pipelines.WAVLM_BASE
    • +
    • torchaudio.pipelines.WAVLM_BASE_PLUS
    • +
    • torchaudio.pipelines.WAVLM_LARGE
    • +
    • torchaudio.pipelines.WAV2VEC_XLSR_300M
    • +
    • torchaudio.pipelines.WAV2VEC_XLSR_1B
    • +
    • torchaudio.pipelines.WAV2VEC_XLSR_2B
    • +
    + +

    For usage details, please refer to the factory function and pre-trained pipelines documentation.

    + +

    TorchRL

    + +

    The initial release of torchrl includes several features that span across the entire RL domain. TorchRL can already be used in online, offline, multi-agent, multi-task and distributed RL settings, among others. See below:

    + +

    [Beta] Environment wrappers and transforms

    + +

    torchrl.envs includes several wrappers around common environment libraries. This allows users to swap one library with another without effort. These wrappers build an interface between these simulators and torchrl:

    + +
      +
    • dm_control:
    • +
    • Gym
    • +
    • Brax
    • +
    • EnvPool
    • +
    • Jumanji
    • +
    • Habitat
    • +
    + +

    It also comes with many commonly used transforms and vectorized environment utilities that allow for a fast execution across simulation libraries. Please refer to the documentation for more detail.

    + +

    [Beta] Datacollectors

    + +

    Data collection in RL is made easy via the usage of single process or multiprocessed/distributed data collectors that execute the policy in the environment over a desired duration and deliver samples according to the user’s needs. These can be found in torchrl.collectors and are documented here.

    + +

    [Beta] Objective modules

    + +

    Several objective functions are included in torchrl.objectives, among which:

    + +
      +
    • A generic PPOLoss class and derived ClipPPOLoss and KLPPOLoss
    • +
    • SACLoss and DiscreteSACLoss
    • +
    • DDPGLoss
    • +
    • DQNLoss
    • +
    • REDQLoss
    • +
    • A2CLoss
    • +
    • TD3Loss
    • +
    • ReinforceLoss
    • +
    • Dreamer
    • +
    + +

    Vectorized value function operators also appear in the library. Check the documentation here.

    + +

    [Beta] Models and exploration strategies

    + +

    We provide multiple models, modules and exploration strategies. Get a detailed description in the doc.

    + +

    [Beta] Composable replay buffer

    + +

    A composable replay buffer class is provided that can be used to store data in multiple contexts including single and multi-agent, on and off-policy and many more.. Components include:

    + +
      +
    • Storages (list, physical or memory-based contiguous storages)
    • +
    • Samplers (Prioritized, sampler without repetition)
    • +
    • Writers
    • +
    • Possibility to add transforms
    • +
    + +

    Replay buffers and other data utilities are documented here.

    + +

    [Beta] Logging tools and trainer

    + +

    We support multiple logging tools including tensorboard, wandb and mlflow.

    + +

    We provide a generic Trainer class that allows for easy code recycling and checkpointing.

    + +

    These features are documented here.

    + +

    TensorDict

    + +

    TensorDict is a new data carrier for PyTorch.

    + +

    [Beta] TensorDict: specialized dictionary for PyTorch

    + +

    TensorDict allows you to execute many common operations across batches of tensors carried by a single container. TensorDict supports many shape and device or storage operations, and can readily be used in distributed settings. Check the documentation to know more.

    + +

    [Beta] @tensorclass: a dataclass for PyTorch

    + +

    Like TensorDict, tensorclass provides the opportunity to write dataclasses with built-in torch features such as shape or device operations.

    + +

    [Beta] tensordict.nn: specialized modules for TensorDict

    + +

    The tensordict.nn module provides specialized nn.Module subclasses that make it easy to build arbitrarily complex graphs that can be executed with TensorDict inputs. It is compatible with the latest PyTorch features such as functorch, torch.fx and torch.compile.

    + +

    TorchRec

    + +

    [Beta] KeyedJaggedTensor All-to-All Redesign and Input Dist Fusion

    + +

    We observed performance regression due to a bottleneck in sparse data distribution for models that have multiple, large KJTs to redistribute.

    + +

    To combat this we altered the comms pattern to transport the minimum data required in the initial collective to support the collective calls for the actual KJT tensor data. This data sent in the initial collective, ‘splits’ means more data is transmitted over the comms stream overall, but the CPU is blocked for significantly shorter amounts of time leading to better overall QPS.

    + +

    Furthermore, we altered the TorchRec train pipeline to group the initial collective calls for the splits together before launching the more expensive KJT tensor collective calls. This fusion minimizes the CPU blocked time as launching each subsequent input distribution is no longer dependent on the previous input distribution.

    + +

    With this feature, variable batch sizes are now natively supported across ranks. These features are documented here.

    + +

    TorchVision

    + +

    [Beta] Extending TorchVision’s Transforms to Object Detection, Segmentation & Video tasks

    + +

    TorchVision is extending its Transforms API! Here is what’s new:

    + +
      +
    • You can use them not only for Image Classification but also for Object Detection, Instance & Semantic Segmentation and Video Classification.
    • +
    • You can use new functional transforms for transforming Videos, Bounding Boxes and Segmentation Masks.
    • +
    + +

    Learn more about these new transforms from our docs, and submit any feedback in our dedicated issue.

    + +

    TorchText

    + +

    [Beta] Adding scriptable T5 and Flan-T5 to the TorchText library with incremental decoding support!

    + +

    TorchText has added the T5 model architecture with pre-trained weights for both the original T5 paper and Flan-T5. The model is fully torchscriptable and features an optimized multiheaded attention implementation. We include several examples of how to utilize the model including summarization, classification, and translation.

    + +

    For more details, please refer to our docs.

    + +

    TorchX

    + +

    TorchX is moving to community supported mode. More details will be coming in at a later time.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/new-library-updates/index.html b/blog/new-library-updates/index.html new file mode 100644 index 000000000000..c07483e10cca --- /dev/null +++ b/blog/new-library-updates/index.html @@ -0,0 +1,843 @@ + + + + + + + + + + + + + New Library Updates in PyTorch 2.1 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    October 04, 2023

    +

    + New Library Updates in PyTorch 2.1 +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    Summary

    + +

    We are bringing a number of improvements to the current PyTorch libraries, alongside the PyTorch 2.1 release. These updates demonstrate our focus on developing common and extensible APIs across all domains to make it easier for our community to build ecosystem projects on PyTorch. 

    + +

    Along with 2.1, we are also releasing a series of beta updates to the PyTorch domain libraries including TorchAudio and TorchVision. Please find the list of the latest stable versions and updates below.

    + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Latest Stable Library Versions(Full List)* 
    TorchArrow 0.1.0TorchRec 0.5.0TorchVision 0.16
    TorchAudio 2.1TorchServe 0.8.2TorchX 0.5.0
    TorchData 0.7.0TorchText 0.16.0PyTorch on XLA Devices 1.14
    + +

    *To see prior versions or (unstable) nightlies, click on versions in the top left menu above ‘Search Docs’.

    + +

    TorchAudio

    + +

    TorchAudio v2.1 introduces the following new features and backward-incompatible changes:

    + +

    [Beta] A new API to apply filter, effects and codec

    + +

    `torchaudio.io.AudioEffector` can apply filters, effects and encodings to waveforms in online/offline fashion. You can use it as a form of augmentation.

    + +

    Please refer to https://pytorch.org/audio/2.1/tutorials/effector_tutorial.html for the usage and examples.

    + +

    [Beta] Tools for Forced alignment

    + +

    New functions and a pre-trained model for forced alignment were added. `torchaudio.functional.forced_align` computes alignment from an emission and `torchaudio.pipelines.MMS_FA` provides access to the model trained for multilingual forced alignment in MMS: Scaling Speech Technology to 1000+ languages project.

    + +

    Please refer to https://pytorch.org/audio/2.1/tutorials/ctc_forced_alignment_api_tutorial.html for the usage of `forced_align` function, and https://pytorch.org/audio/2.1/tutorials/forced_alignment_for_multilingual_data_tutorial.html for how one can use `MMS_FA` to align transcript in multiple languages.

    + +

    [Beta] TorchAudio-Squim : Models for reference-free speech assessment

    + +

    Model architectures and pre-trained models from the paper TorchAudio-Sequim: Reference-less Speech Quality and Intelligibility measures in TorchAudio were added.

    + +

    You can use the pre-trained models `torchaudio.pipelines.SQUIM_SUBJECTIVE` and `torchaudio.pipelines.SQUIM_OBJECTIVE`. They can estimate the various speech quality and intelligibility metrics (e.g. STOI, wideband PESQ, Si-SDR, and MOS). This is helpful when evaluating the quality of speech generation models, such as Text-to-Speech (TTS).

    + +

    Please refer to https://pytorch.org/audio/2.1/tutorials/squim_tutorial.html for the details.

    + +

    [Beta] CUDA-based CTC decoder

    + +

    `torchaudio.models.decoder.CUCTCDecoder` performs CTC beam search in CUDA devices. The beam search is fast. It eliminates the need to move data from CUDA device to CPU when performing automatic speech recognition. With PyTorch’s CUDA support, it is now possible to perform the entire speech recognition pipeline in CUDA.

    + +

    Please refer to https://pytorch.org/audio/2.1/tutorials/asr_inference_with_cuda_ctc_decoder_tutorial.html for the detail.

    + +

    [Prototype] Utilities for AI music generation

    + +

    We are working to add utilities that are relevant to music AI. Since the last release, the following APIs were added to the prototype.

    + +

    Please refer to respective documentation for the usage.

    + + +

    New recipes for training models

    + +

    Recipes for Audio-visual ASR, multi-channel DNN beamforming and TCPGen context-biasing were added.

    + +

    Please refer to the recipes

    + + +

    Update to FFmpeg support

    + +

    The version of supported FFmpeg libraries was updated. TorchAudio v2.1 works with FFmpeg 6, 5 and 4.4. The support for 4.3, 4.2 and 4.1 are dropped.

    + +

    Please refer to https://pytorch.org/audio/2.1/installation.html#optional-dependencies for the detail of the new FFmpeg integration mechanism.

    + +

    Update to libsox integration

    + +

    TorchAudio now depends on libsox installed separately from torchaudio. Sox I/O backend no longer supports file-like objects. (This is supported by FFmpeg backend and soundfile.)

    + +

    Please refer to https://pytorch.org/audio/2.1/installation.html#optional-dependencies for the details.

    + +

    TorchRL

    + +

    Our RLHF components make it easy to build an RLHF training loop with limited RL knowledge. TensorDict enables an easy interaction between datasets (eg, HF datasets) and RL models. The new algorithms we provide deliver a wide range of solutions for offline RL training, which is more data efficient.

    + +

    Through RoboHive and IsaacGym, TorchRL now provides a built-in interface with hardware (robots), tying training at scale with policy deployment on device. Thanks to SMAC, VMAS, and PettingZoo and related MARL-oriented losses, TorchRL is now fully capable of training complex policies in multi-agent settings.

    + +

    New algorithms

    +
      +
    • [BETA] We integrate some RLHF components and examples: we provide building blocks for data formatting in RL frameworks, reward model design, specific transforms that enable efficient learning (eg. KL correction) and training scripts
    • +
    • [Stable] New algorithms include Decision transformers, CQL, multi-agent losses such as MAPPO and QMixer.New features- [Stable] New transforms such as Visual Cortex 1 (VC1), a foundational model for RL. 
    • +
    • We widened the panel of library covered by TorchRL:  +
        +
      • [Beta] IsaacGym, a powerful GPU-based simulator that allows interaction and rendering of thousands of vectorized environments by NVIDIA.
      • +
      • [Stable] PettingZoo, a multi-agent library by the Farama Foundation.
      • +
      • [Stable] SMAC-v2, the new Starcraft Multi-agent simulator
      • +
      • [Stable] RoboHive, a collection of environments/tasks simulated with the MuJoCo physics engine.
      • +
      +
    • +
    + +

    Performance improvements

    + +

    We provide faster data collection through refactoring and integration of SB3 and Gym asynchronous environments execution. We also made our value functions faster to execute.

    + +

    TorchRec

    + +

    [Prototype] Zero Collision / Managed Collision Embedding Bags

    + +

    A common constraint in Recommender Systems is the sparse id input range is larger than the number of embeddings the model can learn for a given parameter size.   To resolve this issue, the conventional solution is to hash sparse ids into the same size range as the embedding table.  This will ultimately lead to hash collisions, with multiple sparse ids sharing the same embedding space.   We have developed a performant alternative algorithm that attempts to address this problem by tracking the N most common sparse ids and ensuring that they have a unique embedding representation. The module is defined here and an example can be found here.

    + +

    [Prototype] UVM Caching - Prefetch Training Pipeline

    + +

    For tables where on-device memory is insufficient to hold the entire embedding table, it is common to leverage a caching architecture where part of the embedding table is cached on device and the full embedding table is on host memory (typically DDR SDRAM).   However, in practice, caching misses are common, and hurt performance due to relatively high latency of going to host memory.   Building on TorchRec’s existing data pipelining, we developed a new Prefetch Training Pipeline to avoid these cache misses by prefetching the relevant embeddings for upcoming batch from host memory, effectively eliminating cache misses in the forward path.

    + +

    TorchVision 

    +

    Transforms and augmentations

    + +

    Major speedups

    + +

    The new transforms in torchvision.transforms.v2 are now 10%-40% faster than before! This is mostly achieved thanks to 2X-4X improvements made to v2.Resize(), which now supports native uint8 tensors for Bilinear and Bicubic mode. Output results are also now closer to PIL’s! Check out our performance recommendations to learn more.

    + +

    Additionally, torchvision now ships with libjpeg-turbo instead of libjpeg, which should significantly speed-up the jpeg decoding utilities (read_image, decode_jpeg), and avoid compatibility issues with PIL.

    + +

    CutMix and MixUp

    + +

    Long-awaited support for the CutMix and MixUp augmentations is now here! Check our tutorial to learn how to use them.

    + +

    Towards stable V2 transforms

    + +

    In the previous release 0.15 we BETA-released a new set of transforms in torchvision.transforms.v2 with native support for tasks like segmentation, detection, or videos. We have now stabilized the design decisions of these transforms and made further improvements in terms of speedups, usability, new transforms support, etc.

    + +

    We’re keeping the torchvision.transforms.v2 and torchvision.tv_tensors namespaces as BETA until 0.17 out of precaution, but we do not expect disruptive API changes in the future.

    + +

    Whether you’re new to Torchvision transforms, or you’re already experienced with them, we encourage you to start with Getting started with transforms v2 in order to learn more about what can be done with the new v2 transforms.

    + +

    Browse our main docs for general information and performance tips. The available transforms and functionals are listed in the API reference. Additional information and tutorials can also be found in our example gallery, e.g. Transforms v2: End-to-end object detection/segmentation example or How to write your own v2 transforms.

    + +

    [BETA] MPS support

    + +

    The nms and roi-align kernels (roi_align, roi_pool, ps_roi_align, ps_roi_pool) now support MPS. Thanks to Li-Huai (Allan) Lin for this contribution!

    + +

    TorchX

    + +

    Schedulers

    +
      +
    • +

      [Prototype] Kubernetes MCAD Scheduler: Integration for easily scheduling jobs on Multi-Cluster-Application-Dispatcher (MCAD)

      +
    • +
    • +

      AWS Batch 

      + +
        +
      • Add privileged option to enable running containers on EFA enabled instances with elevated networking permissions
      • +
      +
    • +
    + +

    TorchX Tracker

    +
      +
    • [Prototype] MLFlow backend for TorchX Tracker: in addition to fsspec based tracker, TorchX can use MLFlow instance to track metadata/experiments 
    • +
    + +

    Components

    +
      +
    • dist.spmd component to support Single-Process-Multiple-Data style applications
    • +
    + +

    Workspace

    +
      +
    • Add ability to access image and workspace path from Dockerfile while building docker workspace
    • +
    + +

    Release includes number of other bugfixes.

    + +

    To learn more about Torchx visit https://pytorch.org/torchx/latest/

    + +

    TorchText and TorchData

    + +

    As of September 2023 we have paused active development of TorchText and TorchData as we re-evaluate how we want to serve the needs of the community in this space.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/one-year-pytorch/index.html b/blog/one-year-pytorch/index.html new file mode 100644 index 000000000000..e5071fd0dfbd --- /dev/null +++ b/blog/one-year-pytorch/index.html @@ -0,0 +1,667 @@ + + + + + + + + + + + + + One Year of PyTorch Foundation | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    September 12, 2023

    +

    + One Year of PyTorch Foundation +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    It’s been one year since we announced the formation of the PyTorch Foundation! 🎉

    + +

    In its inaugural year, the PyTorch Foundation made a significant impact by launching PyTorch 2.0, growing contributors and adding new member companies. We’re grateful to our founding members for their support to move the foundation forward.

    + +

    A few milestones in the past year include:

    + +

    💻 Over 600,000 repositories on GitHub
    +✅ 60% of AI implementations choosing PyTorch
    +📈 More than 20% year over year growth in new repositories
    +🤝 Over 12,000 commits since last year

    + +

    And a look at what the foundation has been up to this past year:

    + +

    PyTorch project timeline

    + +

    We look forward to growing our community for the years to come through supporting our contributors, democratizing the AI field, and creating new innovations.

    + +

    We invite you to join us at this year’s PyTorch Conference on October 16-17 in San Francisco. Conference registration is filling up quickly, so take advantage of your chance to be part of this exciting event.

    + +

    Join us to stay informed about the latest announcements and have the opportunity to connect with both the founding members and new additions to the PyTorch community.

    + +

    With thanks and gratitude,
    +The PyTorch Foundation Team

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/openmined-and-pytorch-launch-fellowship-funding-for-privacy-preserving-ml/index.html b/blog/openmined-and-pytorch-launch-fellowship-funding-for-privacy-preserving-ml/index.html new file mode 100644 index 000000000000..8dfde8497443 --- /dev/null +++ b/blog/openmined-and-pytorch-launch-fellowship-funding-for-privacy-preserving-ml/index.html @@ -0,0 +1,690 @@ + + + + + + + + + + + + + OpenMined and PyTorch partner to launch fellowship funding for privacy-preserving ML community | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Andrew Trask (OpenMined/U.Oxford), Shubho Sengupta, Laurens van der Maaten, Joe Spisak + +

    +
    + +
    + +

    Many applications of machine learning (ML) pose a range of security and privacy challenges. In particular, users may not be willing or allowed to share their data, which prevents them from taking full advantage of ML platforms like PyTorch. To take the field of privacy-preserving ML (PPML) forward, OpenMined and PyTorch are announcing plans to jointly develop a combined platform to accelerate PPML research as well as new funding for fellowships.

    + +

    There are many techniques attempting to solve the problem of privacy in ML, each at various levels of maturity. These include (1) homomorphic encryption, (2) secure multi-party computation, (3) trusted execution environments, (4) on-device computation, (5) federated learning with secure aggregation, and (6) differential privacy. Additionally, a number of open source projects implementing these techniques were created with the goal of enabling research at the intersection of privacy, security, and ML. Among them, PySyft and CrypTen have taken an “ML-first” approach by presenting an API that is familiar to the ML community, while masking the complexities of privacy and security protocols. We are excited to announce that these two projects are now collaborating closely to build a mature PPML ecosystem around PyTorch.

    + +

    Additionally, to bolster this ecosystem and take the field of privacy preserving ML forward, we are also calling for contributions and supporting research efforts on this combined platform by providing funding to support the OpenMined community and the researchers that contribute, build proofs of concepts and desire to be on the cutting edge of how privacy-preserving technology is applied. We will provide funding through the RAAIS Foundation, a non-profit organization with a mission to advance education and research in artificial intelligence for the common good. We encourage interested parties to apply to one or more of the fellowships listed below.

    + +

    Tools Powering the Future of Privacy-Preserving ML

    + +

    The next generation of privacy-preserving open source tools enable ML researchers to easily experiment with ML models using secure computing techniques without needing to be cryptography experts. By integrating with PyTorch, PySyft and CrypTen offer familiar environments for ML developers to research and apply these techniques as part of their work.

    + +

    PySyft is a Python library for secure and private ML developed by the OpenMined community. It is a flexible, easy-to-use library that makes secure computation techniques like multi-party computation (MPC) and privacy-preserving techniques like differential privacy accessible to the ML community. It prioritizes ease of use and focuses on integrating these techniques into end-user use cases like federated learning with mobile phones and other edge devices, encrypted ML as a service, and privacy-preserving data science.

    + +

    CrypTen is a framework built on PyTorch that enables private and secure ML for the PyTorch community. It is the first step along the journey towards a privacy-preserving mode in PyTorch that will make secure computing techniques accessible beyond cryptography researchers. It currently implements secure multiparty computation with the goal of offering other secure computing backends in the near future. Other benefits to ML researchers include:

    + +
      +
    • It is ML first and presents secure computing techniques via a CrypTensor object that looks and feels exactly like a PyTorch Tensor. This allows the user to use automatic differentiation and neural network modules akin to those in PyTorch.
    • +
    • The framework focuses on scalability and performance and is built with real-world challenges in mind.
    • +
    + +

    The focus areas for CrypTen and PySyft are naturally aligned and complement each other. The former focuses on building support for various secure and privacy preserving techniques on PyTorch through an encrypted tensor abstraction, while the latter focuses on end user use cases like deployment on edge devices and a user friendly data science platform.

    + +

    Working together will enable PySyft to use CrypTen as a backend for encrypted tensors. This can lead to an increase in performance for PySyft and the adoption of CrypTen as a runtime by PySyft’s userbase. In addition to this, PyTorch is also adding cryptography friendly features such as support for cryptographically secure random number generation. Over the long run, this allows each library to focus exclusively on its core competencies while enjoying the benefits of the synergistic relationship.

    + +

    New Funding for OpenMined Contributors

    + +

    We are especially excited to announce that the PyTorch team has invested $250,000 to support OpenMined in furthering the development and proliferation of privacy-preserving ML. This gift will be facilitated via the RAAIS Foundation and will be available immediately to support paid fellowship grants for the OpenMined community.

    + +

    How to get involved

    + +

    Thanks to the support from the PyTorch team, OpenMined is able to offer three different opportunities for you to participate in the project’s development. Each of these fellowships furthers our shared mission to lower the barrier-to-entry for privacy-preserving ML and to create a more privacy-preserving world.

    + +

    Core PySyft CrypTen Integration Fellowships

    + +

    During these fellowships, we will integrate CrypTen as a supported backend for encrypted computation in PySyft. This will allow for the high-performance, secure multi-party computation capabilities of CrypTen to be used alongside other important tools in PySyft such as differential privacy and federated learning. For more information on the roadmap and how to apply for a paid fellowship, check out the project’s call for contributors.

    + +

    Federated Learning on Mobile, Web, and IoT Devices

    + +

    During these fellowships, we will be extending PyTorch with the ability to perform federated learning across mobile, web, and IoT devices. To this end, a PyTorch front-end will be able to coordinate across federated learning backends that run in Javascript, Kotlin, Swift, and Python. Furthermore, we will also extend PySyft with the ability to coordinate these backends using peer-to-peer connections, providing low latency and the ability to run secure aggregation as a part of the protocol. For more information on the roadmap and how to apply for a paid fellowship, check out the project’s call for contributors.

    + +

    Development Challenges

    + +

    Over the coming months, we will issue regular open competitions for increasing the performance and security of the PySyft and PyGrid codebases. For performance-related challenges, contestants will compete (for a cash prize) to make a specific PySyft demo (such as federated learning) as fast as possible. For security-related challenges, contestants will compete to hack into a PyGrid server. The first to demonstrate their ability will win the cash bounty! For more information on the challenges and to sign up to receive emails when each challenge is opened, sign up here.

    + +

    To apply, select one of the above projects and identify a role that matches your strengths!

    + +

    Cheers,

    + +

    Andrew, Laurens, Joe, and Shubho

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/optimize-llms/index.html b/blog/optimize-llms/index.html new file mode 100644 index 000000000000..254165119dc7 --- /dev/null +++ b/blog/optimize-llms/index.html @@ -0,0 +1,790 @@ + + + + + + + + + + + + + Optimize LLMs for Efficiency & Sustainability | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    February 19, 2025

    +

    + Optimize LLMs for Efficiency & Sustainability +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Zach Lasiuk, Arm + +

    +

    The rapid growth of large language model (LLM) applications is linked to rapid growth in energy demand. According to the International Energy Agency (IEA), data center electricity consumption is projected to roughly double by 2026 primarily driven by AI. This is due to the energy-intensive training requirements for massive LLMs – however, the increase in AI Inferencing workloads also plays a role. For example, compared with traditional search queries, a single AI inference can consume about 10x more energy.

    + +

    As developers, we directly affect how energy-intensive our AI solution is. There are technical decisions we can take to help make our AI solution more environmentally sustainable. Minimizing compute to deliver LLM solutions is not the only requirement for creating sustainable AI use. For example, systemic changes, such as policy interventions may be needed, but utilizing energy efficient solutions is an important factor and is an impactful intervention we can adopt right away.

    + +

    With that said, minimizing your LLM inference cloud compute requirements also leads to reducing your cloud bill and makes your app more energy efficient, creating a win-win situation. In this blog, we will take you through the steps to creating an LLM chatbot by optimizing and deploying a Llama 3.1 model on PyTorch, quantifying the computational efficiency benefits of specific architecture decisions.

    + +

    What will we evaluate?

    + +

    For this blog, our goal is to create an immersive fantasy storytelling app where users enter a fantasy world by chatting with a Generative AI. The first location is the land of Wicked, allowing people to role-play walking around the Emerald City and observe the sights and scenes in real-time. We’ll implement this via a chatbot and a custom system prompt.

    + +

    We will be evaluating LLM performance on CPUs. You can see the advantages of CPU vs GPU inference here. In general, leveraging CPUs in the cloud for LLM inference is a great choice for models around 10B parameters or less like the Llama series.

    + +

    We will also be using Arm-based CPUs, specifically the AWS Graviton series. Based on studies, the Arm-based Graviton3 server can provide 67.6 percent lower workload carbon intensity built in. While this study was based on a simulation, it is an excellent start to showing the possibilities for minimizing our app’s energy requirements.

    + +

    First, you’ll see how to run a simple LLM chatbot on PyTorch, then explore three techniques to optimize your application for computational efficiency:

    + +
      +
    1. Model optimization: Utilizing 4-bit quantization and added KleidiAI kernels.
    2. +
    3. Shortcut optimization: Implementing a vector database to handle common queries.
    4. +
    5. Architecture optimization: Adopting a serverless architecture.
    6. +
    + +

    Let’s get started.

    + +

    Run Llama-3.1 via PyTorch on AWS Graviton4

    + +

    To maximize energy efficiency, we will only use the minimum server resources needed to support this LLM chatbot. For this Llama-3.1 8-billion parameter model, 16 cores, 64GB RAM, and disk space of 50GB is required. We will use the r8g.4xlarge Graviton4 instance running Ubuntu 24.04, as it meets these specifications.

    + +

    Spin up this EC2 instance, connect to it, and start installing the requirements:

    + +
        sudo apt-get update
    +    sudo apt install gcc g++ build-essential python3-pip python3-venv google-perftools -y
    +
    + +

    Then install Torchchat, the library developed by the PyTorch team that enables running LLMs across devices:

    + +
        git clone https://github.com/pytorch/torchchat.git
    +    cd torchchat
    +    python3 -m venv .venv
    +    source .venv/bin/activate
    +    ./install/install_requirements.sh 
    +
    + +

    Next, install the Llama-3.1-8b model from Hugging Face through the CLI. You will first need to make a Hugging Face access token on your HF account. This will download the 16GB model to your instance, which may take a few minutes:

    + +
        pip install -U "huggingface_hub[cli]"
    +    huggingface-cli login
    +    	<enter your access token when prompted>
    +    python torchchat.py export llama3.1 --output-dso-path exportedModels/llama3.1.so --device cpu --max-seq-length 1024
    +
    + +

    Now you are ready to run the LLM model, adding a system prompt to be a guiding storyteller in the land of Wicked:

    + +
        LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libtcmalloc.so.4 TORCHINDUCTOR_CPP_WRAPPER=1 TORCHINDUCTOR_FREEZING=1 OMP_NUM_THREADS=16 python torchchat.py generate llama3.1 --device cpu --chat
    +
    + +

    Type ‘y’ to enter a system prompt and enter the following prompt:

    + +

    You are the guiding storyteller for a fantasy adventure application. Immerse users in the enchanting world of Wicked, guiding them through interactive, real-time experiences in the Emerald City. Describe vivid sights, dynamic scenes, and engage users in storytelling that feels alive and responsive. Allow users to make choices that shape their journey while maintaining the magical tone of the Wicked universe.

    + +

    Then enter your user query:

    + +

    I walk through the Emerald City gates and look up

    + +

    The output will show on the screen, taking about 7 seconds to generate the first token with less than 1 token per second.

    + +

    terminal

    + +

    This example took 245 seconds, or 4 minutes, to generate its complete reply—not very fast. The first optimization we’ll look at will speed up the LLM generation, reducing its computational footprint.

    + +

    Optimization 1: KleidiAI and Quantization

    + +

    Several optimizations are possible from the basic implementation above. The simplest and quickest one t to do is to quantize the model from FP16 to INT4. This approach trades-off some accuracy while cutting the model size from 16Gb to about 4Gb, increasing the inference speed in the process.

    + +

    Another common optimization comes in leveraging TorchAO (Torch Architecture Optimization), the PyTorch library that works seamlessly with TorchChat to enhance model performance through various quantization and sparsity methods.

    + +

    Lastly, we’ll use Arm KleidiAI optimizations. These are micro-kernels written in assembly that lead to significant performance improvements for LLM inference on Arm CPUs. You can read more about how KleidiAI kernels work if interested.

    + +

    To implement these optimizations, spin up a fresh EC2 instance and follow the instructions on how to run a Large Language Model (LLM) chatbot with PyTorch. When ready, run the model and enter the same system prompt and user query as above. You’ll get results that significantly speed up the inference: Less than 1 second to first token, and about 25 tokens per second.

    + +

    This cuts the inference time from 245 seconds to about 10 seconds. This results in less power-draw from your server, as it is spending more time idle vs running a power-hungry inference. All else being equal, this is a more carbon-friendly solution than the non-optimized app. The next two approaches go beyond model inference optimization, modifying the solution architectural to further reduce computational load.

    + +

    Optimization 2: FAISS to match database for common questions

    + +

    As stated in the introduction, model inferences are typically more computationally expensive than other search techniques. What if you could automatically respond to common user queries without performing an LLM inference? Using a query/response database is an option to bypass LLM inference and respond efficiently. For this interactive storytelling app, you can imagine common questions about specific characters, the world itself, and rules about what the chatbot is/is not capable of that can have pre-generated answers.

    + +

    However, a traditional exact-match database isn’t sufficient as users can phrase the same query in many ways. Asking about the chatbot’s capabilities could all invite the same answer but be phrased differently:

    + +
      +
    • “What are you capable of?”
    • +
    • “Tell me what you can do.”
    • +
    • “How can I interact with you?”
    • +
    + +

    Implementing semantic search solves this issue by matching a user’s query to the most relevant pre-generated answer by understanding the user’s intent. The FAISS library is a great option to implement semantic search.

    + +

    The computational savings of this approach depends on three factors:

    + +
      +
    1. Percentage of user queries that can be serviced by semantic search instead of LLM.
    2. +
    3. Computational cost of running the LLM inference.
    4. +
    5. Computational cost of running the semantic search.
    6. +
    + +

    With the savings equation being:

    + +
        Computational_savings = (% of queries) * (LLM_cost – search_cost).
    +
    + +

    This type of architecture makes sense in a few situations. One is if your system has common queries with many repeat questions. Another is large-scale systems with hundreds of thousands of incoming queries, where small percentage savings add up to meaningful changes. Lastly, if your LLM inference is very computationally expensive compared to the search cost, particularly with larger parameter models.

    + +

    The final optimization approach is transitioning from server to serverless.

    + +

    Optimization 3: Serverless approach

    + +

    Using serverless architectures are popular for many reasons, one being only paying for active compute time, and eliminating costs with idle servers. Idling servers require a non-trivial amount of power to keep on, wasting energy while waiting.

    + +

    This cost efficiency translates into being an inherently more environmentally friendly architecture, as it reduces wasteful energy consumption. Further, multiple applications share underlying physical infrastructure, improving resource efficiency.

    + +

    To set up your own serverless chatbot, you need to first containerize the quantized Llama-3.1-8b with TorchChat, TorchAO, and Arm KleidiAI optimizations with a python script containing a Lambda entry function lambda_handler. One deployment option is to upload your container to AWS ECR and attach the container to your Lambda function. Then set up an API Gateway WebSocket or similar to interact with your Lambda through an API.

    + +

    There are two notable limitations to using a serverless architecture to host your LLM, the first being token generation speed. Recall that the server-based approach delivered about 25 tokens/second with KleidiAI optimizations. The serverless approach delivers an order of magnitude slower, which we measured at around about 2.5 tokens/second. This limitation mainly results from Lambda functions deploying onto Graviton2 servers. When deployment moves to CPUs with more SIMD channels, like Graviton3 and Graviton4, the tokens/second should increase over time. Learn more about architecture optimizations introduced in Graviton3 via the Arm Neoverse-V1 CPU here.

    + +

    This slower speed restricts the viable use cases for serverless LLM architectures, but there are certain cases where this can be seen as an advantage. In our use cases of interactive storytelling, slowly revealing information creates a sense of immersion, building anticipation and mimicking real-time narration. Other use cases include:

    + +
      +
    • Guided meditation apps with slow, relaxing word delivery
    • +
    • Virtual friend engaging in thoughtful conversation, or a therapeutic conversation.
    • +
    • Poetry generation or interactive art to slow delivery creating a contemplative aesthetic.
    • +
    + +

    Users may have a better experience with slower token generation in the right applications. When prioritizing a more sustainable solution, restrictions end up becoming strengths. As an analogy, a common critique of modern movies today is that their overreliance on visual effects leads to fewer compelling storylines vs older movies. The cost restrictions of VFX meant older movies had to craft captivating dialog, leveraging skillful camera angles and character positioning to fully engage viewers. Similarly, focusing on sustainable AI architectures can lead to more engaging, immersive experiences when done thoughtfully.

    + +

    The second serverless limitation on LLM inferences is the cold-start time of about 50 seconds. If implemented poorly, a user waiting 50 seconds with no alternative will likely leave the app. You can turn this limitation into a feature in our Wicked-based experience with several design tricks:

    + +
      +
    • Create a “prologue experience” where you guide users through hard-coded questions and answers, priming them for where they will land in Emerald City and collecting input to shape their upcoming experience.
    • +
    • Make the waiting period a countdown timer, revealing hard-coded text snippets of the story or world-building. A character, like the wizard, could communicate with the user with fragmented lines to build suspense and prime the user into the right mindset.
    • +
    • Create an audio intro with music from the movie or musical, along with rotating visuals to draw users into the atmosphere of the Wicked world.
    • +
    + +

    Thinking outside the box

    + +

    Implementing a sustainability-minded solution architecture includes and goes beyond optimizing your AI inferences. Understand how users will interact with your system, and right-size your implementation accordingly. Always optimizing for fast tokens per second or time to first token will hide opportunities for engaging features.

    + +

    With that said, you should be leveraging straightforward optimizations when possible. Using TorchAO and Arm KleidiAI micro-kernels are great ways to speed up your LLM chatbot. By combining creative solution architectures and optimizing where possible, you can build more sustainable LLM-based applications. Happy coding!

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/optimized-pytorch-w-graviton/index.html b/blog/optimized-pytorch-w-graviton/index.html new file mode 100644 index 000000000000..3b5b4bbb69d9 --- /dev/null +++ b/blog/optimized-pytorch-w-graviton/index.html @@ -0,0 +1,808 @@ + + + + + + + + + + + + + Optimized PyTorch 2.0 Inference with AWS Graviton processors | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Sunita Nadampalli from AWS & Ankith Gunapal from Meta + +

    +

    New generations of CPUs offer significant performance improvement in machine learning (ML) inference due to specialized built-in instructions. Combined with their flexibility, high speed of development, and low operating cost, these general-purpose processors offer an alternative ML inference solution to other existing hardware solutions.

    + +

    AWS, Arm, Meta, and others helped optimize the performance of PyTorch 2.0 inference for Arm-based processors. As a result, we are delighted to announce that Arm-based AWS Graviton instance inference performance for PyTorch 2.0 is up to 3.5 times the speed for ResNet-50 compared to the previous PyTorch release, and up to 1.4 times the speed for BERT, making Graviton-based instances the fastest compute optimized instances on AWS for these models (see the following graph).

    + +

    Relative speed improvement achieved by upgrading PyTorch to 2.0

    + +

    Image 1: Relative speed improvement achieved by upgrading from PyTorch version 1.13 to 2.0 (higher is better). The performance is measured on c7g.4xlarge instances.

    + +

    As shown in the next graph, we measured up to 50% cost savings for PyTorch inference with Graviton3-based c7g instances across Torch Hub ResNet-50 and multiple Hugging Face models compared to comparable x86-based compute optimized Amazon EC2 instances. For that graph, we first measured the cost per million inference for the five instance types. Then, we normalized the cost per million inference results to a c5.4xlarge instance, which is the baseline measure of “1” on the Y-axis of the chart.

    + +

    Relative cost of PyTorch inference running on different AWS instances

    + +

    Image 2: Relative cost of PyTorch inference running on different AWS instances (lower is better).
    Source: AWS ML Blog on Graviton PyTorch2.0 inference performance.

    + +

    Similar to the preceding inference cost comparison graph, the following graph shows the model p90 latency for the same five instance types. We normalized the latency results to the c5.4xlarge instance, which is the baseline measure of “1” on the Y-axis of the chart. The c7g.4xlarge (AWS Graviton3) model inference latency is up to 50% better than the latencies measured on c5.4xlarge, c6i.4xlarge, and c6a.4xlarge. \

    + +

    Relative latency (p90) of PyTorch inference running on different AWS instances

    + +

    Image 3: Relative latency (p90) of PyTorch inference running on different AWS instances (lower is better).
    Source: AWS ML Blog on Graviton PyTorch2.0 inference performance.

    + +

    Optimization details

    + +

    PyTorch supports Compute Library for the Arm® Architecture (ACL) GEMM kernels via the oneDNN backend (previously called “MKL-DNN”) for AArch64 platforms. The optimizations are primarily for PyTorch ATen CPU BLAS, ACL kernels for fp32 and bfloat16, and oneDNN primitive caching. There are no frontend API changes, so no changes are required at the application level to get these optimizations working on Graviton3-based instances.

    + +

    PyTorch level optimizations

    + +

    We extended the ATen CPU BLAS interface to accelerate more operators and tensor configurations via oneDNN backend for aarch64 platform. The following diagram highlights (in orange) the optimized components that improved the PyTorch inference performance on aarch64 platform.

    + +

    PyTorch software stack highlighting (in orange) the components optimized for inference performance improvement on AArch64 platform

    + +

    Image 4: PyTorch software stack highlighting (in orange) the components optimized for inference performance improvement on AArch64 platform

    + +

    ACL kernels and BFloat16 FPmath mode

    + +

    The ACL library provides Neon and SVE optimized GEMM kernels for both fp32 and bfloat16 formats: These kernels improve the SIMD hardware utilization and reduce the end to end inference latencies. The bfloat16 support in Graviton3 allows efficient deployment of models trained using bfloat16, fp32 and Automatic Mixed Precision (AMP). The standard fp32 models use bfloat16 kernels via oneDNN FPmath mode without model quantization. They provide up to two times faster performance compared to existing fp32 model inference without bfloat16 FPmath support. For more details on ACL GEMM kernel support, refer to Arm Compute Library github.

    + +

    Primitive Caching

    + +

    The following call sequence diagram shows how ACL operators are integrated into oneDNN backend. As shown in the diagram, ACL objects are handled as oneDNN resources instead of the primitive objects. This is because the ACL objects are stateful and mutable. Since the ACL objects are handled as resource objects, they are not cacheable with the default primitive caching feature supported in oneDNN. We implemented primitive caching at ideep operator level for “convolution”, “matmul” and “inner product” operators to avoid redundant GEMM kernel initialization and tensor allocation overhead.

    + +

    Call sequence diagram showing how the Compute Library for the Arm® Architecture (ACL) GEMM kernels are integrated into oneDNN backend

    + +

    Image 5: Call sequence diagram showing how the Compute Library for the Arm® Architecture (ACL) GEMM kernels are integrated into oneDNN backend

    + +

    How to take advantage of the optimizations

    + +

    Install the PyTorch 2.0 wheel from the official repo and set environment variables to enable the additional optimizations.

    + +
    # Install Python
    +sudo apt-get update
    +sudo apt-get install -y python3 python3-pip
    +
    +# Upgrade pip3 to the latest version
    +python3 -m pip install --upgrade pip
    +
    +# Install PyTorch and extensions
    +python3 -m pip install torch
    +python3 -m pip install torchvision torchaudio torchtext
    +
    +# Turn on Graviton3 optimization
    +export DNNL_DEFAULT_FPMATH_MODE=BF16
    +export LRU_CACHE_CAPACITY=1024
    +
    + +

    Running an inference

    + +

    You can use PyTorch torchbench to measure the CPU inference performance improvements, or to compare different instance types.

    + +
    # Pre-requisite:
    +# pip install PyTorch2.0 wheels and set the above mentioned environment variables
    +
    +# Clone PyTorch benchmark repo
    +git clone https://github.com/pytorch/benchmark.git
    +
    +# Setup ResNet-50 benchmark
    +cd benchmark
    +python3 install.py resnet50
    +
    +# Install the dependent wheels
    +python3 -m pip install numba
    +
    +# Run ResNet-50 inference in jit mode. On successful completion of the inference runs,
    +# the script prints the inference latency and accuracy results
    +python3 run.py resnet50 -d cpu -m jit -t eval --use_cosine_similarity
    +
    + +

    Performance Analysis

    + +

    Now, we will analyze the inference performance of ResNet-50 on Graviton3-based c7g instance using PyTorch profiler. We run the code below with PyTorch 1.13 and PyTorch 2.0 and run the inference for a few iterations as a warmup before measuring the performance.

    + +
    # Turn on Graviton3 optimization
    +export DNNL_DEFAULT_FPMATH_MODE=BF16
    +export LRU_CACHE_CAPACITY=1024
    +
    + +
    import torch
    +from torchvision import models
    +sample_input = [torch.rand(1, 3, 224, 224)]
    +eager_model = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
    +model = torch.jit.script(eager_model, example_inputs=[sample_input, ])
    +
    +model = model.eval()
    +model = torch.jit.optimize_for_inference(model)
    +
    +with torch.no_grad():
    +    # warmup runs
    +    for i in range(10):
    +        model(*sample_input)
    +    prof = torch.profiler.profile(
    +      on_trace_ready=torch.profiler.tensorboard_trace_handler('./logs'), record_shapes=True, with_stack=True)
    +    # profile after warmup
    +    prof.start()
    +    model(*sample_input)
    +    prof.stop()
    +
    + +

    We use tensorboard to view results of the profiler and analyze model performance.

    + +

    Install PyTorch Profiler Tensorboard plugin as follows

    + +
    pip install torch_tb_profiler
    +
    + +

    Launch the tensorboard using

    + +
    tensorboard --logdir=./logs
    +
    + +

    Launch the following in the browser to view the profiler output. The profiler supports ‘Overview’, ‘Operator’, ‘Trace’ and ‘Module’ views to get insight into the inference execution.

    + +
    http://localhost:6006/#pytorch_profiler
    +
    + +

    The following diagram is the profiler ‘Trace’ view which shows the call stack along with the execution time of each function. In the profiler, we selected the forward() function to get the overall inference time. As shown in the diagram, the inference time for the ResNet-50 model on Graviton3-based c7g instance is around 3 times faster in PyTorch 2.0 compared to PyTorch 1.13.

    + +

    Profiler Trace view: Forward pass wall duration on PyTorch 1.13 and PyTorch 2.0

    + +

    Image 6: Profiler Trace view: Forward pass wall duration on PyTorch 1.13 and PyTorch 2.0

    + +

    The next diagram is the ‘Operator’ view which shows the list of PyTorch operators and their execution time. Similar to the preceding Trace view, the Operator view shows that the operator host duration for the ResNet-50 model on Graviton3-based c7g instance is around 3 times faster in PyTorch 2.0 compared to PyTorch 1.13.

    + +

    Profiler Operator view: Forward operator Host duration on PyTorch 1.13 and PyTorch 2.0

    + +

    Image 7: Profiler Operator view: Forward operator Host duration on PyTorch 1.13 and PyTorch 2.0

    + +

    Benchmarking Hugging Face models

    + +

    You can use the Amazon SageMaker Inference Recommender utility to automate performance benchmarking across different instances. With Inference Recommender, you can find the real-time inference endpoint that delivers the best performance at the lowest cost for a given ML model. We collected the preceding data using the Inference Recommender notebooks by deploying the models on production endpoints. For more details on Inference Recommender, refer to the amazon-sagemaker-examples GitHub repo. We benchmarked the following models for this post: ResNet50 image classification, DistilBERT sentiment analysis, RoBERTa fill mask, and RoBERTa sentiment analysis.

    + +

    Conclusion

    + +

    For PyTorch 2.0, the Graviton3-based C7g instance is the most cost-effective compute optimized Amazon EC2 instance for inference. These instances are available on SageMaker and Amazon EC2. The AWS Graviton Technical Guide provides the list of optimized libraries and best practices that will help you achieve cost benefit with Graviton instances across different workloads.

    + +

    If you find use cases where similar performance gains are not observed on Graviton, please open an issue on the aws-graviton-getting-started github to let us know about it. We will continue to add more performance improvements to make AWS Graviton-based instances the most cost-effective and efficient general purpose processor for inference using PyTorch.

    + +

    Acknowledgments

    + +

    We would like to thank Ali Saidi (Sr. Principal Engineer) and Csaba Csoma (Sr. Manager, Software Development) from AWS, Ashok Bhat (Sr. Product Manager), Nathan Sircombe (Sr. Engineering Manager) and Milos Puzovic (Principal Software Engineer) from Arm for their support during the Graviton PyTorch inference optimization work. We would also like to thank Geeta Chauhan (Engineering Leader, Applied AI) from Meta for her guidance on this blog.

    + +

    About the authors

    + +

    Sunita Nadampalli is a ML Engineer and Software Development Manager at AWS.

    + +

    Ankith Gunapal is an AI Partner Engineer at Meta(PyTorch).

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/optimizing-cuda-rnn-with-torchscript/index.html b/blog/optimizing-cuda-rnn-with-torchscript/index.html new file mode 100644 index 000000000000..c53ab248b9ae --- /dev/null +++ b/blog/optimizing-cuda-rnn-with-torchscript/index.html @@ -0,0 +1,874 @@ + + + + + + + + + + + + + Optimizing CUDA Recurrent Neural Networks with TorchScript | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + The PyTorch Team + +

    +

    This week, we officially released PyTorch 1.1, a large feature update to PyTorch 1.0. One of the new features we’ve added is better support for fast, custom Recurrent Neural Networks (fastrnns) with TorchScript (the PyTorch JIT) (https://pytorch.org/docs/stable/jit.html).

    + +

    RNNs are popular models that have shown good performance on a variety of NLP tasks that come in different shapes and sizes. PyTorch implements a number of the most popular ones, the Elman RNN, GRU, and LSTM as well as multi-layered and bidirectional variants.

    + +

    However, many users want to implement their own custom RNNs, taking ideas from recent literature. Applying Layer Normalization to LSTMs is one such use case. Because the PyTorch CUDA LSTM implementation uses a fused kernel, it is difficult to insert normalizations or even modify the base LSTM implementation. Many users have turned to writing custom implementations using standard PyTorch operators, but such code suffers from high overhead: most PyTorch operations launch at least one kernel on the GPU and RNNs generally run many operations due to their recurrent nature. However, we can apply TorchScript to fuse operations and optimize our code automatically, launching fewer, more optimized kernels on the GPU.

    + +

    Our goal is for users to be able to write fast, custom RNNs in TorchScript without writing specialized CUDA kernels to achieve similar performance. In this post, we’ll provide a tutorial for how to write your own fast RNNs with TorchScript. To better understand the optimizations TorchScript applies, we’ll examine how those work on a standard LSTM implementation but most of the optimizations can be applied to general RNNs.

    + +

    Writing custom RNNs

    + +

    To get started, you can use this file as a template to write your own custom RNNs.

    + +

    We are constantly improving our infrastructure on trying to make the performance better. If you want to gain the speed/optimizations that TorchScript currently provides (like operator fusion, batch matrix multiplications, etc.), here are some guidelines to follow. The next section explains the optimizations in depth.

    + +
      +
    1. +

      If the customized operations are all element-wise, that’s great because you can get the benefits of the PyTorch JIT’s operator fusion automatically!

      +
    2. +
    3. +

      If you have more complex operations (e.g. reduce ops mixed with element-wise ops), consider grouping the reduce operations and element-wise ops separately in order to fuse the element-wise operations into a single fusion group.

      +
    4. +
    5. +

      If you want to know about what has been fused in your custom RNN, you can inspect the operation’s optimized graph by using graph_for . Using LSTMCell as an example:

      + +
       # get inputs and states for LSTMCell
      +
      + inputs = get_lstm_inputs()
      +
      + # instantiate a ScriptModule
      +
      + cell = LSTMCell(input_size, hidden_size)
      +
      + # print the optimized graph using graph_for
      +
      + out = cell(inputs)
      + print(cell.graph_for(inputs))
      +
      +
      + +

      This will generate the optimized TorchScript graph (a.k.a PyTorch JIT IR) for the specialized inputs that you provides:

      + +
       graph(%x : Float(*, *),
      +         %hx : Float(*, *),
      +         %cx : Float(*, *),
      +         %w_ih : Float(*, *),
      +         %w_hh : Float(*, *),
      +         %b_ih : Float(*),
      +         %b_hh : Float(*)):
      +     %hy : Float(*, *), %cy : Float(*, *) = prim::DifferentiableGraph_0(%cx, %b_hh, %b_ih, %hx, %w_hh, %x, %w_ih)
      +     %30 : (Float(*, *), Float(*, *)) = prim::TupleConstruct(%hy, %cy)
      +     return (%30)
      +     with prim::DifferentiableGraph_0 = graph(%13 : Float(*, *),
      +         %29 : Float(*),
      +         %33 : Float(*),
      +         %40 : Float(*, *),
      +         %43 : Float(*, *),
      +         %45 : Float(*, *),
      +         %48 : Float(*, *)):
      +     %49 : Float(*, *) = aten::t(%48)
      +     %47 : Float(*, *) = aten::mm(%45, %49)
      +     %44 : Float(*, *) = aten::t(%43)
      +     %42 : Float(*, *) = aten::mm(%40, %44)
      +     ...some broadcast sizes operations...
      +     %hy : Float(*, *), %287 : Float(*, *), %cy : Float(*, *), %outgate.1 : Float(*, *), %cellgate.1 : Float(*, *), %forgetgate.1 : Float(*, *), %ingate.1 : Float(*, *) = prim::FusionGroup_0(%13, %346, %345, %344, %343)
      +     ...some broadcast sizes operations...
      +     return (%hy, %cy, %49, %44, %196, %199, %340, %192, %325, %185, %ingate.1, %forgetgate.1, %cellgate.1, %outgate.1, %395, %396, %287)
      +     with prim::FusionGroup_0 = graph(%13 : Float(*, *),
      +         %71 : Tensor,
      +         %76 : Tensor,
      +         %81 : Tensor,
      +         %86 : Tensor):
      +     ...some chunks, constants, and add operations...
      +     %ingate.1 : Float(*, *) = aten::sigmoid(%38)
      +     %forgetgate.1 : Float(*, *) = aten::sigmoid(%34)
      +     %cellgate.1 : Float(*, *) = aten::tanh(%30)
      +     %outgate.1 : Float(*, *) = aten::sigmoid(%26)
      +     %14 : Float(*, *) = aten::mul(%forgetgate.1, %13)
      +     %11 : Float(*, *) = aten::mul(%ingate.1, %cellgate.1)
      +     %cy : Float(*, *) = aten::add(%14, %11, %69)
      +     %4 : Float(*, *) = aten::tanh(%cy)
      +     %hy : Float(*, *) = aten::mul(%outgate.1, %4)
      +     return (%hy, %4, %cy, %outgate.1, %cellgate.1, %forgetgate.1, %ingate.1)
      +
      +
    6. +
    + +

    From the above graph we can see that it has a prim::FusionGroup_0 subgraph that is fusing all element-wise operations in LSTMCell (transpose and matrix multiplication are not element-wise ops). Some graph nodes might be hard to understand in the first place but we will explain some of them in the optimization section, we also omitted some long verbose operators in this post that is there just for correctness.

    + +

    Variable-length sequences best practices

    + +

    TorchScript does not support PackedSequence. Generally, when one is handling variable-length sequences, it is best to pad them into a single tensor and send that tensor through a TorchScript LSTM. Here’s an example:

    + +
    sequences = [...] # List[Tensor], each Tensor is T' x C
    +padded = torch.utils.rnn.pad_sequence(sequences)
    +lengths = [seq.size(0) for seq in sequences]
    +padded  # T x N x C, where N is batch size and T is the max of all T'
    +
    +model = LSTM(...)
    +output, hiddens = model(padded)
    +output  # T x N x C
    +
    + +

    Of course, output may have some garbage data in the padded regions; use lengths to keep track of which part you don’t need.

    + +

    Optimizations

    + +

    We will now explain the optimizations performed by the PyTorch JIT to speed up custom RNNs. We will use a simple custom LSTM model in TorchScript to illustrate the optimizations, but many of these are general and apply to other RNNs.

    + +

    To illustrate the optimizations we did and how we get benefits from those optimizations, we will run a simple custom LSTM model written in TorchScript (you can refer the code in the custom_lstm.py or the below code snippets) and time our changes.

    + +

    We set up the environment in a machine equipped with 2 Intel Xeon chip and one Nvidia P100, with cuDNN v7.3, CUDA 9.2 installed. The basic set up for the LSTM model is as follows:

    + +
    input_size = 512
    +hidden_size = 512
    +mini_batch = 64
    +numLayers = 1
    +seq_length = 100 
    +
    + +

    The most important thing PyTorch JIT did is to compile the python program to a PyTorch JIT IR, which is an intermediate representation used to model the program’s graph structure. This IR can then benefit from whole program optimization, hardware acceleration and overall has the potential to provide large computation gains. In this example, we run the initial TorchScript model with only compiler optimization passes that are provided by the JIT, including common subexpression elimination, constant pooling, constant propagation, dead code elimination and some peephole optimizations. We run the model training for 100 times after warm up and average the training time. The initial results for model forward time is around 27ms and backward time is around 64ms, which is a bit far away from what PyTorch cuDNN LSTM provided. Next we will explain the major optimizations we did on how we improve the performance on training or inferencing, starting with LSTMCell and LSTMLayer, and some misc optimizations.

    + +

    LSTM Cell (forward)

    + +

    Almost all the computations in an LSTM happen in the LSTMCell, so it’s important for us to take a look at the computations it contains and how can we improve their speed. Below is a sample LSTMCell implementation in TorchScript:

    + +
    class LSTMCell(jit.ScriptModule):
    +    def __init__(self, input_size, hidden_size):
    +        super(LSTMCell, self).__init__()
    +        self.input_size = input_size
    +        self.hidden_size = hidden_size
    +        self.weight_ih = Parameter(torch.randn(4 * hidden_size, input_size))
    +        self.weight_hh = Parameter(torch.randn(4 * hidden_size, hidden_size))
    +        self.bias_ih = Parameter(torch.randn(4 * hidden_size))
    +        self.bias_hh = Parameter(torch.randn(4 * hidden_size))
    +
    +    @jit.script_method
    +    def forward(self, input, state):
    +        # type: (Tensor, Tuple[Tensor, Tensor]) -> Tuple[Tensor, Tuple[Tensor, Tensor]]
    +        hx, cx = state
    +        gates = (torch.mm(input, self.weight_ih.t()) + self.bias_ih +
    +                 torch.mm(hx, self.weight_hh.t()) + self.bias_hh)
    +        ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)
    +
    +        ingate = torch.sigmoid(ingate)
    +        forgetgate = torch.sigmoid(forgetgate)
    +        cellgate = torch.tanh(cellgate)
    +        outgate = torch.sigmoid(outgate)
    +
    +        cy = (forgetgate * cx) + (ingate * cellgate)
    +        hy = outgate * torch.tanh(cy)
    +
    +        return hy, (hy, cy)
    +
    + +

    This graph representation (IR) that TorchScript generated enables several optimizations and scalable computations. In addition to the typical compiler optimizations that we could do (CSE, constant propagation, etc. ) we can also run other IR transformations to make our code run faster.

    + +
      +
    • Element-wise operator fusion. PyTorch JIT will automatically fuse element-wise ops, so when you have adjacent operators that are all element-wise, JIT will automatically group all those operations together into a single FusionGroup, this FusionGroup can then be launched with a single GPU/CPU kernel and performed in one pass. This avoids expensive memory reads and writes for each operation.
    • +
    • Reordering chunks and pointwise ops to enable more fusion. An LSTM cell adds gates together (a pointwise operation), and then chunks the gates into four pieces: the ifco gates. Then, it performs pointwise operations on the ifco gates like above. This leads to two fusion groups in practice: one fusion group for the element-wise ops pre-chunk, and one group for the element-wise ops post-chunk. + The interesting thing to note here is that pointwise operations commute with torch.chunk: Instead of performing pointwise ops on some input tensors and chunking the output, we can chunk the input tensors and then perform the same pointwise ops on the output tensors. By moving the chunk to before the first fusion group, we can merge the first and second fusion groups into one big group.
    • +
    + +
    + +
    + +
      +
    • Tensor creation on the CPU is expensive, but there is ongoing work to make it faster. At this point, a LSTMCell runs three CUDA kernels: two gemm kernels and one for the single pointwise group. One of the things we noticed was that there was a large gap between the finish of the second gemm and the start of the single pointwise group. This gap was a period of time when the GPU was idling around and not doing anything. Looking into it more, we discovered that the problem was that torch.chunk constructs new tensors and that tensor construction was not as fast as it could be. Instead of constructing new Tensor objects, we taught the fusion compiler how to manipulate a data pointer and strides to do the torch.chunk before sending it into the fused kernel, shrinking the amount of idle time between the second gemm and the launch of the element-wise fusion group. This give us around 1.2x increase speed up on the LSTM forward pass.
    • +
    + +

    By doing the above tricks, we are able to fuse the almost all LSTMCell forward graph (except the two gemm kernels) into a single fusion group, which corresponds to the prim::FusionGroup_0 in the above IR graph. It will then be launched into a single fused kernel for execution. With these optimizations the model performance improves significantly with average forward time reduced by around 17ms (1.7x speedup) to 10ms, and average backward time reduce by 37ms to 27ms (1.37x speed up).

    + +

    LSTM Layer (forward)

    + +
    class LSTMLayer(jit.ScriptModule):
    +    def __init__(self, cell, *cell_args):
    +        super(LSTMLayer, self).__init__()
    +        self.cell = cell(*cell_args)
    +
    +    @jit.script_method
    +    def forward(self, input, state):
    +        # type: (Tensor, Tuple[Tensor, Tensor]) -> Tuple[Tensor, Tuple[Tensor, Tensor]]
    +        inputs = input.unbind(0)
    +        outputs = torch.jit.annotate(List[Tensor], [])
    +        for i in range(len(inputs)):
    +            out, state = self.cell(inputs[i], state)
    +            outputs += [out]
    +        return torch.stack(outputs), state
    +
    + +

    We did several tricks on the IR we generated for TorchScript LSTM to boost the performance, some example optimizations we did:

    + +
      +
    • Loop Unrolling: We automatically unroll loops in the code (for big loops, we unroll a small subset of it), which then empowers us to do further optimizations on the for loops control flow. For example, the fuser can fuse together operations across iterations of the loop body, which results in a good performance improvement for control flow intensive models like LSTMs.
    • +
    • Batch Matrix Multiplication: For RNNs where the input is pre-multiplied (i.e. the model has a lot of matrix multiplies with the same LHS or RHS), we can efficiently batch those operations together into a single matrix multiply while chunking the outputs to achieve equivalent semantics.
    • +
    + +

    By applying these techniques, we reduced our time in the forward pass by an additional 1.6ms to 8.4ms (1.2x speed up) and timing in backward by 7ms to around 20ms (1.35x speed up).

    + +

    LSTM Layer (backward)

    + +
      +
    • +

      “Tree” Batch Matrix Muplication: It is often the case that a single weight is reused multiple times in the LSTM backward graph, forming a tree where the leaves are matrix multiplies and nodes are adds. These nodes can be combined together by concatenating the LHSs and RHSs in different dimensions, then computed as a single matrix multiplication. The formula of equivalence can be denoted as follows:

      + +

      $L1 * R1 + L2 * R2 = torch.cat((L1, L2), dim=1) * torch.cat((R1, R2), dim=0)$

      +
    • +
    • +

      Autograd is a critical component of what makes PyTorch such an elegant ML framework. As such, we carried this through to PyTorch JIT, but using a new Automatic Differentiation (AD) mechanism that works on the IR level. JIT automatic differentiation will slice the forward graph into symbolically differentiable subgraphs, and generate backwards nodes for those subgraphs. Taking the above IR as an example, we group the graph nodes into a single prim::DifferentiableGraph_0 for the operations that has AD formulas. For operations that have not been added to AD formulas, we will fall back to Autograd during execution.

      +
    • +
    • +

      Optimizing the backwards path is hard, and the implicit broadcasting semantics make the optimization of automatic differentiation harder. PyTorch makes it convenient to write tensor operations without worrying about the shapes by broadcasting the tensors for you. For performance, the painful point in backward is that we need to have a summation for such kind of broadcastable operations. This results in the derivative of every broadcastable op being followed by a summation. Since we cannot currently fuse reduce operations, this causes FusionGroups to break into multiple small groups leading to bad performance. To deal with this, refer to this great post written by Thomas Viehmann.

      +
    • +
    + +

    Misc Optimizations

    + +
      +
    • In addition to the steps laid about above, we also eliminated overhead between CUDA kernel launches and unnecessary tensor allocations. One example is when you do a tensor device look up. This can provide some poor performance initially with a lot of unnecessary allocations. When we remove these this results in a reduction from milliseconds to nanoseconds between kernel launches.
    • +
    • Lastly, there might be normalization applied in the custom LSTMCell like LayerNorm. Since LayerNorm and other normalization ops contains reduce operations, it is hard to fuse it in its entirety. Instead, we automatically decompose Layernorm to a statistics computation (reduce operations) + element-wise transformations, and then fuse those element-wise parts together. As of this post, there are some limitations on our auto differentiation and graph fuser infrastructure which limits the current support to inference mode only. We plan to add backward support in a future release.
    • +
    + +

    With the above optimizations on operation fusion, loop unrolling, batch matrix multiplication and some misc optimizations, we can see a clear performance increase on our custom TorchScript LSTM forward and backward from the following figure:

    + +
    + +
    + +

    There are a number of additional optimizations that we did not cover in this post. In addition to the ones laid out in this post, we now see that our custom LSTM forward pass is on par with cuDNN. We are also working on optimizing backward more and expect to see improvements in future releases. Besides the speed that TorchScript provides, we introduced a much more flexible API that enable you to hand draft a lot more custom RNNs, which cuDNN could not provide.

    + + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/optimizing-libtorch/index.html b/blog/optimizing-libtorch/index.html new file mode 100644 index 000000000000..cc14385d3151 --- /dev/null +++ b/blog/optimizing-libtorch/index.html @@ -0,0 +1,722 @@ + + + + + + + + + + + + + Optimizing LibTorch-based inference engine memory usage and thread-pooling | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Himalay Mohanlal Joriwal, Pierre-Yves Aquilanti, Vivek Govindan, Hamid Shojanazeri, Ankith Gunapal, Tristan Rice + +

    +

    Outline

    + +

    In this blog post we show how to optimize LibTorch-based inference engine to maximize throughput by reducing memory usage and optimizing the thread-pooling strategy. We apply these optimizations to Pattern Recognition engines for audio data, for example, music and speech recognition or acoustic fingerprinting. The optimizations discussed in this blog post allow for memory usage reduction by 50% and reduction in end-to-end latency for Inference by 37.5%. These optimizations are applicable to computer vision and natural language processing.

    + +

    Audio Recognition Inferencing

    + +

    Audio Recognition (AR) engines can be used to recognize and identify sound patterns. As an example, identifying the type and species of a bird from audio recordings, distinguishing music from the singer’s voice, or detecting an abnormal sound indicating a breach in a building. To identify sounds of interest, AR engines process audio through 4 stages:

    + +
      +
    1. File Validation: The AR engine validates the input audio file.
    2. +
    3. Feature Extraction: Features are extracted from each segment within the audio file.
    4. +
    5. Inference: LibTorch performs inference using CPUs or accelerators. In our case Intel processors on an Elastic Cloud Compute (EC2) instance.
    6. +
    7. Post-processing: A post-processing model decodes the results and calculates scores that are used to convert inference output into tags or transcripts.
    8. +
    + +

    Of these 4 steps, inference is the most computationally intensive and can take up to 50% of the pipeline processing time depending on the model complexity. This means that any optimization at this stage has a significant impact on the overall pipeline. 

    + +

    Optimizing the Audio Recognition engine with concurrency…is not so simple

    + +

    Our objective for this processing pipeline is to extract audio segments into tags or transcripts through a processing. The input data is an audio file composed of several short sound segments (S1 to S6 in Figure 1). The output data corresponds to tags or transcripts ordered by timestamps.

    + +

    Figure 1: Example audio file with segment boundaries

    + +

    Figure 1: Example audio file with segment boundaries

    + +

    Each segment can be processed independently and in an out-of-order fashion. This offers the opportunity to process segments concurrently and in parallel to optimize the overall inference throughput as well as maximize the usage of the resources.

    + +

    Parallelization on an instance can be achieved through multi-threading (pThreads, std::threads, OpenMP) or multi-processing. The advantage of multi-threading over multi-processing is the ability to use shared memory. It enables developers to minimize data duplication across threads by sharing data across threads; the AR models in our case (Figure 2). Furthermore, a reduction in memory allows us to run more pipelines in parallel by increasing the number of engine threads in order to utilize all vCPUs on our Amazon EC2 instance (c5.4xlarge in our case, it offers 16 vCPUs). In theory, we expect to see higher hardware utilization and higher throughput for our AR engine as a result.

    + +

    Figure 2: Multi-threaded AR Engine

    + +

    Figure 2: Multi-threaded AR Engine

    + +

    But we found these assumptions to be wrong. Indeed, we found that increasing the number of threads of the application led to an increase of the end-to-end latency for each audio segment and to a decrease of the engine throughput. For example, increasing the concurrency from 1 to 5 threads led to an increase of the latency by 4x which had a proportional effect on decreasing the throughput. In fact, metrics showed that within the pipeline, the latency of the inference stage alone was 3x higher than it’s single thread baseline. 

    + +

    Using a profiler, we found that the CPU Spin Time increased, potentially due to CPU oversubscription which impacts system and application performance. Given our control over the application’s multi-thread implementation, we chose to dive deeper into the stack and identify potential conflicts with LibTorch’s default settings.

    + +

    Diving deeper on LibTorch’s multi-threading and its impact on concurrency

    + +

    LibTorch’s parallel implementations on CPU for inference are based on  global thread pools. Examples of implementations are Inter-op and intra-op parallelism, which can be chosen depending on the model’s properties. In both cases, it is possible to set the number of threads in each thread-poll to optimize the latency and throughput. 

    + +

    To test if LibTorch’s parallel default implementation settings had a counter effect on our inference latency, we ran an experiment on a 16 vCPus machine with a 35-minute audio file, keeping the LibTorch inter-threads constant at 1 (because our models didn’t utilize the inter-op thread pool). We collected the following data as shown in Figure 3 and 4. 

    + +

    Figure 3: CPU Utilization for different number of engine threads

    + +

    Figure 3: CPU Utilization for different number of engine threads

    + +

    Figure 4: Processing times for different number of engine threads

    + +

    Figure 4: Processing times for different number of engine threads

    + +

    Execution time in Figure 4 is the end-to-end processing time for processing all the segments of the given audio file. We have 4 different configurations of LibTorch intra-threads which are 1, 4, 8, 16 and we change the number of engine threads from 1 to 16 for each intra-thread LibTorch configuration. As we see in Figure 3, CPU utilization increases with an increase in the number of engine threads for all LibTorch intra-thread configurations. But as we see in Figure 4, an increase in CPU utilization doesn’t translate into lower execution time. We found out that in all but one case, as the number of engine threads shot up, so did execution time. The one exception was the case where the intra-thread pool size was 1.

    + +

    Resolving the global thread pool issue

    + +

    Using too many threads with a global thread pool led to performance degradation and caused an over-subscription problem. Without disabling LibTorch global thread pools, it was difficult to match the performance of the multi-process engine.

    + +

    Disabling the LibTorch global thread pool is as simple as setting the intra-op/inter-op parallelism threads to 1, as shown here:

    + +
    at::set_num_threads(1)           // Disables the intraop thread pool.
    +at::set_num_interop_threads(1). // Disables the interop thread pool.
    +
    + +

    As shown in Figure 4, the lowest processing time was measured when the LibTorch global thread pool was disabled.

    + +

    This solution improved AR engine throughput in several cases. However, when evaluating long datasets (audio files longer than 2 hours in load test), we found that the memory footprint of the engine gradually started to increase.

    + +

    Optimizing memory usage

    + +

    We ran a load-test on the system with two hours long audio files and found out that the observed memory increase was the result of memory fragmentation within a multi-threaded LibTorch inference. We resolved this using jemalloc, which is a general purpose malloc(3) implementation that emphasizes fragmentation avoidance and scalable concurrency support. Using jemalloc, our peak memory usage decreased by an average of 34% and average memory usage decreased by 53%.

    + +

    Figure 5: Memory usage over time using the same input file with and without jemalloc

    + +

    Figure 5: Memory usage over time using the same input file with and without jemalloc

    + +

    Summary

    + +

    To optimize the performance of multi-threaded LibTorch-based inference engines, we recommend verifying that there is no oversubscription problem in LibTorch. In our case, all threads in the multi-threaded engine were sharing the LibTorch global thread pool, which caused an oversubscription problem. This was remedied by disabling the global thread pool: we disabled the interop and intraop global thread pool by setting threads to 1. To optimize the memory of a multi-threaded engine, we recommend using Jemalloc as a memory allocator tool rather than the default malloc function.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/optimizing-production-pytorch-performance-with-graph-transformations/index.html b/blog/optimizing-production-pytorch-performance-with-graph-transformations/index.html new file mode 100644 index 000000000000..f63e3d7bb6ba --- /dev/null +++ b/blog/optimizing-production-pytorch-performance-with-graph-transformations/index.html @@ -0,0 +1,846 @@ + + + + + + + + + + + + + Optimizing Production PyTorch Models’ Performance with Graph Transformations | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Jade Nie, CK Luk, Xiaodong Wang, Jackie (Jiaqi) Xu + +

    +

    1. Introduction

    + +

    PyTorch supports two execution modes [1]: eager mode and graph mode. In eager mode, operators in a model are immediately executed as they are encountered. In contrast, in graph mode, operators are first synthesized into a graph, which will then be compiled and executed as a whole. Eager mode is easier to use, more suitable for ML researchers, and hence is the default mode of execution. On the other hand, graph mode typically delivers higher performance and hence is heavily used in production.

    + +

    Specifically, graph mode enables operator fusion [2], wherein one operator is merged with another to reduce/localize memory reads as well as total kernel launch overhead. Fusion can be horizontal—taking a single operation (e.g., BatchNorm) that is independently applied to many operands and merging those operands into an array; and vertical—merging a kernel with another kernel that consumes the output of the first kernel (e.g., Convolution followed by ReLU).

    + +

    Torch.FX [3, 4] (abbreviated as FX) is a publicly available toolkit as part of the PyTorch package that supports graph mode execution. In particular, it (1) captures the graph from a PyTorch program and (2) allows developers to write transformations on the captured graph. It is used inside Meta to optimize the training throughput of production models. By introducing a number of FX-based optimizations developed at Meta, we demonstrate the approach of using graph transformation to optimize PyTorch’s performance for production.

    + +

    2. Background

    + +

    Embedding tables are ubiquitous in recommendation systems. Section 3 will discuss three FX transformations that optimize accesses to embedding tables. In this section, we provide some background on FX (Section 2.1) and embedding tables (Section 2.2).

    + +

    2.1 FX

    + +

    Figure 1 is a simple example adopted from [3] which illustrates using FX to transform a PyTorch program. It contains three steps: (1) capturing the graph from a program, (2) modifying the graph (in this example, all uses of RELU are replaced by GELU), and (3) generating a new program from the modified graph.

    + +

    + +

    + +

    Figure 1: A FX example which replaces all uses of RELU by GELU in a PyTorch module.

    + +

    The FX API [4] provides many more functionalities for inspecting and transforming PyTorch program graphs.

    + +

    2.2 Embedding Tables

    + +

    + +

    + +

    Figure 2: Illustration of an embedding table for a sparse feature with batch size = 1

    + +

    In a recommendation system, sparse features (e.g., User ID, Story ID) are represented by embedding tables. An embedding table E is an HxD matrix, where H is the hash size, D is the embedding dimension. Each row of E is a vector of floats. Feature hashing [5] is used to map a sparse feature to a list of indices to E, say [S1,S2, …, Sk], where 0<=Si<H. Its output value is computed as f(E[S1], E[S2], …, E[Sk]), where E[Si] is the vector at row Si, and f is called the pooling function, which is typically one of the following functions: sum, average, maximum. See Figure 2 for an illustration.

    + +

    To fully utilize the GPU, sparse features are usually processed in a batch. Each entity in a batch has its own list of indices. If a batch has B entities, a naive representation has B lists of indices. A more compact representation is to combine the B lists of indices into a single list of indices and add a list of the lengths of indices (one length for each entity in the batch). For example, if a batch has 3 entities whose lists of indices are as follows:

    + +
      +
    • Entity 1: indices = [10, 20]
    • +
    • Entity 2: indices = [5, 9, 77, 81]
    • +
    • Entity 3: indices = [15, 20, 45]
    • +
    + +

    Then the indices and lengths for the entire batch will be:

    + +
      +
    • Indices = [10, 20, 5, 9, 77, 81, 15, 20, 45]
    • +
    • Lengths = [2, 4, 3]
    • +
    + +

    And the output of the embedding table lookup for the whole batch is a BxD matrix.

    + +

    3. Three FX Transformations

    + +

    We have developed three FX transformations that accelerate accesses to embedding tables. Section 3.1 discusses a transformation that combines multiple small input tensors into a single big tensor; Section 3.2 a transformation that fuses multiple, parallel compute chains into a single compute chain; and Section 3.3 a transformation that overlaps communication with computation.

    + +

    3.1 Combining Input Sparse Features

    + +

    Recall that an input sparse feature in a batch is represented by two lists: a list of indices and a list of B lengths, where B is the batch size. In PyTorch, these two lists are implemented as two tensors. When a PyTorch model is run on a GPU, embedding tables are commonly stored in the GPU memory (which is closer to the GPU and has much higher read/write bandwidth than the CPU memory). To use an input sparse feature, its two tensors need to be first copied from CPU to GPU. Nevertheless, per host-to-device memory copying requires a kernel launch, which is relatively expensive compared to the actual data transfer time. If a model uses many input sparse features, this copying could become a performance bottleneck (e.g., 1000 input sparse features would require copying 2000 tensors from host to device).

    + +

    An optimization that reduces the number of host-to-device memcpy is to combine multiple input sparse features before sending them to the device. For instance, given the following three input features:

    + +
      +
    • Feature_A: indices = [106, 211, 7], lengths = [2, 1]
    • +
    • Feature_B: indices = [52, 498, 616, 870, 1013], lengths = [3, 2]
    • +
    • Feature_C: indices = [2011, 19, 351, 790], lengths = [1, 3]
    • +
    + +

    The combined form is:

    + +
      +
    • Features_A_B_C: indices = [106, 211, 7, 52, 498, 616, 870, 1013, 2011, 19, 351, 790], lengths = [2, 1, 3, 2, 1, 3]
    • +
    + +

    So, instead of copying 3x2=6 tensors from host to device, we only need to copy 2 tensors.

    + +

    Figure 3(b) describes an implementation of this optimization, which has two components:

    + +
      +
    • On the CPU side: The input pipeline is modified to combine all the indices of sparse features into a single tensor and similarly all the lengths into another tensor. Then the two tensors are copied to the GPU.
    • +
    • On the GPU side: Using FX, we insert a Permute_and_Split op into the model graph to recover the indices and lengths tensors of individual features from the combined tensors, and route them to the corresponding nodes downstream.
    • +
    + +

    + +

    + +

    (a). Without the optimization

    + +

    + +

    + +

    (b). With the optimization

    + +

    Figure 3: Combining input sparse features

    + +

    3.2 Horizontal fusion of computation chains started with accesses to embedding tables

    + +

    In a production model, it is fairly common to have 10s of embedding tables residing on each GPU. For performance reasons, lookups to these tables are grouped together so that their outputs are concatenated in a single big tensor (see the red part in Figure 4(a)). To apply computations to individual feature outputs, a Split op is used to divide the big tensors into N smaller tensors (where N is the number of features) and then the desired computations are applied to each tensor. This is shown in Figure 4(a), where the computation applied to each feature output O is Tanh(LayerNorm(O)). All the computation results are concatenated back to a big tensor, which is then passed to downstream ops (Op1 in Figure 4(a)).

    + +

    The main runtime cost here is the GPU kernel launch overhead. For instance, the number of GPU kernel launches in Figure 4(a) is 2*N + 3 (each oval in the figure is a GPU kernel). This could become a performance issue because execution times of LayerNorm and Tanh on the GPU are short compared to their kernel launch times. In addition, the Split op may create an extra copy of the embedding output tensor, consuming additional GPU memory.

    + +

    We use FX to implement an optimization called horizontal fusion which dramatically reduces the number of GPU kernel launches (in this example, the optimized number of GPU kernel launches is 5, see Figure 4(b)). Instead of doing an explicit Split, we use the Add_middle_dim op to reshape the 2D embedding tensor of shape (B, NxD) to a 3D tensor of shape (B, N, D). Then a single LayerNorm is applied to the last dimension of it. Then a single Tanh is applied to the result of the LayerNorm. At the end, we use the Remove_middle_dim op to reshape the Tanh’s result back to a 2D tensor. In addition, since Add_middle_dim and Remove_middle_dim only reshape the tensor without creating an extra copy, the amount of GPU memory consumption could be reduced as well.

    + +

    + +

    + +

    (a). Without the optimization

    + +

    + +

    + +

    (b). With the optimization

    + +

    Figure 4: Horizontal fusion

    + +

    3.3 Overlapping Computation with Communication

    + +

    Training of a production recommendation model is typically done on a distributed GPU system. Since the capacity of the device memory per GPU is not big enough to hold all the embedding tables in the model, they need to be distributed among the GPUs.

    + +

    Within a training step, a GPU needs to read/write feature values from/to the embedding tables on the other GPUs. This is known as all-to-all communication [6] and can be a major performance bottleneck.

    + +

    We use FX to implement a transformation that can overlap computation with all-to-all communication. Figure 5(a) shows the example of a model graph which has embedding table accesses (EmbeddingAllToAll) and other ops. Without any optimization, they are sequentially executed on a GPU stream, as shown in Figure 5(b). Using FX, we break EmbeddingAllToAll into EmbeddingAllToAll_Request and EmbeddingAllToAll_Wait, and schedule independent ops in between them.

    + +

    + +

    + +

    (a) Model graph

    + +

    + +

    + +

    (b) Original execution order

    + +

    + +

    + +

    (c)Optimized execution order

    + +

    Figure 5: Overlapping Computation with Communication

    + +

    3.4 Summary

    + +

    Table 1 summarizes the optimizations discussed in this section and the corresponding performance bottlenecks addressed.

    + + + + + + + + + + + + + + + + + + +
    Optimization + Performance Bottleneck Addressed +
    Combining Input Sparse Features + Host-to-device memory copy +
    Horizontal fusion + GPU kernel launch overhead +
    Overlapping Computation with Communication + Embedding all-to-all access time +
    + +

    Table 1: Summary of the optimizations and the performance bottlenecks addressed

    + +

    We have also developed other FX transformations which are not discussed in this section due to space limitations.

    + +

    To discover which models would benefit from these transformations, we analyzed the performance data collected by MAIProf [7] from the models that run at Meta’s data centers. Altogether, these transformations provide up to 2-3x of speedups compared to eager mode on a set of production models.

    + +

    4. Concluding Remarks

    + +

    The graph mode in PyTorch is preferred over the eager mode for production use for performance reasons. FX is a powerful tool for capturing and optimizing the graph of a PyTorch program. We demonstrate three FX transformations that are used to optimize production recommendation models inside Meta. We hope that this blog can motivate other PyTorch model developers to use graph transformations to boost their models’ performance.

    + +

    References

    + +

    [1] End-to-end Machine Learning Framework

    + +

    [2] DNNFusion: Accelerating Deep Neural Networks Execution with Advanced Operator Fusion

    + +

    [3] Torch.FX: Practical Program Capture and Transformation for Deep Learning In Python, MLSys 2022.

    + +

    [4] Torch.fx—PyTorch 1.12 documentation

    + +

    [5] Feature Hashing for Large Scale Multitask Learning

    + +

    [6] NVIDIA Collective Communication Library Documentation

    + +

    [7] Performance Debugging of Production PyTorch Models at Meta

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/out-of-the-box-acceleration/index.html b/blog/out-of-the-box-acceleration/index.html new file mode 100644 index 000000000000..bc6a12ae2a12 --- /dev/null +++ b/blog/out-of-the-box-acceleration/index.html @@ -0,0 +1,813 @@ + + + + + + + + + + + + + Out of the box acceleration and memory savings of 🤗 decoder models with PyTorch 2.0 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Felix Marty, Younes Belkada, Hamid Shojanazeri, Driss Guessous + +

    +

    As part of PyTorch 2.0 release, an accelerated implementation of the attention mechanism as part of the “Better Transformer” project (and known in PyTorch as Accelerated Transformers) has been added natively into PyTorch as torch.nn.functional.scaled_dot_product_attention. This implementation leverages fused kernels from FlashAttention and Memory-efficient attention, and supports both training and inference.

    + +

    We also release a notebook showcasing an example of this integration here

    + +

    After seeing 20-30% speedups at inference for diffusion models, we went ahead and implemented an integration with 🤗 Transformers models through the 🤗 Optimum library. Similar to the previous integration for encoder models, the integration replaces modules from Transformers with efficient implementations that use torch.nn.functional.scaled_dot_product_attention. The usage is as follow:

    + +
    from optimum.bettertransformer import BetterTransformer
    +from transformers import AutoModelForCausalLM
    +
    +with torch.device(“cuda”):
    +model = AutoModelForCausalLM.from_pretrained(“gpt2-large”, torch_dtype=torch.float16)
    +
    +model = BetterTransformer.transform(model)
    +
    +# do your inference or training here
    +
    +# if training and want to save the model
    +model = BetterTransformer.reverse(model)
    +model.save_pretrained(“fine_tuned_model”)
    +model.push_to_hub(“fine_tuned_model”) 
    +
    + +

    Summarizing our findings below about torch.nn.functional.scaled_dot_product_attention:

    +
      +
    • It is most useful to fit larger models, sequence length, or batch size to train on a given hardware.
    • +
    • Memory footprint savings on GPU during training range from 20% to 110%+.
    • +
    • Speedups during training range from 10% to 70%.
    • +
    • Speedups during inference range from 5% to 20%.
    • +
    • Standalone, for small head dimensions, scaled_dot_product_attention speedups go up to 3x, memory savings go as high as 40x (depending on the sequence length).
    • +
    + +

    You may be surprised by the wide range of memory savings and speedups. In this blog post, we discuss our benchmarks, where this feature shines and upcoming improvements in future PyTorch releases.

    + +

    In the next release of transformers you will just need to install the proper version of optimum and run:

    +
    model = model.to_bettertransformer()
    +
    +

    To convert your model using the BetterTransformer API. You can already try this feature out by installing transformers from source.

    + +

    Benchmark and usage with 🤗 Transformers

    + +

    torch.nn.functional.scaled_dot_product_attention is usable with any architecture that uses standard attention, and namely replaces the boiler-plate code:

    + +
    # native scaled_dot_product_attention is equivalent to the following:
    +def eager_sdpa(query, key, value, attn_mask, dropout_p, is_causal, scale):
    +	scale_factor = 1 / math.sqrt(Q.size(-1)) if scale is None else scale
    +	attn_mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=0) if is_causal else attn_mask
    +	attn_mask = attn_mask.masked_fill(not attn_mask, -float('inf')) if attn_mask.dtype==torch.bool else attn_mask
    +	attn_weight = torch.softmax((Q @ K.transpose(-2, -1) * scale_factor) + attn_mask, dim=-1)
    +	attn_weight = torch.dropout(attn_weight, dropout_p)
    +	return attn_weight @ V
    +
    + +

    In the 🤗 Optimum integration with Transformers models, the following architectures are supported for now: gpt2, gpt-neo, gpt-neox, gptj, t5, bart, codegen, pegasus, opt, LLaMA, blenderbot, m2m100. You can expect this list to be extended in the near future!

    + +

    To validate the benefits from the native scaled dot-product attention, we ran inference and training benchmarks, whose results are presented below.

    + +

    Inference benchmark on a single A10G GPU, AWS g5.4xlarge instance +Inference benchmark on a single A10G GPU, AWS g5.4xlarge instance

    + +

    + +

    Training benchmark on a single A10G GPU, AWS g5.4xlarge instance +Training benchmark on a single A10G GPU, AWS g5.4xlarge instance

    + +

    + +

    Training benchmark on a single A100-SXM4-80GB, Nvidia DGX +Training benchmark on a single A100-SXM4-80GB, Nvidia DGX

    + +

    + +

    Out of this benchmark, the most interesting finding is that native SDPA allows for the usage of longer sequence lengths and batch sizes without running into out of memory issues. Moreover, up to 20% speedups can be seen during inference, and even larger during training.

    + +

    As seen on the training benchmarks, it appears that smaller head dimension brings higher speedups and memory savings, which we will discuss in the following section.

    + +

    The implementation supports multi-GPU settings as well, thanks to 🤗 Accelerate library by passing device_map=”auto” to the from_pretrained method. Here are some results for training on two A100-SXM4-80GB.

    + +

    Training benchmark on two A100-SXM4-80GB, Nvidia DGX, using 🤗 Accelerate library for distributed training +Training benchmark on two A100-SXM4-80GB, Nvidia DGX, using 🤗 Accelerate library for distributed training

    + +

    + +

    Note that some kernels support only the sm_80 compute capability (which is the one from A100 GPUs), which limits usability on a wide range of hardware, notably if the head dimension is not a power of two. For example, as of PyTorch 2.0.0 during training, opt-2.7b (headim=80) and gpt-neox-20b (headdim=96) can not dispatch to a kernel using flash attention, unless run on an A100 GPU. Better kernels may be developed in the future: https://github.com/pytorch/pytorch/issues/98140#issuecomment-1518101895

    + +

    Flash Attention, Memory-efficient attention & math differences

    + +

    The native scaled_dot_product_attention relies on three possible backend implementations: flash attention, memory-efficient attention, and the so-called math implementation which provides a hardware-neutral fallback for all PyTorch platforms.

    + +

    When fused kernels are available for a given problem size, flash-attention or memory-efficient attention will be used, effectively allowing for a lower memory footprint, as in the memory-efficient attention case O(N) memory allocations are done on the GPU global memory instead of the classic O(N^2) for the traditional eager attention implementation. With flash attention, a reduced number of memory accesses (read and writes) is expected, hence both giving speedups and memory savings.

    + +

    The “math” implementation is simply an implementation using the PyTorch’s C++ API. Interesting to note in this implementation is that the query and key tensors are scaled individually for numerical stability, thus launching two aten::div operations instead of possibly only one in an eager implementation that does not contain this optimization for numerical stability.

    + +

    Head dimension influence on speedups, memory savings

    + +

    Benchmarking torch.nn.functional.scaled_dot_product_attention, we notice a decrease in the speedup / memory gains as the head dimension increases. This is an issue for some architectures like EleutherAI/gpt-neo-2.7B, that has a relatively large head dimension of 128, or EleutherAI/gpt-j-6B (and derived models as PygmalionAI/pygmalion-6b) that has a head dimension of 256 (that actually currently do not dispatch on fused kernels as the head dimension is too large).

    + +

    This trend can be seen in the figures below, where torch.nn.scaled_dot_production is benchmarked standalone versus the above eager implementation. Moreover, we use the torch.backends.cuda.sdp_kernel context manager to force the usage of respectively math, flash attention, and memory-efficient attention implementation.

    + +

    Using memory-efficient attention SDP kernel (forward-only), A100 +Using memory-efficient attention SDP kernel (forward-only), A100

    + +

    + +

    Using math (without dropout), A100 +Using math (without dropout), A100

    + +

    + +

    Using flash attention SDP kernel (without dropout), A100 +Using flash attention SDP kernel (without dropout), A100

    + +

    + +

    Using memory-efficient attention SDP kernel (without dropout), A100 +Using memory-efficient attention SDP kernel (without dropout), A100

    + +

    + +

    We see that for the same problem size, be it for inference-only or training, the speedup decreases with higher head dimension, e.g. from 3.4x for headdim=8 to 1.01x for headdim=128 using flash attention kernel.

    + +

    The reduced memory saving is expected with larger head dimensions. Recall the standard attention computation:

    + +

    Math equation

    + +

    Due to the intermediate computations, the global memory footprint is 2 * N * N + N * d in this standard step by step computation. Memory-efficient attention proposes to iteratively update the softmax renormalization constant and moving its computation at the very end, allowing for only a constant output memory allocation N * d.

    + +

    Thus, the memory saving ratio is 2 * N / d + 1, which decreases with larger head dimension.

    + +

    In flash attention, the tradeoff is between the head dimension d and the shared memory size M of a GPU streaming multiprocessor, with a total number of memory accesses of O(N² * d²/M). Thus, the memory accesses scale quadratically in the head dimension, contrary to the standard attention that scales linearly. The reason is that in flash attention, for larger head dimension d, the key and value K, V need to be split into more blocks to fit into shared memory, and in turn each block needs to load the full query Q and output O.

    + +

    Thus, the highest speedups for flash attention are in a regime where the ratio d² / M is small enough.

    + +

    Current limitations as of PyTorch 2.0.0

    + +

    Absence of a scale argument

    + +

    As of PyTorch 2.0.0, torch.nn.functional.scaled_dot_product_attention has no scale argument and uses the default square root of the hidden size sqrt(d_k).

    + +

    Math equation

    + +

    However, some architectures as OPT or T5 do not use a scaling in the attention, which as of Pytorch 2.0.0 forces it to artificially rescale before the scaled_dot_product_attention call. This introduces an unnecessary overhead, as an additional multiplication is necessary, on top of unneeded divisions in the attention.

    + +

    A fix for this issue has been merged in PyTorch repository.

    + +

    Support of flash attention / memory-efficient attention with custom mask

    + +

    As of PyTorch 2.0.0, when passing a custom attention mask, flash attention and memory-efficient attention can not be used. In this case, scaled_dot_product_attention automatically dispatches to the C++ implementation.

    + +

    However, as we have seen, some architectures require a custom attention mask, as T5 that uses positional bias. Moreover, in the case of a batch size larger than one where some inputs may be padded, a custom attention mask also needs to be passed. For this latter case, an alternative would be to use NestedTensor, which SDPA supports.

    + +

    This limited support for custom masks thus limits the benefits from SDPA in these specific cases, although we can hope for an extended support in the future.

    + +

    Note that xformers, from which PyTorch’s SDPA partially takes inspiration, currently supports arbitrary attention masks: https://github.com/facebookresearch/xformers/blob/658ebab39545f180a6075385b3897921623d6c3b/xformers/ops/fmha/cutlass.py#L147-L156 . HazyResearch implementation of flash attention also supports an equivalent implementation of padding, as a cumulative sequence length array is used along with packed query/key/values - similar in essence to NestedTensor.

    + +

    In conclusion

    + +

    Using torch.nn.functional.scaled_dot_product_attention is a free-lunch optimization, both making your code more readable, uses less memory, and is in most common cases faster.

    + +

    Although the implementation in PyTorch 2.0.0 has still minor limitations, inference and training already massively benefit from SDPA in most cases. We encourage you to use this native implementation be it to train or deploy your PyTorch models, and for 🤗 Transformers models as a one-line transformation!

    + +

    In the future, we would like to adapt the API to enable users to use SDPA in encoder-based models as well.

    + +

    We thank Benjamin Lefaudeux, Daniel Haziza and Francisco Massa for their advice on the head dimension influence, as well as Michael Gschwind, Christian Puhrsch and Driss Guessous for their feedback on the blog post!

    + +

    Benchmark reproduction

    + +

    The benchmark presented in this post was done using torch==2.0.0, transformers==4.27.4, accelerate==0.18.0 and optimum==1.8.0.

    + +

    The benchmarks can be easily reproduced using the scripts for inference, training for 🤗 Transformers models, and standalone SDPA.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/overview-of-pytorch-autograd-engine/index.html b/blog/overview-of-pytorch-autograd-engine/index.html new file mode 100644 index 000000000000..3387674bc9d7 --- /dev/null +++ b/blog/overview-of-pytorch-autograd-engine/index.html @@ -0,0 +1,803 @@ + + + + + + + + + + + + + Overview of PyTorch Autograd Engine | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    June 08, 2021

    +

    + Overview of PyTorch Autograd Engine +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Preferred Networks, Inc. + +

    +

    This blog post is based on PyTorch version 1.8, although it should apply for older versions too, since most of the mechanics have remained constant.

    + +

    To help understand the concepts explained here, it is recommended that you read the awesome blog post by @ezyang: PyTorch internals if you are not familiar with PyTorch architecture components such as ATen or c10d.

    + +

    What is autograd?

    + +

    Background

    + +

    PyTorch computes the gradient of a function with respect to the inputs by using automatic differentiation. Automatic differentiation is a technique that, given a computational graph, calculates the gradients of the inputs. Automatic differentiation can be performed in two different ways; forward and reverse mode. Forward mode means that we calculate the gradients along with the result of the function, while reverse mode requires us to evaluate the function first, and then we calculate the gradients starting from the output. While both modes have their pros and cons, the reverse mode is the de-facto choice since the number of outputs is smaller than the number of inputs, which allows a much more efficient computation. Check [3] to learn more about this.

    + +

    Automatic differentiation relies on a classic calculus formula known as the chain-rule. The chain rule allows us to calculate very complex derivatives by splitting them and recombining them later.

    + +

    Formally speaking, given a composite function , we can calculate its derivative as . This result is what makes automatic differentiation work. +By combining the derivatives of the simpler functions that compose a larger one, such as a neural network, it is possible to compute the exact value of the gradient at a given point rather than relying on the numerical approximation, which would require multiple perturbations in the input to obtain a value.

    + +

    To get the intuition of how the reverse mode works, let’s look at a simple function . Figure 1 shows its computational graph where the inputs x, y in the left, flow through a series of operations to generate the output z.

    + +
    + +

    Figure 1: Computational graph of f(x, y) = log(x*y)

    +
    + +

    The automatic differentiation engine will normally execute this graph. It will also extend it to calculate the derivatives of w with respect to the inputs x, y, and the intermediate result v.

    + +

    The example function can be decomposed in f and g, where and . Every time the engine executes an operation in the graph, the derivative of that operation is added to the graph to be executed later in the backward pass. Note, that the engine knows the derivatives of the basic functions.

    + +

    In the example above, when multiplying x and y to obtain v, the engine will extend the graph to calculate the partial derivatives of the multiplication by using the multiplication derivative definition that it already knows. and . The resulting extended graph is shown in Figure 2, where the MultDerivative node also calculates the product of the resulting gradients by an input gradient to apply the chain rule; this will be explicitly seen in the following operations. Note that the backward graph (green nodes) will not be executed until all the forward steps are completed.

    + +
    + +

    Figure 2: Computational graph extended after executing the logarithm

    +
    + +

    Continuing, the engine now calculates the operation and extends the graph again with the log derivative that it knows to be . This is shown in figure 3. This operation generates the result that when propagated backward and multiplied by the multiplication derivative as in the chain rule, generates the derivatives , .

    + +
    + +

    Figure 3: Computational graph extended after executing the logarithm

    +
    + +

    The original computation graph is extended with a new dummy variable z that is the same w. The derivative of z with respect to w is 1 as they are the same variable, this trick allows us to apply the chain rule to calculate the derivatives of the inputs. After the forward pass is complete, we start the backward pass, by supplying the initial value of 1.0 for . This is shown in Figure 4.

    + +
    + +

    Figure 4: Computational graph extended for reverse auto differentiation

    +
    + +

    Then following the green graph we execute the LogDerivative operation that the auto differentiation engine introduced, and multiply its result by to obtain the gradient as per the chain rule states. Next, the multiplication derivative is executed in the same way, and the desired derivatives are finally obtained.

    + +

    Formally, what we are doing here, and PyTorch autograd engine also does, is computing a Jacobian-vector product (Jvp) to calculate the gradients of the model parameters, since the model parameters and inputs are vectors.

    + +

    The Jacobian-vector product

    + +

    When we calculate the gradient of a vector-valued function (a function whose inputs and outputs are vectors), we are essentially constructing a Jacobian matrix .

    + +

    Thanks to the chain rule, multiplying the Jacobian matrix of a function by a vector with the previously calculated gradients of a scalar function results in the gradients of the scalar output with respect to the vector-valued function inputs.

    + +

    As an example, let’s look at some functions in python notation to show how the chain rule applies.

    +
    + + +
    def f(x1, x2):
    +      a = x1 * x2
    +      y1 = log(a)
    +      y2 = sin(x2)
    +      return (y1, y2)
    +  
    + + + +
    def g(y1, y2):
    +      return y1 * y2
    +  
    + +
    + +

    Now, if we derive this by hand using the chain rule and the definition of the derivatives, we obtain the following set of identities that we can directly plug into the Jacobian matrix of

    + +
    +

    +

    +

    +

    +
    + +

    Next, let’s consider the gradients for the scalar function

    + +
    +

    +

    +
    + +

    If we now calculate the transpose-Jacobian vector product obeying the chain rule, we obtain the following expression:

    +
    + +
    + +

    Evaluating the Jvp for yields the result: + +We can execute the same expression in PyTorch and calculate the gradient of the input:

    +
    +
    >>> import torch
    +
    >>> x = torch.tensor([0.5, 0.75], requires_grad=True)
    +
    >>> y = torch.log(x[0] * x[1]) * torch.sin(x[1])
    +
    >>> y.backward(1.0)
    +
    >>> x.grad
    + tensor([1.3633, + 0.1912]) +
    + +

    The result is the same as our hand-calculated Jacobian-vector product! +However, PyTorch never constructed the matrix as it could grow prohibitively large but instead, created a graph of operations that traversed backward while applying the Jacobian-vector products defined in tools/autograd/derivatives.yaml.

    + +

    Going through the graph

    + +

    Every time PyTorch executes an operation, the autograd engine constructs the graph to be traversed backward. +The reverse mode auto differentiation starts by adding a scalar variable at the end so that as we saw in the introduction. This is the initial gradient value that is supplied to the Jvp engine calculation as we saw in the section above.

    + +

    In PyTorch, the initial gradient is explicitly set by the user when he calls the backward method.

    + +

    Then, the Jvp calculation starts but it never constructs the matrix. Instead, when PyTorch records the computational graph, the derivatives of the executed forward operations are added (Backward Nodes). Figure 5 shows a backward graph generated by the execution of the functions and seen before.

    + +
    + +

    Figure 5: Computational Graph extended with the backward pass

    +
    + +

    Once the forward pass is done, the results are used in the backward pass where the derivatives in the computational graph are executed. The basic derivatives are stored in the tools/autograd/derivatives.yaml file and they are not regular derivatives but the Jvp versions of them [3]. They take their primitive function inputs and outputs as parameters along with the gradient of the function outputs with respect to the final outputs. By repeatedly multiplying the resulting gradients by the next Jvp derivatives in the graph, the gradients up to the inputs will be generated following the chain rule.

    + +
    + +

    Figure 6: How the chain rule is applied in backward differentiation

    +
    + +

    Figure 6 represents the process by showing the chain rule. We started with a value of 1.0 as detailed before which is the already calculated gradient highlighted in green. And we move to the next node in the graph. The backward function registered in derivatives.yaml will calculate the associated + value highlighted in red and multiply it by . By the chain rule this results in which will be the already calculated gradient (green) when we process the next backward node in the graph.

    + +

    You may also have noticed that in Figure 5 there is a gradient generated from two different sources. When two different functions share an input, the gradients with respect to the output are aggregated for that input, and calculations using that gradient can’t proceed unless all the paths have been aggregated together.

    + +

    Let’s see an example of how the derivatives are stored in PyTorch.

    + +

    Suppose that we are currently processing the backward propagation of the function, in the LogBackward node in Figure 2. The derivative of in derivatives.yaml is specified as grad.div(self.conj()). grad is the already calculated gradient and self.conj() is the complex conjugate of the input vector. For complex numbers PyTorch calculates a special derivative called the conjugate Wirtinger derivative [6]. This derivative takes the complex number and its conjugate and by operating some magic that is described in [6], they are the direction of steepest descent when plugged into optimizers.

    + +

    This code translates to , the corresponding green, and red squares in Figure 3. Continuing, the autograd engine will execute the next operation; backward of the multiplication. As before, the inputs are the original function’s inputs and the gradient calculated from the backward step. This step will keep repeating until we reach the gradient with respect to the inputs and the computation will be finished. The gradient of is only completed once the multiplication and sin gradients are added together. As you can see, we computed the equivalent of the Jvp but without constructing the matrix.

    + +

    In the next post we will dive inside PyTorch code to see how this graph is constructed and where are the relevant pieces should you want to experiment with it!

    + +

    References

    + +
      +
    1. https://pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html
    2. +
    3. https://web.stanford.edu/class/cs224n/readings/gradient-notes.pdf
    4. +
    5. https://www.cs.toronto.edu/~rgrosse/courses/csc321_2018/slides/lec10.pdf
    6. +
    7. https://mustafaghali11.medium.com/how-pytorch-backward-function-works-55669b3b7c62
    8. +
    9. https://indico.cern.ch/event/708041/contributions/3308814/attachments/1813852/2963725/automatic_differentiation_and_deep_learning.pdf
    10. +
    11. https://pytorch.org/docs/stable/notes/autograd.html#complex-autograd-doc
    12. +

      Recommended: shows why the backprop is formally expressed with the Jacobian

      +
    13. https://cs.ubc.ca/~fwood/CS340/lectures/AD1.pdf
    14. +
    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/path-achieve-low-inference-latency/index.html b/blog/path-achieve-low-inference-latency/index.html new file mode 100644 index 000000000000..7eee42c27ab1 --- /dev/null +++ b/blog/path-achieve-low-inference-latency/index.html @@ -0,0 +1,957 @@ + + + + + + + + + + + + + The Path to Achieve Ultra-Low Inference Latency With LLaMA 65B on PyTorch/XLA | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Milad Mohammadi, Jiewen Tan, Liyang Lu, Siyuan Liu, Yeounoh Chung, Wonjoo Lee, Manfei Bai, Steven Krawczyk, Shauheen Zahirazami, Alex Wertheim, Meghan Cowan, Jack Cao, Joe Spisak + +

    +

    Background & State of the Art

    + +

    In the natural language processing (NLP) space, language models are designed to generate a token (e.g. word) using a sequence of past input tokens. Large Language Models (LLMs) are the latest deep learning innovation in this space built to generate text in a human-like fashion. These models generally use transformers to improve their attention over a large sequence of input tokens.

    + +

    LLaMA, open sourced by Meta AI, is a powerful foundation LLM trained on over 1T tokens. LLaMA is competitive with many best-in-class models such as GPT-3, Chinchilla, PaLM. LLaMA (13B) outperforms GPT-3 (175B) highlighting its ability to extract more compute from each model parameter.

    + +

    In this blog post, we use LLaMA as an example model to demonstrate the capabilities of PyTorch/XLA for LLM inference. We discuss how the computation techniques and optimizations discussed here improve inference latency by 6.4x on 65B parameter LLaMA models powered by Google Cloud TPU v4 (v4-16).

    + +

    Model Overview

    + +

    We demonstrate the performance capabilities of PyTorch/XLA on LLaMA, the latest LLM from Meta. We showcase performance optimizations on a series of common LLaMA configurations. Notice the 175B parameter model configuration is absent in the public domain. For the 175B parameter model mentioned below, we apply OPT 175B model configuration to the LLaMA code base. Unless stated otherwise, in all configurations, we use max_seq_len=256 and dtype=bfloat16 for weights and activations.

    + +

    Table 1: Model Configurations Explored in this article

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    LLaMA + Model Hyper Parameters +
    # Parameters + Dimensions + N Heads + N Layers + Max Seq Len +
    7B + 4,096 + 32 + 32 + 256 +
    33B + 6,656 + 52 + 60 + 256 +
    65B + 8,192 + 64 + 80 + 256 +
    175B + 12,288 + 96 + 96 + 256 +
    + +

    Performance Challenges of LLMs

    + +

    LLMs have a few properties that make them challenging for compiler optimizations. (a) LLMs use autoregressive decoding to generate the next token baked on the previous ones; this means prompt tensors and coaches have a dynamic shape. (b) LLMs must work with variable input prompt lengths without triggering recompilation due to input tensor shape changes; input tensors must be properly bucketized and padded to avoid recompilation. (c) LLMs often require more memory than a single TPU (or GPU) device can support. A model-sharding scheme is required to fit the model across a distributed compute architecture. For instance, a LLaMA model with 65B parameters can fit on a v4-16 Cloud TPU, which is comparable to 8 A100 GPUs. (d) running LLMs in production can be expensive; one way to improve performance per total cost of ownership (Perf/TCO) is via quantization; quantization can potentially reduce hardware requirements.

    + +

    Inference Tech Stack in PyTorch/XLA

    + +

    Our goal is to offer the AI community a high performance inference stack. PyTorch/XLA integrates with TorchDynamo, PjRt, OpenXLA, and various model parallelism schemes. TorchDynamo eliminates tracing overhead at runtime, PjRt enables efficient host-device communication; PyTorch/XLA traceable collectives enable model and data parallelism on LLaMA via TorchDynamo. To try our results, please use our custom torch, torch-xla wheels to reproduce our LLaMA inference solution. PyTorch/XLA 2.1 will support the features discussed in this post by default.

    + +

    Parallel Computing

    + +

    FairScale Sharding

    + +

    LLaMA uses FairScale model sharding API (fairscale.nn.model_parallel.layers). We built an equivalent representation of this API using PyTorch/XLA communication collective (CC) ops such as all-reduce to communicate program state (e.g. activations) between accelerators. TorchDynamo does not fully support capturing CC ops currently (a.k.a. traceable collectives). Without this support, a TorchDynamo FX graph would be cut at every device communication, meaning at every model layer. Graph cuts lead to performance loss as the underlying XLA compiler loses full graph optimization opportunities. To resolve this, we offer PyTorch/XLA traceable collectives by integrating the dispatcher collectives into our existing CC APIs. The difference is we don’t need to insert c10d.wait() ops after collectives, given the lazy execution nature of PyTorch/XLA. With support for traceable collectives, PyTorch/XLA allows singular FX graph generation in TorchDynamo.

    + +

    Autoregressive Decoding on PyTorch/XLA

    + +

    LLMs need autoregressive decoding to feed the previous word as a prompt to predict the next token. Autoregressive decoding leads to unbounded dynamic shape problems, which in turn causes recompilation of every prompt. We optimized the LLaMA autoregressive decoder to operate with fixed shapes that in-place updates the KV-cache, output sequences, and attention masks during every token generation. With a combination of padding, masking, and index ops, we avoided excessive graph recompilation, thereby achieving efficient autoregressive decoding.

    + +

    KV-Cache Optimization

    + +

    LLaMA implements autoregressive decoding with KV-cache. For every generated token, the KV-cache stores the attention key/value activations of each Transformer layer. Thus, upon decoding a new token, the key/values of prior tokens no longer need recomputation.

    + +

    In LLaMA, the KV-cache tensor slices are updated in-place; this leads to recompilation events every time a token is generated. To address this issue, we use index tensors and tensor.index_copy() ops to replace the in-place slice updates. Attention masks and output sequences also benefit from the same optimization.

    + +

    Input Prompt Optimization

    + +

    Variable length input prompts are common in LLM applications. This property causes input tensor shape dynamism and in turn recompilation events. When processing a prompt to fill the KV-cache, we either (a) process the input prompt token-by-token, or (b) process the whole prompt in one iteration. The pros and cons of each method are:

    + +
      +
    1. Pre-compile 1 graph and process a prompt token-by-token +
        +
      • Practical: 1 graph is compiled during warm-up
      • +
      • Slow: O(L) to process an input prompt length L - a disadvantage for long prompts
      • +
      +
    2. +
    3. Pre-compile all graphs with input lengths ranging from 1 to max_seq_len (e.g. 2,048) +
        +
      • Impractical: pre-compile and cache max_seq_len graphs during warm-up time
      • +
      • Fast: 1 graph execution to process the full prompt
      • +
      +
    4. +
    + +

    We introduce prompt length bucketization, an optimization to strike a balance between the two alternatives. We define a set of ascending bucket sizes, (b0,b1,b2,…,bB-1), and then pre-compile program graphs with input sizes according to these bucket values, (G0,G1,G2,…,GB-1); B is the number of buckets. For a given input prompt, we round up the prompt length to the closest bucket value bn, pad the sequence, and use Gn to process the prompt in one iteration. The computation on the padding tokens is discarded. For prompts larger than the largest bucket size, we process them section-by-section.

    + +

    The optimal bucket sizes should be determined by prompt length distribution in a target application. Here, we adopt bucket lengths: 128, 256, 384, 512. Any input prompt with up to 2,047 tokens requires up to 4 graph executions. For example, a 1,500 input prompt with generation length of 256 requires 260 graph executions - 4 to process the input, and 256 to generate the output.

    + +

    Quantization

    + +

    Quantization reduces the number of bits necessary to represent a value; it reduces the bandwidth to communicate data across multiple accelerator nodes (via collectives) and lowers the hardware requirements to serve a specific model size.

    + +

    Normally, with BF16 weights, a 175B parameter model would consume about 351GB of memory, and therefore require a v4-32 instance to accommodate the model. By quantizing the weights to INT8, we reduced the model size by roughly 50%, allowing it to run on a smaller v4-16 instance. Because LLaMA shards model activations, quantization offers negligible communication gain.

    + +

    In our experiments, we quantized the linear layer. Since LLaMA model checkpoints are unavailable publicly, and our goal is to evaluate performance, the quantized model is initialized with random weights.Recent literature such as AWQ and Integer or Floating Point? offer insights into performance properties of LLaMA under various low-bit quantization schemes.

    + +

    Effect of Batch Size on Quantization Performance

    + +

    TPU v4 is programmed to run matmul on the Matrix Multiply Unit (MXU) when the model batch size (BS) > 1. For BS = 1, matmul runs on the Vector Processor Unit (VPU). Since MXU is more efficient than VPU, INT8 quantization gains performance at BS>1. See Performance Analysis section for details.

    + +

    Op Support

    + +

    Occasionally, new models introduce new mathematical operations that require PyTorch/XLA to extend its supported op set for compilation. For LLaMA, we supported: multinomial.

    + +

    Methodology

    + +

    LLaMA works on PyTorch/XLA out of the box on LazyTensorCore. We use this configuration as a baseline for our follow up analysis. All experiments assume 256-long input prompts. In the absence of a publicly available model checkpoint, we used random tensor initialization for this inference stack optimization effort. A model checkpoint is not expected to change latency results discussed here.

    + +

    Model Sizing

    + +

    Assuming N is the number of parameters, dimensions is the hidden size, n_layers is the number of layers, n_heads is the number of attention heads, the equation below can be used to approximate the model size. See the Model Overview section for details.

    + +
    N = (dimensions)^2 * n_layers * 12
    +
    + +

    n_heads doesn’t affect N, but the following equation holds for the open sourced model configs.

    + +
    dim = 128 * n_heads
    +
    + +

    Cache Sizing

    + +

    Both model parameters and the cache layers in the Attention block contribute to memory consumption. Since the default LLaMA model uses BF16 weights, the memory consumption calculation in this section is based on BF16 weights.

    + +

    The size of the cache layer is calculated by cache_size = max_batch_size * max_seq_len * dimensions. max_batch_size = 1 and max_seq_len = 256 are used as an example configuration in the following calculations. There are 2 cache layers in each Attention block. So, the total LLaMA cache size (in Bytes) is total_cache_size = n_layers * 2 * cache_size * (2 bytes).

    + +

    TPU v4 Hardware Sizing

    + +

    Each TPU v4 chip has 32GB of available High-Bandwidth Memory (HBM). Table 2 has the details on memory consumption and the number of required TPU chips to hold a LLaMA model.

    + +

    Table 2: LLaMA TPU v4 HBM requirements (i.e. TPU v4 chip requirements)

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    # Parameters + Parameter (MB) + Cache (MB) + Total (GB) + Min # of TPU v4 Chips +
    7B + 14,000 + 134 + 14.128 + 1 +
    33B + 66,000 + 408 + 66.41 + 3 +
    65B + 130,000 + 671 + 130.67 + 5 +
    175B + 350,000 + 1,208 + 351.21 + 11 +
    + +

    Metrics

    + +

    Below are useful metrics to measure inference speed. Assuming T is the total time, B is the batch size, L is the decoded sequence length.

    + +

    Latency Definition

    + +

    Latency is the time it takes to get the decoded result at target length L, regardless of the batch size B. Latency represents how long the user should wait to get the response from the generation model.

    + +
    Latency = T (s)
    +
    + +

    Per-token latency

    + +

    One step of autoregressive decoding generates a token for each sample in the batch. Per-token latency is the average time for that one step.

    + +
    Per-token latency = T / L (s/token)
    +
    + +

    Throughput

    + +

    Throughput measures how many tokens are generated per unit time. While it’s not a useful metric for evaluating online serving it is useful to measure the speed of batch processing.

    + +
    Throughput = B * L / T (tokens/s)
    +
    + +

    To minimize confusion and misinterpretation, it’s better to avoid metrics like T / (B * L), which mixes latency and throughput.

    + +

    Results

    + +

    Figure 1 shows latency / token results for LLaMA 7B to 175B models. In each case, the model is run on a range of TPU v4 configurations. For instance, LLaMA 7B shows 4.7ms/token and 3.8ms/token on v4-8 and v4-16 respectively. For more comparison, visit the HuggingFace LLM performance leaderboard.

    + +

    In the absence of the features discussed in this blog post, the LLaMA 65B running on v4-32 delivers 120ms/token instead of 14.5ms/token obtained here, leading to 8.3x speedup. As discussed earlier, developers are encouraged to try our custom torch, torch-xla wheels that unlock the repro of LLaMA inference results shared here.

    + +

    Figure 1: LLaMA Inference Performance on TPU v4 hardware

    + +

    Figure 1: LLaMA Inference Performance on TPU v4 hardware

    + +

    PyTorch/XLA:GPU performance is better than PyTorch:GPU eager and similar to PyTorch Inductor. PyTorch/XLA:TPU performance is superior to PyTorch/XLA:GPU. In the near future, XLA:GPU will deliver optimizations that bring parity with XLA:TPU. The single A100 configuration only fits LLaMA 7B, and the 8-A100 doesn’t fit LLaMA 175B.

    + +

    Figure 2: LLaMA Inference Performance on GPU A100 hardware

    + +

    Figure 2: LLaMA Inference Performance on GPU A100 hardware

    + +

    As the batch size increases, we observe a sublinear increase in per-token latency highlighting the tradeoff between hardware utilization and latency.

    + +

    Figure 3: LLaMA Inference Performance across different batch sizes

    + +

    Figure 3: LLaMA Inference Performance across different batch sizes

    + +

    Our studies suggest the impact of maximum sequence input length (max_seq_len) on inference latency is relatively minimal. We attribute this to the sequential and iterative nature of token generation. The small difference in performance can be due to KV cache access latency changes as the storage size increases.

    + +

    Figure 4: LLaMA Inference Performance across different prompt lengths

    + +

    Figure 4: LLaMA Inference Performance across different prompt lengths

    + +

    LLMs are often memory bound applications; thus, by quantizing model parameters we enable loading and executing a larger tensor on MXUs per unit time (i.e. HBM ⇒ CMEM and CMEM ⇒ MXU data moevment). Figure 5 shows INT8 weight-only quantization offers 1.6x-1.9x speedup allowing running a larger model on a given hardware.

    + +

    When BS=1, INT8 tensors are dispatched to VPU which is smaller than MXU (see the TPU v4 paper); otherwise, MXU is used. As a result, when BS=1, quantization memory bandwidth gains are offset by lack of MXU utilization. When BS>1, however, memory gains deliver superior latency on the quantized model. For example, in the case of 175B parameters LLaMA, v4-16 with quantiztion and v4-32 without quantiztion deliver similar performance. Note we do not provied FP8 comparisons because PyTorch is yet to offer this data type.

    + +

    Figure 5: LLaMA Inference Performance vs. weight-only quantization. The missing blue bars suggest the model size doesn’t fit in the specified TPU hardware.

    + +

    Figure 5: LLaMA Inference Performance vs. weight-only quantization. The missing blue bars suggest the model size doesn’t fit in the specified TPU hardware.

    + +

    Figure 6 demonstrates the steady performance advantage of PyTorch/XLA as the input prompt length grows from 10 tokens to 1,500 tokens. This strong scaling capability suggests minimal PyTorch/XLA recompilation events enabling a wide range of real-world applications. In this experiment, the maximum length is 2,048 and maximum generation length is 256.

    + +

    Figure 6: LLaMA Inference Performance vs. Input Prompt Length

    + +

    Figure 6: LLaMA Inference Performance vs. Input Prompt Length

    + +

    Final Thoughts

    + +

    We are ecstatic about what’s ahead for PyTorch/XLA and invite the community to join us. PyTorch/XLA is developed fully in open source. So, please file issues, submit pull requests, and send RFCs to GitHub so that we can openly collaborate. You can also try out PyTorch/XLA for yourself on various XLA devices including TPUs and GPUs.

    + +

    Cheers,
    +The PyTorch/XLA Team at Google
    +#PoweredByPyTorch

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/peak-performance-minimized-memory/index.html b/blog/peak-performance-minimized-memory/index.html new file mode 100644 index 000000000000..74c8b6a7a70c --- /dev/null +++ b/blog/peak-performance-minimized-memory/index.html @@ -0,0 +1,774 @@ + + + + + + + + + + + + + Peak Performance, Minimized Memory: Optimizing torchtune’s performance with torch.compile & Liger Kernel | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + LinkedIn and Meta + +

    +

    LinkedIn: Shivam Sahni, Byron Hsu, Yanning Chen
    +Meta: Ankith Gunapal, Evan Smothers

    + +

    This blog explores the integration of a custom triton kernel, Liger Kernel with torch.compile to enhance the performance of fine-tuning large language models (LLMs) using torchtune. torchtune, a PyTorch-native library, offers modular building blocks and customizable finetuning recipes which include torch.compile support for various LLMs, while Liger Kernel provides optimized Triton kernels to improve training efficiency and reduce memory usage. The integration involves modifying the TransformerDecoder module in torchtune to bypass the linear layer computation, allowing the Liger Fused Linear Cross Entropy Loss to handle the forward projection weights. Experiments conducted on an NVIDIA A100 instance demonstrate that torch.compile outperforms PyTorch Eager in throughput and memory efficiency, with Liger Kernel further reducing peak memory allocation and enabling larger batch sizes. The results show a 47% reduction in peak memory at batch size 256 and a marginal increase in throughput with meta-llama/Llama-3.2-1B , confirming the effectiveness of the integration without affecting the loss curves.

    + +

    Introduction to torchtune

    + +

    torchtune is a PyTorch-native library which has been designed for finetuning LLMs. torchtune provides composable and modular building blocks along with finetuning recipes that can be easily customized for your use case, as will be shown in this blog.
    +torchtune provides:

    + +
      +
    • PyTorch implementations of popular LLM model architectures from Llama, Gemma, Mistral, Phi, and Qwen model families
    • +
    • Hackable training recipes for full finetuning, LoRA, QLoRA, DPO, PPO, QAT, knowledge distillation, and more
    • +
    • Out-of-the-box memory efficiency, performance improvements, and scaling with the latest PyTorch APIs, including torch.compile
    • +
    • YAML configs for easily configuring training, evaluation, quantization or inference recipes
    • +
    • Built-in support for many popular dataset formats and prompt templates
    • +
    + +

    Introduction to Liger Kernel

    + +

    Liger Kernel is an open source library of optimized Triton kernels designed to enhance the efficiency and scalability of training Large Language Models (LLMs). It focuses on kernel-level optimizations such as operation fusing and input chunking, achieving significant improvements in training throughput and GPU memory usage compared to existing implementations like those from HuggingFace. By using a single line of code, Liger Kernel can improve training throughput by 20% and reduce memory usage by 60%.

    + +

    Fused Linear Cross Entropy

    + + + +

    The bulk of LIger Kernel’s performance improvement comes from the Fused Linear Cross Entropy (FLCE) Loss, whose core idea is as follows:

    + +

    In LLMs, the vocabulary size has increased significantly, leading to a large logit tensor during cross-entropy (CE) loss computation. This logit tensor consumes excessive memory, causing a bottleneck in training. For example, when training with a batch size of 8 and sequence length of 4096, the 256k vocabulary size results in a 16.8 GB logit tensor. The FLCE kernel breaks down the computation into smaller chunks, reducing memory consumption.

    + +

    Here’s how it works:

    + +
      +
    1. Flattens the 3D hidden states into a 2D matrix by collapsing the batch size and sequence length dimensions.
    2. +
    3. Applies the linear projection head sequentially on the chunked hidden states.
    4. +
    5. Computes the partial loss and returns the chunked logits gradient using the Liger CE kernel.
    6. +
    7. Derives the chunked hidden states gradients and accumulates the projection head gradients.
    8. +
    + +

    Torchtune’s recipes provide torch.compile support out of the box. It has been shown that utilizing torch.compile with FLCE makes FLCE 2x faster.

    + +

    Integrating Liger Kernel with torch.compile & torchtune

    + +

    We demonstrate integration of Liger Kernel with torch.compile & torchtune by running a full fine-tuning recipe with meta-llama/Llama-3.2-1B. To make this integration happen, we have defined a custom full finetuning recipe, the details of the changes are mentioned below.

    + +
    CUDA_VISIBLE_DEVICES=0,1,2,3 tune run --nproc_per_node 4 recipes/full_finetune_distributed.py --config llama3_2/1B_full optimizer=torch.optim.AdamW optimizer.fused=True optimizer_in_bwd=False gradient_accumulation_steps=1  dataset.packed=True compile=True enable_activation_checkpointing=True tokenizer.max_seq_len=512  batch_size=128
    +
    + +

    One of the inputs to the LCE Kernel is the forward projection weights. torchtune is designed as a modular library with composable blocks. There is a TransformerDecoder block where at the end of the block, we pass the final hidden state through a linear layer to get the final output. Since the linear layer is combined with the CE loss in LCE Kernel, we write a custom forward function for TransformerDecoder where we skip the computation through the linear layer.

    + +

    In the full finetuning recipe, we override the model’s forward method with this custom method

    + +
    import types
    +from liger_kernel.torchtune.modules.transformers import decoder_forward
    +self._model.forward = types.MethodType(decoder_forward, self._model)
    +
    + +

    We then pass the model’s forward projection weights to calculate the loss with LCE Kernel

    + +
    from liger_kernel.transformers.fused_linear_cross_entropy import (
    +    LigerFusedLinearCrossEntropyLoss,
    +)
    +
    +# Use LCE loss instead of CE loss
    +self._loss_fn = LigerFusedLinearCrossEntropyLoss()
    +
    +# call torch.compile on the loss function
    +if self._compile:
    +    training.compile_loss(self._loss_fn, verbose=self._is_rank_zero)
    +
    +# pass the model's forward projection weights for loss computation
    +current_loss = (
    +     self._loss_fn(
    +         self._model.output.tied_module.weight,
    +         logits,
    +         labels,
    +     )
    +     * current_num_tokens
    + )
    +
    + +

    The complete code and instructions can be found in the GitHub repo.

    + +

    Experiments & Benchmarking Results

    + +

    We conduct 3 types of experiments to demonstrate how Liger Kernel integration with torch.compile enhances the performance of torchtune. We set up our experiments on an instance running NVIDIA A100. We fine-tune a small LLM meta-llama/Llama-3.2-1B with differing batch sizes. We record the throughput in terms of tokens/second and measure the peak memory allocated during finetuning. Since it’s a small model, we only use 4 A100 GPUs for the benchmarking. The following are the experiments we conducted:

    + +
      +
    1. Increase batch_size in powers of 2 with PyTorch eager
    2. +
    3. Increase batch_size in powers of 2 with torch.compile
    4. +
    5. Increase batch_size in powers of 2 with torch.compile & Liger integration
    6. +
    + +

    We notice that with PyTorch Eager, throughput increases with increasing batch_size till we hit OOM at batch_size 256. With torch.compile, the throughput is higher than PyTorch Eager for each batch_size. We see that the peak memory allocation reduces drastically with increasing batch_size and more than 50% reduction in peak memory at batch_size 128. This results in torch.compile being able to support batch_size 256 and hence, the overall throughput with torch.compile being 36% greater than PyTorch Eager. Integrating Liger Kernel with torch.compile doesn’t drop the throughput at lower batch_size but with increasing batch_size, we notice that torchtune is consuming less memory compared to torch.compile. At batch_size 256, we see a 47% reduction in peak memory allocation with the Liger kernel. This allows us to use batch_size 512 with torch.compile & Liger. We notice that there is a marginal 1-2% increase in throughput compared to torch.compile without custom triton kernels.

    + +

    Plot of tokens/sec per rank vs batch_size

    + +
    +

    Figure 2: Plot of tokens/sec per rank vs batch_size

    +
    + +

    Peak memory allocated vs batch_size

    + +
    +

    Figure 3: Peak memory allocated vs batch_size

    +
    + +

    To rule out any potential functional issues with our integration of Liger Kernel with torchtune, we plot the loss curve against training steps with & without Liger. We see that there is no visible difference in the loss curves.

    + +

    Plot of loss vs training steps for batch_size=128

    + +
    +

    Figure 4: Plot of loss vs training steps for batch_size=128

    +
    + +

    Next Steps

    + + + +

    Acknowledgments

    + +

    We thank Hamid Shojanazeri (Meta), Less Wright (Meta), Horace He (Meta) & Gregory Chanan (Meta) for their feedback and support in making this blog post happen.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/performance-boost-windows/index.html b/blog/performance-boost-windows/index.html new file mode 100644 index 000000000000..656fe2525892 --- /dev/null +++ b/blog/performance-boost-windows/index.html @@ -0,0 +1,2086 @@ + + + + + + + + + + + + + The Path to Achieve PyTorch Performance Boost on Windows CPU | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Intel Corporation + +

    +

    The challenge of PyTorch’s lower CPU performance on Windows compared to Linux has been a significant issue. There are multiple factors leading to this performance disparity. Through our investigation, we’ve identified several reasons for poor CPU performance on Windows, two primary issues have been pinpointed: the inefficiency of the Windows default malloc memory allocator and the absence of SIMD for vectorization optimizations on the Windows platform. In this article, we show how PyTorch CPU performance on Windows has improved from the previous releases and where it stands as of PyTorch 2.4.1.

    + +

    Memory Allocation Optimization in PyTorch 2.1.2 and later

    + +

    In versions prior to PyTorch 2.1.2, PyTorch relied on the operating system’s default malloc function for memory allocation. The default malloc memory allocation on the Windows platform was less efficient compared to the malloc implementation mechanism on the Linux platform, leading to increased memory allocation times and reduced performance. To address this, we have substituted the default Windows malloc with mimalloc, a more efficient memory allocator developed by Microsoft. This update, included with the release of PyTorch 2.1.2 and later, has significantly enhanced the CPU performance of PyTorch on Windows, as shown in Figure 1.1.

    + +

    performance comparison chart

    + +

    PyTorch CPU Performance Improvement on Windows with Memory Allocation Optimization

    + +

    Figure 1.1: Relative throughput improvement achieved by upgrading from Windows PyTorch version 2.0.1 to 2.1.2 (higher is better).

    + +

    The graph illustrates that with the release of PyTorch 2.1.2, there has been a notable enhancement in CPU performance on the Windows platform. The degree of improvement varies across different models, which can be attributed to the diverse mix of operations they perform and their corresponding memory access patterns. While the BERT model shows a modest performance gain, models like ResNet50 and MobileNet-v3 Large benefit from more pronounced improvements.

    + +

    On a high-performance CPU, memory allocation becomes a performance bottleneck. This is also why addressing this issue has led to such significant performance improvements.

    + +

    As shown in the graphs below, we see that PyTorch CPU performance on Windows can significantly be improved. However, there is still a noticeable gap when compared to its performance on Linux. The absence of vectorization optimizations in the Windows variant of PyTorch CPU is a key factor to the remaining performance gap.

    + +

    performance comparison chart

    + +

    Windows vs Linux Performance on PyTorch 2.0.1

    + +

    Figure 1.2: Relative performance of Windows vs Linux with PyTorch version 2.0.1 (higher is better).

    + +

    performance comparison chart

    + +

    Windows vs Linux Performance on PyTorch 2.1.2

    + +

    Figure 1.3: Relative performance of Windows vs Linux with PyTorch version 2.1.2 (higher is better).

    + +

    Vectorization Optimization in PyTorch 2.4.1 and later

    + +

    Prior to PyTorch 2.4.1, the Windows build of PyTorch lacked SIMD for vectorization optimizations, a feature that the Linux build leveraged for improved performance. This discrepancy was due to the SLEEF Library’s integration issues on Windows, which is a SIMD Library for Evaluating Elementary Functions, vectorized libm and DFT and is essential for efficient trigonometric calculations. Through a collaborative effort with engineers from ARM and Qualcomm, these challenges were resolved, enabling the integration of SIMD into PyTorch for Windows. The PyTorch 2.4.1 update has thus significantly enhanced PyTorch’s CPU performance on Windows, as shown in Figure 2.1.

    + +

    performance comparison chart

    + +

    PyTorch CPU Performance Improvement on Windows with Vertorization Optimization

    + +

    Figure 2.1: Relative throughput improvement achieved by upgrading from PyTorch CPU version 2.1.2 to 2.4.1 (higher is better).

    + +

    As shown in the graph below, we see that PyTorch CPU performance on Windows ahieved the performance on Linux.

    + +

    performance comparison chart

    + +

    Windows vs Linux Performance on PyTorch 2.4.1

    + +

    Figure 2.2: Relative performance of Windows vs Linux with PyTorch version 2.4.1 (higher is better).

    + +

    CONCLUSION

    + +

    From PyTorch 2.0.1 to PyTorch 2.4.1, the CPU performance gap between Windows and Linux has been continuously narrowing. We compared the ratio of CPU performance on Windows to CPU performance on Linux across different versions, and the results are shown in the following graph.

    + +

    performance comparison chart

    + +

    Windows vs Linux Performance on different version of PyTorch

    + +

    Figure 3: Performance Ratio for Windows to Linux with different version of PyTorch (higher is better).

    + +

    The graph shows that with PyTorch 2.4.1, CPU performance on Windows has nearly converged with that on Linux, and on some models, it has even surpassed Linux. For example, in the case of DistillBERT and RoBERTa models, the CPU performance ratio of Windows to Linux has achieved a remarkable 102%. However, certain models, including MobileNet-v3, still show a performance discrepancy. Intel engineers will continue to collaborate with Meta engineers, to reduce the performance gap of PyTorch CPU between Windows and Linux.

    + +

    HOW TO TAKE ADVANTAGE OF THE OPTIMIZATIONS

    + +

    Install PyTorch CPU 2.4.1 or later on Windows from the official repository, and you may automatically experience a performance boost with memory allocation and vectorizations.

    + +

    ACKNOWLEDGMENTS

    + +

    The results presented in this blog post was achieved through the collaborative effort of the Intel PyTorch team and Meta. We would like to express our sincere gratitude to Xu Han, Jiong Gong, Haozhe Zhu, Mingfei Ma, Chuanqi Wang, Guobing Chen and Eikan Wang. Their expertise and dedication have been instrumental in achieving the optimizations and performance improvements discussed here. Thanks to Jiachen Pu from community for his participation in the issue discussion and suggesting the use of mimalloc. We’d also like to express our gratitude to Microsoft for providing such an easily integrated and performant mallocation library. Thanks to Pierre Blanchard , Nathan Sircombe from ARM and Alex Reinking from Adobe for their contribution in overcome the compatibility issues with the sleef integrated to PyTorch Windows. Finally we want to thank Jing Xu, Weizhuo Zhang and Zhaoqiong Zheng for their contributions to this blog.

    + +

    Product and Performance Information

    + +

    The configurations in the table are collected with svr-info. Test by Intel on August 30, 2024.

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Specification + Configuration1 + Configuration2 +
    Name + ThinkBook 14 G5+ IRH + ThinkBook 14 G5+ IRH +
    Time + Fri Aug 30 02:43:02 PM UTC 2024 + Fri Aug 30 02:43:02 PM UTC 2024 +
    System + LENOVO + LENOVO +
    Baseboard + LENOVO + LENOVO +
    Chassis + LENOVO + LENOVO +
    CPU Model + 13th Gen Intel(R) Core(TM) i7-13700H + 13th Gen Intel(R) Core(TM) i7-13700H +
    Microarchitecture + Unknown Intel + Unknown Intel +
    Sockets + 1 + 1 +
    Cores per Socket + 14 + 14 +
    Hyperthreading + Enabled + Enabled +
    CPUs + 20 + 20 +
    Intel Turbo Boost + Enabled + Enabled +
    Base Frequency + 2.4GHz + 2.4GHz +
    All-core Maximum Frequency + 4.7GHz + 4.7GHz +
    Maximum Frequency + 4.8GHz + 4.8GHz +
    NUMA Nodes + 1 + 1 +
    Prefetchers + L2 HW: Enabled, L2 Adj.: Enabled, DCU HW: Enabled, DCU IP: Enabled + L2 HW: Enabled, L2 Adj.: Enabled, DCU HW: Enabled, DCU IP: Enabled +
    PPINs + - + - +
    Accelerators + DLB, DSA, IAA, QAT + DLB, DSA, IAA, QAT +
    Installed Memory + 32GB (8x4GB LPDDR4 7400 MT/s [5200 MT/s]) + 32GB (8x4GB LPDDR4 7400 MT/s [5200 MT/s]) +
    Hugepagesize + 2048kb + 2048kb +
    Transparent Huge Pages + madvise + madvise +
    Automatic NUMA Balancing + Disabled + Disabled +
    NIC + “1. Raptor Lake PCH CNVi WiFi 2. Intel Corporation” + “1. Raptor Lake PCH CNVi WiFi 2. Intel Corporation” +
    Disk + Micron MTFDKBA512TFH 500G + Micron MTFDKBA512TFH 500G +
    BIOS + LBCN22WW + LBCN22WW +
    Microcode + 0x411c + 0x411c +
    OS + Windows 11 Desktop + Ubuntu 23.10 +
    Kernel + OS Build 19045.4412 + 6.5.0-27-generic +
    TDP + 200 watts + 200 watts +
    Power & Perf Policy + Normal Powersave (7) + Normal Powersave (7) +
    Frequency Governor + performance + performance +
    Frequency Driver + intel_pstate + intel_pstate +
    Max C-State + 9 + 9 +
    + +

    Notices and Disclaimers

    + +

    Performance varies by use, configuration and other factors. Learn more on the Performance Index site.

    + +

    Performance results are based on testing as of dates shown in configurations and may not reflect all publicly available updates. See backup for configuration details. No product or component can be absolutely secure. Your costs and results may vary. Intel technologies may require enabled hardware, software or service activation.

    + +

    Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/performance-debugging-of-production-pytorch-models-at-meta/index.html b/blog/performance-debugging-of-production-pytorch-models-at-meta/index.html new file mode 100644 index 000000000000..15072908948f --- /dev/null +++ b/blog/performance-debugging-of-production-pytorch-models-at-meta/index.html @@ -0,0 +1,787 @@ + + + + + + + + + + + + + Performance Debugging of Production PyTorch Models at Meta | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + CK Luk, Lei Tian + +

    +

    1. Meta’s AI Performance Profiling (MAIProf)

    + +

    + +

    + +

    +Figure 1: A simplified illustration of the Meta’s AI performance profiling (MAIProf) infrastructure. +

    + +

    Figure 1 gives a simplified illustration of the AI performance profiling infrastructure at Meta. ML research and performance engineers submit through the User Portal a profiling request for a training job to the Profiling Service, which subsequently broadcasts the request to all the GPU hosts running the training job. When the Monitoring Daemon on a GPU host receives the profiling request, it will notify the Kineto GPU tracer (built on top of NVIDIA’s libcupti) inside the PyTorch program corresponding to the training job. As a result, Kineto traces will be collected and uploaded to the Object Store asynchronously (in more details: there is one Kineto trace collected for each individual GPU, each is treated and stored as a blob; an example will be given in Section 2). Meanwhile, MAIProf also collects a variety of aggregated performance metrics: the Monitoring Daemon on every GPU host continuously reads performance counters from NVIDIA’s DCGM/NVML and logs them to a Time Series DB.

    + +

    Once both trace and metrics collections are completed, the Profiling Service will automatically download traces from the Object Store for trace analysis and performance metrics from the Time Series DB for metric analysis. Finally, an overall profiling report with detailed and insightful analysis is delivered to the user.

    + +

    To serve production uses, we deliberately made the following design choices for MAIProf:

    + +
      +
    • No source-code change required in the PyTorch models: profiling is triggered by sampling the execution of an unmodified model for a user-specified amount of time.
    • +
    • Provide a holistic view of performance: MAIProf performs system-wide analysis that cover both CPU and GPU. Under the hood, it invokes various CPU tools (e.g., Python tracer, Autograd Observer) and GPU tools (e.g., Kineto, DCGM) and correlates their results.
    • +
    • Provide multiple tools that target a wide range of AI partitioners: At Meta, there are engineers with different backgrounds who may need to tune their AI workload performance. Some of them are AI experts while others are general software engineers. Therefore, MAIProf provides a variety of tools for different levels of performance debugging, from high-level automatic trace comprehension to low-level trace analysis.
    • +
    • Support distributed GPU profiling: MAIProf can collect profiling data from multiple hosts, each with multiple GPUs. It then shows a combined view/analysis of the entire system.
    • +
    • Highly scalable: MAIProf is built as a service on top of existing infrastructures in Meta data centers such as a scalable storage system called Manifold. Its profiling capability can be easily scaled by adding more machines in the service pool with the increase of workloads.
    • +
    + +

    2. Case Study: Optimizing a Protection PyTorch Model

    + +

    To be concrete, we use a case study on a protection PyTorch model used in production. First, we discuss our steps for identifying the performance bottlenecks in the model with MAIProf. Then we describe the corresponding optimizations applied and their impacts.

    + +

    2.1 Performance Bottlenecks

    + +

    Step 1:

    + +

    Inspect the CPU and GPU utilization on the same timeline, as shown in Figure 2.

    + +

    + +

    + +

    +Figure 2: CPU usage over time (the top) vs. GPU usage over time (the bottom). +

    + +

    The first performance anomaly we noticed in Figure 2 is the pattern: “GPU-idle, GPU-active, GPU-idle, GPU-active …” throughout the training. Overall, the GPU is idle for more than half of the training time (this is bad for performance because the GPU is a higher-performance device and so we want it to be utilized as much as possible).

    + +

    Step 2:

    + +

    Collect a Python function call trace on the CPU with MAIProf while the GPU is idle, which is shown in Figure 3.

    + +

    + +

    + +

    +Figure 3: A Python call trace. +

    + +

    The Python trace shows that most of the CPU time is spent inside a Python function sharded_iterrows(). From the source code of the model, we learned that this function processes a big feature table in parallel. The number of worker threads used is controlled by a configurable parameter (num_worker_threads). Also, after investigating how the feature table is generated, we understood the performance anomaly: the training dataset is too large to fit in the CPU memory all at once; it needs to be broken into multiple sub-datasets, each has sufficient data for running 10 epochs. Consequently, a new sub-dataset needs to be read from the disk to memory every 10 epochs, during which the GPU is totally idle.

    + +

    Step 3:

    + +

    Collect GPU performance metrics, which is shown in Figure 4.

    + +

    + +

    + +

    +Figure 4: GPU performance metrics in MAIProf. +

    + +

    We made the following observations from Figure 4:

    + +
      +
    • The streaming multiprocessor (SM) runs the model’s CUDA kernels. Its utilization [1] is 9.1%, indicating that the parallel compute units on the GPU are not well utilized.
    • +
    • Tensor Core utilization is 0, meaning that Tensor Core (the mixed-precision compute unit on GPU) [2] is not used at all.
    • +
    • Max GPU memory utilization is 47.13%, indicating that half of the GPU memory is left unused.
    • +
    + +

    Step 4:

    + +

    Collect a GPU trace (aka Kineto trace) of the training loop as shown in Figure 5.

    + +

    + +

    + +

    +Figure 5: A GPU trace (aka Kineto trace) of the training loop. +

    + +

    Since commonly used PyTorch functions are already annotated, their names are automatically shown on the trace. With them, we can roughly divide the trace into the four phases in a training iteration: (1) data loading, (2) forward pass, (3) backward pass, (4) gradient optimization (note: In Figure 5, the “optimizer” phase is from the previous batch while the other three phases are from the current batch).

    + +

    2.2 Optimizations

    + +

    We performed four simple optimizations that target the bottlenecks identified above, each requiring only a change in a config parameter or at most a few source lines. They are listed in Figure 6.

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    OptimizationAmount of changesBottlenecks addressed
    Tune num_worker_threads by trying a few possible values within the number of CPU cores on each host.1 source lineGPU totally idle time
    Double the batch sizes2 config parametersGPU memory under-utilization
    Use automatic mixed precision in PyTorch13 source linesZero Tensor Core utilization
    Use mulitensor optimizer in PyTorch1 source lineMany small GPU kernels in the optimizer
    + +

    +Figure 6: Four simple optimizations applied. +

    + +

    3. Concluding Remarks

    + +

    Performance tuning for PyTorch in production environments is increasingly important. A capable performance-debugging tool is a key to this process. We demonstrate with a case study on a production model that MAIProf is a powerful infrastructure for identifying optimization opportunities.

    + +

    At Meta, MAIProf has been used by 100s of engineers, from performance novices to experts, to identify many more types of bottlenecks. These include slow data loading, small and/or slow GPU kernels, distributed training issues such as load imbalance and excessive communication. MAIProf covers major classes of models, including recommendation, vision, and natural language processing. In summary, it is now an indispensable tool for tuning the performance of production PyTorch workloads.

    + +

    References

    + +

    [1] https://docs.nvidia.com/gameworks/content/developertools/desktop/analysis/report/ cudaexperiments/kernellevel/achievedoccupancy.htm

    + +

    [2] https://www.nvidia.com/en-us/data-center/tensor-cores/

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/performant-distributed-checkpointing/index.html b/blog/performant-distributed-checkpointing/index.html new file mode 100644 index 000000000000..a285314e775e --- /dev/null +++ b/blog/performant-distributed-checkpointing/index.html @@ -0,0 +1,698 @@ + + + + + + + + + + + + + Performant Distributed checkpointing in Production with IBM | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Meta: Iris Zhang, Less Wright, Rodrigo Kumpera, Chien-Chin Huang, IBM: Davis Wertheimer, Supriyo Chakraboty, Sophia Wen, Raghu Ganti, Mudhakar Srivatsa, Seethrami Seelam + +

    +

    Params saved per minute

    + +

    Last year, IBM Research began collaborating with us to onboard Fully Sharded Data Parallelism (FSDP) for their large foundation models. They became interested as FSDP is a PyTorch native offering for scaling their distributed training efforts on IBM Cloud.

    + +

    We are pleased to share that, in collaboration with IBM, we have achieved substantial checkpointing speedups for large models (72x vs the original PyTorch 1.13 save speed), proven model and optimizer checkpoint scaling to 30B parameters, and enabled cloud first training using FSDP + Distributed Checkpoint on S3 backends.

    + +

    What is a Distributed Checkpoint?

    + +

    Distributed checkpointing is the PyTorch native solution for saving and loading PyTorch models and optimizer states from multiple ranks, as well as supporting dynamically changing world sizes between reloads.

    + +

    Checkpoint time vs model params

    + +

    PyTorch Distributed Checkpoint (DCP) APIs were introduced in PyTorch 1.13, and are included as an official prototype feature in PyTorch 2.0.

    + +

    Distributed checkpoint is different from torch.save() and torch.load() in a few significant ways:

    + +
      +
    1. DCP produces multiples files per checkpoint, with at least one file per rank,
    2. +
    3. DCP operates in place, meaning that the model should allocate its data first and the Distributed Checkpoint will then use the storage.
    4. +
    + +

    A major improvement from 1.13 to 2.0 includes adding sharded_state_dict support for checkpointing FSDP models. This allows checkpointing for larger sized models, as well as adding support for load-time resharding. Load time resharding enables saving in one cluster topology, and loading into another. This feature was highly requested as it allows training jobs to be run on one cluster, saved, and then continued on a different cluster with different world size.

    + +

    Another major change is that we decouple the storage layer from the checkpoint planning layer and separate implementation from the interface for both layers. With this change, users can now specify how their state_dict should be chunked or transformed during the checkpoint planning phase. Additionally, the customizable storage layer can easily accommodate different backends.

    + +

    More information on the Distributed Checkpoint package can be found here.

    + +

    Performant Distributed checkpointing in Production with IBM

    + +

    IBM at Think 2023 announced its watsonx.ai platform for development and deployment of foundation models for the enterprise. Built on Hybrid Cloud, the platform enables use cases across multiple modalities such as NLP, timeseries, weather, chemistry, tabular data, and cybersecurity, with model sizes from 100s of millions to 10s of billions of parameters. Model architectures range from vision transformers, to multi-modal RoBERTa-style feature extractors, to large-scale generative language models similar to T5, GPT and Llama.

    + +

    As of today, IBM has now enabled checkpointing for T5-style architectures up to 11B parameters, and decoder architectures (GPT style) up to 30B.

    + +

    IBM helped us identify that this limits the scaling power of DCP from both memory and performance standpoints. With their suggestion, we enhanced our FileSystemWriter to produce single checkpoint per rank to reduce read write overhead.

    + +

    With this option as the new default, DCP now creates a single file per rank during checkpoint saving, which would then be sliced when reading parameters at load time.

    + +

    By combining sharded_state_dict support with single filer per rank writer, distributed checkpoint was able to accelerate checkpoint saving time over 72x vs the original PyTorch 1.13 save speed, and enable rapid checkpointing for models sizes over 15B which would previously simply time out.

    + +

    “Looking back, it’s really astounding the speedups we’ve seen, handling training for many of these models. We went from taking almost half an hour to write a single 11B checkpoint in PyTorch 1.13, to being able to handle a 30B parameter model, with optimizer and dataloader state - so that’s over eight times the raw data - in just over 3 minutes. That’s done wonders for both the stability and efficiency of our jobs, as we scale up training to hundreds of gpus.” – Davis Wertheimer, IBM Research

    + +

    IBM’s adoption has also helped us validate and improve our solutions in a real world, large-scale training environment. As an example, IBM discovered that DCP was working well for them on a single node with multiple GPUs, but erred out when used on multiple nodes.

    + +

    Upon investigating the issue, we realized that we were assuming writing to a NFS-like shared file system, which assumes strong read-after-write consistencies. Object stores with file system APIs such as S3FS provide eventual consistency semantics, thus causing the distributed checkpoint in such a setting to fail. Working together with IBM, we identified this issue and fixed it by making one line code change and enabled object storage backend for DCP! Such storage approaches are typically an order of magnitude cheaper than shared file systems thus enabling finer grained checkpointing.

    + +

    Looking for Collaboration

    + +

    If you are interested in trying Distributed Checkpoint, feel free to reach out to us!

    + +

    If you run into any issue when trying it, you can open an issue at our Github repo.

    + +

    Acknowledgements

    + +

    This project would not have been possible without the assistance from many collaborators. We would like to thank Yanli Zhao, Andrew Gu, Rohan Varma for their support of FSDP. Thanks to Pritam Damania, Junjie Zhao, and Wanchao Liang for their support of ShardedTensor.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pipetransformer-automated-elastic-pipelining/index.html b/blog/pipetransformer-automated-elastic-pipelining/index.html new file mode 100644 index 000000000000..f91a9b07a9bb --- /dev/null +++ b/blog/pipetransformer-automated-elastic-pipelining/index.html @@ -0,0 +1,991 @@ + + + + + + + + + + + + + PipeTransformer: Automated Elastic Pipelining for Distributed Training of Large-scale Models | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Chaoyang He, Shen Li, Mahdi Soltanolkotabi, and Salman Avestimehr + +

    +

    In this blog post, we describe the first peer-reviewed research paper that explores accelerating the hybrid of PyTorch DDP (torch.nn.parallel.DistributedDataParallel) [1] and Pipeline (torch.distributed.pipeline) - PipeTransformer: Automated Elastic Pipelining for Distributed Training of Large-scale Models (Transformers such as BERT [2] and ViT [3]), published at ICML 2021.

    + +

    PipeTransformer leverages automated elastic pipelining for efficient distributed training of Transformer models. In PipeTransformer, we designed an adaptive on-the-fly freeze algorithm that can identify and freeze some layers gradually during training and an elastic pipelining system that can dynamically allocate resources to train the remaining active layers. More specifically, PipeTransformer automatically excludes frozen layers from the pipeline, packs active layers into fewer GPUs, and forks more replicas to increase data-parallel width. We evaluate PipeTransformer using Vision Transformer (ViT) on ImageNet and BERT on SQuAD and GLUE datasets. Our results show that compared to the state-of-the-art baseline, PipeTransformer attains up to 2.83-fold speedup without losing accuracy. We also provide various performance analyses for a more comprehensive understanding of our algorithmic and system-wise design.

    + +

    Next, we will introduce the background, motivation, our idea, design, and how we implement the algorithm and system with PyTorch Distributed APIs.

    + + + +

    Introduction

    +

    +Model Size +
    +Figure 1: the Parameter Number of Transformer Models Increases Dramatically. +

    + +

    Large Transformer models [4][5] have powered accuracy breakthroughs in both natural language processing and computer vision. GPT-3 [4] hit a new record high accuracy for nearly all NLP tasks. Vision Transformer (ViT) [3] also achieved 89\% top-1 accuracy in ImageNet, outperforming state-of-the-art convolutional networks ResNet-152 and EfficientNet. To tackle the growth in model sizes, researchers have proposed various distributed training techniques, including parameter servers [6][7][8], pipeline parallelism [9][10][11][12], intra-layer parallelism [13][14][15], and zero redundancy data-parallel [16].

    + +

    Existing distributed training solutions, however, only study scenarios where all model weights are required to be optimized throughout the training (i.e., computation and communication overhead remains relatively static over different iterations). Recent works on progressive training suggest that parameters in neural networks can be trained dynamically:

    + +
      +
    • Freeze Training: Singular Vector Canonical Correlation Analysis for Deep Learning Dynamics and Interpretability. NeurIPS 2017
    • +
    • Efficient Training of BERT by Progressively Stacking. ICML 2019
    • +
    • Accelerating Training of Transformer-Based Language Models with Progressive Layer Dropping. NeurIPS 2020.
    • +
    • On the Transformer Growth for Progressive BERT Training. NACCL 2021
    • +
    + +

    +Freeze Training +
    +

    +

    Figure 2. Interpretable Freeze Training: DNNs converge bottom-up (Results on CIFAR10 using ResNet). Each pane shows layer-by-layer similarity using SVCCA [17][18]

    + +

    For example, in freeze training [17][18], neural networks usually converge from the bottom-up (i.e., not all layers need to be trained all the way through training). Figure 2 shows an example of how weights gradually stabilize during training in this approach. This observation motivates us to utilize freeze training for distributed training of Transformer models to accelerate training by dynamically allocating resources to focus on a shrinking set of active layers. Such a layer freezing strategy is especially pertinent to pipeline parallelism, as excluding consecutive bottom layers from the pipeline can reduce computation, memory, and communication overhead.

    + +

    + +
    +Figure 3. The process of PipeTransformer’s automated and elastic pipelining to accelerate distributed training of Transformer models +

    + +

    We propose PipeTransformer, an elastic pipelining training acceleration framework that automatically reacts to frozen layers by dynamically transforming the scope of the pipelined model and the number of pipeline replicas. To the best of our knowledge, this is the first paper that studies layer freezing in the context of both pipeline and data-parallel training. Figure 3 demonstrates the benefits of such a combination. First, by excluding frozen layers from the pipeline, the same model can be packed into fewer GPUs, leading to both fewer cross-GPU communications and smaller pipeline bubbles. Second, after packing the model into fewer GPUs, the same cluster can accommodate more pipeline replicas, increasing the width of data parallelism. More importantly, the speedups acquired from these two benefits are multiplicative rather than additive, further accelerating the training.

    + +

    The design of PipeTransformer faces four major challenges. First, the freeze algorithm must make on-the-fly and adaptive freezing decisions; however, existing work [17][18] only provides a posterior analysis tool. Second, the efficiency of pipeline re-partitioning results is influenced by multiple factors, including partition granularity, cross-partition activation size, and the chunking (the number of micro-batches) in mini-batches, which require reasoning and searching in a large solution space. Third, to dynamically introduce additional pipeline replicas, PipeTransformer must overcome the static nature of collective communications and avoid potentially complex cross-process messaging protocols when onboarding new processes (one pipeline is handled by one process). Finally, caching can save time for repeated forward propagation of frozen layers, but it must be shared between existing pipelines and newly added ones, as the system cannot afford to create and warm up a dedicated cache for each replica.

    + +

    +Freeze Training +
    +Figure 4: An Animation to Show the Dynamics of PipeTransformer +

    + +

    As shown in the animation (Figure 4), PipeTransformer is designed with four core building blocks to address the aforementioned challenges. First, we design a tunable and adaptive algorithm to generate signals that guide the selection of layers to freeze over different iterations (Freeze Algorithm). Once triggered by these signals, our elastic pipelining module (AutoPipe), then packs the remaining active layers into fewer GPUs by taking both activation sizes and variances of workloads across heterogeneous partitions (frozen layers and active layers) into account. It then splits a mini-batch into an optimal number of micro-batches based on prior profiling results for different pipeline lengths. Our next module, AutoDP, spawns additional pipeline replicas to occupy freed-up GPUs and maintains hierarchical communication process groups to attain dynamic membership for collective communications. Our final module, AutoCache, efficiently shares activations across existing and new data-parallel processes and automatically replaces stale caches during transitions.

    + +

    Overall, PipeTransformer combines the Freeze Algorithm, AutoPipe, AutoDP, and AutoCache modules to provide a significant training speedup. +We evaluate PipeTransformer using Vision Transformer (ViT) on ImageNet and BERT on GLUE and SQuAD datasets. Our results show that PipeTransformer attains up to 2.83-fold speedup without losing accuracy. We also provide various performance analyses for a more comprehensive understanding of our algorithmic and system-wise design. +Finally, we have also developed open-source flexible APIs for PipeTransformer, which offer a clean separation among the freeze algorithm, model definitions, and training accelerations, allowing for transferability to other algorithms that require similar freezing strategies.

    + +

    Overall Design

    + +

    Suppose we aim to train a massive model in a distributed training system where the hybrid of pipelined model parallelism and data parallelism is used to target scenarios where either the memory of a single GPU device cannot hold the model, or if loaded, the batch size is small enough to avoid running out of memory. More specifically, we define our settings as follows:

    + +

    Training task and model definition. We train Transformer models (e.g., Vision Transformer, BERT on large-scale image or text datasets. The Transformer model mathcal{F} has L layers, in which the i th layer is composed of a forward computation function f_i and a corresponding set of parameters.

    + +

    Training infrastructure. Assume the training infrastructure contains a GPU cluster that has N GPU servers (i.e. nodes). Each node has I GPUs. Our cluster is homogeneous, meaning that each GPU and server have the same hardware configuration. Each GPU’s memory capacity is M_\text{GPU}. Servers are connected by a high bandwidth network interface such as InfiniBand interconnect.

    + +

    Pipeline parallelism. In each machine, we load a model \mathcal{F} into a pipeline \mathcal{P} which has Kpartitions (K also represents the pipeline length). The kth partition p_k consists of consecutive layers. We assume each partition is handled by a single GPU device. 1 \leq K \leq I, meaning that we can build multiple pipelines for multiple model replicas in a single machine. We assume all GPU devices in a pipeline belonging to the same machine. Our pipeline is a synchronous pipeline, which does not involve stale gradients, and the number of micro-batches is M. In the Linux OS, each pipeline is handled by a single process. We refer the reader to GPipe [10] for more details.

    + +

    Data parallelism. DDP is a cross-machine distributed data-parallel process group within R parallel workers. Each worker is a pipeline replica (a single process). The rth worker’s index (ID) is rank r. For any two pipelines in DDP, they can belong to either the same GPU server or different GPU servers, and they can exchange gradients with the AllReduce algorithm.

    + +

    Under these settings, our goal is to accelerate training by leveraging freeze training, which does not require all layers to be trained throughout the duration of the training. Additionally, it may help save computation, communication, memory cost, and potentially prevent overfitting by consecutively freezing layers. However, these benefits can only be achieved by overcoming the four challenges of designing an adaptive freezing algorithm, dynamical pipeline re-partitioning, efficient resource reallocation, and cross-process caching, as discussed in the introduction.

    + +

    +Overview +
    +Figure 5. Overview of PipeTransformer Training System +

    + +

    PipeTransformer co-designs an on-the-fly freeze algorithm and an automated elastic pipelining training system that can dynamically transform the scope of the pipelined model and the number of pipeline replicas. The overall system architecture is illustrated in Figure 5. To support PipeTransformer’s elastic pipelining, we maintain a customized version of PyTorch Pipeline. For data parallelism, we use PyTorch DDP as a baseline. Other libraries are standard mechanisms of an operating system (e.g.,multi-processing) and thus avoid specialized software or hardware customization requirements. To ensure the generality of our framework, we have decoupled the training system into four core components: freeze algorithm, AutoPipe, AutoDP, and AutoCache. The freeze algorithm (grey) samples indicators from the training loop and makes layer-wise freezing decisions, which will be shared with AutoPipe (green). AutoPipe is an elastic pipeline module that speeds up training by excluding frozen layers from the pipeline and packing the active layers into fewer GPUs (pink), leading to both fewer cross-GPU communications and smaller pipeline bubbles. Subsequently, AutoPipe passes pipeline length information to AutoDP (purple), which then spawns more pipeline replicas to increase data-parallel width, if possible. The illustration also includes an example in which AutoDP introduces a new replica (purple). AutoCache (orange edges) is a cross-pipeline caching module, as illustrated by connections between pipelines. The source code architecture is aligned with Figure 5 for readability and generality.

    + +

    Implementation Using PyTorch APIs

    + +

    As can be seen from Figure 5, PipeTransformers contain four components: Freeze Algorithm, AutoPipe, AutoDP, and AutoCache. Among them, AutoPipe and AutoDP relies on PyTorch DDP (torch.nn.parallel.DistributedDataParallel) [1] and Pipeline (torch.distributed.pipeline), respectively. In this blog, we only highlight the key implementation details of AutoPipe and AutoDP. For details of Freeze Algorithm and AutoCache, please refer to our paper.

    + +

    AutoPipe: Elastic Pipelining

    + +

    AutoPipe can accelerate training by excluding frozen layers from the pipeline and packing the active layers into fewer GPUs. This section elaborates on the key components of AutoPipe that dynamically 1) partition pipelines, 2) minimize the number of pipeline devices, and 3) optimize mini-batch chunk size accordingly.

    + +

    Basic Usage of PyTorch Pipeline

    + +

    Before diving into details of AutoPipe, let us warm up the basic usage of PyTorch Pipeline (torch.distributed.pipeline.sync.Pipe, see this tutorial). More specially, we present a simple example to understand the design of Pipeline in practice:

    + +
    # Step 1: build a model including two linear layers
    +fc1 = nn.Linear(16, 8).cuda(0)
    +fc2 = nn.Linear(8, 4).cuda(1)
    +
    +# Step 2: wrap the two layers with nn.Sequential
    +model = nn.Sequential(fc1, fc2)
    +
    +# Step 3: build Pipe (torch.distributed.pipeline.sync.Pipe)
    +model = Pipe(model, chunks=8)
    +
    +# do training/inference
    +input = torch.rand(16, 16).cuda(0)
    +output_rref = model(input)
    +
    + +

    In this basic example, we can see that before initializing Pipe, we need to partition the model nn.Sequential into multiple GPU devices and set optimal chunk number (chunks). Balancing computation time across partitions is critical to pipeline training speed, as skewed workload distributions across stages can lead to stragglers and forcing devices with lighter workloads to wait. The chunk number may also have a non-trivial influence on the throughput of the pipeline.

    + +

    Balanced Pipeline Partitioning

    + +

    In dynamic training system such as PipeTransformer, maintaining optimally balanced partitions in terms of parameter numbers does not guarantee the fastest training speed because other factors also play a crucial role:

    + +

    + +
    +Figure 6. The partition boundary is in the middle of a skip connection +

    + +
      +
    1. +

      Cross-partition communication overhead. Placing a partition boundary in the middle of a skip connection leads to additional communications since tensors in the skip connection must now be copied to a different GPU. For example, with BERT partitions in Figure 6, partition k must take intermediate outputs from both partition k-2 and partition k-1. In contrast, if the boundary is placed after the addition layer, the communication overhead between partition k-1 and k is visibly smaller. Our measurements show that having cross-device communication is more expensive than having slightly imbalanced partitions (see the Appendix in our paper). Therefore, we do not consider breaking skip connections (highlighted separately as an entire attention layer and MLP layer in green color at line 7 in Algorithm 1.

      +
    2. +
    3. +

      Frozen layer memory footprint. During training, AutoPipe must recompute partition boundaries several times to balance two distinct types of layers: frozen layers and active layers. The frozen layer’s memory cost is a fraction of that inactive layer, given that the frozen layer does not need backward activation maps, optimizer states, and gradients. Instead of launching intrusive profilers to obtain thorough metrics on memory and computational cost, we define a tunable cost factor lambda_{\text{frozen}} to estimate the memory footprint ratio of a frozen layer over the same active layer. Based on empirical measurements in our experimental hardware, we set it to \frac{1}{6}.

      +
    4. +
    + +

    + +
    +

    + +

    Based on the above two considerations, AutoPipe balances pipeline partitions based on parameter sizes. More specifically, AutoPipe uses a greedy algorithm to allocate all frozen and active layers to evenly distribute partitioned sublayers into K GPU devices. Pseudocode is described as the load\_balance() function in Algorithm 1. The frozen layers are extracted from the original model and kept in a separate model instance \mathcal{F}_{\text{frozen}} in the first device of a pipeline.

    + +

    Note that the partition algorithm employed in this paper is not the only option; PipeTransformer is modularized to work with any alternatives.

    + +

    Pipeline Compression

    + +

    Pipeline compression helps to free up GPUs to accommodate more pipeline replicas and reduce the number of cross-device communications between partitions. To determine the timing of compression, we can estimate the memory cost of the largest partition after compression, and then compare it with that of the largest partition of a pipeline at timestep T=0. To avoid extensive memory profiling, the compression algorithm uses the parameter size as a proxy for the training memory footprint. Based on this simplification, the criterion of pipeline compression is as follows:

    + +

    + +
    +

    + +

    Once the freeze notification is received, AutoPipe will always attempt to divide the pipeline length K by 2 (e.g., from 8 to 4, then 2). By using \frac{K}{2} as the input, the compression algorithm can verify if the result satisfies the criterion in Equation (1). Pseudocode is shown in lines 25-33 in Algorithm 1. Note that this compression makes the acceleration ratio exponentially increase during training, meaning that if a GPU server has a larger number of GPUs (e.g., more than 8), the acceleration ratio will be further amplified.

    + +

    + +
    +Figure 7. Pipeline Bubble: F_{d,b}, and U_d" denote forward, backward, and the optimizer update of micro-batch b on device d, respectively. The total bubble size in each iteration is K-1 times per micro-batch forward and backward cost. +

    + +

    Additionally, such a technique can also speed up training by shrinking the size of pipeline bubbles. To explain bubble sizes in a pipeline, Figure 7 depicts how 4 micro-batches run through a 4-device pipeline K = 4. In general, the total bubble size is (K-1) times per micro-batch forward and backward cost. Therefore, it is clear that shorter pipelines have smaller bubble sizes.

    + +

    Dynamic Number of Micro-Batches

    + +

    Prior pipeline parallel systems use a fixed number of micro-batches per mini-batch (M ). GPipe suggests M \geq 4 \times K, where K is the number of partitions (pipeline length). However, given that PipeTransformer dynamically configures K, we find it to be sub-optimal to maintain a static M during training. Moreover, when integrated with DDP, the value of M also has an impact on the efficiency of DDP gradient synchronizations. Since DDP must wait for the last micro-batch to finish its backward computation on a parameter before launching its gradient synchronization, finer micro-batches lead to a smaller overlap between computation and communication. Hence, instead of using a static value, PipeTransformer searches for optimal M on the fly in the hybrid of DDP environment by enumerating M values ranging from K to 6K. For a specific training environment, the profiling needs only to be done once (see Algorithm 1 line 35).

    + +

    For the complete source code, please refer to https://github.com/Distributed-AI/PipeTransformer/blob/master/pipe_transformer/pipe/auto_pipe.py.

    + +

    AutoDP: Spawning More Pipeline Replicas

    +

    As AutoPipe compresses the same pipeline into fewer GPUs, AutoDP can automatically spawn new pipeline replicas to increase data-parallel width.

    + +

    Despite the conceptual simplicity, subtle dependencies on communications and states require careful design. The challenges are threefold:

    + +
      +
    1. +

      DDP Communication: Collective communications in PyTorch DDP requires static membership, which prevents new pipelines from connecting with existing ones;

      +
    2. +
    3. +

      State Synchronization: newly activated processes must be consistent with existing pipelines in the training progress (e.g., epoch number and learning rate), weights and optimizer states, the boundary of frozen layers, and pipeline GPU range;

      +
    4. +
    5. +

      Dataset Redistribution: the dataset should be re-balanced to match a dynamic number of pipelines. This not only avoids stragglers but also ensures that gradients from all DDP processes are equally weighted.

      +
    6. +
    + +

    + +
    +Figure 8. AutoDP: handling dynamical data-parallel with messaging between double process groups (Process 0-7 belong to machine 0, while process 8-15 belong to machine 1) +

    + +

    To tackle these challenges, we create double communication process groups for DDP. As in the example shown in Figure 8, the message process group (purple) is responsible for light-weight control messages and covers all processes, while the active training process group (yellow) only contains active processes and serves as a vehicle for heavy-weight tensor communications during training. The message group remains static, whereas the training group is dismantled and reconstructed to match active processes. +In T0, only processes 0 and 8 are active. During the transition to T1, process 0 activates processes 1 and 9 (newly added pipeline replicas) and synchronizes necessary information mentioned above using the message group. The four active processes then form a new training group, allowing static collective communications adaptive to dynamic memberships. +To redistribute the dataset, we implement a variant of DistributedSampler that can seamlessly adjust data samples to match the number of active pipeline replicas.

    + +

    The above design also naturally helps to reduce DDP communication overhead. More specifically, when transitioning from T0 to T1, processes 0 and 1 destroy the existing DDP instances, and active processes construct a new DDP training group using a cached pipelined model (AutoPipe stores frozen model and cached model separately).

    + +

    We use the following APIs to implement the design above.

    + +
    import torch.distributed as dist
    +from torch.nn.parallel import DistributedDataParallel as DDP
    +
    +# initialize the process group (this must be called in the initialization of PyTorch DDP)
    +dist.init_process_group(init_method='tcp://' + str(self.config.master_addr) + ':' +
    +str(self.config.master_port), backend=Backend.GLOO, rank=self.global_rank, world_size=self.world_size)
    +...
    +
    +# create active process group (yellow color)
    +self.active_process_group = dist.new_group(ranks=self.active_ranks, backend=Backend.NCCL, timeout=timedelta(days=365))
    +...
    +
    +# create message process group (yellow color)
    +self.comm_broadcast_group = dist.new_group(ranks=[i for i in range(self.world_size)], backend=Backend.GLOO, timeout=timedelta(days=365))
    +...
    +
    +# create DDP-enabled model when the number of data-parallel workers is changed. Note:
    +# 1. The process group to be used for distributed data all-reduction.
    +If None, the default process group, which is created by torch.distributed.init_process_group, will be used.
    +In our case, we set it as self.active_process_group
    +# 2. device_ids should be set when the pipeline length = 1 (the model resides on a single CUDA device).
    +
    +self.pipe_len = gpu_num_per_process
    +if gpu_num_per_process > 1:
    +    model = DDP(model, process_group=self.active_process_group, find_unused_parameters=True)
    +else:
    +    model = DDP(model, device_ids=[self.local_rank], process_group=self.active_process_group, find_unused_parameters=True)
    +
    +# to broadcast message among processes, we use dist.broadcast_object_list
    +def dist_broadcast(object_list, src, group):
    +    """Broadcasts a given object to all parties."""
    +    dist.broadcast_object_list(object_list, src, group=group)
    +    return object_list
    +
    +

    For the complete source code, please refer to https://github.com/Distributed-AI/PipeTransformer/blob/master/pipe_transformer/dp/auto_dp.py.

    + +

    Experiments

    + +

    This section first summarizes experiment setups and then evaluates PipeTransformer using computer vision and natural language processing tasks.

    + +

    Hardware. Experiments were conducted on 2 identical machines connected by InfiniBand CX353A (5GB/s), where each machine is equipped with 8 NVIDIA Quadro RTX 5000 (16GB GPU memory). GPU-to-GPU bandwidth within a machine (PCI 3.0, 16 lanes) is 15.754GB/s.

    + +

    Implementation. We used PyTorch Pipe as a building block. The BERT model definition, configuration, and related tokenizer are from HuggingFace 3.5.0. We implemented Vision Transformer using PyTorch by following its TensorFlow implementation. More details can be found in our source code.

    + +

    Models and Datasets. Experiments employ two representative Transformers in CV and NLP: Vision Transformer (ViT) and BERT. ViT was run on an image classification task, initialized with pre-trained weights on ImageNet21K and fine-tuned on ImageNet and CIFAR-100. BERT was run on two tasks, text classification on the SST-2 dataset from the General Language Understanding Evaluation (GLUE) benchmark, and question answering on the SQuAD v1.1 Dataset (Stanford Question Answering), which is a collection of 100k crowdsourced question/answer pairs.

    + +

    Training Schemes. Given that large models normally would require thousands of GPU-days {\emph{e.g.}, GPT-3) if trained from scratch, fine-tuning downstream tasks using pre-trained models has become a trend in CV and NLP communities. Moreover, PipeTransformer is a complex training system that involves multiple core components. Thus, for the first version of PipeTransformer system development and algorithmic research, it is not cost-efficient to develop and evaluate from scratch using large-scale pre-training. Therefore, the experiments presented in this section focuses on pre-trained models. Note that since the model architectures in pre-training and fine-tuning are the same, PipeTransformer can serve both. We discussed pre-training results in the Appendix.

    + +

    Baseline. Experiments in this section compare PipeTransformer to the state-of-the-art framework, a hybrid scheme of PyTorch Pipeline (PyTorch’s implementation of GPipe) and PyTorch DDP. Since this is the first paper that studies accelerating distributed training by freezing layers, there are no perfectly aligned counterpart solutions yet.

    + +

    Hyper-parameters. Experiments use ViT-B/16 (12 transformer layers, 16 \times 16 input patch size) for ImageNet and CIFAR-100, BERT-large-uncased (24 layers) for SQuAD 1.1, and BERT-base-uncased (12 layers) for SST-2. With PipeTransformer, ViT and BERT training can set the per-pipeline batch size to around 400 and 64, respectively. Other hyperparameters (e.g., epoch, learning rate) for all experiments are presented in Appendix.

    + +

    Overall Training Acceleration

    +

    + +
    +

    + +

    We summarize the overall experimental results in the table above. Note that the speedup we report is based on a conservative \alpha \frac{1}{3} value that can obtain comparable or even higher accuracy. A more aggressive \alpha (\frac{2}{5}, \frac{1}{2}) can obtain a higher speedup but may lead to a slight loss in accuracy. Note that the model size of BERT (24 layers) is larger than ViT-B/16 (12 layers), thus it takes more time for communication.

    + +

    Performance Analysis

    + +

    Speedup Breakdown

    + +

    This section presents evaluation results and analyzes the performance of different components in \autopipe. More experimental results can be found in the Appendix.

    + +

    + +
    +Figure 9. Speedup Breakdown (ViT on ImageNet) +

    + +

    To understand the efficacy of all four components and their impacts on training speed, we experimented with different combinations and used their training sample throughput (samples/second) and speedup ratio as metrics. Results are illustrated in Figure 9. Key takeaways from these experimental results are:

    + +
      +
    1. the main speedup is the result of elastic pipelining which is achieved through the joint use of AutoPipe and AutoDP;
    2. +
    3. AutoCache’s contribution is amplified by AutoDP;
    4. +
    5. freeze training alone without system-wise adjustment even downgrades the training speed.
    6. +
    + +

    Tuning \alpha in Freezing Algorithm

    + +

    + +
    +Figure 10. Tuning \alpha in Freezing Algorithm +

    + +

    We ran experiments to show how the \alpha in the freeze algorithms influences training speed. The result clearly demonstrates that a larger \alpha (excessive freeze) leads to a greater speedup but suffers from a slight performance degradation. In the case shown in Figure 10, where \alpha=1/5, freeze training outperforms normal training and obtains a 2.04-fold speedup. We provide more results in the Appendix.

    + +

    Optimal Chunks in the elastic pipeline

    + +

    + +
    +Figure 11. Optimal chunk number in the elastic pipeline +

    + +

    We profiled the optimal number of micro-batches M for different pipeline lengths K. Results are summarized in Figure 11. As we can see, different K values lead to different optimal M, and the throughput gaps across different M values are large (as shown when K=8), which confirms the necessity of an anterior profiler in elastic pipelining.

    + +

    Understanding the Timing of Caching

    + +

    + +
    +Figure 12. the timing of caching +

    + +

    To evaluate AutoCache, we compared the sample throughput of training that activates AutoCache from epoch 0 (blue) with the training job without AutoCache (red). Figure 12 shows that enabling caching too early can slow down training, as caching can be more expensive than the forward propagation on a small number of frozen layers. After more layers are frozen, caching activations clearly outperform the corresponding forward propagation. As a result, AutoCache uses a profiler to determine the proper timing to enable caching. In our system, for ViT (12 layers), caching starts from 3 frozen layers, while for BERT (24 layers), caching starts from 5 frozen layers.

    + +

    For more detailed experimental analysis, please refer to our paper.

    + +

    Summarization

    +

    This blog introduces PipeTransformer, a holistic solution that combines elastic pipeline-parallel and data-parallel for distributed training using PyTorch Distributed APIs. More specifically, PipeTransformer incrementally freezes layers in the pipeline, packs remaining active layers into fewer GPUs, and forks more pipeline replicas to increase the data-parallel width. Evaluations on ViT and BERT models show that compared to the state-of-the-art baseline, PipeTransformer attains up to 2.83× speedups without accuracy loss.

    + +

    Reference

    + +

    [1] Li, S., Zhao, Y., Varma, R., Salpekar, O., Noordhuis, P., Li,T., Paszke, A., Smith, J., Vaughan, B., Damania, P., et al. Pytorch Distributed: Experiences on Accelerating Dataparallel Training. Proceedings of the VLDB Endowment,13(12), 2020

    + +

    [2] Devlin, J., Chang, M. W., Lee, K., and Toutanova, K. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In NAACL-HLT, 2019

    + +

    [3] Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., Gelly, S., et al. An image is Worth 16x16 words: Transformers for Image Recognition at Scale.

    + +

    [4] Brown, T. B., Mann, B., Ryder, N., Subbiah, M., Kaplan, J., Dhariwal, P., Neelakantan, A., Shyam, P., Sastry, G., Askell, A., et al. Language Models are Few-shot Learners.

    + +

    [5] Lepikhin, D., Lee, H., Xu, Y., Chen, D., Firat, O., Huang, Y., Krikun, M., Shazeer, N., and Chen, Z. Gshard: Scaling Giant Models with Conditional Computation and Automatic Sharding.

    + +

    [6] Li, M., Andersen, D. G., Park, J. W., Smola, A. J., Ahmed, A., Josifovski, V., Long, J., Shekita, E. J., and Su, B. Y. Scaling Distributed Machine Learning with the Parameter Server. In 11th {USENIX} Symposium on Operating Systems Design and Implementation ({OSDI} 14), pp. 583–598, 2014.

    + +

    [7] Jiang, Y., Zhu, Y., Lan, C., Yi, B., Cui, Y., and Guo, C. A Unified Architecture for Accelerating Distributed DNN Training in Heterogeneous GPU/CPU Clusters. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20), pp. 463–479. USENIX Association, November 2020. ISBN 978-1-939133-19- 9.

    + +

    [8] Kim, S., Yu, G. I., Park, H., Cho, S., Jeong, E., Ha, H., Lee, S., Jeong, J. S., and Chun, B. G. Parallax: Sparsity-aware Data Parallel Training of Deep Neural Networks. In Proceedings of the Fourteenth EuroSys Conference 2019, pp. 1–15, 2019.

    + +

    [9] Kim, C., Lee, H., Jeong, M., Baek, W., Yoon, B., Kim, I., Lim, S., and Kim, S. TorchGPipe: On-the-fly Pipeline Parallelism for Training Giant Models.

    + +

    [10] Huang, Y., Cheng, Y., Bapna, A., Firat, O., Chen, M. X., Chen, D., Lee, H., Ngiam, J., Le, Q. V., Wu, Y., et al. Gpipe: Efficient Training of Giant Neural Networks using Pipeline Parallelism.

    + +

    [11] Park, J. H., Yun, G., Yi, C. M., Nguyen, N. T., Lee, S., Choi, J., Noh, S. H., and ri Choi, Y. Hetpipe: Enabling Large DNN Training on (whimpy) Heterogeneous GPU Clusters through Integration of Pipelined Model Parallelism and Data Parallelism. In 2020 USENIX Annual Technical Conference (USENIX ATC 20), pp. 307–321. USENIX Association, July 2020. ISBN 978-1-939133- 14-4.

    + +

    [12] Narayanan, D., Harlap, A., Phanishayee, A., Seshadri, V., Devanur, N. R., Ganger, G. R., Gibbons, P. B., and Zaharia, M. Pipedream: Generalized Pipeline Parallelism for DNN Training. In Proceedings of the 27th ACM Symposium on Operating Systems Principles, SOSP ’19, pp. 1–15, New York, NY, USA, 2019. Association for Computing Machinery. ISBN 9781450368735. doi: 10.1145/3341301.3359646.

    + +

    [13] Lepikhin, D., Lee, H., Xu, Y., Chen, D., Firat, O., Huang, Y., Krikun, M., Shazeer, N., and Chen, Z. Gshard: Scaling Giant Models with Conditional Computation and Automatic Sharding.

    + +

    [14] Shazeer, N., Cheng, Y., Parmar, N., Tran, D., Vaswani, A., Koanantakool, P., Hawkins, P., Lee, H., Hong, M., Young, C., Sepassi, R., and Hechtman, B. Mesh-Tensorflow: Deep Learning for Supercomputers. In Bengio, S., Wallach, H., Larochelle, H., Grauman, K., Cesa-Bianchi, N., and Garnett, R. (eds.), Advances in Neural Information Processing Systems, volume 31, pp. 10414–10423. Curran Associates, Inc., 2018.

    + +

    [15] Shoeybi, M., Patwary, M., Puri, R., LeGresley, P., Casper, J., and Catanzaro, B. Megatron-LM: Training Multi-billion Parameter Language Models using Model Parallelism.

    + +

    [16] Rajbhandari, S., Rasley, J., Ruwase, O., and He, Y. ZERO: Memory Optimization towards Training a Trillion Parameter Models.

    + +

    [17] Raghu, M., Gilmer, J., Yosinski, J., and Sohl Dickstein, J. Svcca: Singular Vector Canonical Correlation Analysis for Deep Learning Dynamics and Interpretability. In NIPS, 2017.

    + +

    [18] Morcos, A., Raghu, M., and Bengio, S. Insights on Representational Similarity in Neural Networks with Canonical Correlation. In Bengio, S., Wallach, H., Larochelle, H., Grauman, K., Cesa-Bianchi, N., and Garnett, R. (eds.), Advances in Neural Information Processing Systems 31, pp. 5732–5741. Curran Associates, Inc., 2018.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/prototype-features-now-available-apis-for-hardware-accelerated-mobile-and-arm64-builds/index.html b/blog/prototype-features-now-available-apis-for-hardware-accelerated-mobile-and-arm64-builds/index.html new file mode 100644 index 000000000000..2c0b4fe55474 --- /dev/null +++ b/blog/prototype-features-now-available-apis-for-hardware-accelerated-mobile-and-arm64-builds/index.html @@ -0,0 +1,685 @@ + + + + + + + + + + + + + Prototype Features Now Available - APIs for Hardware Accelerated Mobile and ARM64 Builds | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    Today, we are announcing four PyTorch prototype features. The first three of these will enable Mobile machine-learning developers to execute models on the full set of hardware (HW) engines making up a system-on-chip (SOC). This gives developers options to optimize their model execution for unique performance, power, and system-level concurrency.

    + +

    These features include enabling execution on the following on-device HW engines:

    +
      +
    • DSP and NPUs using the Android Neural Networks API (NNAPI), developed in collaboration with Google
    • +
    • GPU execution on Android via Vulkan
    • +
    • GPU execution on iOS via Metal
    • +
    + +

    This release also includes developer efficiency benefits with newly introduced support for ARM64 builds for Linux.

    + +

    Below, you’ll find brief descriptions of each feature with the links to get you started. These features are available through our nightly builds. Reach out to us on the PyTorch Forums for any comment or feedback. We would love to get your feedback on those and hear how you are using them!

    + +

    NNAPI Support with Google Android

    + +

    The Google Android and PyTorch teams collaborated to enable support for Android’s Neural Networks API (NNAPI) via PyTorch Mobile. Developers can now unlock high-performance execution on Android phones as their machine-learning models will be able to access additional hardware blocks on the phone’s system-on-chip. NNAPI allows Android apps to run computationally intensive neural networks on the most powerful and efficient parts of the chips that power mobile phones, including DSPs (Digital Signal Processors) and NPUs (specialized Neural Processing Units). The API was introduced in Android 8 (Oreo) and significantly expanded in Android 10 and 11 to support a richer set of AI models. With this integration, developers can now seamlessly access NNAPI directly from PyTorch Mobile. This initial release includes fully-functional support for a core set of features and operators, and Google and Facebook will be working to expand capabilities in the coming months.

    + +

    Links

    + + +

    PyTorch Mobile GPU support

    + +

    Inferencing on GPU can provide great performance on many models types, especially those utilizing high-precision floating-point math. Leveraging the GPU for ML model execution as those found in SOCs from Qualcomm, Mediatek, and Apple allows for CPU-offload, freeing up the Mobile CPU for non-ML use cases. This initial prototype level support provided for on device GPUs is via the Metal API specification for iOS, and the Vulkan API specification for Android. As this feature is in an early stage: performance is not optimized and model coverage is limited. We expect this to improve significantly over the course of 2021 and would like to hear from you which models and devices you would like to see performance improvements on.

    + +

    Links

    + + +

    ARM64 Builds for Linux

    + +

    We will now provide prototype level PyTorch builds for ARM64 devices on Linux. As we see more ARM usage in our community with platforms such as Raspberry Pis and Graviton(2) instances spanning both at the edge and on servers respectively. This feature is available through our nightly builds.

    + +

    We value your feedback on these features and look forward to collaborating with you to continuously improve them further!

    + +

    Thank you,

    + +

    Team PyTorch

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pt-day-china-2025-cfp/index.html b/blog/pt-day-china-2025-cfp/index.html new file mode 100644 index 000000000000..389deb82124e --- /dev/null +++ b/blog/pt-day-china-2025-cfp/index.html @@ -0,0 +1,694 @@ + + + + + + + + + + + + + PyTorch Day China 2025 Call for Proposals Open | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    + +
    + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    We’re excited to announce the first-ever PyTorch Day China! This new event, hosted by the PyTorch Foundation, will take place on June 7 in Beijing, China, bringing together AI practitioners, researchers, and industry professionals to explore the latest advancements in open source AI and machine learning. Co-located with the BAAI Conference, PyTorch Day China is a chance to connect with the community, share knowledge, and help shape the future of deep learning.

    + +

    PyTorch Day China 2025 Call for Proposals Open

    + +

    Why Submit a Proposal?

    + +

    PyTorch Day China offers a platform for AI practitioners and researchers to showcase their work, exchange ideas, and connect with others in the community. If you’re working on innovative applications, tools, or research in the PyTorch ecosystem, we encourage you to share your expertise.

    + +

    Topics for Submission:

    + +
      +
    • AI Applications and Use Cases
    • +
    • Core PyTorch Framework
    • +
    • DL Compilers and Kernel Authoring
    • +
    • Edge AI and On-Device
    • +
    • Ethical AI, Governance, and Regulation
    • +
    • Generative AI and Large Language Models (LLMs) with PyTorch
    • +
    • Open Source Collaboration, Education, and Community Building
    • +
    • Optimization for Training and Inference
    • +
    • PyTorch on Accelerator Hardware
    • +
    • PyTorch Ecosystem and Tools
    • +
    • PyTorch in Research and Academia
    • +
    • Performance Measurement and Benchmarking
    • +
    • Scaling Training and Inference
    • +
    + +

    The submission deadline is April 13. Submit and learn more here: https://www.lfasiallc.com/pytorch-day-china/call-for-proposals-cfp/

    + +

    Why Attend?

    + +

    PyTorch Day China will feature technical talks, discussions, and poster sessions that highlight real-world applications and developments in AI and machine learning. Attendees will have the opportunity to learn from experts, contribute to the open source community, and engage with fellow PyTorch users. Registration information will be available in April.

    + +

    Event Details

    + +
      +
    • Date: June 7, 2025
    • +
    • Location: Zhongguancun Exhibition Center, Beijing, China
    • +
    • Address: 索家坟, Hai Dian Qu, Bei Jing Shi, China, 100080
    • +
    • Co-located with: BAAI Conference
    • +
    + +

    Travel Information

    + +

    The venue, Zhongguancun Exhibition Center, is approximately 39 km from Beijing International Airport. More details on travel and accommodation will be available on the BAAI Conference website and updated here as they become available.

    + +

    Have Questions?

    + +

    For inquiries, please contact pytorchevents@linuxfoundation.org.

    + +

    Submit your proposal by April 13 and join the conversation shaping the future of PyTorch.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pt-day-france-cfp/index.html b/blog/pt-day-france-cfp/index.html new file mode 100644 index 000000000000..82b961b9b3c1 --- /dev/null +++ b/blog/pt-day-france-cfp/index.html @@ -0,0 +1,691 @@ + + + + + + + + + + + + + PyTorch Day France 2025: Call For Proposals Open | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    We’re pleased to announce PyTorch Day France 2025, a dedicated gathering of the PyTorch community held 7 May 2025 in Paris, France. Proudly hosted by the PyTorch Foundation and co-located with GOSIM AI Paris 2025, this event will bring together developers, researchers, and practitioners driving innovation in open source AI and machine learning.

    + +

    Whether you’re building cutting-edge models or contributing to the ecosystem, PyTorch Day France is your opportunity to connect, collaborate, and help shape the future of deep learning.

    + +

    PT Day CFP

    + +

    Why Attend?

    + +

    Set in the vibrant atmosphere of STATION F, the world’s largest startup campus, PyTorch Day France will offer a full day of:

    + +
      +
    • Insightful Technical Talks
    • +
    • Interactive Discussions
    • +
    • Engaging Poster Sessions
    • +
    + +

    The event is designed to foster open exchange across the PyTorch ecosystem, providing a space to learn from peers, share practical insights, and explore the latest research and applications in AI.

    + +

    Submit a Proposal

    + +

    We are currently accepting proposals for talks. If you have a project, idea, or research story you’d like to share with the PyTorch community, we want to hear from you.

    + +

    📩 Email your talk title and abstract to pytorchevents@linuxfoundation.org for consideration.

    + +

    Registration

    + +

    To register for PyTorch Day France, please visit the GOSIM AI Paris website, and use the code PYTORCHFRIEND to receive 25% off.

    + +

    👉 https://paris2025.gosim.org/

    + +

    We encourage early registration to secure your spot and ensure access to both PyTorch Day France and the broader GOSIM AI Paris programming.

    + +

    Venue

    + +

    STATION F
    +5 Parv. Alan Turing, 75013 Paris, France
    +A landmark of innovation and entrepreneurship in the heart of Paris.

    + +

    Travel and Accommodations

    + +

    Participants are responsible for their own travel and lodging. For those arriving internationally, Paris Charles de Gaulle Airport is approximately 38.4 km from STATION F. Additional information about accommodations and transportation may be available on the GOSIM AI Paris website.

    + +

    Questions?

    + +

    For any inquiries, please contact us at pytorchevents@linuxfoundation.org.

    + +

    We look forward to welcoming the PyTorch community to Paris this May for a day of collaboration, learning, and open source AI innovation.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pt-day-france-featured-sessions/index.html b/blog/pt-day-france-featured-sessions/index.html new file mode 100644 index 000000000000..2e0b9a3a62e6 --- /dev/null +++ b/blog/pt-day-france-featured-sessions/index.html @@ -0,0 +1,685 @@ + + + + + + + + + + + + + PyTorch Day France Featured Sessions: A Defining Moment for Open Source AI | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    PyTorch Day France offers a front-row seat to the future of open source AI. Taking place 7 May at Station F in Paris and co-located with GOSIM AI Paris, this one-day event will bring together developers, researchers, and industry leaders for a day of technical sessions, real-world insights, and community exchange.

    + +

    🌍 A Major Milestone for the PyTorch Foundation

    + +

    This event marks the very first PyTorch Day, launching a new international series hosted annually in different regions to convene AI researchers, developers, engineers, and enthusiasts. PyTorch Days are designed to spotlight open source AI advancements, foster community collaboration, and provide a forum to learn about active, high-impact AI projects built using PyTorch.

    + +

    PyTorch Day France also represents a pivotal moment in the PyTorch Foundation’s journey. With its recent expansion into an umbrella foundation, PyTorch is now positioned to support a broader ecosystem of trusted, community-driven AI projects across the full AI lifecycle.

    + +

    At PyTorch Day France, you’ll hear directly from PyTorch Foundation Executive Director, Matt White, about this transition—and get a first look at some exciting announcements.

    + +

    🎟️ Registration Details

    + +

    Register now with code PYTORCH for free access to the full day of PyTorch Day France sessions, plus GOSIM AI Paris.

    + +

    🔗Two events, one registration—double the sessions, double the innovation.
    +Register here

    + + + +

    The day’s agenda includes deep technical dives and applied AI use cases from across the community, including the following talks:

    + + + +

    View the full schedule.

    + +

    Whether you’re a contributor, practitioner, or simply curious about what’s ahead, PyTorch Day France is an opportunity to connect with the community and shape what’s next for our ecosystem.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pt-executorch-ethos-u85/index.html b/blog/pt-executorch-ethos-u85/index.html new file mode 100644 index 000000000000..6808b1a0fb9a --- /dev/null +++ b/blog/pt-executorch-ethos-u85/index.html @@ -0,0 +1,728 @@ + + + + + + + + + + + + + Getting started with PyTorch, ExecuTorch, and Ethos-U85 in three easy steps | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Robert Elliott, Fredrik Knutsson, and Mark Quartermain + +

    +

    ExecuTorch support for Ethos-U85

    + +

    In the rapidly evolving landscape of machine learning, PyTorch has emerged as a leading framework for model development, given its flexibility and comprehensive ecosystem. Arm has worked with Meta to introduce support for Arm platforms in ExecuTorch, that further simplifies this process, making it seamless to deploy PyTorch models on edge devices.

    + +

    The Arm Ethos-U85 NPU is the highest performing Ethos NPU addressing the growing demand for running advanced AI inference workloads at the edge, including transformer-based networks like LLMs. Arm offers reference designs, including the Corstone-320 IoT reference design platform, around the Ethos-U to accelerate and simplify the chip development cycle. The reference design platform includes, among many items, a Fixed Virtual Platform (FVP) that simulates an entire system, enabling cutting edge embedded software development and neural network deployment for the Ethos-U85.

    + +

    Today, Arm is extending the support for developers building IoT edge applications, by supporting ExecuTorch beta on Ethos-U85. Leveraging ExecuTorch, developers can now efficiently land their natively developed PyTorch models to enable intelligent and responsive IoT solutions built on Arm.

    + +

    With this package now available, thousands of developers looking to create Edge AI applications, can start their model and application development months before the platforms arrive on the market.

    + +

    Getting started with ExecuTorch on Ethos-U85

    + +

    A full development environment has been provided in the public ExecuTorch GitHub repository. This provides an integrated and tested development flow with all necessary components.

    + +

    The three simple steps are:

    + +
      +
    1. Set up ExecuTorch
    2. +
    3. Set up the Arm Build environment
    4. +
    5. Compile and Run models on the arm_executor_runner
    6. +
    + +

    You can then build on this flow for compiling and running models, to capture runtime behavior from the Ethos-U85 driver, such as cycle count information.

    + +

    To make the process easier for end users, we have also added scripts to the ExecuTorch repository:

    + +
      +
    1. Set up ExecuTorch
    2. +
    3. setup.sh: Download the necessary software.
    4. +
    5. run.sh: to compile and run the model on the Corstone-320 FVP
    6. +
    + +

    To build other models, you can use the ahead of time compiler script aot_arm_compiler.py, which takes a PyTorch program (nn.module) to an ExecuTorch program (.pte flatbuffer file). To write custom applications which use ExecuTorch you can follow the application flow in the example executor_runner application.

    + +

    We support approximately 40 core ATen operators and already support end-to-end deployment of models such as Mobilenetv2. Ongoing efforts to support further operators will enable more PyTorch models every week .

    + +

    As more functionality is added, it will be demonstrated through the tutorial materials for Ethos-U on pytorch.org

    + +

    How this deployment flow works in more detail

    + +

    Leveraging the extensibility of ExecuTorch and the expressiveness of Arm’s Tensor Operator Set Architecture (TOSA), we have enabled Ethos-U support in ExecuTorch. The Ethos-U compiler, Vela, has been enhanced with a TOSA front-end, making it possible to compile models for all products in the Ethos-U family. Combining these components into a cohesive workflow involves the following steps.

    + +
      +
    1. Converting a PyTorch model into a deployable ExecuTorch program (AOT flow)
    2. +
    3. Compile the ExecuTorch program into an executable, which can be deployed on Corstone-320 (runtime flow)
    4. +
    + +

    The ExecuTorch Ahead of time (AOT) flow

    + +

    The process begins by converting a PyTorch model into a quantized TOSA representation using the PyTorch dynamo export flow. This allows us to generate an Ethos-U set of machine instructions, known as a command stream, utilizing the Vela compiler TOSA frontend. The command stream is bundled into an ExecuTorch program, represented by a flatbuffer file (.pte). This file contains everything the ExecuTorch runtime needs to perform inference using Ethos-U hardware.

    + +

    flow diagram

    + +

    The ExecuTorch Runtime flow

    + +

    The ExecuTorch runtime, written in C/C++, is designed to support multiple backends. We have extended it to include support for the Ethos-U device driver. Following this flow will produce a self-contained compiled executable. Deploying the executable on the Corstone-320 FVP is straightforward and requires only the appropriate flags when calling the FVP.

    + +

    flow diagram

    + +

    Ethos-U85 and Corstone-320

    + +

    The Ethos-U family of NPUs offers high performance and energy-efficient solutions for edge AI. The Ethos-U55 (also supported by ExecuTorch) is widely deployed in many Cortex-M heterogeneous systems, while the Ethos-U65 extends the applicability of the Ethos-U family to Cortex-A-based systems and increases the performance.

    + +

    Ethos-U85 further extends the Ethos-U product line, supporting current and future workloads on the edge using transformer-based networks. Ethos-U85 delivers a 4x performance uplift and 20% higher energy efficiency compared to its predecessor, with up to 85% utilization on popular networks. Notable feature of Ethos-U85 includes;

    + +
      +
    • configurations from 128 to 2048 MACs/cycle, delivering up 4 TOP/s at 1GHz
    • +
    • Compatible with Cortex-A and Cortex-M based systems
    • +
    • Native support for major neural networks though support for TOSA
    • +
    • Full hardware acceleration of all major neural networks
    • +
    • For a full list of features, see the Ethos-U85 Technical Overview
    • +
    + +

    A typical compute subsystem design with Ethos-U85

    + +

    A typical compute subsystem design with Ethos-U85

    + +

    What’s next

    + +

    We are adding new operator support every week, extending ExecuTorch core ATen operator coverage, and enabling a wider range of models to run on Ethos-U. Our ongoing efforts focus on improving performance to ensure models run as optimally as possible on Ethos-U.

    + +

    The ExecuTorch delegate framework supports fallback to running operators not supported by Ethos-U on the CPU using reference kernel implementations. We will work towards optimal performance on Cortex-M CPUs using CMSIS-NN, providing the best possible support for fallback operators and ensuring optimal performance for devices without Ethos-U capability.

    + +

    The package above with the Corstone-320 FVP are more steps to simplify application development, so please, go ahead, check out the code and build process and send us feedback. Meanwhile we will be busy making weekly releases to enable more features, models and to extract the maximum performance out of the hardware.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pt-fedora-os-communities/index.html b/blog/pt-fedora-os-communities/index.html new file mode 100644 index 000000000000..7f0db2e8b3fb --- /dev/null +++ b/blog/pt-fedora-os-communities/index.html @@ -0,0 +1,677 @@ + + + + + + + + + + + + + Powering AI with PyTorch, Fedora, and Open Source Communities | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Sudhir Dharanendraiah + +

    +

    man speaking at a conference

    + +

    At DevConf.IN 2025 in Pune, I had the opportunity to host a PyTorch Meetup on February 28th. The session, titled “Powering AI with PyTorch, Fedora, and Open Source Communities” was aimed at introducing PyTorch to students and professionals, explaining why PyTorch+Fedora form an ideal AI development platform. The other key aspect I covered was collaboration between open source communities.

    + +

    Introduction to PyTorch

    + +

    The Power of Deep Learning made simple

    + +

    With the explosion of GPTs, there is a renowned interest in the field of AI and ML. The myth of developing AI/ML technologies and its applications is rocket science and far-fetched, needs correction. Only open source has the power to demystify this myth and further evolve the technology to make it versatile and developer friendly. Since its inception, PyTorch has evolved and has been a driving force to make AI/ML development extremely simple. I covered the aspects of PyTorch key components, its features and why PyTorch is the best choice as a deep learning framework.

    + +

    man speaking at a conference

    + +

    The codewalk through was designed to showcase how easy and simple it is to utilise the power of GPUs, creating a simple neural network and training the model. The code walkthrough was very well received and it was great to hear back from the attendees that they never knew how powerful PyTorch is for deep learning. The real world examples sighted how this powerful framework can be used beyond the common GPTs and has the power to influence across a broad spectrum of applications.

    + +

    Fedora+PyTorch the Ideal AI/ML Development Platform

    + +

    man speaking at a conference

    + +

    man speaking at a conference

    + +

    One of the highlights of the event was the discussion on Fedora’s role as an AI platform. Fedora’s reliability, flexibility, and strong community support make it an ideal partner for PyTorch, allowing developers to focus on model-building without worrying about infrastructure. The students were intrigued by the idea of contributing to Fedora’s AI/ML ecosystem while building their own projects. Sumantro Mukherjee spoke about the AI policy in Fedora and how one can start contributing to the AI/ML using Fedora as a platform. He highlighted how Fedora is evolving to meet the needs of AI practitioners. The idea that an open-source operating system could provide the perfect foundation for AI research sparked an engaging conversation.

    + +

    Innovation in Open Source When Communities Come Together

    + +

    charts

    + +

    It is important that we learn from history and repeat the good things! When open source communities come together they can create seismic shifts in the industry. To drive this home, I took the audience on a journey through history, revisiting a pivotal moment when Apache and Linux came together, solving common problems and fundamentally reshaping enterprise computing. That moment was not just about technology; it was about collaboration. It was about two powerful communities recognizing that they were stronger together. Today, we stand at the cusp of another such moment - PyTorch and Linux, particularly Fedora, are coming together to shape the future of AI/ML. This is not just an opportunity but a responsibility for contributors, developers, and AI/ML enthusiasts to be part of this movement.

    + +

    Looking Ahead

    + +

    man speaking at a conference

    + +

    One of the best parts of the event was the enthusiasm it generated. Diverse audience, including students, AI enthusiasts, and industry professionals. Notably, Vincent Caldeira (CTO, APAC, Red Hat) and Chris Butler (Senior Principal Chief Architect, Red Hat) were present, reinforcing the growing interest in open-source AI/ML. Many students were eager to explore PyTorch and Fedora, contribute to open-source AI projects, and start their own AI experiments. Industry experts saw the potential for scalable, community-driven AI innovation. The session sparked curiosity and conversations that continued long after the event ended.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pt-foundation-expands/index.html b/blog/pt-foundation-expands/index.html new file mode 100644 index 000000000000..3b94b847eaec --- /dev/null +++ b/blog/pt-foundation-expands/index.html @@ -0,0 +1,693 @@ + + + + + + + + + + + + + PyTorch Foundation Expands to an Umbrella Foundation to Accelerate AI Innovation | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Matt White, Executive Director, PyTorch Foundation + +

    +

    Today, I am thrilled to announce a significant milestone for the PyTorch Foundation: we are expanding our scope to become an umbrella foundation, allowing us to host additional projects. This expansion positions the PyTorch Foundation to foster a broader ecosystem of high-value, trusted, and innovative AI projects that cater to all stages of the AI lifecycle—from training and inference to industry-specific applications.

    + +

    Why Expand?

    + +

    Since its inception at the Linux Foundation two and a half years ago, the PyTorch Foundation has rapidly grown, now encompassing over 30 member organizations and 120 vibrant ecosystem projects. PyTorch itself has become the framework of choice for AI researchers, practitioners, and industry leaders worldwide. Our flagship PyTorch Conference has seen attendance multiply sixfold over just two years, reflecting the community’s tremendous enthusiasm and engagement.

    + +

    With new initiatives such as PyTorch Day events, global community meetups, the PyTorch Ambassador Program, Open Source Program Office (OSPO) outreach, the Speaker’s Bureau, and our upcoming training and certification programs, we have significantly deepened our community’s expertise and collaboration capabilities. To sustain and accelerate this momentum, the logical next step was to expand the PyTorch Foundation into an umbrella organization.

    + +

    What Does an Umbrella Foundation Mean?

    + +

    By transitioning into an umbrella foundation, PyTorch will now host a range of diverse, high-quality AI and ML projects beyond PyTorch Core. These include foundation-hosted projects in two categories:

    + +
      +
    • Platform Projects: Domain-agnostic solutions essential across various stages of the AI lifecycle, such as training, inference, model optimization, and deployment as well as agentic systems.
    • +
    • Vertical Projects: Domain-specific projects tailored to particular industries or applications, such as biomedical imaging, protein folding, and geospatial analysis.
    • +
    + +

    Projects under our umbrella gain immediate access to vendor-neutral governance, enhanced visibility, increased funding opportunities, and robust community engagement and support.

    + +

    Foundation-Hosted vs. Ecosystem Projects

    + +

    As we expand, it’s important to clarify the distinction between foundation-hosted and ecosystem projects:

    + +
      +
    • Foundation-Hosted Projects are projects that fall under the umbrella, they are officially governed and administered under the PyTorch Foundation’s neutral and transparent governance model. Project maintainers continue to oversee their project, and they transfer assets to the Linux Foundation for independent stewardship and adopt an open governance model significantly reducing vendor bias and encouraging broader community contributions and adoption. These projects have greater stability and longevity and integrate with the larger PyTorch community.
    • +
    • Ecosystem Projects remain independently managed but receive recognition and increased visibility by aligning themselves closely with the PyTorch Foundation community standards. These projects meet specific quality and maturity criteria but retain full independence in governance and asset management.
    • +
    + +

    How to Join the PyTorch Ecosystem or Become a Foundation-Hosted Project

    + +

    We have clearly defined pathways for projects looking to become part of the PyTorch community:

    + +
      +
    1. Ecosystem Project Status: Projects must meet defined criteria, such as active development, comprehensive documentation, CI/CD infrastructure, clear governance, and community engagement. Approved ecosystem projects benefit from increased exposure and official recognition on the PyTorch Landscape.
    2. +
    3. Candidate Project Status: Ecosystem projects aspiring to foundation-hosted status can become candidates by securing sponsorship from a PyTorch Foundation Technical Advisory Council (TAC) voting member. Candidates receive guidance on meeting all necessary governance, technical, and strategic criteria.
    4. +
    5. Foundation-Hosted Project Status: Candidate projects demonstrating high maturity, stability, multi-platform support, security best practices, and strategic value to the PyTorch community can be approved by the TAC. These projects gain extensive benefits, including neutral trademark hosting, foundation support, marketing and events resources, governance guidance, and strategic funding opportunities.
    6. +
    + +

    Ensuring Long-Term Success and Innovation

    + +

    By expanding our scope to become an umbrella foundation, the PyTorch Foundation is uniquely positioned to enhance collaboration, innovation, and sustained growth across the entire AI community. Our mission is clear: create a vendor-neutral, open source environment where the best AI and ML tools can thrive, benefiting users, contributors, and industry stakeholders worldwide.

    + +

    “PyTorch is absolutely the foundation of the innovation happening in AI today and with projects like Llama, ChatGPT, and hundreds of thousands of open projects built on PyTorch, it has cemented itself as a critical ingredient to the world of AI. This move to create an umbrella foundation enables PyTorch to significantly expand its ecosystem both horizontally and vertically in this new era of agentic systems. I am very excited about this opportunity to take the PyTorch community to the next level!” - Joe Spisak, Product Director for PyTorch at Meta.

    + +

    “PyTorch sits at the very core of AI today. Meanwhile, the depth of the AI stack has grown dramatically—evolving from enabling accelerated compute to powering fully autonomous systems. Broadening the PyTorch Foundation is a key step in keeping the AI revolution open and accessible to all, across the stack and aligned with the principles PyTorch was built on.” - Luca Antiga, CTO at Lightning AI.

    + +

    We are incredibly optimistic about the opportunities ahead and excited to welcome new projects into our growing family. The PyTorch Foundation remains deeply committed to driving AI innovation forward, and together, we will continue to build the future of open source artificial intelligence.

    + +

    Stay tuned for more updates, announcements, and opportunities to participate!

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pt-korea-user-group-recap/index.html b/blog/pt-korea-user-group-recap/index.html new file mode 100644 index 000000000000..92c64658b660 --- /dev/null +++ b/blog/pt-korea-user-group-recap/index.html @@ -0,0 +1,703 @@ + + + + + + + + + + + + + Recap of the PyTorch Korea User Group Meetup: A Technical Conference with a PyTorch Core Maintainer | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Jiho Kim, PyTorch Korea User Group + +

    +

    At the end of March, the PyTorch Korea User Group hosted a special meetup that brought together prominent speakers for deep discussions on the PyTorch core and its broader ecosystem. With the event more than doubling in size compared to past gatherings, we were able to connect with even more developers and share insights. Huge thanks to goorm for sponsoring the fantastic venue! 😄

    + +

    people at a conference

    + +

    This recap is for those who couldn’t attend in person, as well as for participants who want to revisit the energy and insights of the day. The event featured experts in core PyTorch, AI accelerators, inference optimization, and large language model development. Below is a quick overview of the key sessions that anchored the conference.

    + +

    1️⃣ Jerry Lee | PyTorch Foundation

    + +

    Representing the PyTorch Foundation, part of the Linux Foundation, Jaeung provided an overview of how PyTorch is driving core open source technologies forward. He shared PyTorch’s growth story, the many global projects currently in motion, and the ecosystem’s impressive 20%+ annual growth. The session also covered how the foundation operates, how member organizations are involved, and upcoming plans that are particularly useful for practitioners.

    + +

    people at a conference

    + +

    2️⃣ Alban Desmaison | PyTorch Roadmap

    + +

    Alban shared the design philosophy behind PyTorch and Meta’s official contribution roadmap (link). He provided a deep technical dive into the differences between Eager and Compiled modes, especially breaking down the backend architecture of device Eager execution. Practical tools and improvements were also introduced—such as memory profilers, enhanced custom operator support, and pinned memory optimizations.

    + +

    people at a conference

    + +

    3️⃣ Hongseok Kim | PyTorch on Rebellions AI Accelerators: Status

    + +

    Rebellions is building runtime integration for their proprietary NPU architecture, fully aligned with the structural changes in PyTorch 2.0. This talk introduced the performance and scalability of their upcoming chip, their integration strategy with the PyTorch runtime, and challenges in supporting Eager Mode. Hongseok also previewed their roadmap toward releasing these features within the year.

    + +

    people at a conference

    + +

    4️⃣ Kyujin Cho | Backend.AI: A Unified Platform for All AI Accelerators

    + +

    Backend.AI abstracts and integrates various AI accelerators into a unified workflow. As the diversity of accelerator architectures grows, the need for portability and infrastructure unification becomes even more important. This session showcased features across development and operations—from NPU scheduling and resource allocation to monitoring. Backend.AI currently supports accelerators from NVIDIA, Intel, Tenstorrent, Rebellions, and more.

    + +

    people at a conference

    + +

    5️⃣ Taeho Kim | Optimizing & Deploying Models Across Multiple Chipsets Using NetsPresso

    + +

    This talk focused on the challenges of inference in real-world industrial applications of AI models. As new state-of-the-art models emerge rapidly, there’s a growing need for environments that can quickly validate device compatibility—ideally with one-click ease. NetsPresso is actively working on a static graph representation compatible with PyTorch, offering efficient support for model development, optimization, and testing.

    + +

    people at a conference

    + +

    6️⃣ Jungyeop Lee | The Journey to Reproduce Deepseek-R1

    + +

    Jungyeop took us through his journey of reproducing Deepseek, a large language model—an effort that involved 201 experiments. He shared real-world lessons from training with Korean data, tokenizer modifications, and fine-tuning strategies. His practical insights and next steps were especially valuable for those building or re-implementing large models from scratch.

    + +

    people at a conference

    + +

    7️⃣ Sol Kim | A journey from TCP architecture to production-level LLMs

    + +

    Sol presented an integrated optimization approach to deploying large models using the TCP(Tensor Contraction Processor) architecture, which supports tensor contraction at the hardware level. The talk highlighted optimization techniques built on hardware abstraction layers (HALs) and bottom-up integration strategies with PyTorch—offering a hybrid hardware-software perspective.

    + +

    people at a conference

    + +

    💡 Panel Talk & Q&A 💡

    + +

    The event wrapped up with an engaging panel discussion. Attendees asked sharp questions, and the speakers offered insightful answers. It was a powerful moment that captured the community’s enthusiasm for PyTorch and their hunger for deeper technical understanding.

    + +

    people at a conference

    + +

    Final Thoughts

    + +

    Since our first offline meetup in October 2022, the PyTorch Korea User Group has held five major technical conferences. Each event deepens our appreciation for the scale and depth of the PyTorch ecosystem. With perspectives from users, contributors, and ecosystem builders, the stories we share are only growing—and we’re committed to continuing this journey together.

    + +

    See you at the next conference—with even more exciting talks to come! 🙌

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pt-multidevice-integration/index.html b/blog/pt-multidevice-integration/index.html new file mode 100644 index 000000000000..8f0a1ead8140 --- /dev/null +++ b/blog/pt-multidevice-integration/index.html @@ -0,0 +1,773 @@ + + + + + + + + + + + + + Challenges and Efforts in PyTorch Multi-Device Integration: Compatibility, Portability, and Integration Efficiencies | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Zesheng Zong (Huawei), Jiawei Li (Huawei) | Co-authors: Jiong Gong (Intel), Bartosz Sochacki (Intel), Eikan Wang (Intel) + +

    +

    Introduction

    + +

    As the demand for diverse hardware accelerators grows, the need for a robust and adaptable deep learning framework becomes increasingly critical. While working through this integration, several challenges have surfaced in the PyTorch ecosystem, potentially affecting various hardware vendors. This blog aims to highlight these issues and propose solutions to enhance PyTorch’s adaptability, portability, and resilience across different hardware platforms.

    + +

    Improve Users’ Code Portability via Accelerator Autoloading

    + +

    Currently, users face additional work when running their code on different accelerators. One such task is manually importing modules for out-of-tree devices. This requires users to not only understand the different usage patterns between accelerators but also make their code aware of these differences. If you have projects originally running on GPU/CPU and want to migrate to other accelerators, this can lead to significant work and potential frustration.

    + +

    Examples of extra import:

    + +
    # Case 1: Use HPU
    +import torch
    +import torchvision.models as models
    +import habana_frameworks.torch # <-- extra import
    +model = models.resnet50().eval().to("hpu")
    +input = torch.rand(128, 3, 224, 224).to("hpu")
    +output = model(input)
    +
    +# Case 2: Use torch_npu
    +import torch
    +import torch_npu # <-- extra import
    +print(torch.ones(1, 2, device='npu'))
    +
    + +

    As a high-level machine learning framework, PyTorch’s ability to shield users from device differences is a competitive feature. Accelerator Autoloading allows users to continue using the familiar PyTorch device programming model without explicitly loading or importing device-specific extensions.

    + +

    How does it works?

    + +

    Utilize Python’s plugin architecture to enable automatic loading of device extensions via entry points in the PyTorch package.

    + +

    Python entry points provide a standardized way for Python packages to expose and discover components or plugins within an application. Via definition in accelerator’s package setup.py , PyTorch can automatically initialize accelerator modules when calling import torch , which gives users consistent experience between different backend devices.

    + +

    From device perspective, only need to claim following setup in setup.py (as example of torch_npu )

    + +
    // setup.py 
    +entry_points={
    + 'torch.backends': ['torch_npu = torch_npu:_autoload', ],
    +}
    +
    + +

    When import torch is invoked, the accelerator module will be loaded automatically. This provides users with a consistent programming experience across out-of-tree devices, eliminating the need to be aware of differences between CUDA, HPU, and NPU.

    + +
    # Case 1: Use HPU 
    +import torch 
    +import torchvision.models as models 
    +model = models.resnet50().eval().to("hpu") 
    +input = torch.rand(128, 3, 224, 224).to("hpu") 
    +output = model(input) 
    +
    +# Case 2: Use torch_npu 
    +import torch 
    +print(torch.ones(1, 2, device='npu'))
    +
    + +

    Device Integration Optimization

    + +

    What is PrivateUse1?

    + +

    In PyTorch, the dispatcher is a crucial component of the framework’s backend that manages how operations are routed to the appropriate device-specific implementation. Dispatch keys are an integral part of this system, serving as identifiers that represent various execution contexts—such as the device (CPU, CUDA, XPU), layout (dense, sparse), and autograd functionality. These keys ensure that operations are directed to the correct implementation.

    + +

    PrivateUse1 is a customizable device dispatch key, similar to CUDA/CPU/XPU, etc.), reserved for out-of-tree devices. It provides developers with a way to extend PyTorch’s functionality without modifying the core framework, allowing for the integration of new devices, hardware accelerators, or other specialized computing environments.

    + +

    Why do we need PrivateUse1?

    + +

    Internally, dispatch keys are represented as bit masks, each bit represents whether a certain key is active. This bit mask representation is efficient for quick lookup and combination of keys, but it inherently limits the number of distinct keys (typically to 64 or fewer).

    + +

    The current implementation of BackendComponent dispatch keys in PyTorch has encountered a critical bottleneck, which restricts the addition of new backends and, as a result, limits the expansion of the PyTorch ecosystem.

    + +

    bit diagram

    + +

    In response to this challenge, a series of optimizations have been applied to the PrivateUse1 mechanism to enhance its capacity.

    + +
      +
    • +

      PrivateUse1 integration mechanism

      + +

      Initially reserved as fallback options, PrivateUse1, along with PrivateUse2 and PrivateUse3, were designed to be activated only when existing key resources became scarce.

      + +

      PrivateUse1 is now being developed to match the robustness and versatility of established keys like CUDA and CPU. Achieving this required a deep integration across critical PyTorch modules. This integration wasn’t just a simple switch—it involved significant updates to core components such as AMP (Automatic Mixed Precision), Autograd, Distributed Training, Checkpointing, DataLoader, Optimization, and Quantization, etc.

      +
    • +
    + +

    flow diagram

    + +

    The activation of PrivateUse1 was a massive collaborative effort, culminating in over 100 pull requests aimed at making it from a placeholder to a fully operational dispatch key.

    + +
      +
    • +

      PrivateUse1 UT/CI Quality Assurance

      + +

      While unit tests are essential for ensuring quality during the development of the PrivateUse1 mechanism, they are not sufficient on their own to prevent new pull requests from inadvertently affecting existing functionality or compatibility of out-of-tree devices.

      + +

      To mitigate this risk, the community has added the pytorch_openreg module to the test suite. This module leverages a CPU backend to simulate interactions with accelerators, creating a controlled environment for rigorous testing. After implemented, this will enable automatic execution of device-generic test cases whenever relevant code is updated, allowing us to quickly detect and address any potential issues affecting the PrivateUse1 integration mechanism.

      +
    • +
    • +

      Comprehensive Documentation

      + +

      By providing comprehensive and easy-to-understand documentation, we aim to lower the barrier to entry for developers and encourage wider adoption of the PrivateUse1 mechanism in the PyTorch ecosystem. This documentation includes:

      +
        +
      • Step-by-step guides for integrating new backends using PrivateUse1
      • +
      • Clear explanations of PrivateUse1’s functionality and benefits
      • +
      • Code examples and best practices for efficient implementation
      • +
      +
    • +
    + +

    These enhancements aim to improve the robustness and reliability of the PrivateUse1 mechanism, facilitating better integration of new backends and expanding the capabilities of PyTorch.

    + +

    Compatibility Between Upstream and Downstream

    + +

    Device-Generic Unit Tests

    + +

    Most unit tests in PyTorch focus on CPU and CUDA devices, which limits participation from users with other hardware. To address this, a plan to modify PyTorch’s unit testing framework, enabling better support for non-CUDA devices. This plan includes removing existing device restrictions, implementing dynamic data type loading, and generalizing decorators to accommodate a broader range of devices. Additionally, we aim to enforce the use of universal device code and expand distributed testing to support non-NCCL backends.

    + +

    Through these improvements, we hope to significantly increase test coverage and pass rates for non-CUDA devices, integrating them into PyTorch’s continuous integration process. Initial changes have already been implemented, paving the way for new hardware support and creating a reference template for other devices.

    + +

    Ensuring Robust Device Integration through Automated Testing

    + +

    To uphold the high standards of quality assurance in PyTorch, an independent build repository and daily continuous integration (CI) workflows have been established, focusing on smoke and integration testing.

    + +

    The pytorch-integration-tests repository automates the testing of PyTorch’s device-specific functionalities, ensuring that they operate correctly and efficiently across a variety of hardware platforms(NPUs and other specialized devices). In repository we are trying to make a fully automated system that continuously validates PyTorch’s compatibility with different hardware backends.

    + +
      +
    • Automated Integration Tests: Run automated tests across different devices using GitHub Actions. This automation ensures that every change in the codebase is thoroughly tested against multiple hardware platforms, catching potential issues early in the development process.
    • +
    • Reusable Workflows: Workflows in this repository are modular and reusable, which streamlines the testing process. Developers can easily adapt these workflows to new devices or testing scenarios, making the system both flexible and scalable as PyTorch evolves.
    • +
    • Awareness of Out-of-Tree Devices: The repository displays the existence and behavior of all out-of-tree devices, keeping the community informed. This approach minimizes the risk of accidentally breaking downstream functionalities and provides fast feedback on changes.
    • +
    + +

    Efforts to enhance multi-device integration are pivotal for its adaptability in the evolving deep learning landscape. These initiatives not only benefit current users but also lower entry barriers for new hardware vendors and developers, fostering innovation in AI and machine learning. As PyTorch continues to evolve, its commitment to flexibility, robustness, and inclusivity positions it as a leading framework capable of meeting the diverse needs of the deep learning community.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-0_4_0-migration-guide/index.html b/blog/pytorch-0_4_0-migration-guide/index.html new file mode 100644 index 000000000000..8037a04c348f --- /dev/null +++ b/blog/pytorch-0_4_0-migration-guide/index.html @@ -0,0 +1,1129 @@ + + + + + + + + + + + + + PyTorch 0.4.0 Migration Guide | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    April 22, 2018

    +

    + PyTorch 0.4.0 Migration Guide +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    Welcome to the migration guide for PyTorch 0.4.0. In this release we introduced many exciting new features and critical bug fixes, with the goal of providing users a better and cleaner interface. In this guide, we will cover the most important changes in migrating existing code from previous versions:

    + +
      +
    • Tensors and Variables have merged
    • +
    • Support for 0-dimensional (scalar) Tensors
    • +
    • Deprecation of the volatile flag
    • +
    • dtypes, devices, and Numpy-style Tensor creation functions
    • +
    • Writing device-agnostic code
    • +
    • New edge-case constraints on names of submodules, parameters, and buffers in nn.Module
    • +
    + +

    Merging Tensor and Variable and classes

    + +

    torch.Tensor and torch.autograd.Variable are now the same class. More precisely, torch.Tensor is capable of tracking history and behaves like the old Variable; Variable wrapping continues to work as before but returns an object of type torch.Tensor. This means that you don’t need the Variable wrapper everywhere in your code anymore.

    + +

    The type() of a Tensor has changed

    + +

    Note also that the type() of a Tensor no longer reflects the data type. Use isinstance() or x.type()instead:

    + +
    >>> x = torch.DoubleTensor([1, 1, 1])
    +>>> print(type(x))  # was torch.DoubleTensor
    +"<class 'torch.Tensor'>"
    +>>> print(x.type())  # OK: 'torch.DoubleTensor'
    +'torch.DoubleTensor'
    +>>> print(isinstance(x, torch.DoubleTensor))  # OK: True
    +True
    +
    + +

    When does autograd start tracking history now?

    + +

    requires_grad, the central flag for autograd, is now an attribute on Tensors. The same rules previously used for Variables applies to Tensors; autograd starts tracking history when any input Tensor of an operation has requires_grad=True. For example,

    + +
    >>> x = torch.ones(1)  # create a tensor with requires_grad=False (default)
    +>>> x.requires_grad
    +False
    +>>> y = torch.ones(1)  # another tensor with requires_grad=False
    +>>> z = x + y
    +>>> # both inputs have requires_grad=False. so does the output
    +>>> z.requires_grad
    +False
    +>>> # then autograd won't track this computation. let's verify!
    +>>> z.backward()
    +RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn
    +>>>
    +>>> # now create a tensor with requires_grad=True
    +>>> w = torch.ones(1, requires_grad=True)
    +>>> w.requires_grad
    +True
    +>>> # add to the previous result that has require_grad=False
    +>>> total = w + z
    +>>> # the total sum now requires grad!
    +>>> total.requires_grad
    +True
    +>>> # autograd can compute the gradients as well
    +>>> total.backward()
    +>>> w.grad
    +tensor([ 1.])
    +>>> # and no computation is wasted to compute gradients for x, y and z, which don't require grad
    +>>> z.grad == x.grad == y.grad == None
    +True
    +
    + +

    Manipulating requires_grad flag

    + +

    Other than directly setting the attribute, you can change this flag in-place using my_tensor.requires_grad_(), or, as in the above example, at creation time by passing it in as an argument (default is False), e.g.,

    + +
    >>> existing_tensor.requires_grad_()
    +>>> existing_tensor.requires_grad
    +True
    +>>> my_tensor = torch.zeros(3, 4, requires_grad=True)
    +>>> my_tensor.requires_grad
    +True
    +
    + +

    What about .data?

    + +

    .data was the primary way to get the underlying Tensor from a Variable. After this merge, calling y = x.data still has similar semantics. So y will be a Tensor that shares the same data with x, is unrelated with the computation history of x, and has requires_grad=False.

    + +

    However, .data can be unsafe in some cases. Any changes on x.data wouldn’t be tracked by autograd, and the computed gradients would be incorrect if x is needed in a backward pass. A safer alternative is to use x.detach(), which also returns a Tensor that shares data with requires_grad=False, but will have its in-place changes reported by autograd if x is needed in backward.

    + +

    Here is an example of the difference between .data and x.detach() (and why we recommend using detach in general).

    + +

    If you use Tensor.detach(), the gradient computation is guaranteed to be correct.

    + +
    >>> a = torch.tensor([1,2,3.], requires_grad = True)
    +>>> out = a.sigmoid()
    +>>> c = out.detach()
    +>>> c.zero_()
    +tensor([ 0.,  0.,  0.])
    +
    +>>> out  # modified by c.zero_() !!
    +tensor([ 0.,  0.,  0.])
    +
    +>>> out.sum().backward()  # Requires the original value of out, but that was overwritten by c.zero_()
    +RuntimeError: one of the variables needed for gradient computation has been modified by an
    +
    + +

    However, using Tensor.data can be unsafe and can easily result in incorrect gradients when a tensor is required for gradient computation but modified in-place.

    + +
    >>> a = torch.tensor([1,2,3.], requires_grad = True)
    +>>> out = a.sigmoid()
    +>>> c = out.data
    +>>> c.zero_()
    +tensor([ 0.,  0.,  0.])
    +
    +>>> out  # out  was modified by c.zero_()
    +tensor([ 0.,  0.,  0.])
    +
    +>>> out.sum().backward()
    +>>> a.grad  # The result is very, very wrong because `out` changed!
    +tensor([ 0.,  0.,  0.])
    +
    + +

    Support for 0-dimensional (scalar) Tensors

    + +

    Previously, indexing into a Tensor vector (1-dimensional tensor) gave a Python number but indexing into a Variable vector gave (inconsistently!) a vector of size (1,)! Similar behavior existed with reduction functions, e.g. tensor.sum() would return a Python number, but variable.sum() would return a vector of size (1,).

    + +

    Fortunately, this release introduces proper scalar (0-dimensional tensor) support in PyTorch! Scalars can be created using the new torch.tensor function (which will be explained in more detail later; for now just think of it as the PyTorch equivalent of numpy.array). Now you can do things like:

    + +
    >>> torch.tensor(3.1416)         # create a scalar directly
    +tensor(3.1416)
    +>>> torch.tensor(3.1416).size()  # scalar is 0-dimensional
    +torch.Size([])
    +>>> torch.tensor([3]).size()     # compare to a vector of size 1
    +torch.Size([1])
    +>>>
    +>>> vector = torch.arange(2, 6)  # this is a vector
    +>>> vector
    +tensor([ 2.,  3.,  4.,  5.])
    +>>> vector.size()
    +torch.Size([4])
    +>>> vector[3]                    # indexing into a vector gives a scalar
    +tensor(5.)
    +>>> vector[3].item()             # .item() gives the value as a Python number
    +5.0
    +>>> mysum = torch.tensor([2, 3]).sum()
    +>>> mysum
    +tensor(5)
    +>>> mysum.size()
    +torch.Size([])
    +
    + +

    Accumulating losses

    + +

    Consider the widely used pattern total_loss += loss.data[0]. Before 0.4.0. loss was a Variable wrapping a tensor of size (1,), but in 0.4.0 loss is now a scalar and has 0 dimensions. Indexing into a scalar doesn’t make sense (it gives a warning now, but will be a hard error in 0.5.0). Use loss.item() to get the Python number from a scalar.

    + +

    Note that if you don’t convert to a Python number when accumulating losses, you may find increased memory usage in your program. This is because the right-hand-side of the above expression used to be a Python float, while it is now a zero-dim Tensor. The total loss is thus accumulating Tensors and their gradient history, which may keep around large autograd graphs for much longer than necessary.

    + +

    Deprecation of volatile flag

    + +

    The volatile flag is now deprecated and has no effect. Previously, any computation that involves a Variable with volatile=True wouldn’t be tracked by autograd. This has now been replaced by a set of more flexible context managers including torch.no_grad(), torch.set_grad_enabled(grad_mode), and others.

    + +
    >>> x = torch.zeros(1, requires_grad=True)
    +>>> with torch.no_grad():
    +...     y = x * 2
    +>>> y.requires_grad
    +False
    +>>>
    +>>> is_train = False
    +>>> with torch.set_grad_enabled(is_train):
    +...     y = x * 2
    +>>> y.requires_grad
    +False
    +>>> torch.set_grad_enabled(True)  # this can also be used as a function
    +>>> y = x * 2
    +>>> y.requires_grad
    +True
    +>>> torch.set_grad_enabled(False)
    +>>> y = x * 2
    +>>> y.requires_grad
    +False
    +
    + +

    dtypes, devices and NumPy-style creation functions

    + +

    In previous versions of PyTorch, we used to specify data type (e.g. float vs double), device type (cpu vs cuda) and layout (dense vs sparse) together as a “tensor type”. For example, torch.cuda.sparse.DoubleTensor was the Tensor type representing the double data type, living on CUDA devices, and with COO sparse tensor layout.

    + +

    In this release, we introduce torch.dtype, torch.device and torch.layout classes to allow better management of these properties via NumPy-style creation functions.

    + +

    torch.dtype

    + +

    Below is a complete list of available torch.dtypes (data types) and their corresponding tensor types.

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Datatype torch.dtypeTensor types
    32-bit floating pointtorch.float32 or torch.floattorch.*.FloatTensor
    64-bit floating pointtorch.float64 or torch.doubletorch.*.DoubleTensor
    16-bit floating pointtorch.float16 or torch.halftorch.*.HalfTensor
    8-bit integer (unsigned)torch.uint8torch.*.ByteTensor
    8-bit integer (signed)torch.int8torch.*.CharTensor
    16-bit integer (signed)torch.int16 or torch.shorttorch.*.ShortTensor
    32-bit integer (signed)torch.int32 or torch.inttorch.*.IntTensor
    64-bit integer (signed)torch.int64 or torch.longtorch.*.LongTensor
    + +

    The dtype of a tensor can be access via its dtype attribute.

    + +

    torch.device

    + +

    A torch.device contains a device type ('cpu' or 'cuda') and optional device ordinal (id) for the device type. It can be initialized with torch.device('{device_type}') or torch.device('{device_type}:{device_ordinal}').

    + +

    If the device ordinal is not present, this represents the current device for the device type; e.g., torch.device('cuda') is equivalent to torch.device('cuda:X') where X is the result of torch.cuda.current_device().

    + +

    The device of a tensor can be accessed via its device attribute.

    + +

    torch.layout

    + +

    torch.layout represents the data layout of a Tensor. Currently torch.strided (dense tensors, the default) and torch.sparse_coo (sparse tensors with COO format) are supported.

    + +

    The layout of a tensor can be access via its layout attribute.

    + +

    Creating Tensors

    + +

    Methods that create a Tensor now also take in dtype, device, layout, and requires_grad options to specify the desired attributes on the returned Tensor. For example,

    + +
    >>> device = torch.device("cuda:1")
    +>>> x = torch.randn(3, 3, dtype=torch.float64, device=device)
    +tensor([[-0.6344,  0.8562, -1.2758],
    +        [ 0.8414,  1.7962,  1.0589],
    +        [-0.1369, -1.0462, -0.4373]], dtype=torch.float64, device='cuda:1')
    +>>> x.requires_grad  # default is False
    +False
    +>>> x = torch.zeros(3, requires_grad=True)
    +>>> x.requires_grad
    +True
    +
    + +
    torch.tensor(data, ...)
    + +

    torch.tensor is one of the newly added tensor creation methods. It takes in array-like data of all kinds and copies the contained values into a new Tensor. As mentioned earlier, torch.tensor is the PyTorch equivalent of NumPy’s numpy.arrayconstructor. Unlike the torch.*Tensor methods, you can also create zero-dimensional Tensors (aka scalars) this way (a single python number is treated as a Size in the torch.*Tensor methods). Moreover, if a dtype argument isn’t given, it will infer the suitable dtype given the data. It is the recommended way to create a tensor from existing data like a Python list. For example,

    + +
    >>> cuda = torch.device("cuda")
    +>>> torch.tensor([[1], [2], [3]], dtype=torch.half, device=cuda)
    +tensor([[ 1],
    +        [ 2],
    +        [ 3]], device='cuda:0')
    +>>> torch.tensor(1)               # scalar
    +tensor(1)
    +>>> torch.tensor([1, 2.3]).dtype  # type inferece
    +torch.float32
    +>>> torch.tensor([1, 2]).dtype    # type inferece
    +torch.int64
    +
    + +

    We’ve also added more tensor creation methods. Some of them have torch.*_like and/or tensor.new_* variants.

    + +
      +
    • +

      torch.*_like takes in an input Tensor instead of a shape. It returns a Tensor with same attributes as the input Tensor by default unless otherwise specified:

      + +
       >>> x = torch.randn(3, dtype=torch.float64)
      + >>> torch.zeros_like(x)
      + tensor([ 0.,  0.,  0.], dtype=torch.float64)
      + >>> torch.zeros_like(x, dtype=torch.int)
      + tensor([ 0,  0,  0], dtype=torch.int32)
      +
      +
    • +
    • +

      tensor.new_* can also create Tensors with same attributes as tensor, but it always takes in a shape argument:

      + +
       >>> x = torch.randn(3, dtype=torch.float64)
      + >>> x.new_ones(2)
      + tensor([ 1.,  1.], dtype=torch.float64)
      + >>> x.new_ones(4, dtype=torch.int)
      + tensor([ 1,  1,  1,  1], dtype=torch.int32)
      +
      +
    • +
    + +

    To specify the desired shape, you can either use a tuple (e.g., torch.zeros((2, 3))) or variable arguments (e.g., torch.zeros(2, 3)) in most cases.

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    NameReturned Tensortorch.*_like varianttensor.new_* variant
    torch.emptyuninitialized memory
    torch.zerosall zeros
    torch.onesall ones
    torch.fullfilled with a given value
    torch.randi.i.d. continuous Uniform[0, 1) 
    torch.randni.i.d. Normal(0, 1) 
    torch.randinti.i.d. discrete Uniform in given range 
    torch.randpermrandom permutation of {0, 1, ..., n - 1}  
    torch.tensorcopied from existing data (list, NumPy ndarray, etc.) 
    torch.from_numpy*from NumPy ndarray (sharing storage without copying)  
    torch.arange, torch.range, and torch.linspaceuniformly spaced values in a given range  
    torch.logspacelogarithmically spaced values in a given range  
    torch.eyeidentity matrix  
    + +

    *: torch.from_numpy only takes in a NumPy ndarray as its input argument.

    + +

    Writing device-agnostic code

    + +

    Previous versions of PyTorch made it difficult to write code that was device agnostic (i.e. that could run on both CUDA-enabled and CPU-only machines without modification).

    + +

    PyTorch 0.4.0 makes this easier in two ways:

    + +
      +
    • The device attribute of a Tensor gives the torch.device for all Tensors (get_device only works for CUDA tensors)
    • +
    • The to method of Tensors and Modules can be used to easily move objects to different devices (instead of having to call cpu() or cuda() based on the context)
    • +
    + +

    We recommend the following pattern:

    + +
    # at beginning of the script
    +device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    +
    +...
    +
    +# then whenever you get a new Tensor or Module
    +# this won't copy if they are already on the desired device
    +input = data.to(device)
    +model = MyModule(...).to(device)
    +
    + +

    New edge-case constraints on names of submodules, parameters, and buffers in nn.Module

    + +

    name that is an empty string or contains "." is no longer permitted in module.add_module(name, value), module.add_parameter(name, value) or module.add_buffer(name, value) because such names may cause lost data in the state_dict. If you are loading a checkpoint for modules containing such names, please update the module definition and patch the state_dict before loading it.

    + +

    Code Samples (Putting it all together)

    + +

    To get a flavor of the overall recommended changes in 0.4.0, let’s look at a quick example for a common code pattern in both 0.3.1 and 0.4.0:

    + +
      +
    • 0.3.1 (old): +
      model = MyRNN()
      +if use_cuda:
      +    model = model.cuda()
      +
      +# train
      +total_loss = 0
      +for input, target in train_loader:
      +    input, target = Variable(input), Variable(target)
      +    hidden = Variable(torch.zeros(*h_shape))  # init hidden
      +    if use_cuda:
      +        input, target, hidden = input.cuda(), target.cuda(), hidden.cuda()
      +    ...  # get loss and optimize
      +    total_loss += loss.data[0]
      +
      +# evaluate
      +for input, target in test_loader:
      +    input = Variable(input, volatile=True)
      +    if use_cuda:
      +        ...
      +    ...
      +
      +
    • +
    • 0.4.0 (new): +
      # torch.device object used throughout this script
      +device = torch.device("cuda" if use_cuda else "cpu")
      +
      +model = MyRNN().to(device)
      +
      +# train
      +total_loss = 0
      +for input, target in train_loader:
      +    input, target = input.to(device), target.to(device)
      +    hidden = input.new_zeros(*h_shape)  # has the same device & dtype as `input`
      +    ...  # get loss and optimize
      +    total_loss += loss.item()           # get Python number from 1-element Tensor
      +
      +# evaluate
      +with torch.no_grad():                   # operations inside don't track history
      +    for input, target in test_loader:
      +        ...
      +
      +
    • +
    + +

    Thank you for reading! Please refer to our documentation and release notes for more details.

    + +

    Happy PyTorch-ing!

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-1-dot-3-adds-mobile-privacy-quantization-and-named-tensors/index.html b/blog/pytorch-1-dot-3-adds-mobile-privacy-quantization-and-named-tensors/index.html new file mode 100644 index 000000000000..e4ac6d2b6039 --- /dev/null +++ b/blog/pytorch-1-dot-3-adds-mobile-privacy-quantization-and-named-tensors/index.html @@ -0,0 +1,778 @@ + + + + + + + + + + + + + PyTorch 1.3 adds mobile, privacy, quantization, and named tensors | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    PyTorch continues to gain momentum because of its focus on meeting the needs of researchers, its streamlined workflow for production use, and most of all because of the enthusiastic support it has received from the AI community. PyTorch citations in papers on ArXiv grew 194 percent in the first half of 2019 alone, as noted by O’Reilly, and the number of contributors to the platform has grown more than 50 percent over the last year, to nearly 1,200. Facebook, Microsoft, Uber, and other organizations across industries are increasingly using it as the foundation for their most important machine learning (ML) research and production workloads.

    + +

    We are now advancing the platform further with the release of PyTorch 1.3, which includes experimental support for features such as seamless model deployment to mobile devices, model quantization for better performance at inference time, and front-end improvements, like the ability to name tensors and create clearer code with less need for inline comments. We’re also launching a number of additional tools and libraries to support model interpretability and bringing multimodal research to production.

    + +

    Additionally, we’ve collaborated with Google and Salesforce to add broad support for Cloud Tensor Processing Units, providing a significantly accelerated option for training large-scale deep neural networks. Alibaba Cloud also joins Amazon Web Services, Microsoft Azure, and Google Cloud as supported cloud platforms for PyTorch users. You can get started now at pytorch.org.

    + +

    PyTorch 1.3

    + +

    The 1.3 release of PyTorch brings significant new features, including experimental support for mobile device deployment, eager mode quantization at 8-bit integer, and the ability to name tensors. With each of these enhancements, we look forward to additional contributions and improvements from the PyTorch community.

    + +

    Named tensors (experimental)

    + +

    Cornell University’s Sasha Rush has argued that, despite its ubiquity in deep learning, the traditional implementation of tensors has significant shortcomings, such as exposing private dimensions, broadcasting based on absolute position, and keeping type information in documentation. He proposed named tensors as an alternative approach.

    + +

    Today, we name and access dimensions by comment:

    + +
    # Tensor[N, C, H, W]
    + images = torch.randn(32, 3, 56, 56)
    + images.sum(dim=1)
    + images.select(dim=1, index=0)
    +
    + +

    But naming explicitly leads to more readable and maintainable code:

    + +
    NCHW = [N, C, H, W]
    +   images = torch.randn(32, 3, 56, 56, names=NCHW)
    +   images.sum('C')
    +   images.select('C', index=0)
    +
    + +

    Quantization (experimental)

    + +

    It’s important to make efficient use of both server-side and on-device compute resources when developing ML applications. To support more efficient deployment on servers and edge devices, PyTorch 1.3 now supports 8-bit model quantization using the familiar eager mode Python API. Quantization refers to techniques used to perform computation and storage at reduced precision, such as 8-bit integer. This currently experimental feature includes support for post-training quantization, dynamic quantization, and quantization-aware training. It leverages the FBGEMM and QNNPACK state-of-the-art quantized kernel back ends, for x86 and ARM CPUs, respectively, which are integrated with PyTorch and now share a common API.

    + +

    To learn more about the design and architecture, check out the API docs here, and get started with any of the supported techniques using the tutorials available here.

    + +

    PyTorch mobile (experimental)

    + +

    Running ML on edge devices is growing in importance as applications continue to demand lower latency. It is also a foundational element for privacy-preserving techniques such as federated learning. To enable more efficient on-device ML, PyTorch 1.3 now supports an end-to-end workflow from Python to deployment on iOS and Android.

    + +

    This is an early, experimental release, optimized for end-to-end development. Coming releases will focus on:

    + +
      +
    • Optimization for size: Build level optimization and selective compilation depending on the operators needed for user applications (i.e., you pay binary size for only the operators you need)
    • +
    • Performance: Further improvements to performance and coverage on mobile CPUs and GPUs
    • +
    • High level API: Extend mobile native APIs to cover common preprocessing and integration tasks needed for incorporating ML in mobile applications. e.g. Computer vision and NLP
    • +
    + +

    Learn more or get started on Android or iOS here.

    + +

    New tools for model interpretability and privacy

    + +

    Captum

    + +

    As models become ever more complex, it is increasingly important to develop new methods for model interpretability. To help address this need, we’re launching Captum, a tool to help developers working in PyTorch understand why their model generates a specific output. Captum provides state-of-the-art tools to understand how the importance of specific neurons and layers and affect predictions made by the models. Captum’s algorithms include integrated gradients, conductance, SmoothGrad and VarGrad, and DeepLift.

    + +

    The example below shows how to apply model interpretability algorithms on a pretrained ResNet model and then visualize the attributions for each pixel by overlaying them on the image.

    + +
    noise_tunnel = NoiseTunnel(integrated_gradients)
    +
    +attributions_ig_nt, delta = noise_tunnel.attribute(input, n_samples=10, nt_type='smoothgrad_sq', target=pred_label_idx)
    +_ = viz.visualize_image_attr_multiple(["original_image", "heat_map"],
    +                                      ["all", "positive"],
    +                                      np.transpose(attributions_ig_nt.squeeze().cpu().detach().numpy(), (1,2,0)),
    +                                      np.transpose(transformed_img.squeeze().cpu().detach().numpy(), (1,2,0)),
    +                                      cmap=default_cmap,
    +                                      show_colorbar=True)
    +
    + +
    + +
    +
    + +
    + +

    Learn more about Captum at captum.ai.

    + +

    CrypTen

    + +

    Practical applications of ML via cloud-based or machine-learning-as-a-service (MLaaS) platforms pose a range of security and privacy challenges. In particular, users of these platforms may not want or be able to share unencrypted data, which prevents them from taking full advantage of ML tools. To address these challenges, the ML community is exploring a number of technical approaches, at various levels of maturity. These include homomorphic encryption, secure multiparty computation, trusted execution environments, on-device computation, and differential privacy.

    + +

    To provide a better understanding of how some of these technologies can be applied, we are releasing CrypTen, a new community-based research platform for taking the field of privacy-preserving ML forward. Learn more about CrypTen here. It is available on GitHub here.

    + +

    Tools for multimodal AI systems

    + +

    Digital content is often made up of several modalities, such as text, images, audio, and video. For example, a single public post might contain an image, body text, a title, a video, and a landing page. Even one particular component may have more than one modality, such as a video that contains both visual and audio signals, or a landing page that is composed of images, text, and HTML sources.

    + +

    The ecosystem of tools and libraries that work with PyTorch offer enhanced ways to address the challenges of building multimodal ML systems. Here are some of the latest libraries launching today:

    + +

    Detectron2

    + +

    Object detection and segmentation are used for tasks ranging from autonomous vehicles to content understanding for platform integrity. To advance this work, Facebook AI Research (FAIR) is releasing Detectron2, an object detection library now implemented in PyTorch. Detectron2 provides support for the latest models and tasks, increased flexibility to aid computer vision research, and improvements in maintainability and scalability to support production use cases.

    + +

    Detectron2 is available here and you can learn more here.

    + +

    Speech extensions to fairseq

    + +

    Language translation and audio processing are critical components in systems and applications such as search, translation, speech, and assistants. There has been tremendous progress in these fields recently thanks to the development of new architectures like transformers, as well as large-scale pretraining methods. We’ve extended fairseq, a framework for sequence-to-sequence applications such as language translation, to include support for end-to-end learning for speech and audio recognition tasks.These extensions to fairseq enable faster exploration and prototyping of new speech research ideas while offering a clear path to production.

    + +

    Get started with fairseq here.

    + +

    Cloud provider and hardware ecosystem support

    + +

    Cloud providers such as Amazon Web Services, Microsoft Azure, and Google Cloud provide extensive support for anyone looking to develop ML on PyTorch and deploy in production. We’re excited to share the general availability of Google Cloud TPU support and a newly launched integration with Alibaba Cloud. We’re also expanding hardware ecosystem support.

    + +
      +
    • Google Cloud TPU support now broadly available. To accelerate the largest-scale machine learning (ML) applications deployed today and enable rapid development of the ML applications of tomorrow, Google created custom silicon chips called Tensor Processing Units (TPUs). When assembled into multi-rack ML supercomputers called Cloud TPU Pods, these TPUs can complete ML workloads in minutes or hours that previously took days or weeks on other systems. Engineers from Facebook, Google, and Salesforce worked together to enable and pilot Cloud TPU support in PyTorch, including experimental support for Cloud TPU Pods. PyTorch support for Cloud TPUs is also available in Colab. Learn more about how to get started with PyTorch on Cloud TPUs here.
    • +
    • Alibaba adds support for PyTorch in Alibaba Cloud. The initial integration involves a one-click solution for PyTorch 1.x, Data Science Workshop notebook service, distributed training with Gloo/NCCL, as well as seamless integration with Alibaba IaaS such as OSS, ODPS, and NAS. Together with the toolchain provided by Alibaba, we look forward to significantly reducing the overhead necessary for adoption, as well as helping Alibaba Cloud’s global customer base leverage PyTorch to develop new AI applications.
    • +
    • ML hardware ecosystem expands. In addition to key GPU and CPU partners, the PyTorch ecosystem has also enabled support for dedicated ML accelerators. Updates from Intel and Habana showcase how PyTorch, connected to the Glow optimizing compiler, enables developers to utilize these market-specific solutions.
    • +
    + +

    Growth in the PyTorch community

    + +

    As an open source, community-driven project, PyTorch benefits from wide range of contributors bringing new capabilities to the ecosystem. Here are some recent examples:

    + +
      +
    • Mila SpeechBrain aims to provide an open source, all-in-one speech toolkit based on PyTorch. The goal is to develop a single, flexible, user-friendly toolkit that can be used to easily develop state-of-the-art systems for speech recognition (both end to end and HMM-DNN), speaker recognition, speech separation, multi-microphone signal processing (e.g., beamforming), self-supervised learning, and many others. Learn more
    • +
    • SpaCy is a new wrapping library with consistent and easy-to-use interfaces to several models, in order to extract features to power NLP pipelines. Support is provided for via spaCy’s standard training API. The library also calculates an alignment so the transformer features can be related back to actual words instead of just wordpieces. Learn more
    • +
    • HuggingFace PyTorch-Transformers (formerly known as pytorch-pretrained-bert is a library of state-of-the-art pretrained models for Natural Language Processing (NLP). The library currently contains PyTorch implementations, pretrained model weights, usage scripts, and conversion utilities for models such as BERT, GPT-2, RoBERTa, and DistilBERT. It has also grown quickly, with more than 13,000 GitHub stars and a broad set of users. Learn more
    • +
    • PyTorch Lightning is a Keras-like ML library for PyTorch. It leaves core training and validation logic to you and automates the rest. Reproducibility is a crucial requirement for many fields of research, including those based on ML techniques. As the number of research papers submitted to arXiv and conferences skyrockets into the tens of thousands, scaling reproducibility becomes difficult. Learn more.
    • +
    + +

    We recently held the first online Global PyTorch Summer Hackathon, where researchers and developers around the world were invited to build innovative new projects with PyTorch. Nearly 1,500 developers participated, submitting projects ranging from livestock disease detection to AI-powered financial assistants. The winning projects were:

    + +
      +
    • Torchmeta, which provides extensions for PyTorch to simplify the development of meta-learning algorithms in PyTorch. It features a unified interface inspired by TorchVision for both few-shot classification and regression problems, to allow easy benchmarking on multiple data sets to aid with reproducibility.
    • +
    • Open-Unmix, a system for end-to-end music demixing with PyTorch. Demixing separates the individual instruments or vocal track from any stereo recording.
    • +
    • Endless AI-Generated Tees, a store featuring AI-generated T-shirt designs that can be purchased and delivered worldwide. The system uses a state-of-the-art generative model (StyleGAN) that was built with PyTorch and then trained on modern art.
    • +
    + +

    Visit pytorch.org to learn more and get started with PyTorch 1.3 and the latest libraries and ecosystem projects. We look forward to the contributions, exciting research advancements, and real-world applications that the community builds with PyTorch.

    + +

    We’d like to thank the entire PyTorch team and the community for all their contributions to this work.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-1-dot-4-released-and-domain-libraries-updated/index.html b/blog/pytorch-1-dot-4-released-and-domain-libraries-updated/index.html new file mode 100644 index 000000000000..31eb0c6173a4 --- /dev/null +++ b/blog/pytorch-1-dot-4-released-and-domain-libraries-updated/index.html @@ -0,0 +1,757 @@ + + + + + + + + + + + + + PyTorch 1.4 released, domain libraries updated | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    + +
    + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    Today, we’re announcing the availability of PyTorch 1.4, along with updates to the PyTorch domain libraries. These releases build on top of the announcements from NeurIPS 2019, where we shared the availability of PyTorch Elastic, a new classification framework for image and video, and the addition of Preferred Networks to the PyTorch community. For those that attended the workshops at NeurIPS, the content can be found here.

    + +

    PyTorch 1.4

    + +

    The 1.4 release of PyTorch adds new capabilities, including the ability to do fine grain build level customization for PyTorch Mobile, and new experimental features including support for model parallel training and Java language bindings.

    + +

    PyTorch Mobile - Build level customization

    + +

    Following the open sourcing of PyTorch Mobile in the 1.3 release, PyTorch 1.4 adds additional mobile support including the ability to customize build scripts at a fine-grain level. This allows mobile developers to optimize library size by only including the operators used by their models and, in the process, reduce their on device footprint significantly. Initial results show that, for example, a customized MobileNetV2 is 40% to 50% smaller than the prebuilt PyTorch mobile library. You can learn more here about how to create your own custom builds and, as always, please engage with the community on the PyTorch forums to provide any feedback you have.

    + +

    Example code snippet for selectively compiling only the operators needed for MobileNetV2:

    + +
    # Dump list of operators used by MobileNetV2:
    +import torch, yaml
    +model = torch.jit.load('MobileNetV2.pt')
    +ops = torch.jit.export_opnames(model)
    +with open('MobileNetV2.yaml', 'w') as output:
    +    yaml.dump(ops, output)
    +
    + +
    # Build PyTorch Android library customized for MobileNetV2:
    +SELECTED_OP_LIST=MobileNetV2.yaml scripts/build_pytorch_android.sh arm64-v8a
    +
    +# Build PyTorch iOS library customized for MobileNetV2:
    +SELECTED_OP_LIST=MobileNetV2.yaml BUILD_PYTORCH_MOBILE=1 IOS_ARCH=arm64 scripts/build_ios.sh
    +
    + +

    Distributed model parallel training (Experimental)

    + +

    With the scale of models, such as RoBERTa, continuing to increase into the billions of parameters, model parallel training has become ever more important to help researchers push the limits. This release provides a distributed RPC framework to support distributed model parallel training. It allows for running functions remotely and referencing remote objects without copying the real data around, and provides autograd and optimizer APIs to transparently run backwards and update parameters across RPC boundaries.

    + +

    To learn more about the APIs and the design of this feature, see the links below:

    + + + +

    For the full tutorials, see the links below:

    + + + +

    As always, you can connect with community members and discuss more on the forums.

    + +

    Java bindings (Experimental)

    + +

    In addition to supporting Python and C++, this release adds experimental support for Java bindings. Based on the interface developed for Android in PyTorch Mobile, the new bindings allow you to invoke TorchScript models from any Java program. Note that the Java bindings are only available for Linux for this release, and for inference only. We expect support to expand in subsequent releases. See the code snippet below for how to use PyTorch within Java:

    + +
    Module mod = Module.load("demo-model.pt1");
    +Tensor data =
    +    Tensor.fromBlob(
    +        new int[] {1, 2, 3, 4, 5, 6}, // data
    +        new long[] {2, 3} // shape
    +        );
    +IValue result = mod.forward(IValue.from(data), IValue.from(3.0));
    +Tensor output = result.toTensor();
    +System.out.println("shape: " + Arrays.toString(output.shape()));
    +System.out.println("data: " + Arrays.toString(output.getDataAsFloatArray()));
    +
    + +

    Learn more about how to use PyTorch from Java here, and see the full Javadocs API documentation here.

    + +

    For the full 1.4 release notes, see here.

    + +

    Domain Libraries

    + +

    PyTorch domain libraries like torchvision, torchtext, and torchaudio complement PyTorch with common datasets, models, and transforms. We’re excited to share new releases for all three domain libraries alongside the PyTorch 1.4 core release.

    + +

    torchvision 0.5

    + +

    The improvements to torchvision 0.5 mainly focus on adding support for production deployment including quantization, TorchScript, and ONNX. Some of the highlights include:

    + +
      +
    • All models in torchvision are now torchscriptable making them easier to ship into non-Python production environments
    • +
    • ResNets, MobileNet, ShuffleNet, GoogleNet and InceptionV3 now have quantized counterparts with pre-trained models, and also include scripts for quantization-aware training.
    • +
    • In partnership with the Microsoft team, we’ve added ONNX support for all models including Mask R-CNN.
    • +
    + +

    Learn more about torchvision 0.5 here.

    + +

    torchaudio 0.4

    + +

    Improvements in torchaudio 0.4 focus on enhancing the currently available transformations, datasets, and backend support. Highlights include:

    + +
      +
    • SoX is now optional, and a new extensible backend dispatch mechanism exposes SoundFile as an alternative to SoX.
    • +
    • The interface for datasets has been unified. This enables the addition of two large datasets: LibriSpeech and Common Voice.
    • +
    • New filters such as biquad, data augmentation such as time and frequency masking, transforms such as MFCC, gain and dither, and new feature computation such as deltas, are now available.
    • +
    • Transformations now support batches and are jitable.
    • +
    • An interactive speech recognition demo with voice activity detection is available for experimentation.
    • +
    + +

    Learn more about torchaudio 0.4 here.

    + +

    torchtext 0.5

    + +

    torchtext 0.5 focuses mainly on improvements to the dataset loader APIs, including compatibility with core PyTorch APIs, but also adds support for unsupervised text tokenization. Highlights include:

    + +
      +
    • Added bindings for SentencePiece for unsupervised text tokenization .
    • +
    • Added a new unsupervised learning dataset - enwik9.
    • +
    • Made revisions to PennTreebank, WikiText103, WikiText2, IMDb to make them compatible with torch.utils.data. Those datasets are in an experimental folder and we welcome your feedback.
    • +
    + +

    Learn more about torchtext 0.5 here.

    + +

    We’d like to thank the entire PyTorch team and the community for all their contributions to this work.

    + +

    Cheers!

    + +

    Team PyTorch

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-1-dot-5-released-with-new-and-updated-apis/index.html b/blog/pytorch-1-dot-5-released-with-new-and-updated-apis/index.html new file mode 100644 index 000000000000..f2a628bcd3b2 --- /dev/null +++ b/blog/pytorch-1-dot-5-released-with-new-and-updated-apis/index.html @@ -0,0 +1,735 @@ + + + + + + + + + + + + + PyTorch 1.5 released, new and updated APIs including C++ frontend API parity with Python | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    Today, we’re announcing the availability of PyTorch 1.5, along with new and updated libraries. This release includes several major new API additions and improvements. PyTorch now includes a significant update to the C++ frontend, ‘channels last’ memory format for computer vision models, and a stable release of the distributed RPC framework used for model-parallel training. The release also has new APIs for autograd for hessians and jacobians, and an API that allows the creation of Custom C++ Classes that was inspired by pybind.

    + +

    You can find the detailed release notes here.

    + +

    C++ Frontend API (Stable)

    + +

    The C++ frontend API is now at parity with Python, and the features overall have been moved to ‘stable’ (previously tagged as experimental). Some of the major highlights include:

    + +
      +
    • Now with ~100% coverage and docs for C++ torch::nn module/functional, users can easily translate their model from Python API to C++ API, making the model authoring experience much smoother.
    • +
    • Optimizers in C++ had deviated from the Python equivalent: C++ optimizers can’t take parameter groups as input while the Python ones can. Additionally, step function implementations were not exactly the same. With the 1.5 release, C++ optimizers will always behave the same as the Python equivalent.
    • +
    • The lack of tensor multi-dim indexing API in C++ is a well-known issue and had resulted in many posts in PyTorch Github issue tracker and forum. The previous workaround was to use a combination of narrow / select / index_select / masked_select, which was clunky and error-prone compared to the Python API’s elegant tensor[:, 0, ..., mask] syntax. With the 1.5 release, users can use tensor.index({Slice(), 0, "...", mask}) to achieve the same purpose.
    • +
    + +

    ‘Channels last’ memory format for Computer Vision models (Experimental)

    + +

    ‘Channels last’ memory layout unlocks ability to use performance efficient convolution algorithms and hardware (NVIDIA’s Tensor Cores, FBGEMM, QNNPACK). Additionally, it is designed to automatically propagate through the operators, which allows easy switching between memory layouts.

    + +

    Learn more here on how to write memory format aware operators.

    + +

    Custom C++ Classes (Experimental)

    + +

    This release adds a new API, torch::class_, for binding custom C++ classes into TorchScript and Python simultaneously. This API is almost identical in syntax to pybind11. It allows users to expose their C++ class and its methods to the TorchScript type system and runtime system such that they can instantiate and manipulate arbitrary C++ objects from TorchScript and Python. An example C++ binding:

    + +
    template <class T>
    +struct MyStackClass : torch::CustomClassHolder {
    +  std::vector<T> stack_;
    +  MyStackClass(std::vector<T> init) : stack_(std::move(init)) {}
    +
    +  void push(T x) {
    +    stack_.push_back(x);
    +  }
    +  T pop() {
    +    auto val = stack_.back();
    +    stack_.pop_back();
    +    return val;
    +  }
    +};
    +
    +static auto testStack =
    +  torch::class_<MyStackClass<std::string>>("myclasses", "MyStackClass")
    +      .def(torch::init<std::vector<std::string>>())
    +      .def("push", &MyStackClass<std::string>::push)
    +      .def("pop", &MyStackClass<std::string>::pop)
    +      .def("size", [](const c10::intrusive_ptr<MyStackClass>& self) {
    +        return self->stack_.size();
    +      });
    +
    + +

    Which exposes a class you can use in Python and TorchScript like so:

    + +
    @torch.jit.script
    +def do_stacks(s : torch.classes.myclasses.MyStackClass):
    +    s2 = torch.classes.myclasses.MyStackClass(["hi", "mom"])
    +    print(s2.pop()) # "mom"
    +    s2.push("foobar")
    +    return s2 # ["hi", "foobar"]
    +
    + +

    You can try it out in the tutorial here.

    + +

    Distributed RPC framework APIs (Now Stable)

    + +

    The Distributed RPC framework was launched as experimental in the 1.4 release and the proposal is to mark Distributed RPC framework as stable and no longer experimental. This work involves a lot of enhancements and bug fixes to make the distributed RPC framework more reliable and robust overall, as well as adding a couple of new features, including profiling support, using TorchScript functions in RPC, and several enhancements for ease of use. Below is an overview of the various APIs within the framework:

    + +

    RPC API

    +

    The RPC API allows users to specify functions to run and objects to be instantiated on remote nodes. These functions are transparently recorded so that gradients can backpropagate through remote nodes using Distributed Autograd.

    + +

    Distributed Autograd

    +

    Distributed Autograd connects the autograd graph across several nodes and allows gradients to flow through during the backwards pass. Gradients are accumulated into a context (as opposed to the .grad field as with Autograd) and users must specify their model’s forward pass under a with dist_autograd.context() manager in order to ensure that all RPC communication is recorded properly. Currently, only FAST mode is implemented (see here for the difference between FAST and SMART modes).

    + +

    Distributed Optimizer

    +

    The distributed optimizer creates RRefs to optimizers on each worker with parameters that require gradients, and then uses the RPC API to run the optimizer remotely. The user must collect all remote parameters and wrap them in an RRef, as this is required input to the distributed optimizer. The user must also specify the distributed autograd context_id so that the optimizer knows in which context to look for gradients.

    + +

    Learn more about distributed RPC framework APIs here.

    + +

    New High level autograd API (Experimental)

    + +

    PyTorch 1.5 brings new functions including jacobian, hessian, jvp, vjp, hvp and vhp to the torch.autograd.functional submodule. This feature builds on the current API and allows the user to easily perform these functions.

    + +

    Detailed design discussion on GitHub can be found here.

    + +

    Python 2 no longer supported

    + +

    Starting PyTorch 1.5.0, we will no longer support Python 2, specifically version 2.7. Going forward support for Python will be limited to Python 3, specifically Python 3.5, 3.6, 3.7 and 3.8 (first enabled in PyTorch 1.4.0).

    + +

    We’d like to thank the entire PyTorch team and the community for all their contributions to this work.

    + +

    Cheers!

    + +

    Team PyTorch

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-1.10-new-library-releases/index.html b/blog/pytorch-1.10-new-library-releases/index.html new file mode 100644 index 000000000000..55bee430860b --- /dev/null +++ b/blog/pytorch-1.10-new-library-releases/index.html @@ -0,0 +1,845 @@ + + + + + + + + + + + + + New Library Releases in PyTorch 1.10, including TorchX, TorchAudio, TorchVision | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    Today, we are announcing a number of new features and improvements to PyTorch libraries, alongside the PyTorch 1.10 release. Some highlights include:

    + +

    Some highlights include:

    + +
      +
    • TorchX - a new SDK for quickly building and deploying ML applications from research & development to production.
    • +
    • TorchAudio - Added text-to-speech pipeline, self-supervised model support, multi-channel support and MVDR beamforming module, RNN transducer (RNNT) loss function, and batch and filterbank support to lfilter function. See the TorchAudio release notes here.
    • +
    • TorchVision - Added new RegNet and EfficientNet models, FX based feature extraction added to utilities, two new Automatic Augmentation techniques: Rand Augment and Trivial Augment, and updated training recipes. See the TorchVision release notes here.
    • +
    + +

    Introducing TorchX

    +

    TorchX is a new SDK for quickly building and deploying ML applications from research & development to production. It offers various builtin components that encode MLOps best practices and make advanced features like distributed training and hyperparameter optimization accessible to all.

    + +

    Users can get started with TorchX 0.1 with no added setup cost since it supports popular ML schedulers and pipeline orchestrators that are already widely adopted and deployed in production. No two production environments are the same. To comply with various use cases, TorchX’s core APIs allow tons of customization at well-defined extension points so that even the most unique applications can be serviced without customizing the whole vertical stack.

    + +

    Read the documentation for more details and try out this feature using this quickstart tutorial.

    + +

    TorchAudio 0.10

    + +

    [Beta] Text-to-speech pipeline

    +

    TorchAudio now adds the Tacotron2 model and pretrained weights. It is now possible to build a text-to-speech pipeline with existing vocoder implementations like WaveRNN and Griffin-Lim. Building a TTS pipeline requires matching data processing and pretrained weights, which are often non-trivial to users. So TorchAudio introduces a bundle API so that constructing pipelines for specific pretrained weights is easy. The following example illustrates this.

    + +
    >>> import torchaudio
    +>>>
    +>>> bundle = torchaudio.pipelines.TACOTRON2_WAVERNN_CHAR_LJSPEECH
    +>>>
    +>>> # Build text processor, Tacotron2 and vocoder (WaveRNN) model
    +>>> processor = bundle.get_text_processor()
    +>>> tacotron2 = bundle.get_tacotron2()
    +Downloading:
    +100%|███████████████████████████████| 107M/107M [00:01<00:00, 87.9MB/s]
    +>>> vocoder = bundle.get_vocoder()
    +Downloading:
    +100%|███████████████████████████████| 16.7M/16.7M [00:00<00:00, 78.1MB/s]
    +>>>
    +>>> text = "Hello World!"
    +>>>
    +>>> # Encode text
    +>>> input, lengths = processor(text)
    +>>>
    +>>> # Generate (mel-scale) spectrogram
    +>>> specgram, lengths, _ = tacotron2.infer(input, lengths)
    +>>>
    +>>> # Convert spectrogram to waveform
    +>>> waveforms, lengths = vocoder(specgram, lengths)
    +>>>
    +>>> # Save audio
    +>>> torchaudio.save('hello-world.wav', waveforms, vocoder.sample_rate)
    +
    +
    + +

    For the details of this API please refer to the documentation. You can also try this from the tutorial.

    + +

    (Beta) Self-Supervised Model Support

    +

    TorchAudio added HuBERT model architecture and pre-trained weight support for wav2vec 2.0 and HuBERT. HuBERT and wav2vec 2.0 are novel ways for audio representation learning and they yield high accuracy when fine-tuned on downstream tasks. These models can serve as baseline in future research, therefore, TorchAudio is providing a simple way to run the model. Similar to the TTS pipeline, the pretrained weights and associated information, such as expected sample rates and output class labels (for fine-tuned weights) are put together as a bundle, so that they can be used to build pipelines. The following example illustrates this.

    + +
    >>> import torchaudio
    +>>>
    +>>> bundle = torchaudio.pipelines.HUBERT_ASR_LARGE
    +>>>
    +>>> # Build the model and load pretrained weight.
    +>>> model = bundle.get_model()
    +Downloading:
    +100%|███████████████████████████████| 1.18G/1.18G [00:17<00:00, 73.8MB/s]
    +>>> # Check the corresponding labels of the output.
    +>>> labels = bundle.get_labels()
    +>>> print(labels)
    +('<s>', '<pad>', '</s>', '<unk>', '|', 'E', 'T', 'A', 'O', 'N', 'I', 'H', 'S', 'R', 'D', 'L', 'U', 'M', 'W', 'C', 'F', 'G', 'Y', 'P', 'B', 'V', 'K', "'", 'X', 'J', 'Q', 'Z')
    +>>>
    +>>> # Infer the label probability distribution
    +>>> waveform, sample_rate = torchaudio.load(hello-world.wav')
    +>>>
    +>>> emissions, _ = model(waveform)
    +>>>
    +>>> # Pass emission to (hypothetical) decoder
    +>>> transcripts = ctc_decode(emissions, labels)
    +>>> print(transcripts[0])
    +HELLO WORLD
    +
    +
    + +

    Please refer to the documentation for more details and try out this feature using this tutorial.

    + +

    (Beta) Multi-channel support and MVDR beamforming

    +

    Far-field speech recognition is a more challenging task compared to near-field recognition. Multi-channel methods such as beamforming help reduce the noises and enhance the target speech.

    + +

    TorchAudio now adds support for differentiable Minimum Variance Distortionless Response (MVDR) beamforming on multi-channel audio using Time-Frequency masks. Researchers can easily assemble it with any multi-channel ASR pipeline. There are three solutions (ref_channel, stv_evd, stv_power) and it supports single-channel and multi-channel (perform average in the method) masks. It provides an online option that recursively updates the parameters for streaming audio. We also provide a tutorial on how to apply MVDR beamforming to the multi-channel audio in the example directory.

    + +
    >>> from torchaudio.transforms import MVDR, Spectrogram, InverseSpectrogram
    +>>>
    +>>> # Load the multi-channel noisy audio
    +>>> waveform_mix, sr = torchaudio.load('mix.wav')
    +>>> # Initialize the stft and istft modules
    +>>> stft = Spectrogram(n_fft=1024, hop_length=256, return_complex=True, power=None)
    +>>> istft = InverseSpectrogram(n_fft=1024, hop_length=256)
    +>>> # Get the noisy spectrogram
    +>>> specgram_mix = stft(waveform_mix)
    +>>> # Get the Time-Frequency mask via machine learning models
    +>>> mask = model(waveform)
    +>>> # Initialize the MVDR module 
    +>>> mvdr = MVDR(ref_channel=0, solution=ref_channel, multi_mask=False)
    +>>> # Apply MVDR beamforming
    +>>> specgram_enhanced = mvdr(specgram_mix, mask)
    +>>> # Get the enhanced waveform via iSTFT
    +>>> waveform_enhanced = istft(specgram_enhanced, length=waveform.shape[-1])
    +
    +

    Please refer to the documentation for more details and try out this feature using the MVDR tutorial.

    + +

    (Beta) RNN Transducer Loss

    +

    The RNN transducer (RNNT) loss is part of the RNN transducer pipeline, which is a popular architecture for speech recognition tasks. Recently it has gotten attention for being used in a streaming setting, and has also achieved state-of-the-art WER for the LibriSpeech benchmark.

    + +

    TorchAudio’s loss function supports float16 and float32 logits, has autograd and torchscript support, and can be run on both CPU and GPU, which has a custom CUDA kernel implementation for improved performance. The implementation is consistent with the original loss function in Sequence Transduction with Recurrent Neural Networks, but relies on code from Alignment Restricted Streaming Recurrent Neural Network Transducer. Special thanks to Jay Mahadeokar and Ching-Feng Yeh for their code contributions and guidance.

    + +

    Please refer to the documentation for more details.

    + +

    (Beta) Batch support and filter bank support

    +

    torchaudio.functional.lfilter now supports batch processing and multiple filters.

    + +

    (Prototype) Emformer Module

    +

    Automatic speech recognition (ASR) research and productization have increasingly focused on on-device applications. Towards supporting such efforts, TorchAudio now includes Emformer, a memory-efficient transformer architecture that has achieved state-of-the-art results on LibriSpeech in low-latency streaming scenarios, as a prototype feature.

    + +

    Please refer to the documentation for more details.

    + +

    GPU Build

    +

    GPU builds that support custom CUDA kernels in TorchAudio, like the one being used for RNN transducer loss, have been added. Following this change, TorchAudio’s binary distribution now includes CPU-only versions and CUDA-enabled versions. To use CUDA-enabled binaries, PyTorch also needs to be compatible with CUDA.

    + +

    TorchVision 0.11

    + +

    (Stable) New Models

    +

    RegNet and EfficientNet are two popular architectures that can be scaled to different computational budgets. In this release we include 22 pre-trained weights for their classification variants. The models were trained on ImageNet and the accuracies of the pre-trained models obtained on ImageNet val can be found below (see #4403, #4530 and #4293 for more details).

    + +

    The models can be used as follows:

    + +
    import torch
    +from torchvision import models
    +
    +x = torch.rand(1, 3, 224, 224)
    +
    +regnet = models.regnet_y_400mf(pretrained=True)
    +regnet.eval()
    +predictions = regnet(x)
    +
    +efficientnet = models.efficientnet_b0(pretrained=True)
    +efficientnet.eval()
    +predictions = efficientnet(x)
    +
    +

    See the full list of new models on the torchvision.models documentation page.

    + +

    We would like to thank Ross Wightman and Luke Melas-Kyriazi for contributing the weights of the EfficientNet variants.

    + +

    (Beta) FX-based Feature Extraction

    +

    A new Feature Extraction method has been added to our utilities. It uses torch.fx and enables us to retrieve the outputs of intermediate layers of a network which is useful for feature extraction and visualization.

    + +

    Here is an example of how to use the new utility:

    + +
    import torch
    +from torchvision.models import resnet50
    +from torchvision.models.feature_extraction import create_feature_extractor
    +
    +
    +x = torch.rand(1, 3, 224, 224)
    +
    +model = resnet50()
    +
    +return_nodes = {
    +"layer4.2.relu_2": "layer4"
    +}
    +model2 = create_feature_extractor(model, return_nodes=return_nodes)
    +intermediate_outputs = model2(x)
    +
    +print(intermediate_outputs['layer4'].shape)
    +
    +

    We would like to thank Alexander Soare for developing this utility.

    + +

    (Stable) New Data Augmentations

    +

    Two new Automatic Augmentation techniques were added: RandAugment and Trivial Augment. They apply a series of transformations on the original data to enhance them and to boost the performance of the models. The new techniques build on top of the previously added AutoAugment and focus on simplifying the approach, reducing the search space for the optimal policy and improving the performance gain in terms of accuracy. These techniques enable users to reproduce recipes to achieve state-of-the-art performance on the offered models. Additionally, it enables users to apply these techniques in order to do transfer learning and achieve optimal accuracy on new datasets.

    + +

    Both methods can be used as drop-in replacement of the AutoAugment technique as seen below:

    + +
    from torchvision import transforms
    +
    +t = transforms.RandAugment()
    +# t = transforms.TrivialAugmentWide()
    +transformed = t(image)
    +
    +transform = transforms.Compose([
    +transforms.Resize(256),
    +transforms.RandAugment(), # transforms.TrivialAugmentWide()
    +transforms.ToTensor()])
    +
    +

    Read the automatic augmentation transforms for more details.

    + +

    We would like to thank Samuel G. Müller for contributing to Trivial Augment and for his help on refactoring the AA package.

    + +

    Updated Training Recipes

    +

    We have updated our training reference scripts to add support for Exponential Moving Average, Label Smoothing, Learning-Rate Warmup, Mixup, Cutmix and other SOTA primitives. The above enabled us to improve the classification Acc@1 of some pre-trained models by over 4 points. A major update of the existing pre-trained weights is expected in the next release.

    + +

    Thanks for reading. If you’re interested in these updates and want to join the PyTorch community, we encourage you to join the discussion forums and open GitHub issues. To get the latest news from PyTorch, follow us on Twitter, Medium, YouTube and LinkedIn.

    + +

    Cheers! +Team PyTorch

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-1.10-released/index.html b/blog/pytorch-1.10-released/index.html new file mode 100644 index 000000000000..3a6c8e025868 --- /dev/null +++ b/blog/pytorch-1.10-released/index.html @@ -0,0 +1,736 @@ + + + + + + + + + + + + + PyTorch 1.10 Release, including CUDA Graphs APIs, Frontend and Compiler Improvements | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    We are excited to announce the release of PyTorch 1.10. This release is composed of over 3,400 commits since 1.9, made by 426 contributors. We want to sincerely thank our community for continuously improving PyTorch.

    + +

    PyTorch 1.10 updates are focused on improving training and performance of PyTorch, and developer usability. The full release notes are available here. Highlights include:

    +
      +
    1. CUDA Graphs APIs are integrated to reduce CPU overheads for CUDA workloads.
    2. +
    3. Several frontend APIs such as FX, torch.special, and nn.Module Parametrization, have moved from beta to stable.
    4. +
    5. Support for automatic fusion in JIT Compiler expands to CPUs in addition to GPUs.
    6. +
    7. Android NNAPI support is now available in beta.
    8. +
    + +

    Along with 1.10, we are also releasing major updates to the PyTorch libraries, which you can read about in this blog post.

    + +

    Frontend APIs

    + +

    (Stable) Python code transformations with FX

    + +

    FX provides a Pythonic platform for transforming and lowering PyTorch programs. It is a toolkit for pass writers to facilitate Python-to-Python transformation of functions and nn.Module instances. This toolkit aims to support a subset of Python language semantics—rather than the whole Python language—to facilitate ease of implementation of transforms. With 1.10, FX is moving to stable.

    + +

    You can learn more about FX in the official documentation and GitHub examples of program transformations implemented using torch.fx.

    + +

    (Stable) torch.special

    +

    A torch.special module, analogous to SciPy’s special module, is now available in stable. The module has 30 operations, including gamma, Bessel, and (Gauss) error functions.

    + +

    Refer to this documentation for more details.

    + +

    (Stable) nn.Module Parametrization

    +

    nn.Module parametrizaton, a feature that allows users to parametrize any parameter or buffer of an nn.Module without modifying the nn.Module itself, is available in stable. This release adds weight normalization (weight_norm), orthogonal parametrization (matrix constraints and part of pruning) and more flexibility when creating your own parametrization.

    + +

    Refer to this tutorial and the general documentation for more details.

    + +

    (Beta) CUDA Graphs APIs Integration

    +

    PyTorch now integrates CUDA Graphs APIs to reduce CPU overheads for CUDA workloads.

    + +

    CUDA Graphs greatly reduce the CPU overhead for CPU-bound cuda workloads and thus improve performance by increasing GPU utilization. For distributed workloads, CUDA Graphs also reduce jitter, and since parallel workloads have to wait for the slowest worker, reducing jitter improves overall parallel efficiency.

    + +

    Integration allows seamless interop between the parts of the network captured by cuda graphs, and parts of the network that cannot be captured due to graph limitations.

    + +

    Read the note for more details and examples, and refer to the general documentation for additional information.

    + +

    [Beta] Conjugate View

    +

    PyTorch’s conjugation for complex tensors (torch.conj()) is now a constant time operation, and returns a view of the input tensor with a conjugate bit set as can be seen by calling torch.is_conj() . This has already been leveraged in various other PyTorch operations like matrix multiplication, dot product etc., to fuse conjugation with the operation leading to significant performance gain and memory savings on both CPU and CUDA.

    + +

    Distributed Training

    + +

    Distributed Training Releases Now in Stable

    +

    In 1.10, there are a number of features that are moving from beta to stable in the distributed package:

    +
      +
    • (Stable) Remote Module: This feature allows users to operate a module on a remote worker like using a local module, where the RPCs are transparent to the user. Refer to this documentation for more details.
    • +
    • (Stable) DDP Communication Hook: This feature allows users to override how DDP synchronizes gradients across processes. Refer to this documentation for more details.
    • +
    • (Stable) ZeroRedundancyOptimizer: This feature can be used in conjunction with DistributedDataParallel to reduce the size of per-process optimizer states. With this stable release, it now can handle uneven inputs to different data-parallel workers. Check out this tutorial. We also improved the parameter partition algorithm to better balance memory and computation overhead across processes. Refer to this documentation and this tutorial to learn more.
    • +
    + +

    Performance Optimization and Tooling

    + +

    [Beta] Profile-directed typing in TorchScript

    +

    TorchScript has a hard requirement for source code to have type annotations in order for compilation to be successful. For a long time, it was only possible to add missing or incorrect type annotations through trial and error (i.e., by fixing the type-checking errors generated by torch.jit.script one by one), which was inefficient and time consuming.

    + +

    Now, we have enabled profile directed typing for torch.jit.script by leveraging existing tools like MonkeyType, which makes the process much easier, faster, and more efficient. For more details, refer to the documentation.

    + +

    (Beta) CPU Fusion

    +

    In PyTorch 1.10, we’ve added an LLVM-based JIT compiler for CPUs that can fuse together sequences of torch library calls to improve performance. While we’ve had this capability for some time on GPUs, this release is the first time we’ve brought compilation to the CPU.
    +You can check out a few performance results for yourself in this Colab notebook.

    + +

    (Beta) PyTorch Profiler

    +

    The objective of PyTorch Profiler is to target the execution steps that are the most costly in time and/or memory, and visualize the workload distribution between GPUs and CPUs. PyTorch 1.10 includes the following key features:

    + +
      +
    • Enhanced Memory View: This helps you understand your memory usage better. This tool will help you avoid Out of Memory errors by showing active memory allocations at various points of your program run.
    • +
    • Enhanced Automated Recommendations: This helps provide automated performance recommendations to help optimize your model. The tools recommend changes to batch size, TensorCore, memory reduction technologies, etc.
    • +
    • Enhanced Kernel View: Additional columns show grid and block sizes as well as shared memory usage and registers per thread.
    • +
    • Distributed Training: Gloo is now supported for distributed training jobs.
    • +
    • Correlate Operators in the Forward & Backward Pass: This helps map the operators found in the forward pass to the backward pass, and vice versa, in a trace view.
    • +
    • TensorCore: This tool shows the Tensor Core (TC) usage and provides recommendations for data scientists and framework developers.
    • +
    • NVTX: Support for NVTX markers was ported from the legacy autograd profiler.
    • +
    • Support for profiling on mobile devices: The PyTorch profiler now has better integration with TorchScript and mobile backends, enabling trace collection for mobile workloads.
    • +
    + +

    Refer to this documentation for details. Check out this tutorial to learn how to get started with this feature.

    + +

    PyTorch Mobile

    + +

    (Beta) Android NNAPI Support in Beta

    +

    Last year we released prototype support for Android’s Neural Networks API (NNAPI). NNAPI allows Android apps to run computationally intensive neural networks on the most powerful and efficient parts of the chips that power mobile phones, including GPUs (Graphics Processing Units) and NPUs (specialized Neural Processing Units).

    + +

    Since the prototype we’ve added more op coverage, added support for load-time flexible shapes and ability to run the model on the host for testing. Try out this feature using the tutorial.

    + +

    Additionally, Transfer Learning steps have been added to Object Detection examples. Check out this GitHub page to learn more. Please provide your feedback or ask questions on the forum. You can also check out this presentation to get an overview.

    + +

    Thanks for reading. If you’re interested in these updates and want to join the PyTorch community, we encourage you to join the discussion forums and open GitHub issues. To get the latest news from PyTorch, follow us on Twitter, Medium, YouTube, and LinkedIn.

    + +

    Cheers! +Team PyTorch

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-1.11-new-library-releases/index.html b/blog/pytorch-1.11-new-library-releases/index.html new file mode 100644 index 000000000000..4064255291e8 --- /dev/null +++ b/blog/pytorch-1.11-new-library-releases/index.html @@ -0,0 +1,965 @@ + + + + + + + + + + + + + Introducing TorchRec, and other domain library updates in PyTorch 1.11 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    We are introducing the beta release of TorchRec and a number of improvements to the current PyTorch domain libraries, alongside the PyTorch 1.11 release. These updates demonstrate our focus on developing common and extensible APIs across all domains to make it easier for our community to build ecosystem projects on PyTorch. Highlights include:

    + +
      +
    • TorchRec, a PyTorch domain library for Recommendation Systems, is available in beta. View it on GitHub.
    • +
    • TorchAudio - Added Enformer- and RNN-T-based models and recipes to support the full development lifecycle of a streaming ASR model. See the release notes here.
    • +
    • TorchText - Added beta support for RoBERTa and XLM-R models, byte-level BPE tokenizer, and text datasets backed by TorchData. See the release notes here.
    • +
    • TorchVision - Added 4 new model families and 14 new classification datasets such as CLEVR, GTSRB, FER2013. See the release notes here.
    • +
    + +

    TorchRec 0.1

    + +

    We announced TorchRec a few weeks ago and we are excited to release the beta version today. To recap, TorchRec is a PyTorch domain library for Recommendation Systems. This new library provides common sparsity and parallelism primitives, enabling researchers to build state-of-the-art personalization models and deploy them in production. TorchRec was used to train a 1.25 trillion parameter model, pushed to production in January 2022.

    + +

    In particular, the library includes:

    + +
      +
    • Modeling primitives, such as embedding bags and jagged tensors, that enable easy authoring of large, performant multi-device/multi-node models using hybrid data-parallelism and model-parallelism.
    • +
    • Optimized RecSys kernels powered by FBGEMM, including support for sparse and quantized operations.
    • +
    • A sharder which can partition embedding tables with a variety of different strategies including data-parallel, table-wise, row-wise, table-wise-row-wise, and column-wise sharding.
    • +
    • A planner which can automatically generate optimized sharding plans for models.
    • +
    • Pipelining to overlap dataloading device transfer (copy to GPU), inter-device communications (input_dist), and computation (forward, backward) for increased performance.
    • +
    • GPU inference support.
    • +
    • Common modules for RecSys, such as models and public datasets (Criteo & Movielens).
    • +
    + +

    Please check the TorchRec announcement post here, video tutorial, install instructions here, test drive the feature through this tutorial here, and refer to the reference document here.

    + +

    TorchAudio 0.11

    + +

    TorchAudio: Building Blocks for Audio and Speech Processing

    + +

    We published a paper, TorchAudio: Building Blocks for Audio and Speech Processing, describing the overview of the TorchAudio library. If you find TorchAudio useful for your research, please help us share with the community by citing our paper.

    + +

    (Beta) RNN-T & (Prototype) Emformer Models and Recipes

    + +

    + +

    + +

    Emformer is an efficient memory-transformer-based streaming acoustic model that has demonstrated state-of-the-art streaming automatic speech recognition (ASR) performance in low-latency, resource-constrained scenarios, such as on-device applications (citation: https://arxiv.org/abs/2010.10759).

    + +

    The TorchAudio v0.11 release includes the following beta features:

    + +
      +
    • Implementation of Emformer (docs)
    • +
    • Recurrent neural network transducer (RNN-T) streaming ASR model that uses Emformer for its transcription network (docs)
    • +
    • RNN-T beam search decoder with TorchScript support (docs)
    • +
    • LibriSpeech Emformer RNN-T training recipe (GitHub) and corresponding pre-trained streaming ASR inference pipeline (docs)
    • +
    + +

    Also there are prototype features that are available from nightly builds or the main branch.

    + +
      +
    • Training recipes trained on MuST-C and TED-LIUM3 datasets. (GitHub)
    • +
    • Pre-trained pipelines corresponding to the recipes. (docs)
    • +
    • Tutorial that steps through performing online speech recognition with RNN-T Emformer model. (docs)
    • +
    + +

    Collectively, these features cover the full development lifecycle of a streaming ASR model, from definition through training and inference, and enable users to easily develop their own Emformer- and RNN-T-based models.

    + +

    Special thanks to Yangyang Shi, Jay Mahadeokar, and Gil Keren for their code contributions and guidance.

    + +

    (Beta) HuBERT Pretrain Model

    + +

    The masked prediction training of HuBERT model requires the masked logits, unmasked logits, and feature norm as the outputs. The logits are for cross-entropy losses and the feature norm is for penalty loss. The release adds HuBERTPretrainModel and corresponding factory functions (hubert_pretrain_base, hubert_pretrain_large, and hubert_pretrain_xlarge) to enable training from scratch.

    + +

    (Prototype) CTC Beam Search Decoder

    + +

    In recent releases, TorchAudio has added support for ASR models fine-tuned on CTC loss. The addition of an inference time CTC beam search decoder enables running end-to-end ASR evaluation using TorchAudio utils.

    + +

    The CTC decoder in TorchAudio supports customizable beam search decoding with lexicon constraint. It also has optional KenLM language model support.

    + +

    For more details, please check out the API tutorial. This prototype feature is available through nightly builds.

    + +

    (Prototype) Streaming API

    + +

    TorchAudio started as simple audio I/O APIs that supplement PyTorch. With the recent addition of ASR models and training recipes, the project has received requests to support high-level application development.

    + +

    Streaming API makes it easy to develop and test the model in online inference. It utilizes ffmpeg under the hood, and enables reading media from online services and hardware devices, decoding media in an incremental manner, and applying filters and preprocessing.

    + +

    Please checkout the API tutorial and the documentation. There are also the streaming ASR tutorial and the device streaming ASR tutorial. This feature is available from nightly releases. Please refer to pytorch.org for how to install nightly builds.

    + +

    TorchText 0.12

    + +

    (Beta) RoBERTa and XLM-R Models

    + +

    TorchText has added support for pre-trained RoBERTa and XLM-R models. It would allow users to train end-2-end Transformer Encoder based models on standard NLP tasks using TorchText.

    + +

    More specifically:

    + +
      +
    • The models are torchscriptable and hence can be employed for production use-cases.
    • +
    • The model APIs let users to easily attach custom task-specific heads with pre-trained encoders.
    • +
    • The API also comes equipped with data pre-processing transforms to match the pre-trained weights and model configuration.
    • +
    + +

    We have added a tutorial to demonstrate SST-2 binary text classification task with pre-trained XLM-R base architecture.

    + +

    For additional details on model APIs and usage examples, please refer to the documentation.

    + +

    (Beta) byte-level BPE tokenizer

    + +

    TorchText has added support for a Byte-Level BPE tokenizer, as used in GPT-2. This tokenizer is also used for tokenizing inputs to the pre-trained RoBERTa models described previously. In addition to the RoBERTa vocab, users can also load their own custom BPE vocab to use the tokenizer. Furthermore, the tokenizer is fully torchscriptable and hence can be employed for production use-cases. For additional details on model APIs and usage examples, please refer to the documentation.

    + +

    (Beta) Text datasets backed by TorchData

    + +

    TorchText has modernized its datasets by migrating from older-style Iterable Datasets to TorchData’s DataPipes. TorchData is a library that provides modular/composable primitives, allowing users to load and transform data in performant data pipelines.

    + +

    These DataPipes work out-of-the-box with PyTorch DataLoader and would enable new functionalities like auto-sharding. Users can now easily do data manipulation and pre-processing using user-defined functions and transformations in a functional style programming. Datasets backed by DataPipes also enable standard flow-control like batching, collation, shuffling and bucketizing.

    + +

    Collectively, DataPipes provides a comprehensive experience for data preprocessing and tensorization needs in a pythonic and flexible way for model training. We have added a tutorial to demonstrate data-processing pipelining using the modernized dataset for binary text-classification.

    + +

    You can learn more about TorchData DataPipe APIs in its official documentation.

    + +

    TorchVision 0.12

    + +

    New Models

    + +

    Four new model families have been released in the latest version along with pre-trained weights for their variants.

    + +

    #1 Object Detection

    + +

    FCOS is a popular, fully convolutional, anchor-free model for object detection. In this release we include a community-contributed model implementation as well as pre-trained weights. The model was trained on COCO train2017 and can be used as follows:

    + +
    import torch
    +from torchvision import models
    +
    +x = [torch.rand(3, 224, 224)]
    +fcos = models.detection.fcos_resnet50_fpn(pretrained=True).eval()
    +predictions =  fcos(x)
    +
    + +

    The box AP of the pre-trained model on COCO val2017 is 39.2 (see #4961 for more details).

    + +

    We would like to thank Hu Ye and Zhiqiang Wang for contributing to the model implementation and initial training. This was the first community-contributed model in a long while, and given its success, we decided to use the learnings from this process and create a new model contribution guidelines.

    + +

    #2 Optical Flow support and RAFT model

    + +

    TorchVision now supports optical flow! Optical Flow models try to predict movement in a video: given two consecutive frames, the model predicts where each pixel of the first frame ends up in the second frame. Check out our new tutorial on Optical Flow!

    + +

    We implemented a torchscript-compatible RAFT model with pre-trained weights (both normal and “small” versions), and added support for training and evaluating optical flow models. Our training scripts support distributed training across processes and nodes, leading to much faster training time than the original implementation. We also added 5 new optical flow datasets: Flying Chairs, Flying Things, Sintel, Kitti, and HD1K.

    + +

    + +

    + +

    #3. Image Classification

    + +

    Vision Transformer (ViT) and ConvNeXt are two popular architectures which can be used as image classifiers or as backbones for downstream vision tasks. In this release we include 8 pre-trained weights for their classification variants. The models were trained on ImageNet and can be used as follows:

    + +
    import torch
    +from torchvision import models
    +
    +x = torch.rand(1, 3, 224, 224)
    +vit = models.vit_b_16(pretrained=True).eval()
    +convnext = models.convnext_tiny(pretrained=True).eval()
    +predictions1 = vit(x)
    +predictions2 = convnext(x)
    +
    + +

    The accuracies of the pre-trained models obtained on ImageNet val are seen below:

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    ModelAcc@1Acc@5
    vit_b_1681.07295.318
    vit_b_3275.91292.466
    vit_l_1679.66294.638
    vit_l_3276.97293.07
    convnext_tiny82.5296.146
    convnext_small83.61696.65
    convnext_base84.06296.87
    convnext_large84.41496.976
    + +

    The above models have been trained using an adjusted version of our new training recipe and this allows us to offer models with accuracies significantly higher than the ones on the original papers.

    + +

    #4. GPU Video Decoding

    + +

    In this release, we add support for GPU video decoding in the video reading API. To use hardware-accelerated decoding, we just need to pass a cuda device to the video reading API as shown below:

    + +
    import torchvision
    +
    +reader = torchvision.io.VideoReader(file_name, device="cuda:0")
    +for frame in reader:
    +    print(frame)
    +
    + +

    We also support seeking to anyframe or a keyframe in the video before reading, as shown below:

    + +
    reader.seek(seek_time)
    +
    + +

    New Datasets

    + +

    We have implemented 14 new classification datasets: CLEVR, GTSRB, FER2013, SUN397, Country211, Flowers102, fvgc_aircraft, OxfordIIITPet, DTD, Food 101, Rendered SST2, Stanford cars, PCAM, and EuroSAT.

    + +

    As part of our work on Optical Flow support (see above for more details), we also added 5 new optical flow datasets: Flying Chairs, Flying Things, Sintel, Kitti, and HD1K.

    + +

    Other Updates

    + +
      +
    • New documentation layout: Each function / class is now documented in a separate page, clearing up some space in the per-module pages, and easing the discovery of the proposed APIs. Compare e.g. our previous docs vs the new ones. Please let us know if you have any feedback!
    • +
    • New model contribution guidelines have been published following the success of the FCOS model which was contributed by the community. These guidelines aim to be an overview of the model contribution process for anyone who would like to suggest, implement and train a new model.
    • +
    • Upcoming Prototype API - We are currently working on a prototype API which adds Multi-weight support on all of our model builder methods. This will enable us to offer multiple pre-trained weights, associated with their meta-data and inference transforms. The API is still under review and thus was not included in the release but you can read more about it on our blogpost and provide your feedback on the dedicated Github issue.
    • +
    • Changes in our deprecation policy - Up until now, torchvision would almost never remove deprecated APIs. In order to be more aligned and consistent with pytorch core, we are updating our deprecation policy. We are now following a 2-release deprecation cycle: deprecated APIs will raise a warning for 2 versions, and will be removed after that. To reflect these changes and to smooth the transition, we have decided to: +
        +
      • Remove all APIs that had been deprecated before or on v0.8, released 1.5 years ago.
      • +
      • Update the removal timeline of all other deprecated APIs to v0.14, to reflect the new 2-cycle policy starting now in v0.12.
      • +
      +
    • +
    + +

    Captum 0.5

    + +

    Captum is a PyTorch library for model interpretability. For this release, we expanded Captum with influential instances and added support for both similarity based influences and novel algorithms, TracIn and its variants. TracIn variants offer faster approximation of influence scores based on random projections for fully connected layers.

    + +

    More specifically the new, influence, subsection of Captum includes:

    + +
      +
    • SimilarityInfluence computes similarity scores between test and training examples using default (cosine or euclidean) or custom user definite metrics w.r.t. given input model layers.
    • +
    • TracInCP approximates the influential score of each training example on a given test example based on the dot-product similarity between loss gradients w.r.t. model parameters for test and training examples. Note that if we use training examples as test examples then we compute self influence. This method and its variants described below also return top-k proponents and opponents which are the top-k largest positive and negative influential examples respectively.
    • +
    • TracInCPFast is an approximation of TracInCP that avoids computing the gradients w.r.t. large parameter matrices. It approximates influence score based on the dot products between last fully connected layer activations and loss gradients w.r.t. that layer for training and test examples.
    • +
    • TracInCPFastRandProj uses a nearest neighbor approximation library such as annoy to compute the dot product between the training and test quantities. In order to reduce the dimensionality of layer activations and corresponding gradients this method, in addition, allows to project those vectors into a lower dimensional space using random projection matrices.
    • +
    + +

    More about the implementation of influential instances can be found on our GitHub page and tutorials.

    + +

    Thanks for reading, If you’re interested in these updates and want to join the PyTorch community, we encourage you to join the discussion forums and open GitHub issues. To get the latest news from PyTorch, follow us on Twitter, Medium, YouTube, and LinkedIn.

    + +

    Cheers!

    + +

    Team PyTorch

    + +
    +
    + +
    +
    + + + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-1.11-released/index.html b/blog/pytorch-1.11-released/index.html new file mode 100644 index 000000000000..72df4944a654 --- /dev/null +++ b/blog/pytorch-1.11-released/index.html @@ -0,0 +1,701 @@ + + + + + + + + + + + + + PyTorch 1.11, TorchData, and functorch are now available | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    We are excited to announce the release of PyTorch 1.11 (release notes). This release is composed of over 3,300 commits since 1.10, made by 434 contributors. Along with 1.11, we are releasing beta versions of TorchData and functorch.

    + +

    Summary:

    + +
      +
    • TorchData is a new library for common modular data loading primitives for easily constructing flexible and performant data pipelines. View it on GitHub.
    • +
    • functorch, a library that adds composable function transforms to PyTorch, is now available in beta. View it on GitHub.
    • +
    • Distributed Data Parallel (DDP) static graph optimizations available in stable.
    • +
    + +

    Introducing TorchData

    + +

    We are delighted to present the Beta release of TorchData. This is a library of common modular data loading primitives for easily constructing flexible and performant data pipelines. Based on community feedback, we have found that the existing DataLoader bundled too many features together and can be difficult to extend. Moreover, different use cases often have to rewrite the same data loading utilities over and over again. The goal here is to enable composable data loading through Iterable-style and Map-style building blocks called “DataPipes” that work well out of the box with the PyTorch’s DataLoader.

    + +

    A DataPipe takes in some access function over Python data structures, __iter__ for IterDataPipe and __getitem__ for MapDataPipe, and returns a new access function with a slight transformation applied. You can chain multiple DataPipes together to form a data pipeline that performs all the necessary data transformation.

    + +

    We have implemented over 50 DataPipes that provide different core functionalities, such as opening files, parsing texts, transforming samples, caching, shuffling, and batching. For users who are interested in connecting to cloud providers (such as Google Drive or AWS S3), the fsspec and iopath DataPipes will allow you to do so. The documentation provides detailed explanations and usage examples of each IterDataPipe and MapDataPipe.

    + +

    In this release, some of the PyTorch domain libraries have migrated their datasets to use DataPipes. In TorchText, the popular datasets provided by the library are implemented using DataPipes and a section of its SST-2 binary text classification tutorial demonstrates how you can use DataPipes to preprocess data for your model. There also are other prototype implementations of datasets with DataPipes in TorchVision (available in nightly releases) and in TorchRec.

    + +

    The documentation for TorchData is now live. It contains a tutorial that covers how to use DataPipes, use them with DataLoader, and implement custom ones. FAQs and future plans related to DataLoader are described in our project’s README file.

    + +

    Introducing functorch

    + +

    We’re excited to announce the first beta release of functorch. Heavily inspired by Google JAX, functorch is a library that adds composable function transforms to PyTorch. It aims to provide composable vmap (vectorization) and autodiff transforms that work with PyTorch modules and PyTorch autograd with good eager-mode performance.

    + +

    Composable function transforms can help with a number of use cases that are tricky to do in PyTorch today:

    + +
      +
    • computing per-sample-gradients (or other per-sample quantities)
    • +
    • running ensembles of models on a single machine
    • +
    • efficiently batching together tasks in the inner-loop of MAML
    • +
    • efficiently computing Jacobians and Hessians as well as batched ones
    • +
    + +

    Composing vmap (vectorization), vjp (reverse-mode AD), and jvp (forward-mode AD) transforms allows us to effortlessly express the above without designing a separate library for each.

    + +

    For more details, please see our documentation, tutorials, and installation instructions.

    + +

    Distributed Training

    + +

    (Stable) DDP static graph

    + +

    DDP static graph assumes that your model employs the same set of used/unused parameters in every iteration, so that it can deterministically know states like which hooks will fire, how many times the hooks will fire and gradients computation ready order after the first iteration. Static graph caches these states in the first iteration, and thus it could support features that DDP can not support in previous releases, e.g., support multiple activation checkpoints on the same parameters regardless of whether there are unused parameters or not. The static graph feature also applies performance optimizations when there are unused parameters, e.g., it avoids traversing graphs to search unused parameters every iteration, and enables dynamic bucketing order. These optimizations in the DDP static graph brought 10% QPS gain for some recommendation models.

    + +

    To enable static graph, just simply set static_graph=True in the DDP API like this:

    + +
    ddp_model = DistributedDataParallel(model, static_graph=True)
    +
    + +

    For more details, please see our documentation and tutorials.

    + +

    Thanks for reading, If you’re interested in these updates and want to join the PyTorch community, we encourage you to join the discussion forums and open GitHub issues. To get the latest news from PyTorch, follow us on Twitter, Medium, YouTube, and LinkedIn.

    + +

    Cheers!

    + +

    Team PyTorch

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-1.12-new-library-releases/index.html b/blog/pytorch-1.12-new-library-releases/index.html new file mode 100644 index 000000000000..ef705d97a90f --- /dev/null +++ b/blog/pytorch-1.12-new-library-releases/index.html @@ -0,0 +1,1299 @@ + + + + + + + + + + + + + New library updates in PyTorch 1.12 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    June 28, 2022

    +

    + New library updates in PyTorch 1.12 +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    We are bringing a number of improvements to the current PyTorch libraries, alongside the PyTorch 1.12 release. These updates demonstrate our focus on developing common and extensible APIs across all domains to make it easier for our community to build ecosystem projects on PyTorch.

    + +

    Summary:

    +
      +
    • TorchVision - Added multi-weight support API, new architectures, model variants, and pretrained weight. See the release notes here.
    • +
    • TorchAudio - Introduced beta features including a streaming API, a CTC beam search decoder, and new beamforming modules and methods. See the release notes here.
    • +
    • TorchText - Extended support for scriptable BERT tokenizer and added datasets for GLUE benchmark. See the release notes here.
    • +
    • TorchRec - Added EmbeddingModule benchmarks, examples for TwoTower Retrieval, inference and sequential embeddings, metrics, improved planner and demonstrated integration with production components. See the release notes here.
    • +
    • TorchX - Launch PyTorch trainers developed on local workspaces onto five different types of schedulers. See the release notes here.
    • +
    • FBGemm - Added and improved kernels for Recommendation Systems inference workloads, including table batched embedding bag, jagged tensor operations, and other special-case optimizations.
    • +
    + +

    TorchVision v0.13

    + +

    Multi-weight support API

    + +

    TorchVision v0.13 offers a new Multi-weight support API for loading different weights to the existing model builder methods:

    + +
    from torchvision.models import *
    +
    +# Old weights with accuracy 76.130%
    +resnet50(weights=ResNet50_Weights.IMAGENET1K_V1)
    +
    +# New weights with accuracy 80.858%
    +resnet50(weights=ResNet50_Weights.IMAGENET1K_V2)
    +
    +# Best available weights (currently alias for IMAGENET1K_V2)
    +# Note that these weights may change across versions
    +resnet50(weights=ResNet50_Weights.DEFAULT)
    +
    +# Strings are also supported
    +resnet50(weights="IMAGENET1K_V2")
    +
    +# No weights - random initialization
    +resnet50(weights=None)
    +
    + +

    The new API bundles along with the weights important details such as the preprocessing transforms and meta-data such as labels. Here is how to make the most out of it:

    + +
    from torchvision.io import read_image
    +from torchvision.models import resnet50, ResNet50_Weights
    +
    +img = read_image("test/assets/encode_jpeg/grace_hopper_517x606.jpg")
    +
    +# Step 1: Initialize model with the best available weights
    +weights = ResNet50_Weights.DEFAULT
    +model = resnet50(weights=weights)
    +model.eval()
    +
    +# Step 2: Initialize the inference transforms
    +preprocess = weights.transforms()
    +
    +# Step 3: Apply inference preprocessing transforms
    +batch = preprocess(img).unsqueeze(0)
    +
    +# Step 4: Use the model and print the predicted category
    +prediction = model(batch).squeeze(0).softmax(0)
    +class_id = prediction.argmax().item()
    +score = prediction[class_id].item()
    +category_name = weights.meta["categories"][class_id]
    +print(f"{category_name}: {100 * score:.1f}%")
    +
    + +

    You can read more about the new API in the docs. To provide your feedback, please use this dedicated Github issue.

    + +

    New architectures and model variants

    + +

    Classification

    + +

    The Swin Transformer and EfficienetNetV2 are two popular classification models which are often used for downstream vision tasks. This release includes 6 pre-trained weights for their classification variants. Here is how to use the new models:

    + +
    import torch
    +from torchvision.models import *
    +
    +image = torch.rand(1, 3, 224, 224)
    +model = swin_t(weights="DEFAULT").eval()
    +prediction = model(image)
    +
    +image = torch.rand(1, 3, 384, 384)
    +model = efficientnet_v2_s(weights="DEFAULT").eval()
    +prediction = model(image)
    +
    + +

    In addition to the above, we also provide new variants for existing architectures such as ShuffleNetV2, ResNeXt and MNASNet. The accuracies of all the new pre-trained models obtained on ImageNet-1K are seen below:

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    ModelAcc@1Acc@5
    swin_t81.47495.776
    swin_s83.19696.36
    swin_b83.58296.64
    efficientnet_v2_s84.22896.878
    efficientnet_v2_m85.11297.156
    efficientnet_v2_l85.80897.788
    resnext101_64x4d83.24696.454
    resnext101_64x4d (quantized)82.89896.326
    shufflenet_v2_x1_572.99691.086
    shufflenet_v2_x1_5 (quantized)72.0520.700
    shufflenet_v2_x2_076.23093.006
    shufflenet_v2_x2_0 (quantized)75.35492.488
    mnasnet0_7571.18090.496
    mnas1_376.50693.522
    + +

    We would like to thank Hu Ye for contributing to TorchVision the Swin Transformer implementation.

    + +

    (BETA) Object Detection and Instance Segmentation

    + +

    We have introduced 3 new model variants for RetinaNet, FasterRCNN and MaskRCNN that include several post-paper architectural optimizations and improved training recipes. All models can be used similarly:

    + +
    import torch
    +from torchvision.models.detection import *
    +
    +images = [torch.rand(3, 800, 600)]
    +model = retinanet_resnet50_fpn_v2(weights="DEFAULT")
    +# model = fasterrcnn_resnet50_fpn_v2(weights="DEFAULT")
    +# model = maskrcnn_resnet50_fpn_v2(weights="DEFAULT")
    +model.eval()
    +prediction = model(images)
    +
    + +

    Below we present the metrics of the new variants on COCO val2017. In parenthesis we denote the improvement over the old variants:

    + + + + + + + + + + + + + + + + + + + + + + + + + + +
    ModelBox mAPMask mAP
    retinanet_resnet50_fpn_v241.5 (+5.1)-
    fasterrcnn_resnet50_fpn_v246.7 (+9.7)-
    maskrcnn_resnet50_fpn_v247.4 (+9.5)41.8 (+7.2)
    + +

    We would like to thank Ross Girshick, Piotr Dollar, Vaibhav Aggarwal, Francisco Massa and Hu Ye for their past research and contributions to this work.

    + +

    New pre-trained weights

    + +

    SWAG weights

    + +

    The ViT and RegNet model variants offer new pre-trained SWAG (​​Supervised Weakly from hashtAGs) weights. One of the biggest of these models achieves a whopping 88.6% accuracy on ImageNet-1K. We currently offer two versions of the weights: 1) fine-tuned end-to-end weights on ImageNet-1K (highest accuracy) and 2) frozen trunk weights with a linear classifier fit on ImageNet-1K (great for transfer learning). Below we see the detailed accuracies of each model variant:

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Model WeightsAcc@1Acc@5
    RegNet_Y_16GF_Weights.IMAGENET1K_SWAG_E2E_V186.01298.054
    RegNet_Y_16GF_Weights.IMAGENET1K_SWAG_LINEAR_V183.97697.244
    RegNet_Y_32GF_Weights.IMAGENET1K_SWAG_E2E_V186.83898.362
    RegNet_Y_32GF_Weights.IMAGENET1K_SWAG_LINEAR_V184.62297.48
    RegNet_Y_128GF_Weights.IMAGENET1K_SWAG_E2E_V188.22898.682
    RegNet_Y_128GF_Weights.IMAGENET1K_SWAG_LINEAR_V186.06897.844
    ViT_B_16_Weights.IMAGENET1K_SWAG_E2E_V185.30497.65
    ViT_B_16_Weights.IMAGENET1K_SWAG_LINEAR_V181.88696.18
    ViT_L_16_Weights.IMAGENET1K_SWAG_E2E_V188.06498.512
    ViT_L_16_Weights.IMAGENET1K_SWAG_LINEAR_V185.14697.422
    ViT_H_14_Weights.IMAGENET1K_SWAG_E2E_V188.55298.694
    ViT_H_14_Weights.IMAGENET1K_SWAG_LINEAR_V185.70897.73
    + +

    The SWAG weights are released under the Attribution-NonCommercial 4.0 International license. We would like to thank Laura Gustafson, Mannat Singh and Aaron Adcock for their work and support in making the weights available to TorchVision.

    + +

    Model Refresh

    + +

    The release of the Multi-weight support API enabled us to refresh the most popular models and offer more accurate weights. We improved on average each model by ~3 points. The new recipe used was learned on top of ResNet50 and its details were covered on a previous blog post.

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    ModelOld weightsNew weights
    efficientnet_b178.64279.838
    mobilenet_v271.87872.154
    mobilenet_v3_large74.04275.274
    regnet_y_400mf74.04675.804
    regnet_y_800mf76.4278.828
    regnet_y_1_6gf77.9580.876
    regnet_y_3_2gf78.94881.982
    regnet_y_8gf80.03282.828
    regnet_y_16gf80.42482.886
    regnet_y_32gf80.87883.368
    regnet_x_400mf72.83474.864
    regnet_x_800mf75.21277.522
    regnet_x_1_6gf77.0479.668
    regnet_x_3_2gf78.36481.196
    regnet_x_8gf79.34481.682
    regnet_x_16gf80.05882.716
    regnet_x_32gf80.62283.014
    resnet5076.1380.858
    resnet50 (quantized)75.9280.282
    resnet10177.37481.886
    resnet15278.31282.284
    resnext50_32x4d77.61881.198
    resnext101_32x8d79.31282.834
    resnext101_32x8d (quantized)78.98682.574
    wide_resnet50_278.46881.602
    wide_resnet101_278.84882.51
    + +

    We would like to thank Piotr Dollar, Mannat Singh and Hugo Touvron for their past research and contributions to this work.

    + +

    New Augmentations, Layers and Losses

    + +

    This release brings a bunch of new primitives which can be used to produce SOTA models. Some highlights include the addition of AugMix data-augmentation method, the DropBlock layer, the cIoU/dIoU loss and many more. We would like to thank Aditya Oke, Abhijit Deo, Yassine Alouini and Hu Ye for contributing to the project and for helping us maintain TorchVision relevant and fresh.

    + +

    Documentation

    + +

    We completely revamped our models documentation to make them easier to browse, and added various key information such as supported image sizes, or image pre-processing steps of pre-trained weights. We now have a main model page with various summary tables of available weights, and each model has a dedicated page. Each model builder is also documented in their own page, with more details about the available weights, including accuracy, minimal image size, link to training recipes, and other valuable info. For comparison, our previous models docs are here. To provide feedback on the new documentation, please use the dedicated Github issue.

    + +

    TorchAudio v0.12

    + +

    (BETA) Streaming API

    + +

    + +

    + +

    StreamReader is TorchAudio’s new I/O API. It is backed by FFmpeg†, and allows users to:

    +
      +
    • Decode audio and video formats, including MP4 and AAC
    • +
    • Handle input forms, such as local files, network protocols, microphones, webcams, screen captures and file-like objects
    • +
    • Iterate over and decode chunk-by-chunk, while changing the sample rate or frame rate
    • +
    • Apply audio and video filters, such as low-pass filter and image scaling
    • +
    • Decode video with Nvidia’s hardware-based decoder (NVDEC)
    • +
    + +

    For usage details, please check out the documentation and tutorials:

    + + +

    † To use StreamReader, FFmpeg libraries are required. Please install FFmpeg. The coverage of codecs depends on how these libraries are configured. TorchAudio official binaries are compiled to work with FFmpeg 4 libraries; FFmpeg 5 can be used if TorchAudio is built from source.

    + +

    (BETA) CTC Beam Search Decoder

    + +

    TorchAudio integrates the wav2letter CTC beam search decoder from Flashlight (GitHub). The addition of this inference time decoder enables running end-to-end CTC ASR evaluation using TorchAudio utils.

    + +

    Customizable lexicon and lexicon-free decoders are supported, and both are compatible with KenLM n-gram language models or without using a language model. TorchAudio additionally supports downloading token, lexicon, and pretrained KenLM files for the LibriSpeech dataset.

    + +

    For usage details, please check out the documentation and ASR inference tutorial.

    + +

    (BETA) New Beamforming Modules and Methods

    + +

    To improve flexibility in usage, the release adds two new beamforming modules under torchaudio.transforms: SoudenMVDR and RTFMVDR. The main differences from MVDR are:

    +
      +
    • Use power spectral density (PSD) and relative transfer function (RTF) matrices as inputs instead of time-frequency masks. The module can be integrated with neural networks that directly predict complex-valued STFT coefficients of speech and noise
    • +
    • Add 'reference_channel' as an input argument in the forward method, to allow users to select the reference channel in model training or dynamically change the reference channel in inference
    • +
    + +

    Besides the two modules, new function-level beamforming methods are added under torchaudio.functional. These include:

    + + +

    For usage details, please check out the documentation at torchaudio.transforms and torchaudio.functional and the Speech Enhancement with MVDR Beamforming tutorial.

    + +

    TorchText v0.13

    + +

    Glue Datasets

    + +

    We increased the number of datasets in TorchText from 22 to 30 by adding the remaining 8 datasets from the GLUE benchmark (SST-2 was already supported). The complete list of GLUE datasets is as follows:

    +
      +
    • CoLA (paper): Single sentence binary classification acceptability task
    • +
    • SST-2 (paper): Single sentence binary classification sentiment task
    • +
    • MRPC (paper): Dual sentence binary classification paraphrase task
    • +
    • QQP: Dual sentence binary classification paraphrase task
    • +
    • STS-B (paper): Single sentence to float regression sentence similarity task
    • +
    • MNLI (paper): Sentence ternary classification NLI task
    • +
    • QNLI (paper): Sentence binary classification QA and NLI tasks
    • +
    • RTE (paper): Dual sentence binary classification NLI task
    • +
    • WNLI (paper): Dual sentence binary classification coreference and NLI tasks
    • +
    + +

    Scriptable BERT Tokenizer

    + +

    TorchText has extended support for scriptable tokenizer by adding the WordPiece tokenizer used in BERT. It is one of the commonly used algorithms for splitting input text into sub-words units and was introduced in Japanese and Korean Voice Search (Schuster et al., 2012).

    + +

    TorchScriptabilty support would allow users to embed the BERT text-preprocessing natively in C++ without needing the support of python runtime. As TorchText now supports the CMAKE build system to natively link torchtext binaries with application code, users can easily integrate BERT tokenizers for deployment needs.

    + +

    For usage details, please refer to the corresponding documentation.

    + +

    TorchRec v0.2.0

    + +

    EmbeddingModule + DLRM benchmarks

    + +

    A set of benchmarking tests, showing performance characteristics of TorchRec’s base modules and research models built out of TorchRec.

    + +

    TwoTower Retrieval Example, with FAISS

    + +

    We provide an example demonstrating training a distributed TwoTower (i.e. User-Item) Retrieval model that is sharded using TorchRec. The projected item embeddings are added to an IVFPQ FAISS index for candidate generation. The retrieval model and KNN lookup are bundled in a Pytorch model for efficient end-to-end retrieval.

    + +

    Integrations

    + +

    We demonstrate that TorchRec works out of the box with many components commonly used alongside PyTorch models in production like systems, such as

    +
      +
    • Training a TorchRec model on Ray Clusters utilizing the Torchx Ray scheduler
    • +
    • Preprocessing and DataLoading with NVTabular on DLRM
    • +
    • Training a TorchRec model with on-the-fly preprocessing with TorchArrow showcasing RecSys domain UDFs
    • +
    + +

    Sequential Embeddings Example: Bert4Rec

    + +

    We provide an example, using TorchRec, that reimplements the BERT4REC paper, showcasing EmbeddingCollection for non-pooled embeddings. Using DistributedModelParallel we see a 35% QPS gain over conventional data parallelism.

    + +

    (Beta) Planner

    + +

    The TorchRec library includes a built-in planner that selects near optimal sharding plan for a given model. The planner attempts to identify the best sharding plan by evaluating a series of proposals which are statically analyzed and fed into an integer partitioner. The planner is able to automatically adjust plans for a wide range of hardware setups, allowing users to scale performance seamlessly from local development environment to large scale production hardware. See this notebook for a more detailed tutorial.

    + +

    (Beta) Inference

    + +

    TorchRec Inference is a C++ library that supports multi-gpu inference. The TorchRec library is used to shard models written and packaged in Python via torch.package (an alternative to TorchScript). The torch.deploy library is used to serve inference from C++ by launching multiple Python interpreters carrying the packaged model, thus subverting the GIL. Two models are provided as examples: DLRM multi-GPU (sharded via TorchRec) and DLRM single-GPU.

    + +

    (Beta) RecMetrics

    + +

    RecMetrics is a metrics library that collects common utilities and optimizations for Recommendation models. It extends torchmetrics.

    +
      +
    • A centralized metrics module that allows users to add new metrics
    • +
    • Commonly used metrics, including AUC, Calibration, CTR, MSE/RMSE, NE & Throughput
    • +
    • Optimization for metrics related operations to reduce the overhead of metric computation
    • +
    • Checkpointing
    • +
    + +

    (Prototype) Single process Batched + Fused Embeddings

    + +

    Previously TorchRec’s abstractions (EmbeddingBagCollection/EmbeddingCollection) over FBGEMM kernels, which provide benefits such as table batching, optimizer fusion, and UVM placement, could only be used in conjunction with DistributedModelParallel. We’ve decoupled these notions from sharding, and introduced the FusedEmbeddingBagCollection, which can be used as a standalone module, with all of the above features, and can also be sharded.

    + +

    TorchX v0.2.0

    + +

    TorchX is a job launcher that makes it easier to run PyTorch in distributed training clusters with many scheduler integrations including Kubernetes and Slurm. We’re excited to release TorchX 0.2.0 with a number of improvements. TorchX is currently being used in production in both on-premise and cloud environments.

    + +

    Check out the quickstart to start launching local and remote jobs.

    + +

    Workspaces

    + +

    TorchX now supports workspaces which allows users to easily launch training jobs using their local workspace. TorchX can automatically build a patch with your local training code on top of a base image to minimize iteration time and time to training.

    + +

    .torchxconfig

    + +

    Specifying options in .torchxconfig saves you from having to type long CLI commands each time you launch a job. You can also define project level generic configs and drop a config file in your home directory for user-level overrides.

    + +

    Expanded Scheduler Support

    + +

    TorchX now supports AWS Batch and Ray (experimental) schedulers in addition to our existing integrations.

    + +

    Distributed Training On All Schedulers

    + +

    The TorchX dist.ddp component now works on all schedulers without any configuration. Distributed training workers will automatically discover each other when using torchelastic via the builtin dist.ddp component.

    + +

    Hyper Parameter Optimization

    + +

    TorchX integrates with Ax to let you scale hyper-parameter optimizations (HPO) by launching the search trials onto remote clusters.

    + +

    File and Device Mounts

    + +

    TorchX now supports remote filesystem mounts and custom devices. This enables your PyTorch jobs to efficiently access cloud storage such as NFS or Lustre. The device mounts enables usage of network accelerators like Infiniband and custom inference/training accelerators.

    + +

    FBGemm v0.2.0

    + +

    The FBGEMM library contains optimized kernels meant to improve the performance of PyTorch workloads. We’ve added a number of new features and optimizations over the last few months that we are excited to report.

    + +

    Inference Table Batched Embedding (TBE)

    + +

    The table batched embedding bag (TBE) operator is an important base operation for embedding lookup for recommendation system inference on GPU. We added the following enhancements for performance and flexibility:

    + +

    Alignment restriction removed

    +
      +
    • Embedding dimension * data type size had to be multiple of 4B before and now, it is 1B.
    • +
    + +

    Unified Virtual Memory (UVM) caching kernel optimizations

    +
      +
    • UVM caching kernels now scale linearly with # of tables using UVM caching. Previously, it was having similar overhead as all tables using UVM caching
    • +
    • UVM caching kernel overhead is much smaller than before
    • +
    + +

    Inference FP8 Table Batched Embedding (TBE)

    + +

    The table batched embedding bag (TBE) previously supported FP32, FP16, INT8, INT4, and INT2 embedding weight types. While these weight types work well in many models, we integrate FP8 weight types (in both GPU and CPU operations) to allow for numerical and performance evaluations of FP8 in our models. Compared to INT8, FP8 does not require the additional bias and scale storage and calculations. Additionally, the next generation of H100 GPUs has the FP8 support on Tensor Core (mainly matmul ops).

    + +

    Jagged Tensor Kernels

    + +

    We added optimized kernels to speed up TorchRec JaggedTensor. The purpose of JaggedTensor is to handle the case where one dimension of the input data is “jagged”, meaning that each consecutive row in a given dimension may be a different length, which is often the case with sparse feature inputs in recommendation systems. The internal representation is shown below:

    + +

    + +

    + +

    We added ops for converting jagged tensors from sparse to dense formats and back, performing matrix multiplications with jagged tensors, and elementwise ops.

    + +

    Optimized permute102-baddbmm-permute102

    + +

    It is difficult to fuse various matrix multiplications where the batch size is not the batch size of the model, switching the batch dimension is a quick solution. We created the permute102_baddbmm_permute102 operation that switches the first and the second dimension, performs the batched matrix multiplication and then switches back. Currently we only support forward pass with FP16 data type and will support FP32 type and backward pass in the future.

    + +

    Optimized index_select for dim 0 index selection

    + +

    index_select is normally used as part of a sparse operation. While PyTorch supports a generic index_select for an arbitrary-dimension index selection, its performance for a special case like the dim 0 index selection is suboptimal. For this reason, we implement a specialized index_select for dim 0. In some cases, we have observed 1.4x performance gain from FBGEMM’s index_select compared to the one from PyTorch (using uniform index distribution).

    + +

    More about the implementation of influential instances can be found on our GitHub page and tutorials.

    + +

    Thanks for reading, If you’re interested in these updates and want to join the PyTorch community, we encourage you to join the discussion forums and open GitHub issues. To get the latest news from PyTorch, follow us on Twitter, Medium, YouTube, and LinkedIn.

    + +

    Cheers!

    + +

    Team PyTorch

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-1.12-released/index.html b/blog/pytorch-1.12-released/index.html new file mode 100644 index 000000000000..501fecdf804b --- /dev/null +++ b/blog/pytorch-1.12-released/index.html @@ -0,0 +1,865 @@ + + + + + + + + + + + + + PyTorch 1.12: TorchArrow, Functional API for Modules and nvFuser, are now available | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    We are excited to announce the release of PyTorch 1.12 (release note)! This release is composed of over 3124 commits, 433 contributors. Along with 1.12, we are releasing beta versions of AWS S3 Integration, PyTorch Vision Models on Channels Last on CPU, Empowering PyTorch on Intel® Xeon® Scalable processors with Bfloat16 and FSDP API. We want to sincerely thank our dedicated community for your contributions.

    + +

    Summary:

    +
      +
    • Functional APIs to functionally apply module computation with a given set of parameters
    • +
    • Complex32 and Complex Convolutions in PyTorch
    • +
    • DataPipes from TorchData fully backward compatible with DataLoader
    • +
    • functorch with improved coverage for APIs
    • +
    • nvFuser a deep learning compiler for PyTorch
    • +
    • Changes to float32 matrix multiplication precision on Ampere and later CUDA hardware
    • +
    • TorchArrow, a new beta library for machine learning preprocessing over batch data
    • +
    + +

    Frontend APIs

    + +

    Introducing TorchArrow

    + +

    We’ve got a new Beta release ready for you to try and use: TorchArrow. This is a library for machine learning preprocessing over batch data. It features a performant and Pandas-style, easy-to-use API in order to speed up your preprocessing workflows and development.

    + +

    Currently, it provides a Python DataFrame interface with the following features:

    +
      +
    • High-performance CPU backend, vectorized and extensible User-Defined Functions (UDFs) with Velox
    • +
    • Seamless handoff with PyTorch or other model authoring, such as Tensor collation and easily plugging into PyTorch DataLoader and DataPipes
    • +
    • Zero copy for external readers via Arrow in-memory columnar format
    • +
    + +

    For more details, please find our 10-min tutorial, installation instructions, API documentation, and a prototype for data preprocessing in TorchRec.

    + +

    (Beta) Functional API for Modules

    + +

    PyTorch 1.12 introduces a new beta feature to functionally apply Module computation with a given set of parameters. Sometimes, the traditional PyTorch Module usage pattern that maintains a static set of parameters internally is too restrictive. This is often the case when implementing algorithms for meta-learning, where multiple sets of parameters may need to be maintained across optimizer steps.

    + +

    The new torch.nn.utils.stateless.functional_call() API allows for:

    +
      +
    • Module computation with full flexibility over the set of parameters used
    • +
    • No need to reimplement your module in a functional way
    • +
    • Any parameter or buffer present in the module can be swapped with an externally-defined value for use in the call. Naming for referencing parameters / buffers follows the fully-qualified form in the module’s state_dict()
    • +
    + +

    Example:

    +
    import torch
    +from torch import nn
    +from torch.nn.utils.stateless import functional_call
    +
    +class MyModule(nn.Module):
    +    def __init__(self):
    +        super().__init__()
    +        self.fc1 = nn.Linear(3, 3)
    +        self.bn = nn.BatchNorm1d(3)
    +        self.fc2 = nn.Linear(3, 3)
    +
    +    def forward(self, x):
    +        return self.fc2(self.bn(self.fc1(x)))
    +
    +m = MyModule()
    +
    +# Define parameter / buffer values to use during module computation.
    +my_weight = torch.randn(3, 3, requires_grad=True)
    +my_bias = torch.tensor([1., 2., 3.], requires_grad=True)
    +params_and_buffers = {
    +    'fc1.weight': my_weight,
    +    'fc1.bias': my_bias,
    +    # Custom buffer values can be used too.
    +    'bn.running_mean': torch.randn(3),
    +}
    +
    +# Apply module computation to the input with the specified parameters / buffers.
    +inp = torch.randn(5, 3)
    +output = functional_call(m, params_and_buffers, inp)
    +
    + +

    (Beta) Complex32 and Complex Convolutions in PyTorch

    + +

    PyTorch today natively supports complex numbers, complex autograd, complex modules, and numerous complex operations, including linear algebra and Fast Fourier Transform (FFT) operators. Many libraries, including torchaudio and ESPNet, already make use of complex numbers in PyTorch, and PyTorch 1.12 further extends complex functionality with complex convolutions and the experimental complex32 (“complex half”) data type that enables half precision FFT operations. Due to the bugs in CUDA 11.3 package, we recommend using CUDA 11.6 package from wheels if you are using complex numbers.

    + +

    (Beta) Forward-mode Automatic Differentiation

    + +

    Forward-mode AD allows the computation of directional derivatives (or equivalently, Jacobian-vector products) eagerly in the forward pass. PyTorch 1.12 significantly improves the operator coverage for forward-mode AD. See our tutorial for more information.

    + +

    TorchData

    + +

    BC DataLoader + DataPipe

    + +

    `DataPipe` from TorchData becomes fully backward compatible with the existing `DataLoader` regarding shuffle determinism and dynamic sharding in both multiprocessing and distributed environments.

    + +

    (Beta) AWS S3 Integration

    + +

    DataPipes based on AWSSDK have been integrated into TorchData. It provides the following features backed by native AWSSDK:

    +
      +
    • Retrieve list of urls from each S3 bucket based on prefix +
        +
      • Support timeout to prevent hanging indefinitely
      • +
      • Support to specify S3 bucket region
      • +
      +
    • +
    • Load data from S3 urls +
        +
      • Support buffered and multi-part download
      • +
      • Support to specify S3 bucket region
      • +
      +
    • +
    + +

    AWS native DataPipes are still in the beta phase. And, we will keep tuning them to improve their performance.

    + +

    (Prototype) DataLoader2

    + +

    DataLoader2 became available in prototype mode. We are introducing new ways to interact between DataPipes, DataLoading API, and backends (aka ReadingServices). Feature is stable in terms of API, but functionally not complete yet. We welcome early adopters and feedback, as well as potential contributors.

    + +

    For more details, please checkout the link.

    + +

    functorch

    + +

    Inspired by Google JAX, functorch is a library that offers composable vmap (vectorization) and autodiff transforms. It enables advanced autodiff use cases that would otherwise be tricky to express in PyTorch. Examples of these include:

    + + +

    We’re excited to announce functorch 0.2.0 with a number of improvements and new experimental features.

    + +

    Significantly improved coverage

    + +

    We significantly improved coverage for functorch.jvp (our forward-mode autodiff API) and other APIs that rely on it (functorch.{jacfwd, hessian}).

    + +

    (Prototype) functorch.experimental.functionalize

    + +

    Given a function f, functionalize(f) returns a new function without mutations (with caveats). This is useful for constructing traces of PyTorch functions without in-place operations. For example, you can use make_fx(functionalize(f)) to construct a mutation-free trace of a pytorch function. To learn more, please see the documentation.

    + +

    For more details, please see our installation instructions, documentation, tutorials, and release notes.

    + +

    Performance Improvements

    + +

    Introducing nvFuser, a deep learning compiler for PyTorch

    + +

    In PyTorch 1.12, Torchscript is updating its default fuser (for Volta and later CUDA accelerators) to nvFuser, which supports a wider range of operations and is faster than NNC, the previous fuser for CUDA devices. A soon to be published blog post will elaborate on nvFuser and show how it speeds up training on a variety of networks.

    + +

    See the nvFuser documentation for more details on usage and debugging.

    + +

    Changes to float32 matrix multiplication precision on Ampere and later CUDA hardware

    + +

    PyTorch supports a variety of “mixed precision” techniques, like the torch.amp (Automated Mixed Precision) module and performing float32 matrix multiplications using the TensorFloat32 datatype on Ampere and later CUDA hardware for faster internal computations. In PyTorch 1.12 we’re changing the default behavior of float32 matrix multiplications to always use full IEEE fp32 precision, which is more precise but slower than using the TensorFloat32 datatype for internal computation. For devices with a particularly high ratio of TensorFloat32 to float32 throughput such as A100, this change in defaults can result in a large slowdown.

    + +

    If you’ve been using TensorFloat32 matrix multiplications then you can continue to do so by setting torch.backends.cuda.matmul.allow_tf32 = True

    + +

    which is supported since PyTorch 1.7. Starting in PyTorch 1.12 the new matmul precision API can be used, too: torch.set_float32_matmul_precision(“highest”|”high”|”medium”)

    + +

    To reiterate, PyTorch’s new default is “highest” precision for all device types. We think this provides better consistency across device types for matrix multiplications. Documentation for the new precision API can be found here. Setting the “high” or “medium” precision types will enable TensorFloat32 on Ampere and later CUDA devices. If you’re updating to PyTorch 1.12 then to preserve the current behavior and faster performance of matrix multiplications on Ampere devices, set precision to “high”.

    + +

    Using mixed precision techniques is essential for training many modern deep learning networks efficiently, and if you’re already using torch.amp this change is unlikely to affect you. If you’re not familiar with mixed precision training then see our soon to be published “What Every User Should Know About Mixed Precision Training in PyTorch” blogpost.

    + +

    (Beta) Accelerating PyTorch Vision Models with Channels Last on CPU

    + +

    Memory formats have a significant impact on performance when running vision models, generally Channels Last is more favorable from a performance perspective due to better data locality. 1.12 includes fundamental concepts of memory formats and demonstrates performance benefits using Channels Last on popular PyTorch vision models on Intel® Xeon® Scalable processors.

    +
      +
    • Enables Channels Last memory format support for the commonly used operators in CV domain on CPU, applicable for both inference and training
    • +
    • Provides native level optimization on Channels Last kernels from ATen, applicable for both AVX2 and AVX512
    • +
    • Delivers 1.3x to 1.8x inference performance gain over Channels First for TorchVision models on Intel® Xeon® Ice Lake (or newer) CPUs
    • +
    + +

    (Beta) Empowering PyTorch on Intel® Xeon® Scalable processors with Bfloat16

    + +

    Reduced precision numeric formats like bfloat16 improves PyTorch performance across multiple deep learning training workloads. PyTorch 1.12 includes the latest software enhancements on bfloat16 which applies to a broader scope of user scenarios and showcases even higher performance gains. The main improvements include:

    +
      +
    • 2x hardware compute throughput vs. float32 with the new bfloat16 native instruction VDPBF16PS, introduced on Intel® Xeon® Cooper Lake CPUs
    • +
    • 1/2 memory footprint of float32, faster speed for memory bandwidth intensive operators
    • +
    • 1.4x to 2.2x inference performance gain over float32 for TorchVision models on Intel® Xeon® Cooper Lake (or newer) CPUs
    • +
    + +

    (Prototype) Introducing Accelerated PyTorch Training on Mac

    + +

    With the PyTorch 1.12 release, developers and researchers can now take advantage of Apple silicon GPUs for significantly faster model training. This unlocks the ability to perform machine learning workflows like prototyping and fine-tuning locally, right on Mac. Accelerated GPU training is enabled using Apple’s Metal Performance Shaders (MPS) as a backend. The benefits include performance speedup from accelerated GPU training and the ability to train larger networks or batch sizes locally. Learn more here.

    + +

    + +

    + +

    + Accelerated GPU training and evaluation speedups over CPU-only (times faster) +

    + +

    Alongside the new MPS device support, the M1 binaries for Core and Domain libraries that have been available for the last few releases are now an official prototype feature. These binaries can be used to run PyTorch natively on Apple Silicon.

    + +

    (Prototype) BetterTransformer: Fastpath execution for Transformer Encoder Inference

    + +

    PyTorch now supports CPU and GPU fastpath implementations (“BetterTransformer”) for several Transformer Encoder modules including TransformerEncoder, TransformerEncoderLayer, and MultiHeadAttention (MHA). The BetterTransformer fastpath architecture Better Transformer is consistently faster – 2x for many common execution scenarios, depending on model and input characteristics. The new BetterTransformer-enabled modules are API compatible with previous releases of the PyTorch Transformer API and will accelerate existing models if they meet fastpath execution requirements, as well as read models trained with previous versions of PyTorch. PyTorch 1.12 includes:

    +
      +
    • BetterTransformer integration for Torchtext’s pretrained RoBERTa and XLM-R models
    • +
    • Torchtext which builds on the PyTorch Transformer API
    • +
    • Fastpath execution for improved performance by reducing execution overheads with fused kernels which combines multiple operators into a single kernel
    • +
    • Option to achieve additional speedups by taking advantage of data sparsity during the processing of padding tokens in natural-language processing (by setting enable_nested_tensor=True when creating a TransformerEncoder)
    • +
    • Diagnostics to help users understand why fastpath execution did not occur
    • +
    + +

    + +

    + +

    Distributed

    + +

    (Beta) Fully Sharded Data Parallel (FSDP) API

    + +

    FSDP API helps easily scale large model training by sharding a model’s parameters, gradients and optimizer states across data parallel workers while maintaining the simplicity of data parallelism. The prototype version was released in PyTorch 1.11 with a minimum set of features that helped scaling tests of models with up to 1T parameters.

    + +

    In this beta release, FSDP API added the following features to support various production workloads. Highlights of the the newly added features in this beta release include:

    +
      +
    1. Universal sharding strategy API - Users can easily change between sharding strategies with a single line change, and thus compare and use DDP (only data sharding), FSDP (full model and data sharding), or Zero2 (only sharding of optimizer and gradients) to optimize memory and performance for their specific training needs
    2. +
    3. Fine grained mixed precision policies - Users can specify a mix of half and full data types (bfloat16, fp16 or fp32) for model parameters, gradient communication, and buffers via mixed precision policies. Models are automatically saved in fp32 to allow for maximum portability
    4. +
    5. Transformer auto wrapping policy - allows for optimal wrapping of Transformer based models by registering the models layer class, and thus accelerated training performance
    6. +
    7. Faster model initialization using device_id init - initialization is performed in a streaming fashion to avoid OOM issues and optimize init performance vs CPU init
    8. +
    9. Rank0 streaming for full model saving of larger models - Fully sharded models can be saved by all GPU’s streaming their shards to the rank 0 GPU, and the model is built in full state on the rank 0 CPU for saving
    10. +
    + +

    For more details and example code, please checkout the documentation and the tutorial.

    + +

    Thanks for reading, If you’re interested in these updates and want to join the PyTorch community, we encourage you to join the discussion forums and open GitHub issues. To get the latest news from PyTorch, follow us on Twitter, Medium, YouTube, and LinkedIn.

    + +

    Cheers!

    + +

    Team PyTorch

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-1.2-and-domain-api-release/index.html b/blog/pytorch-1.2-and-domain-api-release/index.html new file mode 100644 index 000000000000..aeaa86b8116a --- /dev/null +++ b/blog/pytorch-1.2-and-domain-api-release/index.html @@ -0,0 +1,832 @@ + + + + + + + + + + + + + New Releases: PyTorch 1.2, torchtext 0.4, torchaudio 0.3, and torchvision 0.4 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    Since the release of PyTorch 1.0, we’ve seen the community expand to add new tools, contribute to a growing set of models available in the PyTorch Hub, and continually increase usage in both research and production.

    + +

    From a core perspective, PyTorch has continued to add features to support both research and production usage, including the ability to bridge these two worlds via TorchScript. Today, we are excited to announce that we have four new releases including PyTorch 1.2, torchvision 0.4, torchaudio 0.3, and torchtext 0.4. You can get started now with any of these releases at pytorch.org.

    + +

    PyTorch 1.2

    + +

    With PyTorch 1.2, the open source ML framework takes a major step forward for production usage with the addition of an improved and more polished TorchScript environment. These improvements make it even easier to ship production models, expand support for exporting ONNX formatted models, and enhance module level support for Transformers. In addition to these new features, TensorBoard is now no longer experimental - you can simply type from torch.utils.tensorboard import SummaryWriter to get started.

    + +

    TorchScript Improvements

    + +

    Since its release in PyTorch 1.0, TorchScript has provided a path to production for eager PyTorch models. The TorchScript compiler converts PyTorch models to a statically typed graph representation, opening up opportunities for +optimization and execution in constrained environments where Python is not available. You can incrementally convert your model to TorchScript, mixing compiled code seamlessly with Python.

    + +

    PyTorch 1.2 significantly expands TorchScript’s support for the subset of Python used in PyTorch models and delivers a new, easier-to-use API for compiling your models to TorchScript. See the migration guide for details. Below is an example usage of the new API:

    + +
    import torch
    +
    +class MyModule(torch.nn.Module):
    +    def __init__(self, N, M):
    +        super(MyModule, self).__init__()
    +        self.weight = torch.nn.Parameter(torch.rand(N, M))
    +
    +    def forward(self, input):
    +        if input.sum() > 0:
    +          output = self.weight.mv(input)
    +        else:
    +          output = self.weight + input
    +        return output
    +
    +# Compile the model code to a static representation
    +my_script_module = torch.jit.script(MyModule(3, 4))
    +
    +# Save the compiled code and model data so it can be loaded elsewhere
    +my_script_module.save("my_script_module.pt")
    +
    + +

    To learn more, see our Introduction to TorchScript and Loading a +PyTorch Model in C++ tutorials.

    + +

    Expanded ONNX Export

    + +

    The ONNX community continues to grow with an open governance structure and additional steering committee members, special interest groups (SIGs), and working groups (WGs). In collaboration with Microsoft, we’ve added full support to export ONNX Opset versions 7(v1.2), 8(v1.3), 9(v1.4) and 10 (v1.5). We’ve have also enhanced the constant folding pass to support Opset 10, the latest available version of ONNX. ScriptModule has also been improved including support for multiple outputs, tensor factories, and tuples as inputs and outputs. Additionally, users are now able to register their own symbolic to export custom ops, and specify the dynamic dimensions of inputs during export. Here is a summary of the all of the major improvements:

    + +
      +
    • Support for multiple Opsets including the ability to export dropout, slice, flip, and interpolate in Opset 10.
    • +
    • Improvements to ScriptModule including support for multiple outputs, tensor factories, and tuples as inputs and outputs.
    • +
    • More than a dozen additional PyTorch operators supported including the ability to export a custom operator.
    • +
    • Many big fixes and test infra improvements.
    • +
    + +

    You can try out the latest tutorial here, contributed by @lara-hdr at Microsoft. A big thank you to the entire Microsoft team for all of their hard work to make this release happen!

    + +

    nn.Transformer

    + +

    In PyTorch 1.2, we now include a standard nn.Transformer module, based on the paper “Attention is All You Need”. The nn.Transformer module relies entirely on an attention mechanism to draw global dependencies between input and output. The individual components of the nn.Transformer module are designed so they can be adopted independently. For example, the nn.TransformerEncoder can be used by itself, without the larger nn.Transformer. The new APIs include:

    + +
      +
    • nn.Transformer
    • +
    • nn.TransformerEncoder and nn.TransformerEncoderLayer
    • +
    • nn.TransformerDecoder and nn.TransformerDecoderLayer
    • +
    + +
    + +
    + +

    See the Transformer Layers documentation for more information. See here for the full PyTorch 1.2 release notes.

    + +

    Domain API Library Updates

    + +

    PyTorch domain libraries like torchvision, torchtext, and torchaudio provide convenient access to common datasets, models, and transforms that can be used to quickly create a state-of-the-art baseline. Moreover, they also provide common abstractions to reduce boilerplate code that users might have to otherwise repeatedly write. Since research domains have distinct requirements, an ecosystem of specialized libraries called domain APIs (DAPI) has emerged around PyTorch to simplify the development of new and existing algorithms in a number of fields. We’re excited to release three updated DAPI libraries for text, audio, and vision that compliment the PyTorch 1.2 core release.

    + +

    Torchaudio 0.3 with Kaldi Compatibility, New Transforms

    + +
    + +
    + +

    Torchaudio specializes in machine understanding of audio waveforms. It is an ML library that provides relevant signal processing functionality (but is not a general signal processing library). It leverages PyTorch’s GPU support to provide many tools and transformations for waveforms to make data loading and standardization easier and more readable. For example, it offers data loaders for waveforms using sox, and transformations such as spectrograms, resampling, and mu-law encoding and decoding.

    + +

    We are happy to announce the availability of torchaudio 0.3.0, with a focus on standardization and complex numbers, a transformation (resample) and two new functionals (phase_vocoder, ISTFT), Kaldi compatibility, and a new tutorial. Torchaudio was redesigned to be an extension of PyTorch and a part of the domain APIs (DAPI) ecosystem.

    + +

    Standardization

    + +

    Significant effort in solving machine learning problems goes into data preparation. In this new release, we’ve updated torchaudio’s interfaces for its transformations to standardize around the following vocabulary and conventions.

    + +

    Tensors are assumed to have channel as the first dimension and time as the last dimension (when applicable). This makes it consistent with PyTorch’s dimensions. For size names, the prefix n_ is used (e.g. “a tensor of size (n_freq, n_mel)”) whereas dimension names do not have this prefix (e.g. “a tensor of dimension (channel, time)”). The input of all transforms and functions now assumes channel first. This is done to be consistent with PyTorch, which has channel followed by the number of samples. The channel parameter of all transforms and functions is now deprecated.

    + +

    The output of STFT is (channel, frequency, time, 2), meaning for each channel, the columns are the Fourier transform of a certain window, so as we travel horizontally we can see each column (the Fourier transformed waveform) change over time. This matches the output of librosa so we no longer need to transpose in our test comparisons with Spectrogram, MelScale, MelSpectrogram, and MFCC. Moreover, because of these new conventions, we deprecated LC2CL and BLC2CBL which were used to transfer from one shape of signal to another.

    + +

    As part of this release, we’re also introducing support for complex numbers via tensors of dimension (…, 2), and providing magphase to convert such a tensor into its magnitude and phase, and similarly complex_norm and angle.

    + +

    The details of the standardization are provided in the README.

    + +

    Functionals, Transformations, and Kaldi Compatibility

    + +

    Prior to the standardization, we separated state and computation into torchaudio.transforms and torchaudio.functional.

    + +

    As part of the transforms, we’re adding a new transformation in 0.3.0: Resample. Resample can upsample or downsample a waveform to a different frequency.

    + +

    As part of the functionals, we’re introducing: phase_vocoder, a phase vocoder to change the speed of a waveform without changing its pitch, and ISTFT, the inverse STFT implemented to be compatible with STFT provided by PyTorch. This separation allows us to make functionals weak scriptable and to utilize JIT in 0.3.0. We thus have JIT and CUDA support for the following transformations: Spectrogram, AmplitudeToDB (previously named SpectrogramToDB), MelScale, +MelSpectrogram, MFCC, MuLawEncoding, MuLawDecoding (previously named MuLawExpanding).

    + +

    We now also provide a compatibility interface with Kaldi to ease onboarding and reduce a user’s code dependency on Kaldi. We now have an interface for spectrogram, fbank, and resample_waveform.

    + +

    New Tutorial

    + +

    To showcase the new conventions and transformations, we have a new tutorial demonstrating how to preprocess waveforms using torchaudio. This tutorial walks through an example of loading a waveform and applying some of the available transformations to it.

    + +

    We are excited to see an active community around torchaudio and eager to further grow and support it. We encourage you to go ahead and experiment for yourself with this tutorial and the two datasets that are available: VCTK and YESNO! They have an interface to download the datasets and preprocess them in a convenient format. You can find the details in the release notes here.

    + +

    Torchtext 0.4 with supervised learning datasets

    + +

    A key focus area of torchtext is to provide the fundamental elements to help accelerate NLP research. This includes easy access to commonly used datasets and basic preprocessing pipelines for working on raw text based data. The torchtext 0.4.0 release includes several popular supervised learning baselines with “one-command” data loading. A tutorial is included to show how to use the new datasets for text classification analysis. We also added and improved on a few functions such as get_tokenizer and build_vocab_from_iterator to make it easier to implement future datasets. Additional examples can be found here.

    + +

    Text classification is an important task in Natural Language Processing with many applications, such as sentiment analysis. The new release includes several popular text classification datasets for supervised learning including:

    + +
      +
    • AG_NEWS
    • +
    • SogouNews
    • +
    • DBpedia
    • +
    • YelpReviewPolarity
    • +
    • YelpReviewFull
    • +
    • YahooAnswers
    • +
    • AmazonReviewPolarity
    • +
    • AmazonReviewFull
    • +
    + +

    Each dataset comes with two parts (train vs. test), and can be easily loaded with a single command. The datasets also support an ngrams feature to capture the partial information about the local word order. Take a look at the tutorial here to learn more about how to use the new datasets for supervised problems such as text classification analysis.

    + +
    from torchtext.datasets.text_classification import DATASETS
    +train_dataset, test_dataset = DATASETS['AG_NEWS'](ngrams=2)
    +
    + +

    In addition to the domain library, PyTorch provides many tools to make data loading easy. Users now can load and preprocess the text classification datasets with some well supported tools, like torch.utils.data.DataLoader and torch.utils.data.IterableDataset. Here are a few lines to wrap the data with DataLoader. More examples can be found here.

    + +
    from torch.utils.data import DataLoader
    +data = DataLoader(train_dataset, collate_fn=generate_batch)
    +
    + +

    Check out the release notes here to learn more and try out the tutorial here.

    + +

    Torchvision 0.4 with Support for Video

    + +

    Video is now a first-class citizen in torchvision, with support for data loading, datasets, pre-trained models, and transforms. The 0.4 release of torchvision includes:

    + +
      +
    • Efficient IO primitives for reading/writing video files (including audio), with support for arbitrary encodings and formats.
    • +
    • Standard video datasets, compatible with torch.utils.data.Dataset and torch.utils.data.DataLoader.
    • +
    • Pre-trained models built on the Kinetics-400 dataset for action classification on videos (including the training scripts).
    • +
    • Reference training scripts for training your own video models.
    • +
    + +

    We wanted working with video data in PyTorch to be as straightforward as possible, without compromising too much on performance. +As such, we avoid the steps that would require re-encoding the videos beforehand, as it would involve:

    + +
      +
    • A preprocessing step which duplicates the dataset in order to re-encode it.
    • +
    • An overhead in time and space because this re-encoding is time-consuming.
    • +
    • Generally, an external script should be used to perform the re-encoding.
    • +
    + +

    Additionally, we provide APIs such as the utility class, VideoClips, that simplifies the task of enumerating all possible clips of fixed size in a list of video files by creating an index of all clips in a set of videos. It also allows you to specify a fixed frame-rate for the videos. An example of the API is provided below:

    + +
    from torchvision.datasets.video_utils import VideoClips
    +
    +class MyVideoDataset(object):
    +    def __init__(self, video_paths):
    +        self.video_clips = VideoClips(video_paths,
    +                                      clip_length_in_frames=16,
    +                                      frames_between_clips=1,
    +                                      frame_rate=15)
    +
    +    def __getitem__(self, idx):
    +        video, audio, info, video_idx = self.video_clips.get_clip(idx)
    +        return video, audio
    +
    +    def __len__(self):
    +        return self.video_clips.num_clips()
    +
    + +

    Most of the user-facing API is in Python, similar to PyTorch, which makes it easily extensible. Plus, the underlying implementation is fast — torchvision decodes as little as possible from the video on-the-fly in order to return a clip from the video.

    + +

    Check out the torchvision 0.4 release notes here for more details.

    + +

    We look forward to continuing our collaboration with the community and hearing your feedback as we further improve and expand the PyTorch deep learning platform.

    + +

    We’d like to thank the entire PyTorch team and the community for all of the contributions to this work!

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-1.6-now-includes-stochastic-weight-averaging/index.html b/blog/pytorch-1.6-now-includes-stochastic-weight-averaging/index.html new file mode 100644 index 000000000000..c1c65d7b7d0f --- /dev/null +++ b/blog/pytorch-1.6-now-includes-stochastic-weight-averaging/index.html @@ -0,0 +1,946 @@ + + + + + + + + + + + + + PyTorch 1.6 now includes Stochastic Weight Averaging | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Pavel Izmailov, Andrew Gordon Wilson and Vincent Quenneville-Belair + +

    +

    Do you use stochastic gradient descent (SGD) or Adam? Regardless of the procedure you use to train your neural network, you can likely achieve significantly better generalization at virtually no additional cost with a simple new technique now natively supported in PyTorch 1.6, Stochastic Weight Averaging (SWA) [1]. Even if you have already trained your model, it’s easy to realize the benefits of SWA by running SWA for a small number of epochs starting with a pre-trained model. Again and again, researchers are discovering that SWA improves the performance of well-tuned models in a wide array of practical applications with little cost or effort!

    + +

    SWA has a wide range of applications and features:

    +
      +
    • SWA significantly improves performance compared to standard training techniques in computer vision (e.g., VGG, ResNets, Wide ResNets and DenseNets on ImageNet and CIFAR benchmarks [1, 2]).
    • +
    • SWA provides state-of-the-art performance on key benchmarks in semi-supervised learning and domain adaptation [2].
    • +
    • SWA was shown to improve performance in language modeling (e.g., AWD-LSTM on WikiText-2 [4]) and policy-gradient methods in deep reinforcement learning [3].
    • +
    • SWAG, an extension of SWA, can approximate Bayesian model averaging in Bayesian deep learning and achieves state-of-the-art uncertainty calibration results in various settings. Moreover, its recent generalization MultiSWAG provides significant additional performance gains and mitigates double-descent [4, 10]. Another approach, Subspace Inference, approximates the Bayesian posterior in a small subspace of the parameter space around the SWA solution [5].
    • +
    • SWA for low precision training, SWALP, can match the performance of full-precision SGD training, even with all numbers quantized down to 8 bits, including gradient accumulators [6].
    • +
    • SWA in parallel, SWAP, was shown to greatly speed up the training of neural networks by using large batch sizes and, in particular, set a record by training a neural network to 94% accuracy on CIFAR-10 in 27 seconds [11].
    • +
    + +
    + +
    + +

    Figure 1. Illustrations of SWA and SGD with a Preactivation ResNet-164 on CIFAR-100 [1]. Left: test error surface for three FGE samples and the corresponding SWA solution (averaging in weight space). Middle and Right: test error and train loss surfaces showing the weights proposed by SGD (at convergence) and SWA, starting from the same initialization of SGD after 125 training epochs. Please see [1] for details on how these figures were constructed.

    + +

    In short, SWA performs an equal average of the weights traversed by SGD (or any stochastic optimizer) with a modified learning rate schedule (see the left panel of Figure 1.). SWA solutions end up in the center of a wide flat region of loss, while SGD tends to converge to the boundary of the low-loss region, making it susceptible to the shift between train and test error surfaces (see the middle and right panels of Figure 1). We emphasize that SWA can be used with any optimizer, such as Adam, and is not specific to SGD.

    + +

    Previously, SWA was in PyTorch contrib. In PyTorch 1.6, we provide a new convenient implementation of SWA in torch.optim.swa_utils.

    + +

    Is this just Averaged SGD?

    + +

    At a high level, averaging SGD iterates dates back several decades in convex optimization [7, 8], where it is sometimes referred to as Polyak-Ruppert averaging, or averaged SGD. But the details matter. Averaged SGD is often used in conjunction with a decaying learning rate, and an exponential moving average (EMA), typically for convex optimization. In convex optimization, the focus has been on improved rates of convergence. In deep learning, this form of averaged SGD smooths the trajectory of SGD iterates but does not perform very differently.

    + +

    By contrast, SWA uses an equal average of SGD iterates with a modified cyclical or high constant learning rate and exploits the flatness of training objectives [8] specific to deep learning for improved generalization.

    + +

    How does Stochastic Weight Averaging Work?

    + +

    There are two important ingredients that make SWA work. First, SWA uses a modified learning rate schedule so that SGD (or other optimizers such as Adam) continues to bounce around the optimum and explore diverse models instead of simply converging to a single solution. For example, we can use the standard decaying learning rate strategy for the first 75% of training time and then set the learning rate to a reasonably high constant value for the remaining 25% of the time (see Figure 2 below). The second ingredient is to take an average of the weights (typically an equal average) of the networks traversed by SGD. For example, we can maintain a running average of the weights obtained at the end of every epoch within the last 25% of training time (see Figure 2). After training is complete, we then set the weights of the network to the computed SWA averages.

    + +
    + +
    + +

    Figure 2. Illustration of the learning rate schedule adopted by SWA. Standard decaying schedule is used for the first 75% of the training and then a high constant value is used for the remaining 25%. The SWA averages are formed during the last 25% of training.

    + +

    One important detail is the batch normalization. Batch normalization layers compute running statistics of activations during training. Note that the SWA averages of the weights are never used to make predictions during training. So the batch normalization layers do not have the activation statistics computed at the end of training. We can compute these statistics by doing a single forward pass on the train data with the SWA model.

    + +

    While we focus on SGD for simplicity in the description above, SWA can be combined with any optimizer. You can also use cyclical learning rates instead of a high constant value (see e.g., [2]).

    + +

    How to use SWA in PyTorch?

    + +

    In torch.optim.swa_utils we implement all the SWA ingredients to make it convenient to use SWA with any model. In particular, we implement AveragedModel class for SWA models, SWALR learning rate scheduler, and update_bn utility function to update SWA batch normalization statistics at the end of training.

    + +

    In the example below, swa_model is the SWA model that accumulates the averages of the weights. We train the model for a total of 300 epochs, and we switch to the SWA learning rate schedule and start to collect SWA averages of the parameters at epoch 160.

    + +
    from torch.optim.swa_utils import AveragedModel, SWALR
    +from torch.optim.lr_scheduler import CosineAnnealingLR
    +
    +loader, optimizer, model, loss_fn = ...
    +swa_model = AveragedModel(model)
    +scheduler = CosineAnnealingLR(optimizer, T_max=100)
    +swa_start = 5
    +swa_scheduler = SWALR(optimizer, swa_lr=0.05)
    +
    +for epoch in range(100):
    +      for input, target in loader:
    +          optimizer.zero_grad()
    +          loss_fn(model(input), target).backward()
    +          optimizer.step()
    +      if epoch > swa_start:
    +          swa_model.update_parameters(model)
    +          swa_scheduler.step()
    +      else:
    +          scheduler.step()
    +
    +# Update bn statistics for the swa_model at the end
    +torch.optim.swa_utils.update_bn(loader, swa_model)
    +# Use swa_model to make predictions on test data 
    +preds = swa_model(test_input)
    +
    + +

    Next, we explain each component of torch.optim.swa_utils in detail.

    + +

    AveragedModel class serves to compute the weights of the SWA model. You can create an averaged model by running swa_model = AveragedModel(model). You can then update the parameters of the averaged model by swa_model.update_parameters(model). By default, AveragedModel computes a running equal average of the parameters that you provide, but you can also use custom averaging functions with the avg_fn parameter. In the following example, ema_model computes an exponential moving average.

    + +
    ema_avg = lambda averaged_model_parameter, model_parameter, num_averaged:\
    +0.1 * averaged_model_parameter + 0.9 * model_parameter
    +ema_model = torch.optim.swa_utils.AveragedModel(model, avg_fn=ema_avg)
    +
    + +

    In practice, we find an equal average with the modified learning rate schedule in Figure 2 provides the best performance.

    + +

    SWALR is a learning rate scheduler that anneals the learning rate to a fixed value, and then keeps it constant. For example, the following code creates a scheduler that linearly anneals the learning rate from its initial value to 0.05 in 5 epochs within each parameter group.

    + +
    swa_scheduler = torch.optim.swa_utils.SWALR(optimizer, 
    +anneal_strategy="linear", anneal_epochs=5, swa_lr=0.05)
    +
    +
    +

    We also implement cosine annealing to a fixed value (anneal_strategy="cos"). In practice, we typically switch to SWALR at epoch swa_start (e.g. after 75% of the training epochs), and simultaneously start to compute the running averages of the weights:

    + +
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=100)
    +swa_start = 75
    +for epoch in range(100):
    +      # <train epoch>
    +      if i > swa_start:
    +          swa_model.update_parameters(model)
    +          swa_scheduler.step()
    +      else:
    +          scheduler.step()
    +
    + +

    Finally, update_bn is a utility function that computes the batchnorm statistics for the SWA model on a given dataloader loader:

    +
    torch.optim.swa_utils.update_bn(loader, swa_model) 
    +
    +

    update_bn applies the swa_model to every element in the dataloader and computes the activation statistics for each batch normalization layer in the model.

    + +

    Once you computed the SWA averages and updated the batch normalization layers, you can apply swa_model to make predictions on test data.

    + +

    Why does it work?

    + +

    There are large flat regions of the loss surface [9]. In Figure 3 below, we show a visualization of the loss surface in a subspace of the parameter space containing a path connecting two independently trained SGD solutions, such that the loss is similarly low at every point along the path. SGD converges near the boundary of these regions because there isn’t much gradient signal to move inside, as the points in the region all have similarly low values of loss. By increasing the learning rate, SWA spins around this flat region, and then by averaging the iterates, moves towards the center of the flat region.

    + +
    + +
    + +

    Figure 3: visualization of mode connectivity for ResNet-20 with no skip connections on CIFAR-10 dataset. The visualization is created in collaboration with Javier Ideami (https://losslandscape.com/). For more details, see this blogpost.

    + +

    We expect solutions that are centered in the flat region of the loss to generalize better than those near the boundary. Indeed, train and test error surfaces are not perfectly aligned in the weight space. Solutions that are centered in the flat region are not as susceptible to the shifts between train and test error surfaces as those near the boundary. In Figure 4 below, we show the train loss and test error surfaces along the direction connecting the SWA and SGD solutions. As you can see, while the SWA solution has a higher train loss compared to the SGD solution, it is centered in a region of low loss and has a substantially better test error.

    + +
    + +
    + +

    Figure 4. Train loss and test error along the line connecting the SWA solution (circle) and SGD solution (square). The SWA solution is centered in a wide region of low train loss, while the SGD solution lies near the boundary. Because of the shift between train loss and test error surfaces, the SWA solution leads to much better generalization.

    + +

    What are the results achieved with SWA?

    + +

    We release a GitHub repo with examples using the PyTorch implementation of SWA for training DNNs. For example, these examples can be used to achieve the following results on CIFAR-100:

    + + + + + + + + + + + + + + + + + + + + + + + + +
     VGG-16ResNet-164WideResNet-28x10
    SGD72.8 ± 0.378.4 ± 0.381.0 ± 0.3
    SWA74.4 ± 0.379.8 ± 0.482.5 ± 0.2
    + +

    Semi-Supervised Learning

    + +

    In a follow-up paper SWA was applied to semi-supervised learning, where it improved the best reported results in multiple settings [2]. For example, with SWA you can get 95% accuracy on CIFAR-10 if you only have the training labels for 4k training data points (the previous best reported result on this problem was 93.7%). This paper also explores averaging multiple times within epochs, which can accelerate convergence and find still flatter solutions in a given time.

    + +
    + +
    +

    Figure 5. Performance of fast-SWA on semi-supervised learning with CIFAR-10. fast-SWA achieves record results in every setting considered.

    + +

    Reinforcement Learning

    + +

    In another follow-up paper SWA was shown to improve the performance of policy gradient methods A2C and DDPG on several Atari games and MuJoCo environments [3]. This application is also an instance of where SWA is used with Adam. Recall that SWA is not specific to SGD and can benefit essentially any optimizer.

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Environment NameA2CA2C + SWA
    Breakout522 ± 34703 ± 60
    Qbert18777 ± 77821272 ± 655
    SpaceInvaders7727 ± 112121676 ± 8897
    Seaquest1779 ± 41795 ± 4
    BeamRider9999 ± 40211321 ± 1065
    CrazyClimber147030 ± 10239139752 ± 11618
    + +

    Low Precision Training

    + +

    We can filter through quantization noise by combining weights that have been rounded down with weights that have been rounded up. Moreover, by averaging weights to find a flat region of the loss surface, large perturbations of the weights will not affect the quality of the solution (Figures 9 and 10). Recent work shows that by adapting SWA to the low precision setting, in a method called SWALP, one can match the performance of full-precision SGD even with all training in 8 bits [5]. This is quite a practically important result, given that (1) SGD training in 8 bits performs notably worse than full precision SGD, and (2) low precision training is significantly harder than predictions in low precision after training (the usual setting). For example, a ResNet-164 trained on CIFAR-100 with float (16-bit) SGD achieves 22.2% error, while 8-bit SGD achieves 24.0% error. By contrast, SWALP with 8 bit training achieves 21.8% error.

    + +
    + +
    +

    Figure 9. Quantizing a solution leads to a perturbation of the weights which has a greater effect on the quality of the sharp solution (left) compared to wide solution (right).

    + +
    + +
    +

    Figure 10. The difference between standard low precision training and SWALP.

    + +

    Another work, SQWA, presents an approach for quantization and fine-tuning of neural networks in low precision [12]. In particular, SQWA achieved state-of-the-art results for DNNs quantized to 2 bits on CIFAR-100 and ImageNet.

    + +

    Calibration and Uncertainty Estimates

    + +

    By finding a centred solution in the loss, SWA can also improve calibration and uncertainty representation. Indeed, SWA can be viewed as an approximation to an ensemble, resembling a Bayesian model average, but with a single model [1].

    + +

    SWA can be viewed as taking the first moment of SGD iterates with a modified learning rate schedule. We can directly generalize SWA by also taking the second moment of iterates to form a Gaussian approximate posterior over the weights, further characterizing the loss geometry with SGD iterates. This approach,SWA-Gaussian (SWAG) is a simple, scalable and convenient approach to uncertainty estimation and calibration in Bayesian deep learning [4]. The SWAG distribution approximates the shape of the true posterior: Figure 6 below shows the SWAG distribution and the posterior log-density for ResNet-20 on CIFAR-10.

    + +
    + +
    +

    Figure 6. SWAG posterior approximation and the loss surface for a ResNet-20 without skip-connections trained on CIFAR-10 in the subspace formed by the two largest eigenvalues of the SWAG covariance matrix. The shape of SWAG distribution is aligned with the posterior: the peaks of the two distributions coincide, and both distributions are wider in one direction than in the orthogonal direction. Visualization created in collaboration with Javier Ideami.

    + +

    Empirically, SWAG performs on par or better than popular alternatives including MC dropout, KFAC Laplace, and temperature scaling on uncertainty quantification, out-of-distribution detection, calibration and transfer learning in computer vision tasks. Code for SWAG is available here.

    + +
    + +
    +

    Figure 7. MultiSWAG generalizes SWAG and deep ensembles, to perform Bayesian model averaging over multiple basins of attraction, leading to significantly improved performance. By contrast, as shown here, deep ensembles select different modes, while standard variational inference (VI) marginalizes (model averages) within a single basin.

    + +

    MultiSWAG [9] uses multiple independent SWAG models to form a mixture of Gaussians as an approximate posterior distribution. Different basins of attraction contain highly complementary explanations of the data. Accordingly, marginalizing over these multiple basins provides a significant boost in accuracy and uncertainty representation. MultiSWAG can be viewed as a generalization of deep ensembles, but with performance improvements.

    + +

    Indeed, we see in Figure 8 that MultiSWAG entirely mitigates double descent – more flexible models have monotonically improving performance – and provides significantly improved generalization over SGD. For example, when the ResNet-18 has layers of width 20, Multi-SWAG achieves under 30% error whereas SGD achieves over 45%, more than a 15% gap!

    + +
    + +
    +

    Figure 8. SGD, SWAG, and Multi-SWAG on CIFAR-100 for a ResNet-18 with varying widths. We see Multi-SWAG in particular mitigates double descent and provides significant accuracy improvements over SGD.

    + +

    Reference [10] also considers Multi-SWA, which uses multiple independently trained SWA solutions in an ensemble, providing performance improvements over deep ensembles without any additional computational cost. Code for MultiSWA and MultiSWAG is available here.

    + +

    Another method, Subspace Inference, constructs a low-dimensional subspace around the SWA solution and marginalizes the weights in this subspace to approximate the Bayesian model average [5]. Subspace Inference uses the statistics from the SGD iterates to construct both the SWA solution and the subspace. The method achieves strong performance in terms of prediction accuracy and uncertainty calibration both in classification and regression problems. Code is available here.

    + +

    Try it Out!

    + +

    One of the greatest open questions in deep learning is why SGD manages to find good solutions, given that the training objectives are highly multimodal, and there are many settings of parameters that achieve no training loss but poor generalization. By understanding geometric features such as flatness, which relate to generalization, we can begin to resolve these questions and build optimizers that provide even better generalization, and many other useful features, such as uncertainty representation. We have presented SWA, a simple drop-in replacement for standard optimizers such as SGD and Adam, which can in principle, benefit anyone training a deep neural network. SWA has been demonstrated to have a strong performance in several areas, including computer vision, semi-supervised learning, reinforcement learning, uncertainty representation, calibration, Bayesian model averaging, and low precision training.

    + +

    We encourage you to try out SWA! SWA is now as easy as any standard training in PyTorch. And even if you have already trained your model, you can use SWA to significantly improve performance by running it for a small number of epochs from a pre-trained model.

    + +

    [1] Averaging Weights Leads to Wider Optima and Better Generalization; Pavel Izmailov, Dmitry Podoprikhin, Timur Garipov, Dmitry Vetrov, Andrew Gordon Wilson; Uncertainty in Artificial Intelligence (UAI), 2018.

    + +

    [2] There Are Many Consistent Explanations of Unlabeled Data: Why You Should Average; Ben Athiwaratkun, Marc Finzi, Pavel Izmailov, Andrew Gordon Wilson; +International Conference on Learning Representations (ICLR), 2019.

    + +

    [3] Improving Stability in Deep Reinforcement Learning with Weight Averaging; Evgenii Nikishin, Pavel Izmailov, Ben Athiwaratkun, Dmitrii Podoprikhin, +Timur Garipov, Pavel Shvechikov, Dmitry Vetrov, Andrew Gordon Wilson; UAI 2018 Workshop: Uncertainty in Deep Learning, 2018.

    + +

    [4] A Simple Baseline for Bayesian Uncertainty in Deep Learning +Wesley Maddox, Timur Garipov, Pavel Izmailov, Andrew Gordon Wilson; Neural Information Processing Systems (NeurIPS), 2019.

    + +

    [5] Subspace Inference for Bayesian Deep Learning +Pavel Izmailov, Wesley Maddox, Polina Kirichenko, Timur Garipov, Dmitry Vetrov, Andrew Gordon Wilson +Uncertainty in Artificial Intelligence (UAI), 2019.

    + +

    [6] SWALP : Stochastic Weight Averaging in Low Precision Training +Guandao Yang, Tianyi Zhang, Polina Kirichenko, Junwen Bai, +Andrew Gordon Wilson, Christopher De Sa; International Conference on Machine Learning (ICML), 2019.

    + +

    [7] David Ruppert. Efficient estimations from a slowly convergent Robbins-Monro process; Technical report, Cornell University Operations Research and Industrial Engineering, 1988.

    + +

    [8] Acceleration of stochastic approximation by averaging. Boris T Polyak and Anatoli B Juditsky; SIAM Journal on Control and Optimization, 30(4):838–855, 1992.

    + +

    [9] Loss Surfaces, Mode Connectivity, and Fast Ensembling of DNNs +Timur Garipov, Pavel Izmailov, Dmitrii Podoprikhin, Dmitry Vetrov, +Andrew Gordon Wilson. Neural Information Processing Systems (NeurIPS), 2018.

    + +

    [10] Bayesian Deep Learning and a Probabilistic Perspective of Generalization +Andrew Gordon Wilson, Pavel Izmailov. ArXiv preprint, 2020.

    + +

    [11] Stochastic Weight Averaging in Parallel: Large-Batch Training That Generalizes Well +Gupta, Vipul, Santiago Akle Serrano, and Dennis DeCoste; International Conference on Learning Representations (ICLR). 2019.

    + +

    [12] SQWA: Stochastic Quantized Weight Averaging for Improving the Generalization Capability of Low-Precision Deep Neural Networks +Shin, Sungho, Yoonho Boo, and Wonyong Sung; arXiv preprint 2020.

    + + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-1.6-released/index.html b/blog/pytorch-1.6-released/index.html new file mode 100644 index 000000000000..8a42771a7d92 --- /dev/null +++ b/blog/pytorch-1.6-released/index.html @@ -0,0 +1,880 @@ + + + + + + + + + + + + + PyTorch 1.6 released w/ Native AMP Support, Microsoft joins as maintainers for Windows | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    Today, we’re announcing the availability of PyTorch 1.6, along with updated domain libraries. We are also excited to announce the team at Microsoft is now maintaining Windows builds and binaries and will also be supporting the community on GitHub as well as the PyTorch Windows discussion forums.

    + +

    The PyTorch 1.6 release includes a number of new APIs, tools for performance improvement and profiling, as well as major updates to both distributed data parallel (DDP) and remote procedure call (RPC) based distributed training. +A few of the highlights include:

    + +
      +
    1. Automatic mixed precision (AMP) training is now natively supported and a stable feature (See here for more details) - thanks for NVIDIA’s contributions;
    2. +
    3. Native TensorPipe support now added for tensor-aware, point-to-point communication primitives built specifically for machine learning;
    4. +
    5. Added support for complex tensors to the frontend API surface;
    6. +
    7. New profiling tools providing tensor-level memory consumption information;
    8. +
    9. Numerous improvements and new features for both distributed data parallel (DDP) training and the remote procedural call (RPC) packages.
    10. +
    + +

    Additionally, from this release onward, features will be classified as Stable, Beta and Prototype. Prototype features are not included as part of the binary distribution and are instead available through either building from source, using nightlies or via compiler flag. You can learn more about what this change means in the post here. You can also find the full release notes here.

    + +

    Performance & Profiling

    + +

    [Stable] Automatic Mixed Precision (AMP) Training

    + +

    AMP allows users to easily enable automatic mixed precision training enabling higher performance and memory savings of up to 50% on Tensor Core GPUs. Using the natively supported torch.cuda.amp API, AMP provides convenience methods for mixed precision, where some operations use the torch.float32 (float) datatype and other operations use torch.float16 (half). Some ops, like linear layers and convolutions, are much faster in float16. Other ops, like reductions, often require the dynamic range of float32. Mixed precision tries to match each op to its appropriate datatype.

    + +
      +
    • Design doc (Link)
    • +
    • Documentation (Link)
    • +
    • Usage examples (Link)
    • +
    + +

    [Beta] Fork/Join Parallelism

    + +

    This release adds support for a language-level construct as well as runtime support for coarse-grained parallelism in TorchScript code. This support is useful for situations such as running models in an ensemble in parallel, or running bidirectional components of recurrent nets in parallel, and allows the ability to unlock the computational power of parallel architectures (e.g. many-core CPUs) for task level parallelism.

    + +

    Parallel execution of TorchScript programs is enabled through two primitives: torch.jit.fork and torch.jit.wait. In the below example, we parallelize execution of foo:

    + +
    import torch
    +from typing import List
    +
    +def foo(x):
    +    return torch.neg(x)
    +
    +@torch.jit.script
    +def example(x):
    +    futures = [torch.jit.fork(foo, x) for _ in range(100)]
    +    results = [torch.jit.wait(future) for future in futures]
    +    return torch.sum(torch.stack(results))
    +
    +print(example(torch.ones([])))
    +
    + +
      +
    • Documentation (Link)
    • +
    + +

    [Beta] Memory Profiler

    + +

    The torch.autograd.profiler API now includes a memory profiler that lets you inspect the tensor memory cost of different operators inside your CPU and GPU models.

    + +

    Here is an example usage of the API:

    + +
    import torch
    +import torchvision.models as models
    +import torch.autograd.profiler as profiler
    +
    +model = models.resnet18()
    +inputs = torch.randn(5, 3, 224, 224)
    +with profiler.profile(profile_memory=True, record_shapes=True) as prof:
    +    model(inputs)
    +
    +# NOTE: some columns were removed for brevity
    +print(prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=10))
    +# ---------------------------  ---------------  ---------------  ---------------
    +# Name                         CPU Mem          Self CPU Mem     Number of Calls
    +# ---------------------------  ---------------  ---------------  ---------------
    +# empty                        94.79 Mb         94.79 Mb         123
    +# resize_                      11.48 Mb         11.48 Mb         2
    +# addmm                        19.53 Kb         19.53 Kb         1
    +# empty_strided                4 b              4 b              1
    +# conv2d                       47.37 Mb         0 b              20
    +# ---------------------------  ---------------  ---------------  ---------------
    +
    + + + +

    Distributed Training & RPC

    + +

    [Beta] TensorPipe backend for RPC

    + +

    PyTorch 1.6 introduces a new backend for the RPC module which leverages the TensorPipe library, a tensor-aware point-to-point communication primitive targeted at machine learning, intended to complement the current primitives for distributed training in PyTorch (Gloo, MPI, …) which are collective and blocking. The pairwise and asynchronous nature of TensorPipe lends itself to new networking paradigms that go beyond data parallel: client-server approaches (e.g., parameter server for embeddings, actor-learner separation in Impala-style RL, …) and model and pipeline parallel training (think GPipe), gossip SGD, etc.

    + +
    # One-line change needed to opt in
    +torch.distributed.rpc.init_rpc(
    +    ...
    +    backend=torch.distributed.rpc.BackendType.TENSORPIPE,
    +)
    +
    +# No changes to the rest of the RPC API
    +torch.distributed.rpc.rpc_sync(...)
    +
    + +
      +
    • Design doc (Link)
    • +
    • Documentation (Link)
    • +
    + +

    [Beta] DDP+RPC

    + +

    PyTorch Distributed supports two powerful paradigms: DDP for full sync data parallel training of models and the RPC framework which allows for distributed model parallelism. Previously, these two features worked independently and users couldn’t mix and match these to try out hybrid parallelism paradigms.

    + +

    Starting in PyTorch 1.6, we’ve enabled DDP and RPC to work together seamlessly so that users can combine these two techniques to achieve both data parallelism and model parallelism. An example is where users would like to place large embedding tables on parameter servers and use the RPC framework for embedding lookups, but store smaller dense parameters on trainers and use DDP to synchronize the dense parameters. Below is a simple code snippet.

    + +
    // On each trainer
    +
    +remote_emb = create_emb(on="ps", ...)
    +ddp_model = DDP(dense_model)
    +
    +for data in batch:
    +   with torch.distributed.autograd.context():
    +      res = remote_emb(data)
    +      loss = ddp_model(res)
    +      torch.distributed.autograd.backward([loss])
    +
    + +
      +
    • DDP+RPC Tutorial (Link)
    • +
    • Documentation (Link)
    • +
    • Usage Examples (Link)
    • +
    + +

    [Beta] RPC - Asynchronous User Functions

    + +

    RPC Asynchronous User Functions supports the ability to yield and resume on the server side when executing a user-defined function. Prior to this feature, when a callee processes a request, one RPC thread waits until the user function returns. If the user function contains IO (e.g., nested RPC) or signaling (e.g., waiting for another request to unblock), the corresponding RPC thread would sit idle waiting for these events. As a result, some applications have to use a very large number of threads and send additional RPC requests, which can potentially lead to performance degradation. To make a user function yield on such events, applications need to: 1) Decorate the function with the @rpc.functions.async_execution decorator; and 2) Let the function return a torch.futures.Future and install the resume logic as callbacks on the Future object. See below for an example:

    + +
    @rpc.functions.async_execution
    +def async_add_chained(to, x, y, z):
    +    return rpc.rpc_async(to, torch.add, args=(x, y)).then(
    +        lambda fut: fut.wait() + z
    +    )
    +
    +ret = rpc.rpc_sync(
    +    "worker1", 
    +    async_add_chained, 
    +    args=("worker2", torch.ones(2), 1, 1)
    +)
    +        
    +print(ret)  # prints tensor([3., 3.])
    +
    + +
      +
    • Tutorial for performant batch RPC using Asynchronous User Functions
    • +
    • Documentation (Link)
    • +
    • Usage examples (Link)
    • +
    + +

    Frontend API Updates

    + +

    [Beta] Complex Numbers

    + +

    The PyTorch 1.6 release brings beta level support for complex tensors including torch.complex64 and torch.complex128 dtypes. A complex number is a number that can be expressed in the form a + bj, where a and b are real numbers, and j is a solution of the equation x^2 = −1. Complex numbers frequently occur in mathematics and engineering, especially in signal processing and the area of complex neural networks is an active area of research. The beta release of complex tensors will support common PyTorch and complex tensor functionality, plus functions needed by Torchaudio, ESPnet and others. While this is an early version of this feature, and we expect it to improve over time, the overall goal is provide a NumPy compatible user experience that leverages PyTorch’s ability to run on accelerators and work with autograd to better support the scientific community.

    + +

    Mobile Updates

    + +

    PyTorch 1.6 brings increased performance and general stability for mobile on-device inference. We squashed a few bugs, continued maintenance and added few new features while improving fp32 and int8 performance on a large variety of ML model inference on CPU backend.

    + +

    [Beta] Mobile Features and Performance

    + +
      +
    • Stateless and stateful XNNPACK Conv and Linear operators
    • +
    • Stateless MaxPool2d + JIT optimization passes
    • +
    • JIT pass optimizations: Conv + BatchNorm fusion, graph rewrite to replace conv2d/linear with xnnpack ops, relu/hardtanh fusion, dropout removal
    • +
    • QNNPACK integration removes requantization scale constraint
    • +
    • Per-channel quantization for conv, linear and dynamic linear
    • +
    • Disable tracing for mobile client to save ~600 KB on full-jit builds
    • +
    + +

    Updated Domain Libraries

    + +

    torchvision 0.7

    + +

    torchvision 0.7 introduces two new pretrained semantic segmentation models, FCN ResNet50 and DeepLabV3 ResNet50, both trained on COCO and using smaller memory footprints than the ResNet101 backbone. We also introduced support for AMP (Automatic Mixed Precision) autocasting for torchvision models and operators, which automatically selects the floating point precision for different GPU operations to improve performance while maintaining accuracy.

    + +
      +
    • Release notes (Link)
    • +
    + +

    torchaudio 0.6

    + +

    torchaudio now officially supports Windows. This release also introduces a new model module (with wav2letter included), new functionals (contrast, cvm, dcshift, overdrive, vad, phaser, flanger, biquad), datasets (GTZAN, CMU), and a new optional sox backend with support for TorchScript.

    + +
      +
    • Release notes (Link)
    • +
    + +

    Additional updates

    + +

    HACKATHON

    + +

    The Global PyTorch Summer Hackathon is back! This year, teams can compete in three categories virtually:

    + +
      +
    1. PyTorch Developer Tools: Tools or libraries designed to improve productivity and efficiency of PyTorch for researchers and developers
    2. +
    3. Web/Mobile Applications powered by PyTorch: Applications with web/mobile interfaces and/or embedded devices powered by PyTorch
    4. +
    5. PyTorch Responsible AI Development Tools: Tools, libraries, or web/mobile apps for responsible AI development
    6. +
    + +

    This is a great opportunity to connect with the community and practice your machine learning skills.

    + + + +

    LPCV Challenge

    + +

    The 2020 CVPR Low-Power Vision Challenge (LPCV) - Online Track for UAV video submission deadline is coming up shortly. You have until July 31, 2020 to build a system that can discover and recognize characters in video captured by an unmanned aerial vehicle (UAV) accurately using PyTorch and Raspberry Pi 3B+.

    + +

    Prototype Features

    + +

    To reiterate, Prototype features in PyTorch are early features that we are looking to gather feedback on, gauge the usefulness of and improve ahead of graduating them to Beta or Stable. The following features are not part of the PyTorch 1.6 release and instead are available in nightlies with separate docs/tutorials to help facilitate early usage and feedback.

    + +

    Distributed RPC/Profiler

    +

    Allow users to profile training jobs that use torch.distributed.rpc using the autograd profiler, and remotely invoke the profiler in order to collect profiling information across different nodes. The RFC can be found here and a short recipe on how to use this feature can be found here.

    + +

    TorchScript Module Freezing

    +

    Module Freezing is the process of inlining module parameters and attributes values into the TorchScript internal representation. Parameter and attribute values are treated as final value and they cannot be modified in the frozen module. The PR for this feature can be found here and a short tutorial on how to use this feature can be found here.

    + +

    Graph Mode Quantization

    +

    Eager mode quantization requires users to make changes to their model, including explicitly quantizing activations, module fusion, rewriting use of torch ops with Functional Modules and quantization of functionals are not supported. If we can trace or script the model, then the quantization can be done automatically with graph mode quantization without any of the complexities in eager mode, and it is configurable through a qconfig_dict. A tutorial on how to use this feature can be found here.

    + +

    Quantization Numerical Suite

    +

    Quantization is good when it works, but it’s difficult to know what’s wrong when it doesn’t satisfy the expected accuracy. A prototype is now available for a Numerical Suite that measures comparison statistics between quantized modules and float modules. This is available to test using eager mode and on CPU only with more support coming. A tutorial on how to use this feature can be found here.

    + +

    Cheers!

    + +

    Team PyTorch

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-1.7-released/index.html b/blog/pytorch-1.7-released/index.html new file mode 100644 index 000000000000..2300cf04c296 --- /dev/null +++ b/blog/pytorch-1.7-released/index.html @@ -0,0 +1,964 @@ + + + + + + + + + + + + + PyTorch 1.7 released w/ CUDA 11, New APIs for FFTs, Windows support for Distributed training and more | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    Today, we’re announcing the availability of PyTorch 1.7, along with updated domain libraries. The PyTorch 1.7 release includes a number of new APIs including support for NumPy-Compatible FFT operations, profiling tools and major updates to both distributed data parallel (DDP) and remote procedure call (RPC) based distributed training. In addition, several features moved to stable including custom C++ Classes, the memory profiler, extensions via custom tensor-like objects, user async functions in RPC and a number of other features in torch.distributed such as Per-RPC timeout, DDP dynamic bucketing and RRef helper.

    + +

    A few of the highlights include:

    +
      +
    • CUDA 11 is now officially supported with binaries available at PyTorch.org
    • +
    • Updates and additions to profiling and performance for RPC, TorchScript and Stack traces in the autograd profiler
    • +
    • (Beta) Support for NumPy compatible Fast Fourier transforms (FFT) via torch.fft
    • +
    • (Prototype) Support for Nvidia A100 generation GPUs and native TF32 format
    • +
    • (Prototype) Distributed training on Windows now supported
    • +
    • torchvision +
        +
      • (Stable) Transforms now support Tensor inputs, batch computation, GPU, and TorchScript
      • +
      • (Stable) Native image I/O for JPEG and PNG formats
      • +
      • (Beta) New Video Reader API
      • +
      +
    • +
    • torchaudio +
        +
      • (Stable) Added support for speech rec (wav2letter), text to speech (WaveRNN) and source separation (ConvTasNet)
      • +
      +
    • +
    + +

    To reiterate, starting PyTorch 1.6, features are now classified as stable, beta and prototype. You can see the detailed announcement here. Note that the prototype features listed in this blog are available as part of this release.

    + +

    Find the full release notes here.

    + +

    Front End APIs

    +

    [Beta] NumPy Compatible torch.fft module

    +

    FFT-related functionality is commonly used in a variety of scientific fields like signal processing. While PyTorch has historically supported a few FFT-related functions, the 1.7 release adds a new torch.fft module that implements FFT-related functions with the same API as NumPy.

    + +

    This new module must be imported to be used in the 1.7 release, since its name conflicts with the historic (and now deprecated) torch.fft function.

    + +

    Example usage:

    +
    >>> import torch.fft
    +>>> t = torch.arange(4)
    +>>> t
    +tensor([0, 1, 2, 3])
    +
    +>>> torch.fft.fft(t)
    +tensor([ 6.+0.j, -2.+2.j, -2.+0.j, -2.-2.j])
    +
    +>>> t = tensor([0.+1.j, 2.+3.j, 4.+5.j, 6.+7.j])
    +>>> torch.fft.fft(t)
    +tensor([12.+16.j, -8.+0.j, -4.-4.j,  0.-8.j])
    +
    + + + +

    [Beta] C++ Support for Transformer NN Modules

    +

    Since PyTorch 1.5, we’ve continued to maintain parity between the python and C++ frontend APIs. This update allows developers to use the nn.transformer module abstraction from the C++ Frontend. And moreover, developers no longer need to save a module from python/JIT and load into C++ as it can now be used it in C++ directly.

    + + +

    [Beta] torch.set_deterministic

    +

    Reproducibility (bit-for-bit determinism) may help identify errors when debugging or testing a program. To facilitate reproducibility, PyTorch 1.7 adds the torch.set_deterministic(bool) function that can direct PyTorch operators to select deterministic algorithms when available, and to throw a runtime error if an operation may result in nondeterministic behavior. By default, the flag this function controls is false and there is no change in behavior, meaning PyTorch may implement its operations nondeterministically by default.

    + +

    More precisely, when this flag is true:

    +
      +
    • Operations known to not have a deterministic implementation throw a runtime error;
    • +
    • Operations with deterministic variants use those variants (usually with a performance penalty versus the non-deterministic version); and
    • +
    • torch.backends.cudnn.deterministic = True is set.
    • +
    + +

    Note that this is necessary, but not sufficient, for determinism within a single run of a PyTorch program. Other sources of randomness like random number generators, unknown operations, or asynchronous or distributed computation may still cause nondeterministic behavior.

    + +

    See the documentation for torch.set_deterministic(bool) for the list of affected operations.

    + + +

    Performance & Profiling

    +

    [Beta] Stack traces added to profiler

    +

    Users can now see not only operator name/inputs in the profiler output table but also where the operator is in the code. The workflow requires very little change to take advantage of this capability. The user uses the autograd profiler as before but with optional new parameters: with_stack and group_by_stack_n. Caution: regular profiling runs should not use this feature as it adds significant overhead.

    + + +

    Distributed Training & RPC

    +

    [Stable] TorchElastic now bundled into PyTorch docker image

    +

    Torchelastic offers a strict superset of the current torch.distributed.launch CLI with the added features for fault-tolerance and elasticity. If the user is not be interested in fault-tolerance, they can get the exact functionality/behavior parity by setting max_restarts=0 with the added convenience of auto-assigned RANK and MASTER_ADDR|PORT (versus manually specified in torch.distributed.launch).

    + +

    By bundling torchelastic in the same docker image as PyTorch, users can start experimenting with TorchElastic right-away without having to separately install torchelastic. In addition to convenience, this work is a nice-to-have when adding support for elastic parameters in the existing Kubeflow’s distributed PyTorch operators.

    + + +

    [Beta] Support for uneven dataset inputs in DDP

    +

    PyTorch 1.7 introduces a new context manager to be used in conjunction with models trained using torch.nn.parallel.DistributedDataParallel to enable training with uneven dataset size across different processes. This feature enables greater flexibility when using DDP and prevents the user from having to manually ensure dataset sizes are the same across different process. With this context manager, DDP will handle uneven dataset sizes automatically, which can prevent errors or hangs at the end of training.

    + + +

    [Beta] NCCL Reliability - Async Error/Timeout Handling

    +

    In the past, NCCL training runs would hang indefinitely due to stuck collectives, leading to a very unpleasant experience for users. This feature will abort stuck collectives and throw an exception/crash the process if a potential hang is detected. When used with something like torchelastic (which can recover the training process from the last checkpoint), users can have much greater reliability for distributed training. This feature is completely opt-in and sits behind an environment variable that needs to be explicitly set in order to enable this functionality (otherwise users will see the same behavior as before).

    + + +

    [Beta] TorchScript rpc_remote and rpc_sync

    +

    torch.distributed.rpc.rpc_async has been available in TorchScript in prior releases. For PyTorch 1.7, this functionality will be extended the remaining two core RPC APIs, torch.distributed.rpc.rpc_sync and torch.distributed.rpc.remote. This will complete the major RPC APIs targeted for support in TorchScript, it allows users to use the existing python RPC APIs within TorchScript (in a script function or script method, which releases the python Global Interpreter Lock) and could possibly improve application performance in multithreaded environment.

    + + +

    [Beta] Distributed optimizer with TorchScript support

    +

    PyTorch provides a broad set of optimizers for training algorithms, and these have been used repeatedly as part of the python API. However, users often want to use multithreaded training instead of multiprocess training as it provides better resource utilization and efficiency in the context of large scale distributed training (e.g. Distributed Model Parallel) or any RPC-based training application). Users couldn’t do this with with distributed optimizer before because we need to get rid of the python Global Interpreter Lock (GIL) limitation to achieve this.

    + +

    In PyTorch 1.7, we are enabling the TorchScript support in distributed optimizer to remove the GIL, and make it possible to run optimizer in multithreaded applications. The new distributed optimizer has the exact same interface as before but it automatically converts optimizers within each worker into TorchScript to make each GIL free. This is done by leveraging a functional optimizer concept and allowing the distributed optimizer to convert the computational portion of the optimizer into TorchScript. This will help use cases like distributed model parallel training and improve performance using multithreading.

    + +

    Currently, the only optimizer that supports automatic conversion with TorchScript is Adagrad and all other optimizers will still work as before without TorchScript support. We are working on expanding the coverage to all PyTorch optimizers and expect more to come in future releases. The usage to enable TorchScript support is automatic and exactly the same with existing python APIs, here is an example of how to use this:

    + +
    import torch.distributed.autograd as dist_autograd
    +import torch.distributed.rpc as rpc
    +from torch import optim
    +from torch.distributed.optim import DistributedOptimizer
    +
    +with dist_autograd.context() as context_id:
    +  # Forward pass.
    +  rref1 = rpc.remote("worker1", torch.add, args=(torch.ones(2), 3))
    +  rref2 = rpc.remote("worker1", torch.add, args=(torch.ones(2), 1))
    +  loss = rref1.to_here() + rref2.to_here()
    +
    +  # Backward pass.
    +  dist_autograd.backward(context_id, [loss.sum()])
    +
    +  # Optimizer, pass in optim.Adagrad, DistributedOptimizer will
    +  # automatically convert/compile it to TorchScript (GIL-free)
    +  dist_optim = DistributedOptimizer(
    +     optim.Adagrad,
    +     [rref1, rref2],
    +     lr=0.05,
    +  )
    +  dist_optim.step(context_id)
    +
    + + +

    [Beta] Enhancements to RPC-based Profiling

    +

    Support for using the PyTorch profiler in conjunction with the RPC framework was first introduced in PyTorch 1.6. In PyTorch 1.7, the following enhancements have been made:

    +
      +
    • Implemented better support for profiling TorchScript functions over RPC
    • +
    • Achieved parity in terms of profiler features that work with RPC
    • +
    • Added support for asynchronous RPC functions on the server-side (functions decorated with rpc.functions.async_execution).
    • +
    + +

    Users are now able to use familiar profiling tools such as with torch.autograd.profiler.profile() and with torch.autograd.profiler.record_function, and this works transparently with the RPC framework with full feature support, profiles asynchronous functions, and TorchScript functions.

    + + +

    [Prototype] Windows support for Distributed Training

    +

    PyTorch 1.7 brings prototype support for DistributedDataParallel and collective communications on the Windows platform. In this release, the support only covers Gloo-based ProcessGroup and FileStore.

    + +

    To use this feature across multiple machines, please provide a file from a shared file system in init_process_group.

    + +
    # initialize the process group
    +dist.init_process_group(
    +    "gloo",
    +    # multi-machine example:
    +    # init_method = "file://////{machine}/{share_folder}/file"
    +    init_method="file:///{your local file path}",
    +    rank=rank,
    +    world_size=world_size
    +)
    +
    +model = DistributedDataParallel(local_model, device_ids=[rank])
    +
    + + +

    Mobile

    +

    PyTorch Mobile supports both iOS and Android with binary packages available in Cocoapods and JCenter respectively. You can learn more about PyTorch Mobile here.

    + +

    [Beta] PyTorch Mobile Caching allocator for performance improvements

    +

    On some mobile platforms, such as Pixel, we observed that memory is returned to the system more aggressively. This results in frequent page faults as PyTorch being a functional framework does not maintain state for the operators. Thus outputs are allocated dynamically on each execution of the op, for the most ops. To ameliorate performance penalties due to this, PyTorch 1.7 provides a simple caching allocator for CPU. The allocator caches allocations by tensor sizes and, is currently, available only via the PyTorch C++ API. The caching allocator itself is owned by client and thus the lifetime of the allocator is also maintained by client code. Such a client owned caching allocator can then be used with scoped guard, c10::WithCPUCachingAllocatorGuard, to enable the use of cached allocation within that scope. +Example usage:

    + +
    #include <c10/mobile/CPUCachingAllocator.h>
    +.....
    +c10::CPUCachingAllocator caching_allocator;
    +  // Owned by client code. Can be a member of some client class so as to tie the
    +  // the lifetime of caching allocator to that of the class.
    +.....
    +{
    +  c10::optional<c10::WithCPUCachingAllocatorGuard> caching_allocator_guard;
    +  if (FLAGS_use_caching_allocator) {
    +    caching_allocator_guard.emplace(&caching_allocator);
    +  }
    +  ....
    +  model.forward(..);
    +}
    +...
    +
    +

    NOTE: Caching allocator is only available on mobile builds, thus the use of caching allocator outside of mobile builds won’t be effective.

    + + +

    torchvision

    +

    [Stable] Transforms now support Tensor inputs, batch computation, GPU, and TorchScript

    +

    torchvision transforms are now inherited from nn.Module and can be torchscripted and applied on torch Tensor inputs as well as on PIL images. They also support Tensors with batch dimensions and work seamlessly on CPU/GPU devices:

    +
    import torch
    +import torchvision.transforms as T
    +
    +# to fix random seed, use torch.manual_seed
    +# instead of random.seed
    +torch.manual_seed(12)
    +
    +transforms = torch.nn.Sequential(
    +    T.RandomCrop(224),
    +    T.RandomHorizontalFlip(p=0.3),
    +    T.ConvertImageDtype(torch.float),
    +    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    +)
    +scripted_transforms = torch.jit.script(transforms)
    +# Note: we can similarly use T.Compose to define transforms
    +# transforms = T.Compose([...]) and 
    +# scripted_transforms = torch.jit.script(torch.nn.Sequential(*transforms.transforms))
    +
    +tensor_image = torch.randint(0, 256, size=(3, 256, 256), dtype=torch.uint8)
    +# works directly on Tensors
    +out_image1 = transforms(tensor_image)
    +# on the GPU
    +out_image1_cuda = transforms(tensor_image.cuda())
    +# with batches
    +batched_image = torch.randint(0, 256, size=(4, 3, 256, 256), dtype=torch.uint8)
    +out_image_batched = transforms(batched_image)
    +# and has torchscript support
    +out_image2 = scripted_transforms(tensor_image)
    +
    +

    These improvements enable the following new features:

    +
      +
    • support for GPU acceleration
    • +
    • batched transformations e.g. as needed for videos
    • +
    • transform multi-band torch tensor images (with more than 3-4 channels)
    • +
    • torchscript transforms together with your model for deployment +Note: Exceptions for TorchScript support includes Compose, RandomChoice, RandomOrder, Lambda and those applied on PIL images, such as ToPILImage.
    • +
    + +

    [Stable] Native image IO for JPEG and PNG formats

    +

    torchvision 0.8.0 introduces native image reading and writing operations for JPEG and PNG formats. Those operators support TorchScript and return CxHxW tensors in uint8 format, and can thus be now part of your model for deployment in C++ environments.

    +
    from torchvision.io import read_image
    +
    +# tensor_image is a CxHxW uint8 Tensor
    +tensor_image = read_image('path_to_image.jpeg')
    +
    +# or equivalently
    +from torchvision.io import read_file, decode_image
    +# raw_data is a 1d uint8 Tensor with the raw bytes
    +raw_data = read_file('path_to_image.jpeg')
    +tensor_image = decode_image(raw_data)
    +
    +# all operators are torchscriptable and can be
    +# serialized together with your model torchscript code
    +scripted_read_image = torch.jit.script(read_image)
    +
    +

    [Stable] RetinaNet detection model

    +

    This release adds pretrained models for RetinaNet with a ResNet50 backbone from Focal Loss for Dense Object Detection.

    + +

    [Beta] New Video Reader API

    +

    This release introduces a new video reading abstraction, which gives more fine-grained control of iteration over videos. It supports image and audio, and implements an iterator interface so that it is interoperable with other the python libraries such as itertools.

    +
    from torchvision.io import VideoReader
    +
    +# stream indicates if reading from audio or video
    +reader = VideoReader('path_to_video.mp4', stream='video')
    +# can change the stream after construction
    +# via reader.set_current_stream
    +
    +# to read all frames in a video starting at 2 seconds
    +for frame in reader.seek(2):
    +    # frame is a dict with "data" and "pts" metadata
    +    print(frame["data"], frame["pts"])
    +
    +# because reader is an iterator you can combine it with
    +# itertools
    +from itertools import takewhile, islice
    +# read 10 frames starting from 2 seconds
    +for frame in islice(reader.seek(2), 10):
    +    pass
    +    
    +# or to return all frames between 2 and 5 seconds
    +for frame in takewhile(lambda x: x["pts"] < 5, reader):
    +    pass
    +
    +

    Notes:

    +
      +
    • In order to use the Video Reader API beta, you must compile torchvision from source and have ffmpeg installed in your system.
    • +
    • The VideoReader API is currently released as beta and its API may change following user feedback.
    • +
    + +

    torchaudio

    +

    With this release, torchaudio is expanding its support for models and end-to-end applications, adding a wav2letter training pipeline and end-to-end text-to-speech and source separation pipelines. Please file an issue on github to provide feedback on them.

    + +

    [Stable] Speech Recognition

    +

    Building on the addition of the wav2letter model for speech recognition in the last release, we’ve now added an example wav2letter training pipeline with the LibriSpeech dataset.

    + +

    [Stable] Text-to-speech

    +

    With the goal of supporting text-to-speech applications, we added a vocoder based on the WaveRNN model, based on the implementation from this repository. The original implementation was introduced in “Efficient Neural Audio Synthesis”. We also provide an example WaveRNN training pipeline that uses the LibriTTS dataset added to torchaudio in this release.

    + +

    [Stable] Source Separation

    +

    With the addition of the ConvTasNet model, based on the paper “Conv-TasNet: Surpassing Ideal Time-Frequency Magnitude Masking for Speech Separation,” torchaudio now also supports source separation. An example ConvTasNet training pipeline is provided with the wsj-mix dataset.

    + +

    Cheers!

    + +

    Team PyTorch

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-1.8-new-library-releases/index.html b/blog/pytorch-1.8-new-library-releases/index.html new file mode 100644 index 000000000000..795ee46f659b --- /dev/null +++ b/blog/pytorch-1.8-new-library-releases/index.html @@ -0,0 +1,797 @@ + + + + + + + + + + + + + New PyTorch library releases including TorchVision Mobile, TorchAudio I/O, and more | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    Today, we are announcing updates to a number of PyTorch libraries, alongside the PyTorch 1.8 release. The updates include new releases for the domain libraries including TorchVision, TorchText and TorchAudio as well as new version of TorchCSPRNG. These releases include a number of new features and improvements and, along with the PyTorch 1.8 release, provide a broad set of updates for the PyTorch community to build on and leverage.

    + +

    Some highlights include:

    +
      +
    • TorchVision - Added support for PyTorch Mobile including Detectron2Go (D2Go), auto-augmentation of data during training, on the fly type conversion, and AMP autocasting.
    • +
    • TorchAudio - Major improvements to I/O, including defaulting to sox_io backend and file-like object support. Added Kaldi Pitch feature and support for CMake based build allowing TorchAudio to better support no-Python environments.
    • +
    • TorchText - Updated the dataset loading API to be compatible with standard PyTorch data loading utilities.
    • +
    • TorchCSPRNG - Support for cryptographically secure pseudorandom number generators for PyTorch is now stable with new APIs for AES128 ECB/CTR and CUDA support on Windows.
    • +
    + +

    Please note that, starting in PyTorch 1.6, features are classified as Stable, Beta, and Prototype. Prototype features are not included as part of the binary distribution and are instead available through either building from source, using nightlies or via compiler flag. You can see the detailed announcement here.

    + +

    TorchVision 0.9.0

    +

    [Stable] TorchVision Mobile: Operators, Android Binaries, and Tutorial

    +

    We are excited to announce the first on-device support and binaries for a PyTorch domain library. We have seen significant appetite in both research and industry for on-device vision support to allow low latency, privacy friendly, and resource efficient mobile vision experiences. You can follow this new tutorial to build your own Android object detection app using TorchVision operators, D2Go, or your own custom operators and model.

    + +
    + +
    + +

    [Stable] New Mobile models for Classification, Object Detection and Semantic Segmentation

    +

    We have added support for the MobileNetV3 architecture and provided pre-trained weights for Classification, Object Detection and Segmentation. It is easy to get up and running with these models, just import and load them as you would any torchvision model:

    +
    import torch
    +import torchvision
    +
    +# Classification
    +x = torch.rand(1, 3, 224, 224)
    +m_classifier = torchvision.models.mobilenet_v3_large(pretrained=True)
    +m_classifier.eval()
    +predictions = m_classifier(x)
    +
    +# Quantized Classification
    +x = torch.rand(1, 3, 224, 224)
    +m_classifier = torchvision.models.quantization.mobilenet_v3_large(pretrained=True)
    +m_classifier.eval()
    +predictions = m_classifier(x)
    +
    +# Object Detection: Highly Accurate High Resolution Mobile Model
    +x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
    +m_detector = torchvision.models.detection.fasterrcnn_mobilenet_v3_large_fpn(pretrained=True)
    +m_detector.eval()
    +predictions = m_detector(x)
    +
    +# Semantic Segmentation: Highly Accurate Mobile Model
    +x = torch.rand(1, 3, 520, 520)
    +m_segmenter = torchvision.models.segmentation.deeplabv3_mobilenet_v3_large(pretrained=True)
    +m_segmenter.eval()
    +predictions = m_segmenter(x)
    +
    +

    These models are highly competitive with TorchVision’s existing models on resource efficiency, speed, and accuracy. See our release notes for detailed performance metrics.

    + +

    [Stable] AutoAugment

    +

    AutoAugment is a common Data Augmentation technique that can increase the accuracy of Scene Classification models. Though the data augmentation policies are directly linked to their trained dataset, empirical studies show that ImageNet policies provide significant improvements when applied to other datasets. We’ve implemented 3 policies learned on the following datasets: ImageNet, CIFA10 and SVHN. These can be used standalone or mixed-and-matched with existing transforms:

    +
    from torchvision import transforms
    +
    +t = transforms.AutoAugment()
    +transformed = t(image)
    +
    +
    +transform=transforms.Compose([
    +   transforms.Resize(256),
    +   transforms.AutoAugment(),
    +   transforms.ToTensor()])
    +
    +

    Other New Features for TorchVision

    +
      +
    • [Stable] All read and decode methods in the io.image package now support: +
        +
      • Palette, Grayscale Alpha and RBG Alpha image types during PNG decoding
      • +
      • On-the-fly conversion of image from one type to the other during read
      • +
      +
    • +
    • [Stable] WiderFace dataset
    • +
    • [Stable] Improved FasterRCNN speed and accuracy by introducing a score threshold on RPN
    • +
    • [Stable] Modulation input for DeformConv2D
    • +
    • [Stable] Option to write audio to a video file
    • +
    • [Stable] Utility to draw bounding boxes
    • +
    • [Beta] Autocast support in all Operators +Find the full TorchVision release notes here.
    • +
    + +

    TorchAudio 0.8.0

    +

    I/O Improvements

    +

    We have continued our work from the previous release to improve TorchAudio’s I/O support, including:

    +
      +
    • [Stable] Changing the default backend to “sox_io” (for Linux/macOS), and updating the “soundfile” backend’s interface to align with that of “sox_io”. The legacy backend and interface are still accessible, though it is strongly discouraged to use them.
    • +
    • [Stable] File-like object support in both “sox_io” backend, “soundfile” backend and sox_effects.
    • +
    • [Stable] New options to change the format, encoding, and bits_per_sample when saving.
    • +
    • [Stable] Added GSM, HTK, AMB, AMR-NB and AMR-WB format support to the “sox_io” backend.
    • +
    • [Beta] A new functional.apply_codec function which can degrade audio data by applying audio codecs supported by “sox_io” backend in an in-memory fashion. +Here are some examples of features landed in this release:
    • +
    + +
    # Load audio over HTTP
    +with requests.get(URL, stream=True) as response:
    +    waveform, sample_rate = torchaudio.load(response.raw)
    + 
    +# Saving to Bytes buffer as 32-bit floating-point PCM
    +buffer_ = io.BytesIO()
    +torchaudio.save(
    +    buffer_, waveform, sample_rate,
    +    format="wav", encoding="PCM_S", bits_per_sample=16)
    + 
    +# Apply effects while loading audio from S3
    +client = boto3.client('s3')
    +response = client.get_object(Bucket=S3_BUCKET, Key=S3_KEY)
    +waveform, sample_rate = torchaudio.sox_effects.apply_effect_file(
    +    response['Body'],
    +    [["lowpass", "-1", "300"], ["rate", "8000"]])
    + 
    +# Apply GSM codec to Tensor
    +encoded = torchaudio.functional.apply_codec(
    +    waveform, sample_rate, format="gsm")
    +
    + +

    Check out the revamped audio preprocessing tutorial, Audio Manipulation with TorchAudio.

    + +

    [Stable] Switch to CMake-based build

    +

    In the previous version of TorchAudio, it was utilizing CMake to build third party dependencies. Starting in 0.8.0, TorchaAudio uses CMake to build its C++ extension. This will open the door to integrate TorchAudio in non-Python environments (such as C++ applications and mobile). We will continue working on adding example applications and mobile integrations.

    + +

    [Beta] Improved and New Audio Transforms

    +

    We have added two widely requested operators in this release: the SpectralCentroid transform and the Kaldi Pitch feature extraction (detailed in “A pitch extraction algorithm tuned for automatic speech recognition”). We’ve also exposed a normalization method to Mel transforms, and additional STFT arguments to Spectrogram. We would like to ask our community to continue to raise feature requests for core audio processing features like these!

    + +

    Community Contributions

    +

    We had more contributions from the open source community in this release than ever before, including several completely new features. We would like to extend our sincere thanks to the community. Please check out the newly added CONTRIBUTING.md for ways to contribute code, and remember that reporting bugs and requesting features are just as valuable. We will continue posting well-scoped work items as issues labeled “help-wanted” and “contributions-welcome” for anyone who would like to contribute code, and are happy to coach new contributors through the contribution process.

    + +

    Find the full TorchAudio release notes here.

    + +

    TorchText 0.9.0

    +

    [Beta] Dataset API Updates

    +

    In this release, we are updating TorchText’s dataset API to be compatible with PyTorch data utilities, such as DataLoader, and are deprecating TorchText’s custom data abstractions such as Field. The updated datasets are simple string-by-string iterators over the data. For guidance about migrating from the legacy abstractions to use modern PyTorch data utilities, please refer to our migration guide.

    + +

    The text datasets listed below have been updated as part of this work. For examples of how to use these datasets, please refer to our end-to-end text classification tutorial.

    +
      +
    • Language modeling: WikiText2, WikiText103, PennTreebank, EnWik9
    • +
    • Text classification: AG_NEWS, SogouNews, DBpedia, YelpReviewPolarity, YelpReviewFull, YahooAnswers, AmazonReviewPolarity, AmazonReviewFull, IMDB
    • +
    • Sequence tagging: UDPOS, CoNLL2000Chunking
    • +
    • Translation: IWSLT2016, IWSLT2017
    • +
    • Question answer: SQuAD1, SQuAD2
    • +
    + +

    Find the full TorchText release notes here.

    + +

    [Stable] TorchCSPRNG 0.2.0

    +

    We released TorchCSPRNG in August 2020, a PyTorch C++/CUDA extension that provides cryptographically secure pseudorandom number generators for PyTorch. Today, we are releasing the 0.2.0 version and designating the library as stable. This release includes a new API for encrypt/decrypt with AES128 ECB/CTR as well as CUDA 11 and Windows CUDA support.

    + +

    Find the full TorchCSPRNG release notes here.

    + +

    Thanks for reading, and if you are excited about these updates and want to participate in the future of PyTorch, we encourage you to join the discussion forums and open GitHub issues.

    + +

    Cheers!

    + +

    Team PyTorch

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-1.8-released/index.html b/blog/pytorch-1.8-released/index.html new file mode 100644 index 000000000000..c5a0cdaf182d --- /dev/null +++ b/blog/pytorch-1.8-released/index.html @@ -0,0 +1,817 @@ + + + + + + + + + + + + + PyTorch 1.8 Release, including Compiler and Distributed Training updates, and New Mobile Tutorials | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    We are excited to announce the availability of PyTorch 1.8. This release is composed of more than 3,000 commits since 1.7. It includes major updates and new features for compilation, code optimization, frontend APIs for scientific computing, and AMD ROCm support through binaries that are available via pytorch.org. It also provides improved features for large-scale training for pipeline and model parallelism, and gradient compression. A few of the highlights include:

    +
      +
    1. Support for doing python to python functional transformations via torch.fx;
    2. +
    3. Added or stabilized APIs to support FFTs (torch.fft), Linear Algebra functions (torch.linalg), added support for autograd for complex tensors and updates to improve performance for calculating hessians and jacobians; and
    4. +
    5. Significant updates and improvements to distributed training including: Improved NCCL reliability; Pipeline parallelism support; RPC profiling; and support for communication hooks adding gradient compression. +See the full release notes here.
    6. +
    + +

    Along with 1.8, we are also releasing major updates to PyTorch libraries including TorchCSPRNG, TorchVision, TorchText and TorchAudio. For more on the library releases, see the post here. As previously noted, features in PyTorch releases are classified as Stable, Beta and Prototype. You can learn more about the definitions in the post here.

    + +

    New and Updated APIs

    +

    The PyTorch 1.8 release brings a host of new and updated API surfaces ranging from additional APIs for NumPy compatibility, also support for ways to improve and scale your code for performance at both inference and training time. Here is a brief summary of the major features coming in this release:

    + +

    [Stable] Torch.fft support for high performance NumPy style FFTs

    +

    As part of PyTorch’s goal to support scientific computing, we have invested in improving our FFT support and with PyTorch 1.8, we are releasing the torch.fft module. This module implements the same functions as NumPy’s np.fft module, but with support for hardware acceleration and autograd.

    + + +

    [Beta] Support for NumPy style linear algebra functions via torch.linalg

    +

    The torch.linalg module, modeled after NumPy’s np.linalg module, brings NumPy-style support for common linear algebra operations including Cholesky decompositions, determinants, eigenvalues and many others.

    + + +

    [Beta] Python code Transformations with FX

    +

    FX allows you to write transformations of the form transform(input_module : nn.Module) -> nn.Module, where you can feed in a Module instance and get a transformed Module instance out of it.

    + +

    This kind of functionality is applicable in many scenarios. For example, the FX-based Graph Mode Quantization product is releasing as a prototype contemporaneously with FX. Graph Mode Quantization automates the process of quantizing a neural net and does so by leveraging FX’s program capture, analysis and transformation facilities. We are also developing many other transformation products with FX and we are excited to share this powerful toolkit with the community.

    + +

    Because FX transforms consume and produce nn.Module instances, they can be used within many existing PyTorch workflows. This includes workflows that, for example, train in Python then deploy via TorchScript.

    + +

    You can read more about FX in the official documentation. You can also find several examples of program transformations implemented using torch.fx here. We are constantly improving FX and invite you to share any feedback you have about the toolkit on the forums or issue tracker.

    + +

    We’d like to acknowledge TorchScript tracing, Apache MXNet hybridize, and more recently JAX as influences for program acquisition via tracing. We’d also like to acknowledge Caffe2, JAX, and TensorFlow as inspiration for the value of simple, directed dataflow graph program representations and transformations over those representations.

    + +

    Distributed Training

    +

    The PyTorch 1.8 release added a number of new features as well as improvements to reliability and usability. Concretely, support for: Stable level async error/timeout handling was added to improve NCCL reliability; and stable support for RPC based profiling. Additionally, we have added support for pipeline parallelism as well as gradient compression through the use of communication hooks in DDP. Details are below:

    + +

    [Beta] Pipeline Parallelism

    +

    As machine learning models continue to grow in size, traditional Distributed DataParallel (DDP) training no longer scales as these models don’t fit on a single GPU device. The new pipeline parallelism feature provides an easy to use PyTorch API to leverage pipeline parallelism as part of your training loop.

    + + +

    [Beta] DDP Communication Hook

    +

    The DDP communication hook is a generic interface to control how to communicate gradients across workers by overriding the vanilla allreduce in DistributedDataParallel. A few built-in communication hooks are provided including PowerSGD, and users can easily apply any of these hooks to optimize communication. Additionally, the communication hook interface can also support user-defined communication strategies for more advanced use cases.

    + + +

    Additional Prototype Features for Distributed Training

    +

    In addition to the major stable and beta distributed training features in this release, we also have a number of prototype features available in our nightlies to try out and provide feedback. We have linked in the draft docs below for reference:

    +
      +
    • (Prototype) ZeroRedundancyOptimizer - Based on and in partnership with the Microsoft DeepSpeed team, this feature helps reduce per-process memory footprint by sharding optimizer states across all participating processes in the ProcessGroup gang. Refer to this documentation for more details.
    • +
    • (Prototype) Process Group NCCL Send/Recv - The NCCL send/recv API was introduced in v2.7 and this feature adds support for it in NCCL process groups. This feature will provide an option for users to implement collective operations at Python layer instead of C++ layer. Refer to this documentation and code examples to learn more.
    • +
    • (Prototype) CUDA-support in RPC using TensorPipe - This feature should bring consequent speed improvements for users of PyTorch RPC with multiple-GPU machines, as TensorPipe will automatically leverage NVLink when available, and avoid costly copies to and from host memory when exchanging GPU tensors between processes. When not on the same machine, TensorPipe will fall back to copying the tensor to host memory and sending it as a regular CPU tensor. This will also improve the user experience as users will be able to treat GPU tensors like regular CPU tensors in their code. Refer to this documentation for more details.
    • +
    • (Prototype) Remote Module - This feature allows users to operate a module on a remote worker like using a local module, where the RPCs are transparent to the user. In the past, this functionality was implemented in an ad-hoc way and overall this feature will improve the usability of model parallelism on PyTorch. Refer to this documentation for more details.
    • +
    + +

    PyTorch Mobile

    +

    Support for PyTorch Mobile is expanding with a new set of tutorials to help new users launch models on-device quicker and give existing users a tool to get more out of our framework. These include:

    + + +

    Our new demo apps also include examples of image segmentation, object detection, neural machine translation, question answering, and vision transformers. They are available on both iOS and Android:

    + + +

    In addition to performance improvements on CPU for MobileNetV3 and other models, we also revamped our Android GPU backend prototype for broader models coverage and faster inferencing:

    + + +

    Lastly, we are launching the PyTorch Mobile Lite Interpreter as a prototype feature in this release. The Lite Interpreter allows users to reduce the runtime binary size. Please try these out and send us your feedback on the PyTorch Forums. All our latest updates can be found on the PyTorch Mobile page

    + +

    [Prototype] PyTorch Mobile Lite Interpreter

    +

    PyTorch Lite Interpreter is a streamlined version of the PyTorch runtime that can execute PyTorch programs in resource constrained devices, with reduced binary size footprint. This prototype feature reduces binary sizes by up to 70% compared to the current on-device runtime in the current release.

    + + +

    Performance Optimization

    +

    In 1.8, we are releasing the support for benchmark utils to enable users to better monitor performance. We are also opening up a new automated quantization API. See the details below:

    + +

    (Beta) Benchmark utils

    +

    Benchmark utils allows users to take accurate performance measurements, and provides composable tools to help with both benchmark formulation and post processing. This expected to be helpful for contributors to PyTorch to quickly understand how their contributions are impacting PyTorch performance.

    + +

    Example:

    +
    from torch.utils.benchmark import Timer
    +
    +results = []
    +for num_threads in [1, 2, 4]:
    +    timer = Timer(
    +        stmt="torch.add(x, y, out=out)",
    +        setup="""
    +            n = 1024
    +            x = torch.ones((n, n))
    +            y = torch.ones((n, 1))
    +            out = torch.empty((n, n))
    +        """,
    +        num_threads=num_threads,
    +    )
    +    results.append(timer.blocked_autorange(min_run_time=5))
    +    print(
    +        f"{num_threads} thread{'s' if num_threads > 1 else ' ':<4}"
    +        f"{results[-1].median * 1e6:>4.0f} us   " +
    +        (f"({results[0].median / results[-1].median:.1f}x)" if num_threads > 1 else '')
    +    )
    +
    +1 thread     376 us   
    +2 threads    189 us   (2.0x)
    +4 threads     99 us   (3.8x)
    +
    + + +

    (Prototype) FX Graph Mode Quantization

    +

    FX Graph Mode Quantization is the new automated quantization API in PyTorch. It improves upon Eager Mode Quantization by adding support for functionals and automating the quantization process, although people might need to refactor the model to make the model compatible with FX Graph Mode Quantization (symbolically traceable with torch.fx).

    + + +

    Hardware Support

    + +

    [Beta] Ability to Extend the PyTorch Dispatcher for a new backend in C++

    +

    In PyTorch 1.8, you can now create new out-of-tree devices that live outside the pytorch/pytorch repo. The tutorial linked below shows how to register your device and keep it in sync with native PyTorch devices.

    + + +

    [Beta] AMD GPU Binaries Now Available

    +

    Starting in PyTorch 1.8, we have added support for ROCm wheels providing an easy onboarding to using AMD GPUs. You can simply go to the standard PyTorch installation selector and choose ROCm as an installation option and execute the provided command.

    + +

    Thanks for reading, and if you are excited about these updates and want to participate in the future of PyTorch, we encourage you to join the discussion forums and open GitHub issues.

    + +

    Cheers!

    + +

    Team PyTorch

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-1.9-new-library-releases/index.html b/blog/pytorch-1.9-new-library-releases/index.html new file mode 100644 index 000000000000..893bb5264596 --- /dev/null +++ b/blog/pytorch-1.9-new-library-releases/index.html @@ -0,0 +1,857 @@ + + + + + + + + + + + + + New PyTorch Library Releases in PyTorch 1.9, including TorchVision, TorchAudio, and more | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    Today, we are announcing updates to a number of PyTorch libraries, alongside the PyTorch 1.9 release. The updates include new releases for the domain libraries including TorchVision, TorchText and TorchAudio. These releases, along with the PyTorch 1.9 release, include a number of new features and improvements that will provide a broad set of updates for the PyTorch community.

    + +

    Some highlights include:

    + +
      +
    • TorchVision - Added new SSD and SSDLite models, quantized kernels for object detection, GPU Jpeg decoding, and iOS support. See release notes here.
    • +
    • TorchAudio - Added wav2vec 2.0 model deployable in non-Python environments (including C++, Android, and iOS). Many performance improvements in lfilter, spectral operations, resampling. Added options for quality control in sampling (i.e. Kaiser window support). Initiated the migration of complex tensors operations. Improved autograd support. See release notes here.
    • +
    • TorchText - Added a new high-performance Vocab module that provides common functional APIs for NLP workflows. See release notes here.
    • +
    + +

    We’d like to thank the community for their support and work on this latest release.

    + +

    Features in PyTorch releases are classified as Stable, Beta, and Prototype. You can learn more about the definitions in this blog post.

    + +

    TorchVision 0.10

    + +

    (Stable) Quantized kernels for object detection

    +

    The forward pass of the nms and roi_align operators now support tensors with a quantized dtype, which can help lower the memory footprint of object detection models, particularly on mobile environments. For more details, refer to the documentation.

    + +

    (Stable) Speed optimizations for Tensor transforms

    +

    The resize and flip transforms have been optimized and its runtime improved by up to 5x on the CPU.

    + +

    (Stable) Documentation improvements

    +

    Significant improvements were made to the documentation. In particular, a new gallery of examples is available. These examples visually illustrate how each transform acts on an image, and also properly documents and illustrates the output of the segmentation models.

    + +

    The example gallery will be extended in the future to provide more comprehensive examples and serve as a reference for common torchvision tasks. For more details, refer to the documentation.

    + +

    (Beta) New models for detection

    +

    SSD and SSDlite are two popular object detection architectures that are efficient in terms of speed and provide good results for low resolution pictures. In this release, we provide implementations for the original SSD model with VGG16 backbone and for its mobile-friendly variant SSDlite with MobileNetV3-Large backbone.

    + +

    The models were pre-trained on COCO train2017 and can be used as follows:

    + +
    import torch
    +import torchvision
    +
    +# Original SSD variant
    +x = [torch.rand(3, 300, 300), torch.rand(3, 500, 400)]
    +m_detector = torchvision.models.detection.ssd300_vgg16(pretrained=True)
    +m_detector.eval()
    +predictions = m_detector(x)
    +
    +# Mobile-friendly SSDlite variant
    +x = [torch.rand(3, 320, 320), torch.rand(3, 500, 400)]
    +m_detector = torchvision.models.detection.ssdlite320_mobilenet_v3_large(pretrained=True)
    +m_detector.eval()
    +predictions = m_detector(x)
    +
    + +

    The following accuracies can be obtained on COCO val2017 (full results available in #3403 and #3757):

    + + + + + + + + + + + + + + + + + + + + + + + + +
    ModelmAPmAP@50mAP@75
    SSD300 VGG1625.141.526.2
    SSDlite320 MobileNetV3-Large21.334.322.1
    + +

    For more details, refer to the documentation.

    + +

    (Beta) JPEG decoding on the GPU

    +

    Decoding jpegs is now possible on GPUs with the use of nvjpeg, which should be readily available in your CUDA setup. The decoding time of a single image should be about 2 to 3 times faster than with libjpeg on CPU. While the resulting tensor will be stored on the GPU device, the input raw tensor still needs to reside on the host (CPU), because the first stages of the decoding process take place on the host: +from torchvision.io.image import read_file, decode_jpeg

    + +
    data = read_file('path_to_image.jpg')  # raw data is on CPU
    +img = decode_jpeg(data, device='cuda')  # decoded image in on GPU
    +
    +

    For more details, see the documentation.

    + +

    (Beta) iOS support

    +

    TorchVision 0.10 now provides pre-compiled iOS binaries for its C++ operators, which means you can run Faster R-CNN and Mask R-CNN on iOS. An example app on how to build a program leveraging those ops can be found here.

    + +

    TorchAudio 0.9.0

    + +

    (Stable) Complex Tensor Migration

    +

    TorchAudio has functions that handle complex-valued tensors. These functions follow a convention to use an extra dimension to represent real and imaginary parts. In PyTorch 1.6, the native complex type was introduced. As its API is getting stable, torchaudio has started to migrate to the native complex type.

    + +

    In this release, we added support for native complex tensors, and you can opt-in to use them. Using the native complex types, we have verified that affected functions continue to support autograd and TorchScript, moreover, switching to native complex types improves their performance. For more details, refer to pytorch/audio#1337.

    + +

    (Stable) Filtering Improvement

    +

    In release 0.8, we added the C++ implementation of the core part of lfilter for CPU, which improved the performance. In this release, we optimized some internal operations of the CPU implementation for further performance improvement. We also added autograd support to both CPU and GPU. Now lfilter and all the biquad filters (biquad, band_biquad, bass_biquad, treble_biquad, allpass_biquad, lowpass_biquad, highpass_biquad, bandpass_biquad, equalizer_biquad and bandrefect_biquad) benefit from the performance improvement and support autograd. We also moved the implementation of overdrive to C++ for performance improvement. For more details, refer to the documentation.

    + +

    (Stable) Improved Autograd Support

    +

    Along with the work of Complex Tensor Migration and Filtering Improvement, we also added autograd tests to transforms. lfilter, biquad and its variants, and most transforms are now guaranteed to support autograd. For more details, refer to the release note.

    + +

    (Stable) Improved Windows Support

    +

    Torchaudio implements some operations in C++ for reasons such as performance and integration with third-party libraries. These C++ components were only available on Linux and macOS. In this release, we have added support to Windows. With this, the efficient filtering implementation mentioned above is also available on Windows.

    + +

    However, please note that not all the C++ components are available for Windows. “sox_io” backend and torchaudio.functional.compute_kaldi_pitch are not supported.

    + +

    (Stable) I/O Functions Migration

    +

    Since the 0.6 release, we have continuously improved I/O functionality. Specifically, in 0.8 we changed the default backend from “sox” to “sox_io” and applied the same switch to API of the “soundfile” backend. The 0.9 release concludes this migration by removing the deprecated backends. For more details, please refer to #903.

    + +

    (Beta) Wav2Vec2.0 Model

    +

    We have added the model architectures from Wav2Vec2.0. You can import fine-tuned models parameters published on fairseq and Hugging Face Hub. Our model definition supports TorchScript, and it is possible to deploy the model to non-Python environments, such as C++, Android and iOS.

    + +

    The following code snippet illustrates such a use case. Please check out our c++ example directory for the complete example. Currently, it is designed for running inference. If you would like more support for training, please file a feature request.

    + +
    # Import fine-tuned model from Hugging Face Hub
    +import transformers
    +from torchaudio.models.wav2vec2.utils import import_huggingface_model
    +
    +original = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
    +imported = import_huggingface_model(original)
    +
    + +
    # Import fine-tuned model from fairseq
    +import fairseq
    +from torchaudio.models.wav2vec2.utils import import_fairseq_model
    +
    +original, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
    +    ["wav2vec_small_960h.pt"], arg_overrides={'data': "<data_dir>"})
    +imported = import_fairseq_model(original[0].w2v_encoder)
    +
    + +
    # Build uninitialized model and load state dict
    +from torchaudio.models import wav2vec2_base
    +
    +model = wav2vec2_base(num_out=32)
    +model.load_state_dict(imported.state_dict())
    +
    +# Quantize / script / optimize for mobile
    +quantized_model = torch.quantization.quantize_dynamic(
    +    model, qconfig_spec={torch.nn.Linear}, dtype=torch.qint8)
    +scripted_model = torch.jit.script(quantized_model)
    +optimized_model = optimize_for_mobile(scripted_model)
    +optimized_model.save("model_for_deployment.pt")
    +
    + +

    For more details, see the documentation.

    + +

    (Beta) Resampling Improvement

    +

    In release 0.8, we vectorized the operation in torchaudio.compliance.kaldi.resample_waveform, which improved the performance of resample_waveform and torchaudio.transforms.Resample. In this release, we have further revised the way the resampling algorithm is implemented.

    + +

    We have:

    +
      +
    • Added Kaiser Window support for a wider range of resampling quality.
    • +
    • Added rolloff parameter for anti-aliasing control.
    • +
    • Added the mechanism to precompute the kernel and cache it in torchaudio.transforms.Resample for even faster operation.
    • +
    • Moved the implementation from torchaudio.compliance.kaldi.resample_waveform to torchaudio.functional.resample and deprecated torchaudio.compliance.kaldi.resample_waveform.
    • +
    + +

    For more details, see the documentation.

    + +

    (Prototype) RNN Transducer Loss

    +

    The RNN transducer loss is used in training RNN transducer models, which is a popular architecture for speech recognition tasks. The prototype loss in torchaudio currently supports autograd, torchscript, float16 and float32, and can also be run on both CPU and CUDA. For more details, please refer to the documentation.

    + +

    TorchText 0.10.0

    + +

    (Beta) New Vocab Module

    +

    In this release, we introduce a new Vocab module that replaces the current Vocab class. The new Vocab provides common functional APIs for NLP workflows. This module is backed by an efficient C++ implementation that reduces batch look-up time by up-to ~85% (refer to summary of #1248 and #1290 for further information on benchmarks), and provides support for TorchScript. We provide accompanying factory functions that can be used to build the Vocab object either through a python ordered dictionary or an Iterator that yields lists of tokens.

    + +
    #creating Vocab from text file
    +import io
    +from torchtext.vocab import build_vocab_from_iterator
    +#generator that yield list of tokens
    +def yield_tokens(file_path):
    +   with io.open(file_path, encoding = 'utf-8') as f:
    +      for line in f:
    +          yield line.strip().split()
    +#get Vocab object
    +vocab_obj = build_vocab_from_iterator(yield_tokens(file_path), specials=["<unk>"])
    +
    +#creating Vocab through ordered dict
    +from torchtext.vocab import vocab
    +from collections import Counter, OrderedDict
    +counter = Counter(["a", "a", "b", "b", "b"])
    +sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: x[1], reverse=True)
    +ordered_dict = OrderedDict(sorted_by_freq_tuples)
    +vocab_obj = vocab(ordered_dict)
    +
    +#common API usage
    +
    +#look-up index
    +vocab_obj["a"]
    +
    +#batch look-up indices
    +vocab_obj.looup_indices(["a","b"])
    +#support forward API of PyTorch nn Modules
    +vocab_obj(["a","b"])
    +
    +#batch look-up tokens
    +vocab_obj.lookup_tokens([0,1])
    +
    +#set default index to return when token not found
    +vocab_obj.set_default_index(0)
    +vocab_obj["out_of_vocabulary"] #prints 0
    +
    + +

    For more details, refer to the documentation.

    + +

    Thanks for reading. If you’re interested in these updates and want to join the PyTorch community, we encourage you to join the discussion forums and open GitHub issues. To get the latest news from PyTorch, follow us on Facebook, Twitter, Medium, YouTube or LinkedIn.

    + +

    Cheers!

    + +

    -Team PyTorch

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-1.9-released/index.html b/blog/pytorch-1.9-released/index.html new file mode 100644 index 000000000000..c50d5832aa5c --- /dev/null +++ b/blog/pytorch-1.9-released/index.html @@ -0,0 +1,847 @@ + + + + + + + + + + + + + PyTorch 1.9 Release, including torch.linalg and Mobile Interpreter | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    We are excited to announce the release of PyTorch 1.9. The release is composed of more than 3,400 commits since 1.8, made by 398 contributors. The release notes are available here. Highlights include:

    +
      +
    1. Major improvements to support scientific computing, including torch.linalg, torch.special, and Complex Autograd
    2. +
    3. Major improvements in on-device binary size with Mobile Interpreter
    4. +
    5. Native support for elastic-fault tolerance training through the upstreaming of TorchElastic into PyTorch Core
    6. +
    7. Major updates to the PyTorch RPC framework to support large scale distributed training with GPU support
    8. +
    9. New APIs to optimize performance and packaging for model inference deployment
    10. +
    11. Support for Distributed training, GPU utilization and SM efficiency in the PyTorch Profiler
    12. +
    + +

    Along with 1.9, we are also releasing major updates to the PyTorch libraries, which you can read about in this blog post.

    + +

    We’d like to thank the community for their support and work on this latest release. We’d especially like to thank Quansight and Microsoft for their contributions.

    + +

    Features in PyTorch releases are classified as Stable, Beta, and Prototype. You can learn more about the definitions in this blog post.

    + +

    Frontend APIs

    + +

    (Stable) torch.linalg

    + +

    In 1.9, the torch.linalg module is moving to a stable release. Linear algebra is essential to deep learning and scientific computing, and the torch.linalg module extends PyTorch’s support for it with implementations of every function from NumPy’s linear algebra module (now with support for accelerators and autograd) and more, like torch.linalg.matrix_norm and torch.linalg.householder_product. This makes the module immediately familiar to users who have worked with NumPy. Refer to the documentation here.

    + +

    We plan to publish another blog post with more details on the torch.linalg module next week!

    + +

    (Stable) Complex Autograd

    + +

    The Complex Autograd feature, released as a beta in PyTorch 1.8, is now stable. Since the beta release, we have extended support for Complex Autograd for over 98% operators in PyTorch 1.9, improved testing for complex operators by adding more OpInfos, and added greater validation through TorchAudio migration to native complex tensors (refer to this issue).

    + +

    This feature provides users the functionality to calculate complex gradients and optimize real valued loss functions with complex variables. This is a required feature for multiple current and downstream prospective users of complex numbers in PyTorch like TorchAudio, ESPNet, Asteroid, and FastMRI. Refer to the documentation for more details.

    + +

    (Stable) torch.use_deterministic_algorithms()

    + +

    To help with debugging and writing reproducible programs, PyTorch 1.9 includes a torch.use_determinstic_algorithms option. When this setting is enabled, operations will behave deterministically, if possible, or throw a runtime error if they might behave nondeterministically. Here are a couple examples:

    + +
    >>> a = torch.randn(100, 100, 100, device='cuda').to_sparse()
    +>>> b = torch.randn(100, 100, 100, device='cuda')
    +
    +# Sparse-dense CUDA bmm is usually nondeterministic
    +>>> torch.bmm(a, b).eq(torch.bmm(a, b)).all().item()
    +False
    +
    +>>> torch.use_deterministic_algorithms(True)
    +
    +# Now torch.bmm gives the same result each time, but with reduced performance
    +>>> torch.bmm(a, b).eq(torch.bmm(a, b)).all().item()
    +True
    +
    +# CUDA kthvalue has no deterministic algorithm, so it throws a runtime error
    +>>> torch.zeros(10000, device='cuda').kthvalue(1)
    +RuntimeError: kthvalue CUDA does not have a deterministic implementation...
    +
    + +

    PyTorch 1.9 adds deterministic implementations for a number of indexing operations, too, including index_add, index_copy, and index_put with accum=False. For more details, refer to the documentation and reproducibility note.

    + +

    (Beta) torch.special

    + +

    A torch.special module, analogous to SciPy’s special module, is now available in beta. This module contains many functions useful for scientific computing and working with distributions such as iv, ive, erfcx, logerfc, and logerfcx. Refer to the documentation for more details.

    + +

    (Beta) nn.Module parameterization

    + +

    nn.Module parameterization allows users to parametrize any parameter or buffer of an nn.Module without modifying the nn.Module itself. It allows you to constrain the space in which your parameters live without the need for special optimization methods.

    + +

    This also contains a new implementation of the spectral_norm parametrization for PyTorch 1.9. More parametrization will be added to this feature (weight_norm, matrix constraints and part of pruning) for the feature to become stable in 1.10. For more details, refer to the documentation and tutorial.

    + +

    PyTorch Mobile

    + +

    (Beta) Mobile Interpreter

    + +

    We are releasing Mobile Interpreter, a streamlined version of the PyTorch runtime, in beta. The Interpreter will execute PyTorch programs in edge devices, with reduced binary size footprint.

    + +

    Mobile Interpreter is one of the top requested features for PyTorch Mobile. This new release will significantly reduce binary size compared with the current on-device runtime. In order for you to get the binary size improvements with our interpreter (which can reduce the binary size up to ~75% for a typical application) follow these instructions. As an example, using Mobile Interpreter, we can reach 2.6 MB compressed with MobileNetV2 in arm64-v7a Android. With this latest release we are making it much simpler to integrate the interpreter by providing pre-built libraries for iOS and Android.

    + +

    TorchVision Library

    + +

    Starting from 1.9, users can use the TorchVision library on their iOS/Android apps. The Torchvision library contains the C++ TorchVision ops and needs to be linked together with the main PyTorch library for iOS, for Android it can be added as a gradle dependency. This allows using TorchVision prebuilt MaskRCNN operators for object detections and segmentation. To learn more about the library, please refer to our tutorials and demo apps.

    + +

    Demo apps

    + +

    We are releasing a new video app based on PyTorch Video library and an updated speech recognition app based on the latest torchaudio, wave2vec model. Both are available on iOS and Android. In addition, we have updated the seven Computer Vision and three Natural Language Processing demo apps, including the HuggingFace DistilBERT, and the DeiT vision transformer models, with PyTorch Mobile v1.9. With the addition of these two apps, we now offer a full suite of demo apps covering image, text, audio, and video. To get started check out our iOS demo apps and Android demo apps.

    + +
    + +
    + +

    Distributed Training

    + +

    (Beta) TorchElastic is now part of core

    + +

    TorchElastic, which was open sourced over a year ago in the pytorch/elastic github repository, is a runner and coordinator for PyTorch worker processes. Since then, it has been adopted by various distributed torch use-cases: 1) deepspeech.pytorch 2) pytorch-lightning 3) Kubernetes CRD. Now, it is part of PyTorch core.

    + +

    As its name suggests, the core function of TorcheElastic is to gracefully handle scaling events. A notable corollary of elasticity is that peer discovery and rank assignment are built into TorchElastic enabling users to run distributed training on preemptible instances without requiring a gang scheduler. As a side note, etcd used to be a hard dependency of TorchElastic. With the upstream, this is no longer the case since we have added a “standalone” rendezvous based on c10d::Store. For more details, refer to the documentation.

    + +

    (Beta) Distributed Training Updates

    + +

    In addition to TorchElastic, there are a number of beta features available in the distributed package:

    + +
      +
    • +

      (Beta) CUDA support is available in RPC: Compared to CPU RPC and general-purpose RPC frameworks, CUDA RPC is a much more efficient way for P2P Tensor communication. It is built on top of TensorPipe which can automatically choose a communication channel for each Tensor based on Tensor device type and channel availability on both the caller and the callee. Existing TensorPipe channels cover NVLink, InfiniBand, SHM, CMA, TCP, etc. See this recipe for how CUDA RPC helps to attain 34x speedup compared to CPU RPC.

      +
    • +
    • +

      (Beta) ZeroRedundancyOptimizer: ZeroRedundancyOptimizer can be used in conjunction with DistributedDataParallel to reduce the size of per-process optimizer states. The idea of ZeroRedundancyOptimizer comes from DeepSpeed/ZeRO project and Marian, where the optimizer in each process owns a shard of model parameters and their corresponding optimizer states. When running step(), each optimizer only updates its own parameters, and then uses collective communication to synchronize updated parameters across all processes. Refer to this documentation and this tutorial to learn more.

      +
    • +
    • +

      (Beta) Support for profiling distributed collectives: PyTorch’s profiler tools, torch.profiler and torch.autograd.profiler, are able to profile distributed collectives and point to point communication primitives including allreduce, alltoall, allgather, send/recv, etc. This is enabled for all backends supported natively by PyTorch: gloo, mpi, and nccl. This can be used to debug performance issues, analyze traces that contain distributed communication, and gain insight into performance of applications that use distributed training. To learn more, refer to this documentation.

      +
    • +
    + +

    Performance Optimization and Tooling

    + +

    (Stable) Freezing API

    + +

    Module Freezing is the process of inlining module parameters and attributes values as constants into the TorchScript internal representation. This allows further optimization and specialization of your program, both for TorchScript optimizations and lowering to other backends. It is used by optimize_for_mobile API, ONNX, and others.

    + +

    Freezing is recommended for model deployment. It helps TorchScript JIT optimizations optimize away overhead and bookkeeping that is necessary for training, tuning, or debugging PyTorch models. It enables graph fusions that are not semantically valid on non-frozen graphs - such as fusing Conv-BN. For more details, refer to the documentation.

    + +

    (Beta) PyTorch Profiler

    + +
    + +
    + +

    The new PyTorch Profiler graduates to beta and leverages Kineto for GPU profiling, TensorBoard for visualization and is now the standard across our tutorials and documentation.

    + +

    PyTorch 1.9 extends support for the new torch.profiler API to more builds, including Windows and Mac and is recommended in most cases instead of the previous torch.autograd.profiler API. The new API supports existing profiler features, integrates with CUPTI library (Linux-only) to trace on-device CUDA kernels and provides support for long-running jobs, e.g.:

    + +
    def trace_handler(p):
    +    output = p.key_averages().table(sort_by="self_cuda_time_total", row_limit=10)
    +    print(output)
    +    p.export_chrome_trace("/tmp/trace_" + str(p.step_num) + ".json")
    +
    +with profile(
    +    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    +    # schedule argument specifies the iterations on which the profiler is active
    +    schedule=torch.profiler.schedule(
    +        wait=1,
    +        warmup=1,
    +        active=2),
    +    # on_trace_ready argument specifies the handler for the traces
    +    on_trace_ready=trace_handler
    +) as p:
    +    for idx in range(8):
    +        model(inputs)
    +        # profiler will trace iterations 2 and 3, and then 6 and 7 (counting from zero)
    +        p.step()
    +
    + +

    More usage examples can be found on the profiler recipe page.

    + +

    The PyTorch Profiler Tensorboard plugin has new features for:

    +
      +
    • Distributed Training summary view with communications overview for NCCL
    • +
    • GPU Utilization and SM Efficiency in Trace view and GPU operators view
    • +
    • Memory Profiling view
    • +
    • Jump to source when launched from Microsoft VSCode
    • +
    • Ability for load traces from cloud object storage systems
    • +
    + +

    (Beta) Inference Mode API

    + +

    Inference Mode API allows significant speed-up for inference workloads while remaining safe and ensuring no incorrect gradients can ever be computed. It offers the best possible performance when no autograd is required. For more details, refer to the documentation for inference mode itself and the documentation explaining when to use it and the difference with no_grad mode.

    + +

    (Beta) torch.package

    + +

    torch.package is a new way to package PyTorch models in a self-contained, stable format. A package will include both the model’s data (e.g. parameters, buffers) and its code (model architecture). Packaging a model with its full set of Python dependencies, combined with a description of a conda environment with pinned versions, can be used to easily reproduce training. Representing a model in a self-contained artifact will also allow it to be published and transferred throughout a production ML pipeline while retaining the flexibility of a pure-Python representation. For more details, refer to the documentation.

    + +

    (Prototype) prepare_for_inference

    + +

    prepare_for_inference is a new prototype feature that takes in a module and performs graph-level optimizations to improve inference performance, depending on the device. It is meant to be a PyTorch-native option that requires minimal changes to user’s workflows. For more details, see the documentation for the Torchscript version here or the FX version here.

    + +

    (Prototype) Profile-directed typing in TorchScript

    + +

    TorchScript has a hard requirement for source code to have type annotations in order for compilation to be successful. For a long time, it was only possible to add missing or incorrect type annotations through trial and error (i.e., by fixing the type-checking errors generated by torch.jit.script one by one), which was inefficient and time consuming. Now, we have enabled profile directed typing for torch.jit.script by leveraging existing tools like MonkeyType, which makes the process much easier, faster, and more efficient. For more details, refer to the documentation.

    + +

    Thanks for reading. If you’re interested in these updates and want to join the PyTorch community, we encourage you to join the discussion forums and open GitHub issues. To get the latest news from PyTorch, follow us on Facebook, Twitter, Medium, YouTube, or LinkedIn.

    + +

    Cheers!

    + +

    Team PyTorch

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-2-1/index.html b/blog/pytorch-2-1/index.html new file mode 100644 index 000000000000..98267c55bb9b --- /dev/null +++ b/blog/pytorch-2-1/index.html @@ -0,0 +1,819 @@ + + + + + + + + + + + + + PyTorch 2.1: automatic dynamic shape compilation, distributed checkpointing | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    We are excited to announce the release of PyTorch® 2.1 (release note)! PyTorch 2.1 offers automatic dynamic shape support in torch.compile, torch.distributed.checkpoint for saving/loading distributed training jobs on multiple ranks in parallel, and torch.compile support for the NumPy API.

    + +

    In addition, this release offers numerous performance improvements (e.g. CPU inductor improvements, AVX512 support, scaled-dot-product-attention support) as well as a prototype release of torch.export, a sound full-graph capture mechanism, and torch.export-based quantization.

    + +

    Along with 2.1, we are also releasing a series of updates to the PyTorch domain libraries. More details can be found in the library updates blog. 

    + +

    This release is composed of 6,682 commits and 784 contributors since 2.0. We want to sincerely thank our dedicated community for your contributions. As always, we encourage you to try these out and report any issues as we improve 2.1.  More information about how to get started with the PyTorch 2-series can be found at our Getting Started page.

    + +

    Summary: 

    +
      +
    • torch.compile now includes automatic support for detecting and minimizing recompilations due to tensor shape changes using automatic dynamic shapes.
    • +
    • torch.distributed.checkpoint enables saving and loading models from multiple ranks in parallel, as well as resharding due to changes in cluster topology.
    • +
    • torch.compile can now compile NumPy operations via translating them into PyTorch-equivalent operations.
    • +
    • torch.compile now includes improved support for Python 3.11.
    • +
    • New CPU performance features include inductor improvements (e.g. bfloat16 support and dynamic shapes), AVX512 kernel support, and scaled-dot-product-attention kernels.
    • +
    • torch.export, a sound full-graph capture mechanism is introduced as a prototype feature, as well as torch.export-based quantization.
    • +
    • torch.sparse now includes prototype support for semi-structured (2:4) sparsity on NVIDIA® GPUs.
    • +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    StableBetaPrototypePerformance Improvements
     Automatic Dynamic Shapestorch.export()AVX512 kernel support
     torch.distributed.checkpointTorch.export-based QuantizationCPU optimizations for scaled-dot-product-attention (SPDA)
     torch.compile + NumPysemi-structed (2:4) sparsityCPU optimizations for bfloat16
     torch.compile + Python 3.11cpp_wrapper for torchinductor 
     torch.compile + autograd.Function  
     third-party device integration: PrivateUse1  
    + +

    *To see a full list of public 2.1, 2.0, and 1.13 feature submissions click here.

    + +

    Beta Features

    + +

    (Beta) Automatic Dynamic Shapes

    + +

    Dynamic shapes is functionality built into torch.compile that can minimize recompilations by tracking and generating code based on the symbolic shape of a tensor rather than the static shape (e.g. [B, 128, 4] rather than [64, 128, 4]). This allows torch.compile to generate a single kernel that can work for many sizes, at only a modest cost to efficiency. Dynamic shapes has been greatly stabilized in PyTorch 2.1, and is now automatically enabled if torch.compile notices recompilation due to varying input shapes. You can disable automatic dynamic by passing dynamic=False to torch.compile, or by setting torch._dynamo.config.automatic_dynamic_shapes = False.

    + +

    In PyTorch 2.1, we have shown good performance with dynamic shapes enabled on a variety of model types, including large language models, on both CUDA and CPU.

    + +

    For more information on dynamic shapes, see this documentation.

    + +

    [Beta] torch.distributed.checkpoint

    + +

    torch.distributed.checkpoint enables saving and loading models from multiple ranks in parallel. In addition, checkpointing automatically handles fully-qualified-name (FQN) mappings across models and optimizers, enabling load-time resharding across differing cluster topologies.

    + +

    For more information, see torch.distributed.checkpoint documentation and tutorial.

    + +

    [Beta] torch.compile + NumPy

    + +

    torch.compile now understands how to compile NumPy operations via translating them into PyTorch-equivalent operations.  Because this integration operates in a device-agnostic manner, you can now GPU-accelerate NumPy programs – or even mixed NumPy/PyTorch programs – just by using torch.compile.

    + +

    Please see this section in the torch.compile FAQ for more information about torch.compile + NumPy interaction, and follow the PyTorch Blog for a forthcoming blog about this feature.

    + +

    [Beta] torch.compile + Python 3.11

    + +

    torch.compile previously only supported Python versions 3.8-3.10. Users can now optimize models with torch.compile in Python 3.11.

    + +

    [Beta] torch.compile + autograd.Function

    + +

    torch.compile can now trace and optimize the backward function of user-defined autograd Functions, which unlocks training optimizations for models that make heavier use of extensions mechanisms.

    + +

    [Beta] Improved third-party device support: PrivateUse1

    + +

    Third-party device types can now be registered to PyTorch using the privateuse1 dispatch key.  This allows device extensions to register new kernels to PyTorch and to associate them with the new key, allowing user code to work equivalently to built-in device types.  For example, to register “my_hardware_device”, one can do the following:

    + +
    torch.rename_privateuse1_backend("my_hardware_device")
    +torch.utils.generate_methods_for_privateuse1_backend()
    +x = torch.randn((2, 3), device='my_hardware_device')
    +y = x + x # run add kernel on 'my_hardware_device'
    +
    + +

    To validate this feature, the OSS team from Ascend NPU has successfully integrated torch_npu into pytorch as a plug-in through the PrivateUse1 functionality.

    + +

    For more information, please see the PrivateUse1 tutorial here.

    + +

    Prototype Features

    + +

    [Prototype] torch.export()

    + +

    torch.export() provides a sound tracing mechanism to capture a full graph from a PyTorch program based on new technologies provided by PT2.0.

    + +

    Users can extract a clean representation (Export IR) of a PyTorch program in the form of a dataflow graph, consisting of mostly straight-line calls to PyTorch operators. Export IR can then be transformed, serialized, saved to file, transferred, loaded back for execution in an environment with or without Python.

    + +

    For more information, please see the tutorial here.

    + +

    [Prototype] torch.export-based Quantization

    + +

    torch.ao.quantization now supports quantization on PyTorch 2 torch.export-based flows.  This includes support for built-in XNNPACK and X64Inductor Quantizer, as well as the ability to specify one’s own Quantizer.

    + +

    For an explanation on post-training static quantization with torch.export, see this tutorial, for quantization-aware training for static quantization with torch.export, see this tutorial.

    + +

    For an explanation on how to write one’s own Quantizer, see this tutorial.

    + +

    [Prototype] semi-structured (2:4) sparsity for NVIDIA® GPUs

    + +

    torch.sparse now supports creating and accelerating compute over semi-structured sparse (2:4) tensors.  For more information on the format, see this blog from NVIDIA.A minimal example introducing semi-structured sparsity is as follows:

    + +
    from torch.sparse import to_sparse_semi_structured
    + 
    +x = torch.rand(64, 64).half().cuda()
    +mask = torch.tensor([0, 0, 1, 1]).tile((64, 16)).cuda().bool()
    +linear = nn.Linear(64, 64).half().cuda()
    +
    +linear.weight = nn.Parameter(to_sparse_semi_structured(linear.weight.masked_fill(~mask, 0)))
    +linear(x)
    +
    + +

    To learn more, please see the documentation and accompanying tutorial.

    + +

    [Prototype] cpp_wrapper for torchinductor

    + +

    cpp_wrapper can reduce the Python overhead for invoking kernels in torchinductor by generating the kernel wrapper code in C++. This feature is still in the prototype phase; it does not support all programs that successfully compile in PT2 today. Please file issues if you discover limitations for your use case to help us prioritize.

    + +

    The API to turn this feature on is:

    +
    import torch
    +import torch._inductor.config as config
    +config.cpp_wrapper = True
    +
    + +

    For more information, please see the tutorial.

    + +

    Performance Improvements

    + +

    AVX512 kernel support

    + +

    In PyTorch 2.0, AVX2 kernels would be used even if the CPU supported AVX512 instructions.  Now, PyTorch defaults to using AVX512 CPU kernels if the CPU supports those instructions, equivalent to setting ATEN_CPU_CAPABILITY=avx512 in previous releases.  The previous behavior can be enabled by setting ATEN_CPU_CAPABILITY=avx2.

    + +

    CPU optimizations for scaled-dot-product-attention (SDPA)

    + +

    Previous versions of PyTorch provided optimized CUDA implementations for transformer primitives via torch.nn.functiona.scaled_dot_product_attention.  PyTorch 2.1 includes optimized FlashAttention-based CPU routines.

    + +

    See the documentation here.

    + +

    CPU optimizations for bfloat16

    + +

    PyTorch 2.1 includes CPU optimizations for bfloat16, including improved vectorization support and torchinductor codegen.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-2-7-intel-gpus/index.html b/blog/pytorch-2-7-intel-gpus/index.html new file mode 100644 index 000000000000..f8908107fab1 --- /dev/null +++ b/blog/pytorch-2-7-intel-gpus/index.html @@ -0,0 +1,733 @@ + + + + + + + + + + + + + Accelerate PyTorch 2.7 on Intel® GPUs | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    April 25, 2025

    +

    + Accelerate PyTorch 2.7 on Intel® GPUs +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + the Intel PyTorch Team + +

    +

    PyTorch 2.7 continues to deliver significant functionality and performance enhancements on Intel® GPU architectures to streamline AI workflows. Application developers and researchers seeking to fine-tune, inference and develop PyTorch models on Intel GPUs will now have a consistent user experience across various operating systems, including Windows, Linux and Windows Subsystem for Linux (WSL2). This is made possible through improved installation, eager mode script debugging, a performance profiler, and graph model (torch.compile) deployment. As a result, developers have greater options with a unified GPU programming paradigm for both front-end and back-end development.

    + +

    Incremental improvements of Intel GPU support in PyTorch

    + +

    Since PyTorch 2.4, we’ve made steady improvements to Intel GPU support with each release. With PyTorch 2.7, we are excited to share that we have established a solid foundation to have Intel GPU work in both graph mode (torch.compile) and eager mode on Windows and Linux. This includes a wide range of Intel GPU products, many of which you may already access. We hope these enhancements will unlock more ubiquitous hardware for your AI research and development.

    + + + +

    Check out the detailed advancements in these related release blogs: PyTorch 2.4, PyTorch 2.5, and PyTorch 2.6.

    + +

    What’s New in PyTorch 2.7

    + +

    These are the features in PyTorch 2.7 that were added to help accelerate performance on Intel GPUs.

    + +
      +
    • Improve scaled dot-product attention (SDPA) inference performance with bfloat16 and float16 to accelerate attention-based models on Intel GPUs.
      +With the new SDPA optimization for Intel GPUs on PyTorch 2.7, Stable Diffusion float16 inference achieved up to 3x gain over PyTorch 2.6 release on Intel® Arc™ B580 Graphics and Intel® Core™ Ultra 7 Processor 258V with Intel® Arc™ Graphics 140V on eager mode. See Figure 1 below.
    • +
    + +

    chart

    + +

    Figure 1. PyTorch 2.7 Stable Diffusion Performance Gains Over PyTorch 2.6

    + +
      +
    • Enable torch.compile on Windows 11 for Intel GPUs, delivering the performance advantages over eager mode as on Linux. With this, Intel GPUs became the first accelerator to support torch.compile on Windows. Refer to Windows tutorial for details.
      +Graph model (torch.compile) is enabled in Windows 11 for the first time across Intel GPUs, delivering the performance advantages over eager mode as on Linux by PyTorch 2.7. The latest performance data was measured on top of PyTorch Dynamo Benchmarking Suite using Intel® Arc™ B580 Graphics on Windows showcase torch.compile speedup ratio over eager mode as shown in Figure 2. Both training and inference achieved similar significant improvements.
    • +
    + +

    chart

    + +

    Figure 2. Torch.compile Performance Gains Over Eager Mode on Windows

    + +
      +
    • Optimize the performance of PyTorch 2 Export Post Training Quantization (PT2E) on Intel GPU to provide full graph mode quantization pipelines with enhanced computational efficiency. Refer to PT2E tutorial for details.
    • +
    • Enable AOTInductor and torch.export on Linux to simplify deployment workflows. Refer to AOTInductor tutorial for details.
    • +
    • Enable profiler on both Windows and Linux to facilitate model performance analysis. Refer to the PyTorch profiler tutorial for details.
    • +
    + +

    Review the Getting Started on Intel GPU Guide for a tour of the environment setup and a quick start on Intel GPUs.

    + +

    Future Work

    + +

    Looking ahead, we will continue the Intel GPU upstream efforts in future PyTorch releases to:

    + +
      +
    • Attain state-of-the-art PyTorch-native performance to showcase competitive GEMM computational efficiency for torch.compile, and enhance performance for LLM models through FlexAttention and lower precision data types.
    • +
    • Broaden feature compatibility by delivering distributed XCCL backend support for Intel® Data Center GPU Max Series.
    • +
    • Expand accelerator support across core PyTorch ecosystem components including torchao, torchtune, and torchtitan.
    • +
    + +

    Follow along in the PyTorch Dev Discussion to learn more about Intel GPU & CPU enabling status and features. As we get further along, we will create tickets on GitHub to document our progress.

    + +

    Summary

    + +

    In this blog, we reviewed the Intel GPU upstream progress starting in PyTorch 2.4 and highlighted the new features of PyTorch 2.7 that accelerate AI workload performance across various Intel GPUs. These new features, especially SDPA on Windows, achieved up to 3x inference (Stable Diffusion, float16) gain over PyTorch 2.6 release on Intel Arc B580 Graphics and Intel Core Ultra 7 Processor 258V with Intel Arc Graphics 140V. Also, torch.compile on Windows delivers similar performance advantages over eager mode on Dynamo benchmarks as on Linux.

    + +

    Acknowledgments

    + +

    We want to thank the following PyTorch maintainers for their technical discussions and insights: Nikita Shulga, Jason Ansel, Andrey Talman, Alban Desmaison, and Bin Bao.

    + +

    We also thank collaborators from PyTorch for their professional support and guidance.

    + +

    Product and Performance Information

    + +

    Measurement on Intel Core Ultra 7 258V: 2200 MHz, 8 Core(s), 8 Logical Processor(s) with Intel Arc 140V GPU (16GB), GPU memory 18.0 GB, using Intel Graphics Driver 32.0.101.6647 (WHQL Certified), Windows 11 Pro - 24H2. And Intel Core Ultra 5 245KF: 4200 MHz, 14 Core(s), 14 Logical Processor(s), Intel Arc B580 Graphics, dedicated GPU memory 12.0 GB, shared GPU memory 15.8 GB, using Intel Graphics Driver 32.0.101.6647 (WHQL Certified), Windows 11 Enterprise LTSC - 24H2. Test by Intel on Apr 8th, 2025.

    + +

    Notices and Disclaimers

    + +

    Performance varies by use, configuration and other factors. Learn more on the Performance Index site. Performance results are based on testing as of dates shown in configurations and may not reflect all publicly available updates.  See backup for configuration details.  No product or component can be absolutely secure. Your costs and results may vary. Intel technologies may require enabled hardware, software or service activation.

    + +

    Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others.

    + +

    AI Disclaimer

    + +

    AI features may require software purchase, subscription or enablement by a software or platform provider, or may have specific configuration or compatibility requirements. Details at www.intel.com/AIPC. Results may vary.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-2-7/index.html b/blog/pytorch-2-7/index.html new file mode 100644 index 000000000000..c99f25779361 --- /dev/null +++ b/blog/pytorch-2-7/index.html @@ -0,0 +1,789 @@ + + + + + + + + + + + + + PyTorch 2.7 Release | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    April 23, 2025

    +

    + PyTorch 2.7 Release +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    We are excited to announce the release of PyTorch® 2.7 (release notes)! This release features:

    + +
      +
    • support for the NVIDIA Blackwell GPU architecture and pre-built wheels for CUDA 12.8 across Linux x86 and arm64 architectures.
    • +
    • torch.compile support for Torch Function Modes which enables users to override any *torch.** operation to implement custom user-defined behavior.
    • +
    • Mega Cache which allows users to have end-to-end portable caching for torch;
    • +
    • new features for FlexAttention - LLM first token processing, LLM throughput mode optimization and Flex Attention for Inference.
    • +
    + +

    This release is composed of 3262 commits from 457 contributors since PyTorch 2.6. We want to sincerely thank our dedicated community for your contributions. As always, we encourage you to try these out and report any issues as we improve 2.7. More information about how to get started with the PyTorch 2-series can be found at our Getting Started page.

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Beta + Prototype +
    Torch.Compile support for Torch Function Modes + NVIDIA Blackwell Architecture Support +
    Mega Cache + PyTorch Native Context Parallel +
    + Enhancing Intel GPU Acceleration +
    + FlexAttention LLM first token processing on x86 CPUs +
    + FlexAttention LLM throughput mode optimization on x86 CPUs +
    + Foreach Map +
    + Flex Attention for Inference +
    + Prologue Fusion Support in Inductor +
    + +

    *To see a full list of public feature submissions click here.

    + +

    BETA FEATURES

    + +

    [Beta] Torch.Compile support for Torch Function Modes

    + +

    This feature enables users to override any *torch.** operation to implement custom user-defined behavior. For example, ops can be rewritten to accommodate a specific backend. This is used in FlexAttention to re-write indexing ops.

    + +

    See the tutorial for more information.

    + +

    [Beta] Mega Cache

    + +

    Mega Cache allows users to have end-to-end portable caching for torch. The intended use case is after compiling and executing a model, the user calls torch.compiler.save_cache_artifacts() which will return the compiler artifacts in a portable form. Later, potentially on a different machine, the user may call torch.compiler.load_cache_artifacts() with these artifacts to pre-populate the torch.compile caches in order to jump-start their cache.

    + +

    See the tutorial for more information.

    + +

    PROTOTYPE FEATURES

    + +

    [Prototype] NVIDIA Blackwell Architecture Support

    + +

    PyTorch 2.7 introduces support for NVIDIA’s new Blackwell GPU architecture and ships pre-built wheels for CUDA 12.8. For more details on CUDA 12.8 see CUDA Toolkit Release.

    + +
      +
    • Core components and libraries including cuDNN, NCCL, and CUTLASS have been upgraded to ensure compatibility with Blackwell platforms.
    • +
    • PyTorch 2.7 includes Triton 3.3, which adds support for the Blackwell architecture with torch.compile compatibility.
    • +
    • To utilize these new features, install PyTorch with CUDA 12.8 using: pip install torch==2.7.0 –index-url https://download.pytorch.org/whl/cu128
    • +
    + +

    More context can also be found here.

    + +

    [Prototype] PyTorch Native Context Parallel

    + +

    PyTorch Context Parallel API allows users to create a Python context so that every *torch.nn.functional.scaled_dot_product_attention() *call within will run with context parallelism. Currently, PyTorch Context Parallel supports 3 attention backends: 1. Flash attention; 2. Efficient attention; and 3. cuDNN attention.

    + +

    As an example, this is used within TorchTitan as the Context Parallel solution for LLM training.

    + +

    See tutorial here.

    + +

    [Prototype] Enhancing Intel GPU Acceleration

    + +

    This latest release introduces enhanced performance optimizations for Intel GPU architectures. These improvements accelerate workloads across various Intel GPUs through the following key enhancements:

    + +
      +
    • Enable torch.compile on Windows 11 for Intel GPUs, delivering the performance advantages over eager mode as on Linux.
    • +
    • Optimize the performance of PyTorch 2 Export Post Training Quantization (PT2E) on Intel GPU to provide a full graph mode quantization pipelines with enhanced computational efficiency.
    • +
    • Improve Scaled Dot-Product Attention (SDPA) inference performance with bfloat16 and float16 to accelerate attention-based models on Intel GPUs.
    • +
    • Enable AOTInuctor and torch.export on Linux to simplify deployment workflows.
    • +
    • Implement more Aten operators to enhance the continuity of operators execution on Intel GPU and increase the performance on Intel GPU in eager mode.
    • +
    • Enable profiler on both Windows and Linux to facilitate model performance analysis.
    • +
    • Expand the Intel GPUs support to Intel® Core™ Ultra Series 2 with Intel® Arc™ Graphics, and Intel® Arc™ B-Series graphics on both Windows and Linux.
    • +
    + +

    For more information regarding Intel GPU support, please refer to Getting Started Guide.

    + +

    See also the tutorials here and here.

    + +

    [Prototype] FlexAttention LLM first token processing on x86 CPUs

    + +

    FlexAttention x86 CPU support was first introduced in PyTorch 2.6, offering optimized implementations — such as PageAttention, which is critical for LLM inference—via the TorchInductor C++ backend. In PyTorch 2.7, more attention variants for first token processing of LLMs are supported. With this feature, users can have a smoother experience running FlexAttention on x86 CPUs, replacing specific scaled_dot_product_attention operators with a unified FlexAttention API, and benefiting from general support and good performance when using torch.compile.

    + +

    [Prototype] FlexAttention LLM throughput mode optimization

    + +

    The performance of FlexAttention on x86 CPUs for LLM inference throughput scenarios has been further improved by adopting the new C++ micro-GEMM template ability. This addresses the performance bottlenecks for large batch size scenarios present in PyTorch 2.6. With this enhancement, users can transparently benefit from better performance and a smoother experience when using FlexAttention APIs and torch.compile for LLM throughput serving on x86 CPUs.

    + +

    [Prototype] Foreach Map

    + +

    This feature uses torch.compile to allow users to apply any pointwise or user-defined function (e.g. torch.add) to lists of tensors, akin to the existing *torch.foreach** ops. The main advantage over the existing *torch.foreach** ops is that any mix of scalars or lists of tensors can be supplied as arguments, and even user-defined python functions can be lifted to apply to lists of tensors. Torch.compile will automatically generate a horizontally fused kernel for optimal performance.

    + +

    See tutorial here.

    + +

    [Prototype] Flex Attention for Inference

    + +

    In release 2.5.0, FlexAttention* torch.nn.attention.flex_attention* was introduced for ML researchers who’d like to customize their attention kernels without writing kernel code. This update introduces a decoding backend optimized for inference, supporting GQA and PagedAttention, along with feature updates including nested jagged tensor support, performance tuning guides and trainable biases support.

    + +

    [Prototype] Prologue Fusion Support in Inductor

    + +

    Prologue fusion optimizes matrix multiplication (matmul) operations by fusing operations that come before the matmul into the matmul kernel itself, improving performance by reducing global memory bandwidth.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-2-paper-tutorial/index.html b/blog/pytorch-2-paper-tutorial/index.html new file mode 100644 index 000000000000..f6b890fd96f9 --- /dev/null +++ b/blog/pytorch-2-paper-tutorial/index.html @@ -0,0 +1,664 @@ + + + + + + + + + + + + + PyTorch 2 paper and tutorial @ ASPLOS 2024 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    February 06, 2024

    +

    + PyTorch 2 paper and tutorial @ ASPLOS 2024 +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    The PyTorch team is excited to share that our paper on PyTorch 2 has been accepted for presentation at the ACM International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS), scheduled to take place from April 27 to May 1, 2024, in San Diego, CA, USA.

    + +

    The paper delves into the implementation of torch.compile and highlights the key technologies driving it, including TorchDynamo (graph capture), TorchInductor (backend compiler), and Dynamic Shape support.

    + +

    During the ASPLOS conference, we’ll be conducting a tutorial on Saturday, April 27, focusing on the inner workings of PyTorch 2 and how systems researchers can leverage and build upon it. Stay tuned for more details as the event approaches – we look forward to your participation!

    + +

    A preview of the paper is attached below:

    + +

    Title: PyTorch 2: Faster Machine Learning Through Dynamic Python Bytecode Transformation and Graph Compilation. Full Paper PDF

    + +

    Abstract

    +

    This paper introduces two extensions to the popular PyTorch machine learning framework, TorchDynamo and TorchInductor, which implement the torch.compile feature released in PyTorch 2. TorchDynamo is a Python-level just-in-time (JIT) compiler that enables graph compilation in PyTorch programs without sacrificing the flexibility of Python. It achieves this by dynamically modifying Python bytecode before execution and extracting sequences of PyTorch operations into an FX graph, which is then JIT compiled using one of many extensible backends. TorchInductor is the default compiler backend for TorchDynamo, which translates PyTorch programs into OpenAI’s Triton for GPUs and C++ for CPUs. Results show that TorchDynamo is able to capture graphs more robustly than prior approaches while adding minimal overhead, and TorchInductor is able to provide a 2.27x inference and 1.41x training geometric mean speedup on an NVIDIA A100 GPU across 180+ real-world models, which outperforms six other compilers. These extensions provide a new way to apply optimizations through compilers in eager mode frameworks like PyTorch.

    + +

    Authors

    + +

    Jason Ansel (Meta); Edward Yang (Meta); Horace He (Meta); Natalia Gimelshein (OpenAI); Animesh Jain (Meta); Michael Voznesensky (Meta); Bin Bao (Meta); Peter Bell (Quansight); David Berard (Meta); Evgeni Burovski Quansight; Geeta Chauhan (Meta); Anjali Chourdia (Meta); Will Constable (Meta); Alban Desmaison (Meta); Zachary DeVito (Meta); Elias Ellison (Meta); Will Feng (Meta); Jiong Gong (Intel); Michael Gschwind (Meta); Brian Hirsh (Meta); Sherlock Huang (Meta); Kshiteej Kalambarkar (Quansight); Laurent Kirsch (Meta); Michael Lazos (Meta); Mario Lezcano (Quansight); Yanbo Liang (Meta); Jason Liang (Meta); Yinghai Lu (Meta); CK Luk (Meta); Bert Maher (Meta); Yunjie Pan (University of Michigan); Christian Puhrsch (Meta); Matthias Reso (Meta); Mark Saroufim (Meta); Marcos Yukio Siraichi (Quansight); Helen Suk (Meta); Michael Suo (Meta); Phil Tillet (OpenAI); Eikan Wang (Intel); Xiaodong Wang (Meta); William Wen (Meta); Shunting Zhang (Meta); Xu Zhao (Meta); Keren Zhou (OpenAI & George Mason University); Richard Zou (Meta); Ajit Mathews (Meta); Gregory Chanan (Meta); Peng Wu (Meta); Soumith Chintala (Meta)

    + +

    ASPLOS’24 - Full Day Tutorial Schedule

    + +

    Full schedule for the ASPLOS’24 PyTorch 2 Tutoral on Saturday, April 27th is available here

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-2.0-release/index.html b/blog/pytorch-2.0-release/index.html new file mode 100644 index 000000000000..db84b184157f --- /dev/null +++ b/blog/pytorch-2.0-release/index.html @@ -0,0 +1,1150 @@ + + + + + + + + + + + + + PyTorch 2.0: Our next generation release that is faster, more Pythonic and Dynamic as ever | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    We are excited to announce the release of PyTorch® 2.0 which we highlighted during the PyTorch Conference on 12/2/22! PyTorch 2.0 offers the same eager-mode development and user experience, while fundamentally changing and supercharging how PyTorch operates at compiler level under the hood with faster performance and support for Dynamic Shapes and Distributed.

    + +

    This next-generation release includes a Stable version of Accelerated Transformers (formerly called Better Transformers); Beta includes torch.compile as the main API for PyTorch 2.0, the scaled_dot_product_attention function as part of torch.nn.functional, the MPS backend, functorch APIs in the torch.func module; and other Beta/Prototype improvements across various inferences, performance and training optimization features on GPUs and CPUs. For a comprehensive introduction and technical overview of torch.compile, please visit the 2.0 Get Started page.

    + +

    Along with 2.0, we are also releasing a series of beta updates to the PyTorch domain libraries, including those that are in-tree, and separate libraries including TorchAudio, TorchVision, and TorchText. An update for TorchX is also being released as it moves to community supported mode. More details can be found in this library blog.

    + +

    This release is composed of over 4,541 commits and 428 contributors since 1.13.1. We want to sincerely thank our dedicated community for your contributions. As always, we encourage you to try these out and report any issues as we improve 2.0 and the overall 2-series this year.

    + +

    Summary:

    +
      +
    • torch.compile is the main API for PyTorch 2.0, which wraps your model and returns a compiled model. It is a fully additive (and optional) feature and hence 2.0 is 100% backward compatible by definition.
    • +
    • As an underpinning technology of torch.compile, TorchInductor with Nvidia and AMD GPUs will rely on OpenAI Triton deep learning compiler to generate performant code and hide low level hardware details. OpenAI Triton-generated kernels achieve performance that’s on par with hand-written kernels and specialized cuda libraries such as cublas.
    • +
    • Accelerated Transformers introduce high-performance support for training and inference using a custom kernel architecture for scaled dot product attention (SPDA). The API is integrated with torch.compile() and model developers may also use the scaled dot product attention kernels directly by calling the new scaled_dot_product_attention() operator.
    • +
    • Metal Performance Shaders (MPS) backend provides GPU accelerated PyTorch training on Mac platforms with added support for Top 60 most used ops, bringing coverage to over 300 operators.
    • +
    • Amazon AWS optimizes the PyTorch CPU inference on AWS Graviton3 based C7g instances. PyTorch 2.0 improves inference performance on Graviton compared to the previous releases, including improvements for Resnet50 and Bert.
    • +
    • New prototype features and technologies across TensorParallel, DTensor, 2D parallel, TorchDynamo, AOTAutograd, PrimTorch and TorchInductor.
    • +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +Stable + Beta + Prototype + Performance Improvements +
    + +Accelerated PT 2 Transformers + + +torch.compile + + +DTensor + + +CUDA support for 11.7 & 11.8 (deprecating CUDA 11.6) +
    + + +PyTorch MPS Backend + + +TensorParallel + + +Python 3.8 (deprecating Python 3.7) +
    + + +Scaled dot product attention + + +2D Parallel + + +AWS Graviton3 +
    + + +functorch + + +Torch.compile (dynamic=True) + +
    + Dispatchable Collectives + +
    + Torch.set_default & torch.device + + +
    + + +X86 quantization backend + + +
    + + +GNN inference and training performance + + +
    + +

    *To see a full list of public 2.0, 1.13 and 1.12 feature submissions click here.

    + +

    Stable Features

    + +

    [Stable] Accelerated PyTorch 2 Transformers

    + +

    The PyTorch 2.0 release includes a new high-performance implementation of the PyTorch Transformer API. In releasing Accelerated PT2 Transformers, our goal is to make training and deployment of state-of-the-art Transformer models affordable across the industry. This release introduces high-performance support for training and inference using a custom kernel architecture for scaled dot product attention (SPDA), extending the inference “fastpath” architecture, previously known as “Better Transformer.”

    + +

    Similar to the “fastpath” architecture, custom kernels are fully integrated into the PyTorch Transformer API – thus, using the native Transformer and MultiHeadAttention API will enable users to:

    + +
      +
    • transparently see significant speed improvements;
    • +
    • support many more use cases including models using Cross-Attention, Transformer Decoders, and for training models; and
    • +
    • continue to use fastpath inference for fixed and variable sequence length Transformer Encoder and Self Attention use cases.
    • +
    + +

    To take full advantage of different hardware models and Transformer use cases, multiple SDPA custom kernels are supported (see below), with custom kernel selection logic that will pick the highest-performance kernel for a given model and hardware type. In addition to the existing Transformer API, model developers may also use the scaled dot product attention kernels directly by calling the new scaled_dot_product_attention() operator. Accelerated PyTorch 2 Transformers are integrated with torch.compile() . To use your model while benefiting from the additional acceleration of PT2-compilation (for inference or training), pre-process the model with model = torch.compile(model).

    + +

    We have achieved major speedups for training transformer models and in particular large language models with Accelerated PyTorch 2 Transformers using a combination of custom kernels and torch.compile().

    + +

    alt_text +Figure: Using scaled dot product attention with custom kernels and torch.compile delivers significant speedups for training large language models, such as for nanoGPT shown here.

    + +

    Beta Features

    + +

    [Beta] torch.compile

    + +

    torch.compile is the main API for PyTorch 2.0, which wraps your model and returns a compiled model. It is a fully additive (and optional) feature and hence 2.0 is 100% backward compatible by definition.

    + +

    Underpinning torch.compile are new technologies – TorchDynamo, AOTAutograd, PrimTorch and TorchInductor:

    +
      +
    • TorchDynamo captures PyTorch programs safely using Python Frame Evaluation Hooks and is a significant innovation that was a result of 5 years of our R&D into safe graph capture.
    • +
    • AOTAutograd overloads PyTorch’s autograd engine as a tracing autodiff for generating ahead-of-time backward traces.
    • +
    • PrimTorch canonicalizes ~2000+ PyTorch operators down to a closed set of ~250 primitive operators that developers can target to build a complete PyTorch backend. This substantially lowers the barrier of writing a PyTorch feature or backend.
    • +
    • TorchInductor is a deep learning compiler that generates fast code for multiple accelerators and backends. For NVIDIA and AMD GPUs, it uses OpenAI Triton as a key building block. For intel CPUs, we generate C++ code using multithreading, vectorized instructions and offloading appropriate operations to mkldnn when possible.
    • +
    + +

    With all the new technologies, torch.compile is able to work 93% of time across 165 open-source models and runs 20% faster on average at float32 precision and 36% faster on average at AMP precision.

    + +

    For more information, please refer to https://pytorch.org/get-started/pytorch-2.0/ and for TorchInductor CPU with Intel here.

    + +

    [Beta] PyTorch MPS Backend

    + +

    MPS backend provides GPU-accelerated PyTorch training on Mac platforms. This release brings improved correctness, stability, and operator coverage.

    + +

    MPS backend now includes support for the Top 60 most used ops, along with the most frequently requested operations by the community, bringing coverage to over 300 operators. The major focus of the release was to enable full OpInfo-based forward and gradient mode testing to address silent correctness issues. These changes have resulted in wider adoption of MPS backend by 3rd party networks such as Stable Diffusion, YoloV5, WhisperAI, along with increased coverage for Torchbench networks and Basic tutorials. We encourage developers to update to the latest macOS release to see the best performance and stability on the MPS backend.

    + +

    Links

    + +
      +
    1. MPS Backend
    2. +
    3. Developer information
    4. +
    5. Accelerated PyTorch training on Mac
    6. +
    7. Metal, Metal Performance Shaders & Metal Performance Shaders Graph
    8. +
    + +

    [Beta] Scaled dot product attention 2.0

    + +

    We are thrilled to announce the release of PyTorch 2.0, which introduces a powerful scaled dot product attention function as part of torch.nn.functional. This function includes multiple implementations that can be seamlessly applied depending on the input and hardware in use.

    + +

    In previous versions of PyTorch, you had to rely on third-party implementations and install separate packages to take advantage of memory-optimized algorithms like FlashAttention. With PyTorch 2.0, all these implementations are readily available by default.

    + +

    These implementations include FlashAttention from HazyResearch, Memory-Efficient Attention from the xFormers project, and a native C++ implementation that is ideal for non-CUDA devices or when high-precision is required.

    + +

    PyTorch 2.0 will automatically select the optimal implementation for your use case, but you can also toggle them individually for finer-grained control. Additionally, the scaled dot product attention function can be used to build common transformer architecture components.

    + +

    Learn more with the documentation and this tutorial.

    + +

    [Beta] functorch -> torch.func

    + +

    Inspired by Google JAX, functorch is a library that offers composable vmap (vectorization) and autodiff transforms. It enables advanced autodiff use cases that would otherwise be tricky to express in PyTorch. Examples include:

    + + +

    We’re excited to announce that, as the final step of upstreaming and integrating functorch into PyTorch, the functorch APIs are now available in the torch.func module. Our function transform APIs are identical to before, but we have changed how the interaction with NN modules work. Please see the docs and the migration guide for more details.

    + +

    Furthermore, we have added support for torch.autograd.Function: one is now able to apply function transformations (e.g. vmap, grad, jvp) over torch.autograd.Function.

    + +

    [Beta] Dispatchable Collectives

    + +

    Dispatchable collectives is an improvement to the existing init_process_group() API which changes backend to an optional argument. For users, the main advantage of this feature is that it will allow them to write code that can run on both GPU and CPU machines without having to change the backend specification. The dispatchability feature will also make it easier for users to support both GPU and CPU collectives, as they will no longer need to specify the backend manually (e.g. “NCCL” or “GLOO”). Existing backend specifications by users will be honored and will not require change.

    + +

    Usage example:

    +
    import torch.distributed.dist
    +…
    +# old
    +dist.init_process_group(backend=”nccl”, ...)
    +dist.all_reduce(...) # with CUDA tensors works
    +dist.all_reduce(...) # with CPU tensors does not work
    +
    +# new
    +dist.init_process_group(...) # backend is optional
    +dist.all_reduce(...) # with CUDA tensors works
    +dist.all_reduce(...) # with CPU tensors works
    +
    + +

    Learn more here.

    + +

    [Beta] torch.set_default_device and torch.device as context manager

    + +

    torch.set_default_device allows users to change the default device that factory functions in PyTorch allocate on. For example, if you torch.set_default_device(‘cuda’), a call to torch.empty(2) will allocate on CUDA (rather than on CPU). You can also use torch.device as a context manager to change the default device on a local basis. This resolves a long standing feature request from PyTorch’s initial release for a way to do this.

    + +

    Learn more here.

    + +

    [Beta] “X86” as the new default quantization backend for x86 CPU

    + +

    The new X86 quantization backend, which utilizes FBGEMM and oneDNN kernel libraries, replaces FBGEMM as the default quantization backend for x86 CPU platforms and offers improved int8 inference performance compared to the original FBGEMM backend, leveraging the strengths of both libraries, with 1.3X – 2X inference performance speedup measured on 40+ deep learning models. The new backend is functionally compatible with the original FBGEMM backend.

    + +

    Table: Geomean Speedup of X86 Quantization Backend vs. FBGEMM Backend

    + + + + + + + + + + + + + + + + +
    + 1 core/instance + 2 cores/instance + 4 cores/instance + 1 socket (32 cores)/instance +
    Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz + 1.76X + 1.80X + 2.04X + 1.34X +
    + +

    By default, users on x86 platforms will utilize the x86 quantization backend and their PyTorch programs will remain unchanged when using the default backend. Alternatively, users have the option to specify “X86” as the quantization backend explicitly. Example code is shown below:

    + +
    import torch
    +from torch.ao.quantization import get_default_qconfig_mappingfrom torch.quantization.quantize_fx
    +import prepare_fx, convert_fx
    + 
    +# get default configuration
    +qconfig_mapping = get_default_qconfig_mapping()
    + 
    +# or explicitly specify the backend
    +# qengine = 'x86'
    +# torch.backends.quantized.engine = qengine
    +# qconfig_mapping = get_default_qconfig_mapping(qengine)
    + 
    +# construct fp32 model
    +model_fp32 = ...
    + 
    +# prepare
    +prepared_model = prepare_fx(model_fp32, qconfig_mapping, example_inputs=x)
    + 
    +# calibrate
    +...
    + 
    +# convert
    +quantized_model = convert_fx(prepared_model)
    +
    + +

    Find more information: https://github.com/pytorch/pytorch/issues/83888 and https://www.intel.com/content/www/us/en/developer/articles/technical/accelerate-pytorch-int8-inf-with-new-x86-backend.html.

    + +

    [Beta] GNN inference and training optimization on CPU

    + +

    PyTorch 2.0 includes several critical optimizations to improve GNN inference and training performance on CPU. Before 2.0, GNN models of PyG suffers from low efficiency on CPU due to lack of performance tuning for several critical kernels (scatter/gather, etc) and the lack of GNN-related sparse matrix multiplication ops. To be specific, optimizations include:

    +
      +
    • scatter_reduce: performance hotspot in Message Passing when the edge index is stored in Coordinate format (COO).
    • +
    • gather: backward of scatter_reduce, specially tuned for the GNN compute when the index is an expanded tensor.
    • +
    • torch.sparse.mm with reduce flag: performance hotspot in Message Passing when the edge index is stored in Compressed Sparse Row (CSR). Supported reduce flag of: sum, mean, amax, amin.
    • +
    + +

    On PyG benchmarks/examples, OGB benchmarks, a 1.12x - 4.07x performance speedup is measured (1.13.1 compared with 2.0) for single node inference and training.

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Model-Dataset + Option + Speedup Ratio +
    GCN-Reddit (inference) + 512-2-64-dense + 1.22x +
    1024-3-128-dense + 1.25x +
    512-2-64-sparse + 1.31x +
    1024-3-128-sparse + 1.68x +
    512-2-64-dense + 1.22x +
    +GraphSage-ogbn-products (inference) + 1024-3-128-dense + 1.15x +
    512-2-64-sparse + 1.20x +
    1024-3-128-sparse + 1.33x +
    full-batch-sparse + 4.07x +
    GCN-PROTEINS (training) + 3-32 + 1.67x +
    GCN-REDDIT-BINARY (training) + 3-32 + 1.67x +
    GCN-Reddit (training) + 512-2-64-dense + 1.20x +
    1024-3-128-dense + 1.12x +
    + +

    Learn more: PyG CPU Performance Optimization.

    + +

    [Beta] Accelerating inference on CPU with PyTorch by leveraging oneDNN Graph

    + +

    oneDNN Graph API extends oneDNN with a flexible graph API to maximize the optimization opportunity for generating efficient code on AI hardware.

    +
      +
    • It automatically identifies the graph partitions to be accelerated via fusion.
    • +
    • The fusion patterns focus on fusing compute-intensive operations such as convolution, matmul and their neighbor operations for both inference and training use cases.
    • +
    • Although work is ongoing to integrate oneDNN Graph with TorchDynamo as well, its integration with the PyTorch JIT Fuser attained beta status in PyTorch 2.0 for Float32 & BFloat16 inference (on machines that support AVX512_BF16 ISA).
    • +
    + +

    From a developer’s/researcher’s perspective, the usage is quite simple & intuitive, with the only change in code being an API invocation:

    +
      +
    • Leverage oneDNN Graph, with JIT-tracing, a model is profiled with an example input.
    • +
    • The context manager with torch.jit.fuser(“fuser3”): can also be used instead of invoking torch.jit.enable_onednn_fusion(True).
    • +
    • For accelerating BFloat16 inference, we rely on eager-mode AMP (Automatic Mixed Precision) support in PyTorch & disable JIT mode’s AMP, as both of them are currently divergent:
    • +
    + +
    # Assuming we have a model of the name 'model'
    + 
    +example_input = torch.rand(1, 3, 224, 224)
    + 
    +# enable oneDNN Graph
    +torch.jit.enable_onednn_fusion(True)
    +# Disable AMP for JIT
    +torch._C._jit_set_autocast_mode(False)
    +with torch.no_grad(), torch.cpu.amp.autocast():
    +	model = torch.jit.trace(model, (example_input))
    +	model = torch.jit.freeze(model)
    + 	# 2 warm-ups (2 for tracing/scripting with an example, 3 without an example)
    +	model(example_input)
    +	model(example_input)
    + 
    +	# speedup would be observed in subsequent runs.
    +	model(example_input)
    +
    + +

    Learn more here.

    + +

    Prototype Features

    + +

    Distributed API

    + +

    [Prototype] DTensor

    + +

    PyTorch DistributedTensor (DTensor) is a prototyping effort with distributed tensor primitives to allow easier distributed computation authoring in the SPMD (Single Program Multiple Devices) paradigm. The primitives are simple but powerful when used to express tensor distributions with both sharded and replicated parallelism strategies. PyTorch DTensor empowered PyTorch Tensor Parallelism along with other advanced parallelism explorations. In addition, it also offers a uniform way to save/load state_dict for distributed checkpointing purposes, even when there’re complex tensor distribution strategies such as combining tensor parallelism with parameter sharding in FSDP. More details can be found in this RFC and the DTensor examples notebook.

    + +

    [Prototype] TensorParallel

    + +

    We now support DTensor based Tensor Parallel which users can distribute their model parameters across different GPU devices. We also support Pairwise Parallel which shards two concatenated linear layers in a col-wise and row-wise style separately so that only one collective(all-reduce/reduce-scatter) is needed in the end.

    + +

    [Prototype] 2D Parallel

    + +

    We implemented the integration of the aforementioned TP with FullyShardedDataParallel(FSDP) as 2D parallel to further scale large model training. More details can be found in this slide.

    + +

    [Prototype] torch.compile(dynamic=True)

    + +

    Experimental support for PT2 compilation with dynamic shapes is available in this release. Inference compilation with inductor for simple models is supported, but there are a lot of limitations:

    + +
      +
    • Training available in a future release (This is partially fixed in nightlies!)
    • +
    • Minifier available in a future release.
    • +
    • It is easy to end up in a situation where the dimension you wanted to be dynamic gets specialized anyway. Some of these issues are fixed in nightlies, others are not.
    • +
    • We do not appropriately propagate Inductor guards to the top-level, this is tracked at #96296.
    • +
    • Data-dependent operations like nonzero still require a graph break.
    • +
    • Dynamic does not work with non-standard modes like reduce-overhead or max-autotune.
    • +
    • There are many bugs in Inductor compilation. To track known bugs, check the dynamic shapes label on the PyTorch issue tracker.
    • +
    + +

    For the latest and greatest news about dynamic shapes support on master, check out our status reports.

    + +

    Highlights/Performance Improvements

    + +

    Deprecation of Cuda 11.6 and Python 3.7 support for PyTorch 2.0

    + +

    If you are still using or depending on CUDA 11.6 or Python 3.7 builds, we strongly recommend moving to at least CUDA 11.7 and Python 3.8, as it would be the minimum versions required for PyTorch 2.0. For more detail, please refer to the Release Compatibility Matrix for PyTorch releases.

    + +

    Python 3.11 support on Anaconda Platform

    + +

    Due to lack of Python 3.11 support for packages that PyTorch depends on, including NumPy, SciPy, SymPy, Pillow and others on the Anaconda platform. We will not be releasing Conda binaries compiled with Python 3.11 for PyTorch Release 2.0. The Pip packages with Python 3.11 support will be released, hence if you intend to use PyTorch 2.0 with Python 3.11 please use our Pip packages. Please note: Conda packages with Python 3.11 support will be made available on our nightly channel. Also we are planning on releasing Conda Python 3.11 binaries as part of future release once Anaconda provides these key dependencies. More information and instructions on how to download the Pip packages can be found here.

    + +

    Optimized PyTorch Inference with AWS Graviton processors

    + +

    The optimizations focused on three key areas: GEMM kernels, bfloat16 support, primitive caching and the memory allocator. For aarch64 platforms, PyTorch supports Arm Compute Library (ACL) GEMM kernels via Mkldnn(OneDNN) backend. The ACL library provides Neon/SVE GEMM kernels for fp32 and bfloat16 formats. The bfloat16 support on c7g allows efficient deployment of bfloat16 trained, AMP (Automatic Mixed Precision) trained, or even the standard fp32 trained models. The standard fp32 models leverage bfloat16 kernels via OneDNN fast math mode, without any model quantization. Next we implemented primitive caching for conv, matmul and inner product operators. More information on the updated PyTorch user guide with the upcoming 2.0 release improvements and TorchBench benchmark details can be found here.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-2.0-xla-path-forward/index.html b/blog/pytorch-2.0-xla-path-forward/index.html new file mode 100644 index 000000000000..f33aa310416e --- /dev/null +++ b/blog/pytorch-2.0-xla-path-forward/index.html @@ -0,0 +1,662 @@ + + + + + + + + + + + + + PyTorch & OpenXLA: The Path Forward | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    April 03, 2023

    +

    + PyTorch & OpenXLA: The Path Forward +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Milad Mohammadi, Jack Cao, Shauheen Zahirazami, Joe Spisak, and Jiewen Tan + +

    +

    As we celebrate the release of OpenXLA, PyTorch 2.0, and PyTorch/XLA 2.0, it’s worth taking a step back and sharing where we see it all going in the short to medium term. With PyTorch adoption leading in the AI space and XLA supporting best-in-class compiler features, PyTorch/XLA is well positioned to provide a cutting edge development stack for both model training and inference. To achieve this, we see investments in three main areas:

    + +
      +
    • Training Large Models - Large language models (LLM) and diffusion models have quickly risen in popularity and many cutting edge applications today are built on them. Further to this, training these models requires scale and more specifically the ability to train across thousands of accelerators. To achieve this we are investing in features such as AMP for mixed precision training, PjRt for increased runtime performance, SPMD / FSDP for efficient model sharding, Dynamic Shapes to enable new research approaches, faster data loading through Ray and tf.data, and a toolchain that packages all of these features together into a seamless workflow. Some of these features are already available in experimental or beta stages, and others are coming up this year with many heavily leveraging the underlying OpenXLA compiler stack.
    • +
    • Model Inference - With large models continuing to grow in size and computational cost, deployment becomes the next challenge as these models continue to find their way into applications. With the introduction of Dynamo in the PyTorch 2.0 release, PyTorch/XLA delivers performance competitive inference. We are, however, incorporating additional inference-oriented including model serving support, Dynamo for sharded large models, quantization via Torch.Export and StableHLO.
    • +
    • Ecosystem integration - We are expanding integration with Hugging Face and PyTorch Lightning so users can take advantage of upcoming PyTorch/XLA cutting edge features (e.g. FSDP support in Hugging Face) and the downstream OpenXLA features (e.g. Quantization) through familiar APIs.
    • +
    + +

    Additionally, PyTorch/XLA is set to migrate to the open source OpenXLA as its default downstream compiler; allowing the PyTorch community to gain access to a leading, framework-agnostic compiler stack that enjoys industry-wide contribution and innovation. To achieve this, we will begin supporting StableHLO. As a result, OpenXLA will replace the existing TF:XLA dependency, overall streamlining the dependencies and creating leverage from the broader compiler ecosystem. PyTorch/XLA will also sunset the XRT runtime after migration. You can see the resulting high level stack below with the TensorFlow dependency stricken out:

    + +

    the upcoming PyTorch/XLA features and integrations

    + +

    Figure: the upcoming PyTorch/XLA features and integrations are illustrated here

    + +

    We cannot be more excited about what’s ahead for PyTorch/XLA and invite the community to join us. PyTorch/XLA is developed fully in open source so please file issues, submit pull requests, and send RFCs to GitHub such that we can openly collaborate. You can also try out PyTorch/XLA for yourself on various XLA devices including TPUs and GPUs.

    + +

    Cheers,
    +The PyTorch/XLA Team at Google

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-2.0-xla/index.html b/blog/pytorch-2.0-xla/index.html new file mode 100644 index 000000000000..6c58557f7595 --- /dev/null +++ b/blog/pytorch-2.0-xla/index.html @@ -0,0 +1,837 @@ + + + + + + + + + + + + + PyTorch 2.0 & XLA—The Latest Cutting Edge Features | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Jack Cao, Milad Mohammadi, Alex Wertheim, Yeounoh Chung, Joe Spisak, Will Cromar, Shauheen Zahirazami + +

    +

    Today, we are excited to share our latest work for PyTorch/XLA 2.0. The release of PyTorch 2.0 is yet another major milestone for this storied community and we are excited to continue to be part of it. When the PyTorch/XLA project started in 2018 between Google and Meta, the focus was on bringing cutting edge Cloud TPUs to help support the PyTorch community. Along the way, others in the community such as Amazon joined the project and very quickly the community expanded. We are excited about XLA’s direction and the benefits this project continues to bring to the PyTorch community. In this blog we’d like to showcase some key features that have been in development, show code snippets, and illustrate the benefit through some benchmarks.

    + +

    TorchDynamo / torch.compile (Experimental)

    + +

    TorchDynamo (Dynamo) is a Python-level JIT compiler designed to make unmodified PyTorch programs faster. It provides a clean API for compiler backends to hook in; its biggest feature is to dynamically modify Python bytecode just before execution. In the PyTorch/XLA 2.0 release, an experimental backend for Dynamo is provided for both inference and training.

    + +

    Dynamo provides a Torch FX (FX) graph when it recognizes a model pattern and PyTorch/XLA uses a Lazy Tensor approach to compile the FX graph and return the compiled function. To get more insight regarding the technical details about PyTorch/XLA’s dynamo implementation, check out this dev-discuss post and dynamo doc.

    + +

    Here is a small code example of running ResNet18 with torch.compile:

    + +
    import torch
    +import torchvision
    +import torch_xla.core.xla_model as xm
    +
    +def eval_model(loader):
    +  device = xm.xla_device()
    +  xla_resnet18 = torchvision.models.resnet18().to(device)
    +  xla_resnet18.eval()
    +  dynamo_resnet18 = torch.compile(
    +      xla_resnet18, backend='torchxla_trace_once')
    +  for data, _ in loader:
    +    output = dynamo_resnet18(data)
    +
    + +

    With torch.compile PyTorch/XLA only traces the ResNet18 model once during the init time and executes the compiled binary everytime dynamo_resnet18 is invoked, instead of tracing the model every step. To illustrate the benefits of Dynamo+XLA, below is an inference speedup analysis to compare Dynamo and LazyTensor (without Dynamo) using TorchBench on a Cloud TPU v4-8 where the y-axis is the speedup multiplier.

    + +

    Inference Speedup - PyTorch/XLA Dynamo on TPU

    + +

    Dynamo for training is in the development stage with its implementation being at an earlier stage than inference. Developers are welcome to test this early feature, however, in the 2.0 release, PyTorch/XLA supports the forward and backward pass graphs and not the optimizer graph; the optimizer graph is available in the nightly builds and will land in the PyTorch/XLA 2.1 release. Below is an example of what training looks like using the ResNet18 example with torch.compile:

    + +
    import torch
    +import torchvision
    +import torch_xla.core.xla_model as xm
    +
    +def train_model(model, data, target):
    +  loss_fn = torch.nn.CrossEntropyLoss()
    +  pred = model(data)
    +  loss = loss_fn(pred, target)
    +  loss.backward()
    +  return pred
    +
    +def train_model_main(loader):
    +  device = xm.xla_device()
    +  xla_resnet18 = torchvision.models.resnet18().to(device)
    +  xla_resnet18.train()
    +  dynamo_train_model = torch.compile(
    +        train_model, backend='aot_torchxla_trace_once')
    +  for data, target in loader:
    +    output = dynamo_train_model(xla_resnet18, data, target)
    +
    + +

    Note that the backend for training is aot_torchxla_trace_once (API will be updated for stable release) whereas the inference backend is torchxla_trace_once (name subject to change). We expect to extract and execute 3 graphs per training step instead of 1 training step if you use the Lazy tensor. Below is a training speedup analysis to compare Dynamo and Lazy using the TorchBench on Cloud TPU v4-8.

    + +

    Training Speedup - PyTorch/XLA Dynamo on TPU

    + +

    PJRT Runtime (Beta)

    + +

    PyTorch/XLA is migrating from XRT to the new PJRT runtime. PJRT is a better-maintained stack, with demonstrated performance advantages, including, on average, a 35% performance for training on TorchBench 2.0 models. It also supports a richer set of features enabling technologies like SPMD. In the PyTorch/XLA 2.0 release, PJRT is the default runtime for TPU and CPU; GPU support is in experimental state. The PJRT features included in the PyTorch/XLA 2.0 release are:

    + +
      +
    • TPU runtime implementation in libtpu using the PJRT Plugin API improves performance by up to 30%
    • +
    • torch.distributed support for TPU v2 and v3, including pjrt:// init_method (Experimental)
    • +
    • Single-host GPU support. Multi-host support coming soon. (Experimental)
    • +
    + +

    Switching to PJRT requires no change (or minimal change for GPUs) to user code (see pjrt.md for more details). Runtime configuration is as simple as setting the PJRT_DEVICE environment variable to the local device type (i.e. TPU, GPU, CPU). Below are examples of using PJRT runtimes on different devices.

    + +
    # TPU Device
    +PJRT_DEVICE=TPU python3 xla/test/test_train_mp_imagenet.py --fake_data --batch_size=256 --num_epochs=1
    +
    + +
    # TPU Pod Device
    +gcloud alpha compute tpus tpu-vm ssh $USER-pjrt --zone=us-central2-b --project=$PROJECT --worker=all --command="git clone --depth=1 --branch r2.0 https://github.com/pytorch/xla.git"
    +
    +gcloud alpha compute tpus tpu-vm ssh $USER-pjrt --zone=us-central2-b --project=$PROJECT --worker=all --command="PJRT_DEVICE=TPU python3 xla/test/test_train_mp_imagenet.py --fake_data --batch_size=256 --num_epochs=1"
    +
    + +
    # GPU Device (Experimental)
    +PJRT_DEVICE=GPU GPU_NUM_DEVICES=4 python3 xla/test/test_train_mp_imagenet.py --fake_data --batch_size=128 --num_epochs=1
    +
    + +

    Below is a performance comparison between XRT and PJRT by task on TorchBench 2.0 on v4-8 TPU. To learn more about PJRT vs. XRT please review the documentation.

    + +

    TorchBench Training Time

    + +

    Parallelization

    + +

    GSPMD (Experimental)

    + +

    We are delighted to introduce General and Scalable Parallelization for ML Computation Graphs (GSPMD) in PyTorch as a new experimental data & model sharding solution. GSPMD provides automatic parallelization for common ML workloads, allowing developers to write PyTorch programs as if on a single large device and without custom sharded computation ops and/or collective communication ops. The XLA compiler transforms the single device program into a partitioned one with proper collectives, based on the user provided sharding hints. The API (RFC) will be available in the PyTorch/XLA 2.0 release as an experimental feature on a single TPU VM host.

    + +

    Next Steps for GSPMD

    + +

    GSPMD is experimental in 2.0 release. To bring it to Stable status, we plan to address a number of feature gaps and known issues in the following releases, including multi-host support, DTensor integration, partial replication sharding, asynchronous data loading, and checkpointing.

    + +

    FSDP (Beta)

    + +

    PyTorch/XLA introduced fully sharded data parallel (FSDP) experimental support in version 1.12. This feature is a parallel representation of PyTorch FSDP and there are subtle differences in how XLA and upstream CUDA kernels are set up. auto_wrap_policy is a new argument that enables developers to automatically specify conditions for propagating partitioning specifications to neural network submodules. auto_wrap_policys may be simply passed in as an argument when wrapping a model with FSDP. Two auto_wrap_policy callables worth noting are: size_based_auto_wrap_policy, transformer_auto_wrap_policy.

    + +

    size_based_auto_wrap_policy enables users to wrap submodules with a minimum number of parameters. The example below wraps model submodules having at least 10M parameters.

    + +
    auto_wrap_policy = partial(size_based_auto_wrap_policy, min_num_params=1e7)
    +
    + +

    transformer_auto_wrap_policy enables users to wrap all submodules that match a specific layer type. The example below wraps model submodules named torch.nn.Conv2d. To learn more, review this ResNet example by Ronghang Hu.

    + +
    auto_wrap_policy = partial(transformer_auto_wrap_policy, transformer_layer_cls={torch.nn.Conv2d})
    +
    + +

    PyTorch/XLA FSDP is now integrated in HuggingFace trainer class (PR) enabling users to train much larger models on PyTorch/XLA (official Hugging Face documentation). A 16B parameters GPT2 model trained on Cloud TPU v4-64 with this FSDP configuration achieved 39% hardware utilization.

    + + + + + + + + + + + + + + + + + + + + + + + + + + +
    TPU Accelerator - Num Devices + v4-64 +
    GPT2 Parameter Count + 16B +
    Layers Wrapped with FSDP + GPT2Block +
    TFLOPs / Chip + 275 +
    PFLOPs / Step + 50 +
    Hardware Utilization + 39% +
    + +

    Differences Between FSDP & GSPMD

    + +

    FSDP is a data parallelism technique that reduces device memory footprint by storing model parameters, optimizer states, and gradients all sharded. Note that the actual computation is still local to the device and requires all-gathering the sharded model parameters for both forward and backward passes, hence the name “data parallel”. FSDP is one of the newest additions to PyTorch/XLA to scale large model training.

    + +

    GSPMD on the other hand, is a general parallelization system that enables various types of parallelisms, including both data and model parallelisms. PyTorch/XLA provides a sharding annotation API and XLAShardedTensor abstraction, so a user can annotate any tensor with sharding specs in the PyTorch program. Developers don’t need to manually implement sharded computations or inject collective communications ops to get it right. The XLA compiler does the work so that each computation can run in a distributed manner on multiple devices.

    + +

    Examples & Preliminary Results

    + +

    To learn about PyTorch/XLA parallelism sharding API, visit our RFC and see the Sample Code references. Below is a simple example to enable data and model parallelism.

    + +
    model = SimpleLinear().to(xm.xla_device())
    +# Sharding annotate the linear layer weights.
    +xs.mark_sharding(model.fc1.weight, mesh, partition_spec)
    +# Training loop
    +model.train()
    +for step, (data, target) in enumerate(loader):
    +  optimizer.zero_grad()
    +  data = data.to(xm.xla_device())
    +  target = target.to(xm.xla_device())
    +  # Sharding annotate input data, we can shard any input
    +  # dimensions. Sharidng the batch dimension enables 
    +  # data parallelism, sharding the feature dimension enables
    +  # spatial partitioning.
    +  xs.mark_sharding(data, mesh, partition_spec)
    +  ouput = model(data)
    +  loss = loss_fn(output, target)
    +  optimizer.step()
    +  xm.mark_step()
    +
    + +

    The following graph highlights the memory efficiency benefits of PyTorch/XLA FSDP and SPMD on Cloud TPU v4-8 running ResNet50.

    + +

    Batch Size Scaling with Spatial Partitioning

    + +

    Closing Thoughts…

    + +

    We are excited to bring these features to the PyTorch community, and this is really just the beginning. Areas like dynamic shapes, deeper support for OpenXLA and many others are in development and we plan to put out more blogs to dive into the details. PyTorch/XLA is developed fully open source and we invite you to join the community of developers by filing issues, submitting pull requests, and sending RFCs on GitHub. You can try PyTorch/XLA on a variety of XLA devices including TPUs and GPUs. Here is how to get started.

    + +

    Congratulations again to the PyTorch community on this milestone!

    + +

    Cheers,

    + +

    The PyTorch Team at Google

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-adds-new-dev-tools/index.html b/blog/pytorch-adds-new-dev-tools/index.html new file mode 100644 index 000000000000..038d3dcb0045 --- /dev/null +++ b/blog/pytorch-adds-new-dev-tools/index.html @@ -0,0 +1,717 @@ + + + + + + + + + + + + + PyTorch adds new dev tools as it hits production scale | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + The PyTorch Team + +

    +

    This is a partial re-post of the original blog post on the Facebook AI Blog. The full post can be viewed here

    + +

    Since its release just a few months ago, PyTorch 1.0 has been rapidly adopted as a powerful, flexible deep learning platform that enables engineers and researchers to move quickly from research to production. We are highlighting some of the ways the AI engineering and research community is using PyTorch 1.0. We’re also sharing new details about the latest release, PyTorch 1.1, and showcasing some of the new development tools created by the community.

    + +

    Building on the initial launch of PyTorch in 2017, we partnered with the AI community to ship the stable release of PyTorch 1.0 last December. Along with enhanced production-oriented capabilities and deep integration with leading cloud platforms, PyTorch 1.0 expands on the open source library’s core features, with the addition of PyTorch JIT (Just in time compilation) that seamlessly transitions between eager mode and graph mode to provide both flexibility and speed.

    + +

    Leading businesses across industries are beginning to use PyTorch to both facilitate their research and then also deploy at large scale for applications such as translation, computer vision, conversational interfaces, pharmaceutical research, factory optimization, and automated driving research. Community adoption of PyTorch has also continued to expand. Stanford, UC Berkeley, Caltech, and other universities are using PyTorch as a fundamental tool for their machine learning (ML) courses; new ecosystem projects have launched to support development on PyTorch; and major cloud platforms have expanded their integration with PyTorch.

    + +

    Using PyTorch across industries

    + +

    Many leading businesses are moving to PyTorch 1.0 to accelerate development and deployment of new AI systems. Here are some examples:

    + +
      +
    • Airbnb leveraged PyTorch’s rich libraries and APIs for conversational AI and deployed a Smart Reply to help the company’s service agents respond more effectively to customers.
    • +
    • ATOM is building a platform to generate and optimize new drug candidates significantly faster and with greater success than conventional processes. Using machine learning frameworks such as PyTorch, ATOM was able to design a variational autoencoder for representing diverse chemical structures and designing new drug candidates.
    • +
    • Genentech is utilizing PyTorch’s flexible control structures and dynamic graphs to train deep learning models that will aid in the development of individualized cancer therapy.
    • +
    • Microsoft is using PyTorch across its organization to develop ML models at scale and deploy them via the ONNX Runtime. Using PyTorch, Microsoft Cognition has built distributed language models that scale to billions of words and are now in production in offerings such as Cognitive Services.
    • +
    • Toyota Research Institute (TRI) is developing a two-pronged approach toward automated driving with Toyota Guardian and Toyota Chauffeur technologies. The Machine Learning Team at TRI is creating new deep learning algorithms to leverage Toyota’s 10 million sales per year data advantage. The flexibility of PyTorch has vastly accelerated their pace of exploration and its new production features will enable faster deployment towards their safety critical applications.
    • +
    + +

    Following the release of PyTorch 1.0 in December 2018, we’re now announcing the availability of v1.1, which improves performance, adds new model understanding and visualization tools to improve usability, and provides new APIs.

    + +

    Key features of PyTorch v1.1 include:

    + +
      +
    • TensorBoard: First-class and native support for visualization and model debugging with TensorBoard, a web application suite for inspecting and understanding training runs and graphs. PyTorch now natively supports TensorBoard with a simple “from torch.utils.tensorboard import SummaryWriter” command.
    • +
    • JIT compiler: Improvements to just-in-time (JIT) compilation. These include various bug fixes as well as expanded capabilities in TorchScript, such as support for dictionaries, user classes, and attributes.
    • +
    • New APIs: Support for Boolean tensors and better support for custom recurrent neural networks.
    • +
    • Distributed Training: Improved performance for common models such as CNNs, added support for multi device modules including the ability to split models across GPUs while still using Distributed Data Parallel (DDP) and support for modules where not all parameters are used in every iteration (e.g. control flow, like adaptive softmax, etc). See the latest tutorials here.
    • +
    + +

    We’ve also continued to partner with the community to foster projects and tools aimed at supporting ML engineers for needs ranging from improved model understanding to auto-tuning using AutoML methods. With the release of Ax and BoTorch (below), we will be sharing some of our core algorithms, including meta-learning for efficiently optimizing hyperparameters from based on historical tasks. We are excited to see this work open-sourced for the community to build on.

    + +

    This ecosystem includes open source projects and tools that have been deployed at production scale, as well as products and services from our partnership with industry leaders who share our vision of an open and collaborative AI community. Here are a few of the latest tools:

    + +
      +
    • BoTorch: BoTorch is a research framework built on top of PyTorch to provide Bayesian optimization, a sample-efficient technique for sequential optimization of costly-to-evaluate black-box functions.
    • +
    • Ax: Ax is an ML platform for managing adaptive experiments. It enables researchers and engineers to systematically explore large configuration spaces in order to optimize machine learning models, infrastructure, and products.
    • +
    • PyTorch-BigGraph: PBG is a distributed system for creating embeddings of very large graphs with billions of entities and trillions of edges. It includes support for sharding and negative sampling and it offers sample use cases based on Wikidata embeddings.
    • +
    • Google AI Platform Notebooks: AI Platform Notebooks is a new, hosted JupyterLab service from Google Cloud Platform. Data scientists can quickly create virtual machines running JupyterLab with the latest version of PyTorch preinstalled. It is also tightly integrated with GCP services such as BigQuery, Cloud Dataproc, Cloud Dataflow, and AI Factory, making it easy to execute the full ML cycle without ever leaving JupyterLab.
    • +
    + +

    We’re also excited to see many interesting new projects from the broader PyTorch community. Highlights include:

    + +
      +
    • BigGAN-PyTorch:This is a full PyTorch reimplementation that uses gradient accumulation to provide the benefits of big batches on as few as four GPUs.
    • +
    • GeomLoss: A Python API that defines PyTorch layers for geometric loss functions between sampled measures, images, and volumes. It includes MMD, Wasserstein, Sinkhorn, and more.
    • +
    + +
    + +
    + +
      +
    • PyTorch Geometric: A deep learning extension library for PyTorch that offers several methods for deep learning on graphs and other irregular structures (also known as geometric deep learning) from a variety of published papers.
    • +
    • Curve-GCN: A real-time, interactive image annotation approach that uses an end-to-end-trained graph convolutional network (GCN). It supports object annotation by either polygons or splines, facilitating labeling efficiency for both line-based and curved objects. Curve-GCN runs 10x faster than traditional methods, such as Polygon-RNN++.
    • +
    + +

    Udacity, fast.ai, and others develop new PyTorch resources

    + +

    PyTorch is ideal for teaching ML development because it enables rapid experimentation through its flexible, dynamic programming environment and user-friendly Pythonic interface. In addition, Google Colab now offers an interactive Jupyter Notebook environment that natively supports PyTorch, allowing developers to run any PyTorch tutorial immediately with free CPU and GPU resources.

    + +

    University-level classes — including Stanford NLP, UC Berkeley Computer Vision, and Caltech Robotics courses — are now being taught on PyTorch. In addition, massive open online courses (MOOCs) are training thousands of new PyTorch developers.

    + +

    Today, we’re announcing a new Udacity course, building upon the Intro to Deep Learning course launched last year. This new course, led by Andrew Trask of Oxford University and OpenMined, covers important concepts around privacy in AI, including methods such as differential privacy and federated learning. Facebook will also be providing scholarships to support students as they continue their ML education in Udacity’s full Nanodegree programs.

    + +

    The fast.ai community is also continuing to invest energy and resources in PyTorch. In June, fast.ai will launch a new course called Deep Learning from the Foundations, which will show developers how to go all the way from writing matrix multiplication from scratch to how to train and implement a state-of-the-art ImageNet model. The course will include deep dives into the underlying implementation of methods in the PyTorch and fast.ai libraries, and will use the code to explain and illustrate the academic papers that underlie these methods.

    + +

    As part of the course, fast.ai will also release new software modules, including fastai.audio, which brings the power of fast.ai’s deep abstractions and curated algorithms to the new PyTorch.audio module, and show how fastai.vision can be used to create stunning high-resolution videos from material such as old classic movies, and from cutting-edge microscopy sequences through a collaboration with the Salk Institute. In addition, fast.ai is contributing its new X-ResNet module, including a suite of models pretrained on ImageNet.

    + +

    Getting started with PyTorch

    + +

    Everyone in the AI community — including those new to ML development as well as researchers and engineers looking for ways to accelerate their end-to-end workflows — can experiment with PyTorch instantly by visiting pytorch.org and launching a tutorial in Colab. There are also many easy ways to get started both locally and on popular cloud platforms.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-adds-new-tools-and-libraries-welcomes-preferred-networks-to-its-community/index.html b/blog/pytorch-adds-new-tools-and-libraries-welcomes-preferred-networks-to-its-community/index.html new file mode 100644 index 000000000000..538709e16e69 --- /dev/null +++ b/blog/pytorch-adds-new-tools-and-libraries-welcomes-preferred-networks-to-its-community/index.html @@ -0,0 +1,710 @@ + + + + + + + + + + + + + PyTorch adds new tools and libraries, welcomes Preferred Networks to its community | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    PyTorch continues to be used for the latest state-of-the-art research on display at the NeurIPS conference next week, making up nearly 70% of papers that cite a framework. In addition, we’re excited to welcome Preferred Networks, the maintainers of the Chainer framework, to the PyTorch community. Their teams are moving fully over to PyTorch for developing their ML capabilities and services.

    + +

    This growth underpins PyTorch’s focus on building for the needs of the research community, and increasingly, supporting the full workflow from research to production deployment. To further support researchers and developers, we’re launching a number of new tools and libraries for large scale computer vision and elastic fault tolerant training. Learn more on GitHub and at our NeurIPS booth.

    + +

    Preferred Networks joins the PyTorch community

    + +

    Preferred Networks, Inc. (PFN) announced plans to move its deep learning framework from Chainer to PyTorch. As part of this change, PFN will collaborate with the PyTorch community and contributors, including people from Facebook, Microsoft, CMU, and NYU, to participate in the development of PyTorch.

    + +

    PFN developed Chainer, a deep learning framework that introduced the concept of define-by-run (also referred to as eager execution), to support and speed up its deep learning development. Chainer has been used at PFN since 2015 to rapidly solve real-world problems with the latest, cutting-edge technology. Chainer was also one of the inspirations for PyTorch’s initial design, as outlined in the PyTorch NeurIPS paper.

    + +

    PFN has driven innovative work with CuPy, ImageNet in 15 minutes, Optuna, and other projects that have pushed the boundaries of design and engineering. As part of the PyTorch community, PFN brings with them creative engineering capabilities and experience to help take the framework forward. In addition, PFN’s migration to PyTorch will allow it to efficiently incorporate the latest research results to accelerate its R&D activities, given PyTorch’s broad adoption with researchers, and to collaborate with the community to add support for PyTorch on MN-Core, a deep learning processor currently in development.

    + +

    We are excited to welcome PFN to the PyTorch community, and to jointly work towards the common goal of furthering advances in deep learning technology. Learn more about the PFN’s migration to PyTorch here.

    + +

    Tools for elastic training and large scale computer vision

    + +

    PyTorch Elastic (Experimental)

    + +

    Large scale model training is becoming commonplace with architectures like BERT and the growth of model parameter counts into the billions or even tens of billions. To achieve convergence at this scale in a reasonable amount of time, the use of distributed training is needed.

    + +

    The current PyTorch Distributed Data Parallel (DDP) module enables data parallel training where each process trains the same model but on different shards of data. It enables bulk synchronous, multi-host, multi-GPU/CPU execution of ML training. However, DDP has several shortcomings; e.g. jobs cannot start without acquiring all the requested nodes; jobs cannot continue after a node fails due to error or transient issue; jobs cannot incorporate a node that joined later; and lastly; progress cannot be made with the presence of a slow/stuck node.

    + +

    The focus of PyTorch Elastic, which uses Elastic Distributed Data Parallelism, is to address these issues and build a generic framework/APIs for PyTorch to enable reliable and elastic execution of these data parallel training workloads. It will provide better programmability, higher resilience to failures of all kinds, higher-efficiency and larger-scale training compared with pure DDP.

    + +

    Elasticity, in this case, means both: 1) the ability for a job to continue after node failure (by running with fewer nodes and/or by incorporating a new host and transferring state to it); and 2) the ability to add/remove nodes dynamically due to resource availability changes or bottlenecks.

    + +

    While this feature is still experimental, you can try it out on AWS EC2, with the instructions here. Additionally, the PyTorch distributed team is working closely with teams across AWS to support PyTorch Elastic training within services such as Amazon Sagemaker and Elastic Kubernetes Service (EKS). Look for additional updates in the near future.

    + +

    New Classification Framework

    + +

    Image and video classification are at the core of content understanding. To that end, you can now leverage a new end-to-end framework for large-scale training of state-of-the-art image and video classification models. It allows researchers to quickly prototype and iterate on large distributed training jobs at the scale of billions of images. Advantages include:

    + +
      +
    • Ease of use - This framework features a modular, flexible design that allows anyone to train machine learning models on top of PyTorch using very simple abstractions. The system also has out-of-the-box integration with AWS on PyTorch Elastic, facilitating research at scale and making it simple to move between research and production.
    • +
    • High performance - Researchers can use the framework to train models such as Resnet50 on ImageNet in as little as 15 minutes.
    • +
    + +

    You can learn more at the NeurIPS Expo workshop on Multi-Modal research to production or get started with the PyTorch Elastic Imagenet example here.

    + +

    Come see us at NeurIPS

    + +

    The PyTorch team will be hosting workshops at NeurIPS during the industry expo on 12/8. Join the sessions below to learn more, and visit the team at the PyTorch booth on the show floor and during the Poster Session. At the booth, we’ll be walking through an interactive demo of PyTorch running fast neural style transfer on a Cloud TPU - here’s a sneak peek.

    + +

    We’re also publishing a paper that details the principles that drove the implementation of PyTorch and how they’re reflected in its architecture.

    + +

    Multi-modal Research to Production - This workshop will dive into a number of modalities such as computer vision (large scale image classification and instance segmentation) and Translation and Speech (seq-to-seq Transformers) from the lens of taking cutting edge research to production. Lastly, we will also walk through how to use the latest APIs in PyTorch to take eager mode developed models into graph mode via Torchscript and quantize them for scale production deployment on servers or mobile devices. Libraries used include:

    + +
      +
    • Classification Framework - a newly open sourced PyTorch framework developed by Facebook AI for research on large-scale image and video classification. It allows researchers to quickly prototype and iterate on large distributed training jobs. Models built on the framework can be seamlessly deployed to production.
    • +
    • Detectron2 - the recently released object detection library built by the Facebook AI Research computer vision team. We will articulate the improvements over the previous version including: 1) Support for latest models and new tasks; 2) Increased flexibility, to enable new computer vision research; 3) Maintainable and scalable, to support production use cases.
    • +
    • Fairseq - general purpose sequence-to-sequence library, can be used in many applications, including (unsupervised) translation, summarization, dialog and speech recognition.
    • +
    + +

    Responsible and Reproducible AI - This workshop on Responsible and Reproducible AI will dive into important areas that are shaping the future of how we interpret, reproduce research, and build AI with privacy in mind. We will cover major challenges, walk through solutions, and finish each talk with a hands-on tutorial.

    + +
      +
    • Reproducibility: As the number of research papers submitted to arXiv and conferences skyrockets, scaling reproducibility becomes difficult. We must address the following challenges: aid extensibility by standardizing code bases, democratize paper implementation by writing hardware agnostic code, facilitate results validation by documenting “tricks” authors use to make their complex systems function. To offer solutions, we will dive into tool like PyTorch Hub and PyTorch Lightning which are used by some of the top researchers in the world to reproduce the state of the art.
    • +
    • Interpretability: With the increase in model complexity and the resulting lack of transparency, model interpretability methods have become increasingly important. Model understanding is both an active area of research as well as an area of focus for practical applications across industries using machine learning. To get hands on, we will use the recently released Captum library that provides state-of-the-art algorithms to provide researchers and developers with an easy way to understand the importance of neurons/layers and the predictions made by our models.`
    • +
    • Private AI: Practical applications of ML via cloud-based or machine-learning-as-a-service platforms pose a range of security and privacy challenges. There are a number of technical approaches being studied including: homomorphic encryption, secure multi-party computation, trusted execution environments, on-device computation, and differential privacy. To provide an immersive understanding of how some of these technologies are applied, we will use the CrypTen project which provides a community based research platform to take the field of Private AI forward.
    • +
    + +

    We’d like to thank the entire PyTorch team and the community for all their contributions to this work.

    + +

    Cheers!

    + +

    Team PyTorch

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-at-gtc/index.html b/blog/pytorch-at-gtc/index.html new file mode 100644 index 000000000000..a332dbeb7bcb --- /dev/null +++ b/blog/pytorch-at-gtc/index.html @@ -0,0 +1,740 @@ + + + + + + + + + + + + + PyTorch at GTC 2025 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    March 16, 2025

    +

    + PyTorch at GTC 2025 +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Team PyTorch at NVIDIA + +

    +

    GTC is coming back to San Jose on March 17–21, 2025. Join PyTorch Foundation members Arm, AWS, Google Cloud, IBM, Lightning AI, Meta, Microsoft Azure, Snowflake, and thousands of developers as we celebrate PyTorch. Together learn how AI & accelerated computing are helping humanity solve our most complex challenges.

    + +

    Join in person with discounted GTC registration for PyTorch Foundation or watch online with free registration.

    + +

    book cover

    + +

    Scaling Open Source AI: From Foundation Models to Ecosystem Success

    + +

    Hear from PyTorch Foundation’s Executive Director Matt White & panelists from UC Berkeley, Meta, NVIDIA, & Sequoia Capital how open source is transforming AI development, bringing together experts from industry, academia, and venture capital to discuss the technical and business aspects of collaborative open source AI development They’ll examine how open source projects like PyTorch, vLLM, Ray, and NVIDIA’s NeMo are accelerating AI innovation while creating new opportunities for businesses and researchers. They’ll share real-world experiences from PyTorch’s development, Berkeley’s research initiatives, and successful AI startups. Take away valuable insights into the technical and business aspects of open source AI. – Monday, Mar 17 10:00 AM - 11:00 AM PDT

    + +

    PyTorch @ GTC

    + +

    The Performance of CUDA with the Flexibility of PyTorch
    +Mark Saroufim, Software Engineer, Meta Platforms

    + +

    This talk explores how PyTorch users are also becoming CUDA developers. We’ll start with motivating examples from eager, the launch of torch.compile and the more recent trend of kernel zoos. We will share details on how we went about integrating low bit matmuls in torchao and the torch.compile CUTLASS backend. We’ll also discuss details on how you can define, build and package your own custom ops in PyTorch so you get the raw performance of CUDA while maintaining the flexibility of PyTorch.

    + +

    Make My PyTorch Model Fast, and Show Me How You Did It
    +Thomas Viehmann, Principal Research Engineer, Lightning AI
    +Luca Antiga, CTO, Lightning AI

    + +

    PyTorch is popular in deep learning and LLMs for richness and ease of expressions. To make the most of compute resources, PyTorch models benefit from nontrivial optimizations, but this means losing some of their ease and understandability. Learn how with Thunder, a PyTorch-to-Python compiler focused on usability, understandability, and extensibility, you can optimize and transform (i.e., distribute across many machines) models while • leaving the PyTorch code unchanged • targeting a variety of models without needing to adapt to each of them • understanding each transformation step because the results are presented as simple Python code • accessing powerful extension code for your own optimizations with just one or a few lines of code We’ll show how the combination of Thunder transforms and the NVIDIA stack (NVFuser, cuDNN, Apex) delivers optimized performance in training and inference on a variety of models.

    + +

    FlexAttention: The Flexibility of PyTorch With the Performance of FlashAttention
    +Driss Guessous, Machine Learning Engineer, Meta Platforms

    + +

    Introducing FlexAttention: a novel PyTorch API that enables custom, user-defined attention mechanisms with performance comparable to state-of-the-art solutions. By leveraging the PyTorch compiler stack, FlexAttention supports dynamic modifications to attention scores within SDPA, achieving both runtime and memory efficiency through kernel fusion with the FlashAttention algorithm. Our benchmarks on A100 GPUs show FlexAttention achieves 90% of FlashAttention2’s performance in forward passes and 85% in backward passes. On H100 GPUs, FlexAttention’s forward performance averages 85% of FlashAttention3 and is ~25% faster than FlashAttention2, while backward performance averages 76% of FlashAttention3 and is ~3% faster than FlashAttention2. Explore how FlexAttention balances near-state-of-the-art performance with unparalleled flexibility, empowering researchers to rapidly iterate on attention mechanisms without sacrificing efficiency.

    + +

    Keep Your GPUs Going Brrr : Crushing Whitespace in Model Training
    +Syed Ahmed, Senior Software Engineer, NVIDIA
    +Alban Desmaison, Research Engineer, Meta
    +Aidyn Aitzhan, Senior Software Engineer, NVIDIA

    + +

    Substantial progress has recently been made on the compute-intensive portions of model training, such as high-performing attention variants. While invaluable, this progress exposes previously hidden bottlenecks in model training, such as redundant copies during collectives and data loading time. We’ll present recent improvements in PyTorch achieved through Meta/NVIDIA collaboration to tackle these newly exposed bottlenecks and how practitioners can leverage them.

    + +

    Accelerated Python: The Community and Ecosystem
    +Andy Terrel, CUDA Python Product Lead, NVIDIA
    +Jeremy Tanner, Open Source Programs, NVIDIA
    +Anshuman Bhat, CUDA Product Management, NVIDIA

    + +

    Python is everywhere. Simulation, data science, and Gen AI all depend on it. Unfortunately, the dizzying array of tools leaves a newcomer baffled at where to start. We’ll take you on a guided tour of the vibrant community and ecosystem surrounding accelerated Python programming. Explore a variety of tools, libraries, and frameworks that enable efficient computation and performance optimization in Python, including CUDA Python, RAPIDS, Warp, and Legate. We’ll also discuss integration points with PyData, PyTorch, and JAX communities. Learn about collaborative efforts within the community, including open source projects and contributions that drive innovation in accelerated computing. We’ll discuss best practices for leveraging these frameworks to enhance productivity in developing AI-driven applications and conducting large-scale data analyses.

    + +

    Supercharge large scale AI with Google Cloud AI hypercomputer (Presented by Google Cloud)
    +Deepak Patil, Product Manager, Google Cloud
    +Rajesh Anantharaman, Product Management Lead, ML Software, Google Cloud

    + +

    Unlock the potential of your large-scale AI workloads with Google Cloud AI Hypercomputer – a supercomputing architecture designed for maximum performance and efficiency. In this session, we will deep dive into PyTorch and JAX stacks on Google Cloud on NVIDIA GPUs, and showcase capabilities for high performance foundation model building on Google Cloud.

    + +

    Peering Into the Future: What AI and Graph Networks Can Mean for the Future of Financial Analysis
    +Siddharth Samsi, Sr. Solutions Architect, NVIDIA
    +Sudeep Kesh, Chief Innovation Officer, S&P Global

    + +

    Artificial Intelligence, agentic systems, and graph neural networks (GNNs) are providing the new frontier to assess, monitor, and estimate opportunities and risks across work portfolios within financial services. Although many of these technologies are still developing, organizations are eager to understand their potential. See how S&P Global and NVIDIA are working together to find practical ways to learn and integrate such capabilities, ranging from forecasting corporate debt issuance to understanding capital markets at a deeper level. We’ll show a graph representation of market data using the PyTorch-Geometric library and a dataset of issuances spanning three decades and across financial and non-financial industries. Technical developments include generation of a bipartite graph and link-prediction GNN forecasting. We’ll address data preprocessing, pipelines, model training, and how these technologies can broaden capabilities in an increasingly complex world.

    + +

    Unlock Deep Learning Performance on Blackwell With cuDNN
    +Yang Xu (Enterprise Products), DL Software Engineering Manager, NVIDIA

    + +

    Since its launch, cuDNN, a library for GPU-accelerating deep learning (DL) primitives, has been powering many AI applications in domains such as conversational AI, recommender systems, and speech recognition, among others. CuDNN remains a core library for DL primitives in popular frameworks such as PyTorch, JAX, Tensorflow, and many more while covering training, fine-tuning, and inference use cases. Even in the rapidly evolving space of Gen AI — be it Llama, Gemma, or mixture-of-experts variants requiring complex DL primitives such as flash attention variants — cuDNN is powering them all. Learn about new/updated APIs of cuDNN pertaining to Blackwell’s microscaling format, and how to program against those APIs. We’ll deep dive into leveraging its graph APIs to build some fusion patterns, such as matmul fusion patterns and fused flash attention from state-of-the-art models. Understand how new CUDA graph support in cuDNN, not to be mistaken with the cuDNN graph API, could be exploited to avoid rebuilding CUDA graphs, offering an alternative to CUDA graph capture with real-world framework usage.

    + +

    Train and Serve AI Systems Fast With the Lightning AI Open-Source Stack (Presented by Lightning AI)
    +Luca Antiga, CTO, Lightning AI

    + +

    See how the Lightning stack can cover the full life cycle, from data preparation to deployment, with practical examples and particular focus on distributed training and high-performance inference. We’ll show examples that focus on new features like support for multi-dimensional parallelism through DTensors, as well as quantization through torchao.

    + +

    Connect With Experts (Interactive Sessions)

    + +

    Meet the Experts From Deep Learning Framework Teams
    +Eddie Yan, Technical Lead of PyTorch, NVIDIA
    +Masaki Kozuki, Senior Software Engineer in PyTorch, NVIDIA
    +Patrick Wang (Enterprise Products), Software Engineer in PyTorch, NVIDIA
    +Mike Ruberry, Distinguished Engineer in Deep Learning Frameworks, NVIDIA
    +Rishi Puri, Sr. Deep Learning Engineer and Lead for PyTorch Geometric, NVIDIA

    + +

    Training Labs

    + +

    Kernel Optimization for AI and Beyond: Unlocking the Power of Nsight Compute
    +Felix Schmitt, Sr. System Software Engineer, NVIDIA
    +Peter Labus, Senior System Software Engineer, NVIDIA

    + +

    Learn how to unlock the full potential of NVIDIA GPUs with the powerful profiling and analysis capabilities of Nsight Compute. AI workloads are rapidly increasing the demand for GPU computing, and ensuring that they efficiently utilize all available GPU resources is essential. Nsight Compute is the most powerful tool for understanding kernel execution behavior and performance. Learn how to configure and launch profiles customized for your needs, including advice on profiling accelerated Python applications, AI frameworks like PyTorch, and optimizing Tensor Core utilization essential to modern AI performance. Learn how to debug your kernel and use the expert system built into Nsight Compute, known as “Guided Analysis,” that automatically detects common issues and directs you to the most relevant performance data all the way down to the source code level.

    + +

    Make Retrieval Better: Fine-Tuning an Embedding Model for Domain-Specific RAG
    +Gabriel Moreira, Sr. Research Scientist, NVIDIA
    +Ronay Ak, Sr. Data Scientist, NVIDIA

    + +

    LLMs power AI applications like conversational chatbots and content generators, but are constrained by their training data. This might lead to hallucinations in content generation, which requires up-to-date or domain-specific information. Retrieval augmented generation (RAG) addresses this issue by enabling LLMs to access external context without modifying model parameters. Embedding or dense retrieval models are a key component of a RAG pipeline for retrieving relevant context to the LLM. However, an embedding model’s effectiveness to capture the unique characteristics of the custom data hinges on the quality and domain relevance of its training data. Fine-tuning embedding models is gaining interest to provide more accurate and relevant responses tailored to users’ specific domain.

    + +

    In this lab, you’ll learn to generate a synthetic dataset with question-context pairs from a domain-specific corpus, and process the data for fine-tuning. Then, fine-tune a text embedding model using synthetic data and evaluate it.

    + +

    Poster Presentations

    + +

    Single-View X-Ray 3D Reconstruction Using Neural Back Projection and Frustum Resampling
    +Tran Minh Quan, Developer Technologist, NVIDIA

    + +

    Enable Novel Applications in the New AI Area in Medicine: Accelerated Feature Computation for Pathology Slides
    +Nils Bruenggel, Principal Software Engineer, Roche Diagnostics Int. AG

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-compile-to-speed-up-inference/index.html b/blog/pytorch-compile-to-speed-up-inference/index.html new file mode 100644 index 000000000000..ec5f7a030775 --- /dev/null +++ b/blog/pytorch-compile-to-speed-up-inference/index.html @@ -0,0 +1,803 @@ + + + + + + + + + + + + + PyTorch compile to speed up inference on Llama 2 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    + +
    + +
    +
    +
    + +
    +

    + by + + IBM Research: Antoni Viros i Martin, Brian Vaughan, Davis Wertheimer, Joshua Rosenkranz, Mudhakar Srivatsa, Nelson Mimura Gonzalez, Raghu Ganti, Supriyo Chakraborty, Zhuoran Liu Meta: Geeta Chauhan, Hamid Shojanazeri + +

    +

    In this blog, we discuss how to improve the inference latencies of the Llama 2 family of models using PyTorch native optimizations such as native fast kernels, compile transformations from torch compile, and tensor parallel for distributed inference. Our approach results in 29ms/token latency for single user requests on the 70B LLaMa model (as measured on 8 A100 GPUs). We are excited to share our findings with the community and make our code available here.

    + +

    Background

    + +

    We are amid a generative AI revolution with large language models of tens of billions of parameters becoming commoditized and available for use. However, it is well recognized in the community that deploying these large models in a cost-efficient manner remains a key challenge. Many different approaches have been attempted with varying degrees of success and offering different trade-offs. Hardware-specific optimizations (e.g., Faster Transformer from NVIDIA) are restricted to specific target hardware whereas approaches that rely on layers of abstraction (e.g., ONNX) enable arbitrary models but suffer from loss of efficiency. With the introduction of PyTorch compile last year, IBM and the PyTorch team started exploring the use of model compilation for inference optimizations with the goal of reducing the latency per token for generative models.

    + +

    Model Choice

    + +

    We choose to benchmark on the Llama 2 family of models, given their popularity. The models that we are interested in, and their hyper parameters relevant for this blog are given in the below table:

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Model size + Hidden dimension + Num heads + Num layers + Attention type +
    7B + 4096 + 32 + 32 + MHA +
    13B + 5120 + 40 + 40 + MHA +
    70B + 8192 + 64 + 80 + GQA +
    + +

    These models are decoder only, which means that tokens get generated in a serialized manner, which is typically sped up using KV caching. We take a similar approach in our latency and throughput measurements.

    + +

    Inference Approach

    + +

    Our goal for inference is to provide a path for achieving the best possible latencies rapidly, to keep up with the velocity with which new model architectures are emerging in the community. A PyTorch native approach is appealing as it allows for the maximum flexibility in terms of “coverage” of models. We note that there are four orthogonal techniques that provide acceleration in inference: (a) Kernel fusion using compile, (b) Faster kernels, (c) Tensor parallel for larger models, and (d) Quantization. In our approach, we use the first three of these four levers - compile natively working with faster kernels from SDPA and a custom tensor parallel implementation that all work hand-in-glove to achieve inference latencies of 29ms/token on a 70B model as measured on 8 NVIDIA A100 GPUs with single user.

    + +

    Compile all the way!

    + +

    PyTorch Compile leverages tracing and graph capture to reduce the CPU overhead and in an ideal scenario results in a single graph execution/instruction from CPU to GPU. However, often compile introduces graph breaks due to model architecture and ops unsupported by compile. For example, complex operations such as einops are not supported by compile today. Similarly, tensor parallel inference can introduce graph breaks at each layer, since compile requires the tensor parallel implementation to use traceable communication collectives. If these graph breaks are not removed, the performance of the compiled artifacts will be hampered and could even be lower compared to eager mode execution. To get full benefit of the compiled artifacts, the graph breaks need to be removed.

    + +

    Below, we describe how we went about doing this for the 70b Llama 2 model and the challenges we had to overcome to get compile to work all the way through.

    + +

    Our first attempt was to try using torch.compile to compile the out-of-box Llama 2 model, but it failed because complex ops were not supported. Using TORCH_COMPILE_DEBUG = 1 we identified the RoPE positional encodings was using complex number functions resulting in graph breaks and significant slowdowns. We rewrote the RoPE function to bypass torch.einsum (Original implementation uses torch.polar that also conflicts with compile) and use torch.cos and torch.sin instead.

    + +
    self.cached_freqs[dev_idx][alpha] = torch.stack(
    +            [
    +                torch.cos(freqs),
    +                -torch.sin(freqs),
    +                torch.sin(freqs),
    +                torch.cos(freqs),
    +            ],
    +            dim=2,
    +        ).view(*freqs.shape, 2, 2)
    +
    + +

    Our implementation of the frequencies computation

    +

    +
    t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
    +t = t / self.scaling_factor
    +
    +freqs = torch.einsum("i,j->ij", t, self.inv_freq)
    +# Different from paper, but it uses a different permutation in order to obtain the same calculation
    +emb = torch.cat((freqs, freqs), dim=-1)
    +self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
    +self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
    +
    + +

    Hugging Face implementation of the frequencies computation

    + +

    Once RoPE was fixed, we were able to get 7B and 13B models to compile without ANY graph breaks on a single A100 GPU.

    + +

    We used SDPA, the PyTorch native implementation of efficient attention computation with tracing enabled (for compile). To avoid graph breaks related to forcing a single algorithm choice using a Python context, the recommended way, we had to use the torch.backends.cuda.enable_*_sdp functions.

    + +
    attn = torch.nn.functional.scaled_dot_product_attention(
    +            queries,
    +            keys_e,
    +            values_e,
    +            attn_mask=attn_mask,
    +            dropout_p=self.p_dropout if self.training else 0.0,
    +            is_causal=is_causal_mask,
    +)
    +
    + +

    Attention computation using SDPA

    + +

    Next we ran the same steps for the larger 70B model and found that even with half precision, the model does not fit in a single GPU and requires tensor parallel inference. Using torch.compile for the 70B model resulted in 162 graph breaks due to two all-reduces per layer, one all-gather for forward embedding, and one all-gather for reverse embedding. Due to this, we saw no significant improvement in inference latencies. We could not use the distributed tensor implementation from PyTorch at the time of writing this blog as it did not support compile. We rewrote the tensor parallel code from scratch so that it only depends on traceable collectives to make it work with compile. After this last change, PyTorch compiler did not introduce any graph breaks and we saw a significant speedup in inference latencies. Specifically, we measured latencies for the Llama 70B model at 29ms/token when using 8 A100 GPUs, a 2.4x improvement over unoptimized inference.

    + +

    Serving aspects

    + +

    Finally, a point to note here is that simply performing compile on a model is not sufficient to serve the model in a production setting. To realize the above performance with high throughput, we need to support dynamic batching, nested tensors, as well as have a warm up phase where we pre-compile for bucketized sequence lengths. We are working on these aspects to realize such performance in a production setting.

    + +

    Experiments and Measurements

    + +

    We use nodes with 8 A100 NVIDIA GPUs with 80G cards for all our measurements in two different environments (IBM Cloud and AWS, both running OpenShift). First, we compare the various techniques – eager mode, with SDPA Flash kernel, with Compile, and with Compile and SDPA. For the 70B model, we run it in Tensor Parallel mode with compile and SDPA. For this experiment, we use 512 tokens as input length with 50 token generation. For 7 and 13B models, we use single A100 for measurement of latencies, whereas we use 8 A100s for the 70B model. In addition, for the 70B model we use the reduce-overhead option in PyTorch compile that uses CudaGraphs to reduce CPU to GPU kernel launching overheads; the use of CudaGraphs in the 7B and 13B models did not show any benefits (and are thus not reported here). We observe from Figure 1 that compile and SDPA provide very low latencies, with 70B Llama 2 model at 29ms/token.

    + +

    Figure 1. Median latency across different techniques with sequence length 512 (measured on IBM Cloud A100 servers)

    + +

    Fig. 1: Median latency across different techniques with sequence length 512 (measured on IBM Cloud A100 servers)

    + +

    Next, we examine the impact of sequence length, where we increase it from 1024 to 4096 and observe that the median latency per token increases sub-linearly, demonstrating that when we increase context to large documents, we do not sacrifice response times.

    + +

    Figure 2. Median latency for compile+SDPA with different sequence lengths (Measured on A100s on AWS)

    + +

    Fig. 2: Median latency for compile+SDPA with different sequence lengths (Measured on A100s on AWS)

    + +

    Finally, with increased batch sizes, we observe that the response latencies increase sub-linearly. For the 13B model, at batch size 8, we encounter an OOM. For the 70B model, given that it is running on 8 GPUs with tensor parallel, we do not see any such OOM issues.

    + +

    Figure 3. Median latency for compile+SDPA with different batch sizes and sequence length fixed at 4096 (Measured on A100s on AWS)

    + +

    Fig. 3: Median latency for compile+SDPA with different batch sizes and sequence length fixed at 4096 (Measured on A100s on AWS)

    + +

    Final Thoughts

    + +

    We have demonstrated how a PyTorch compile pathway for inference demonstrates ultra low latencies for 70B model inference. The next steps are to enable dynamic batching and nested tensors with the above levers.

    + +

    Special thanks to Edward Yang, Elias Ellison, Driss Guessous, Will Feng, Will Constable, Horace He, Less Wright, and Andrew Gu from Team PyTorch, whose PRs reviews and code contributions made it possible for us to realize the latencies using PyTorch native approach. We thank the broader Team PyTorch that have been tirelessly working to make PyTorch better, special shout outs to the SDPA team for enabling tracing and compile on fast kernels, the compile team that has been closely guiding us on how to work around as well as fix issues (including identifying and raising NVIDIA driver bugs in CUDA graphs).

    + +

    Inference latency has been one of the roadblocks for LLM adoption in critical enterprise workflows, but another major one is the need for safety, trustworthiness and governance. IBM’s guide for AI safety and LLM risk can be found here and Meta’s responsible user guide for LLaMa can be found here.

    + +

    References

    + + + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-conference-2023/index.html b/blog/pytorch-conference-2023/index.html new file mode 100644 index 000000000000..78a1658b0766 --- /dev/null +++ b/blog/pytorch-conference-2023/index.html @@ -0,0 +1,688 @@ + + + + + + + + + + + + + PyTorch Conference 2023: Join us in San Francisco October 16-17 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    PyTorch Conference 2023

    + +

    We’re thrilled to announce the upcoming PyTorch Conference 2023! On October 16-17, the conference will showcase PyTorch 2.1, the next-generation release of the popular machine learning framework. As part of the Linux Foundation, the PyTorch Foundation Conference continues the tradition of bringing together leading researchers, developers, and academic communities to advance the education and development of end-to-end machine learning.

    + +

    The conference agenda features an engaging lineup of events, including an opening reception, engaging community and partner discussions, informative panels, poster sessions, enlightening use cases and community stories, as well as discussions on the latest trends in machine learning and deep learning development and deployment.

    + +

    Call for Proposals

    + +

    We are now accepting speaker proposals for the conference until July 21. The program committee will carefully review all submissions, and selected speakers will be notified by August 8. We strongly encourage both experienced and first-time speakers to submit their proposals. This conference provides an excellent opportunity to connect with the PyTorch community, share your ideas, and showcase your work.

    + +

    When preparing your proposal, please consider the following guidelines:

    + +
      +
    • What are you hoping to get from your presentation?
    • +
    • What do you expect the audience to gain from your presentation?
    • +
    • How will your presentation help better the open source ecosystem?
    • +
    + +

    To help you shape your proposal, here are some suggested topics for the conference:

    + +
      +
    • Deployments on AWS, Azure
    • +
    • Use cases and real-world applications
    • +
    • Foundational models
    • +
    • AI practices
    • +
    • Production considerations
    • +
    • PyTorch 2.X features and updates
    • +
    • Training techniques and best practices
    • +
    • Inference methodologies
    • +
    • Hardware advancements and optimizations
    • +
    • Edge computing applications
    • +
    • Scalability solutions
    • +
    • Latest research breakthroughs
    • +
    • Optimization strategies
    • +
    • Extending PyTorch through customizations and plugins
    • +
    + +

    We kindly request that you refrain from submitting sales or marketing pitches and avoid discussing unlicensed or closed-source technologies. Such talks tend to detract from the integrity of our events and are not well-received by conference attendees.

    + +

    Register Today

    + +

    Registration is now open! Get your ticket today and secure your spot: https://events.linuxfoundation.org/pytorch-conference/register/

    + +

    Thank you for your interest, and we look forward to a successful PyTorch Conference 2023!

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-conference-2024-recap/index.html b/blog/pytorch-conference-2024-recap/index.html new file mode 100644 index 000000000000..c3f3f86f4e4f --- /dev/null +++ b/blog/pytorch-conference-2024-recap/index.html @@ -0,0 +1,1174 @@ + + + + + + + + + + + + + PyTorch Conference 2024 Recap: On Fire 🔥 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    October 02, 2024

    +

    + PyTorch Conference 2024 Recap: On Fire 🔥 +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    women dancing with fire

    + +

    The 2024 PyTorch Conference in San Francisco gathered nearly 1,500 AI researchers, developers, and enthusiasts. Over two days, the event featured engaging discussions, insightful keynotes, and hands-on sessions focused on artificial intelligence (AI) and advancements in PyTorch, the leading open-source machine learning framework. Attendees delved into the future of generative AI, Large Language Models (LLMs), and the crucial role open-source technology plays in driving AI innovation. Here’s a recap of the key themes, highlights, and major takeaways from this year’s conference.

    + +

    Key Themes of the PyTorch Conference 2024

    + +

    Three core themes emerged throughout the conference:

    + +
      +
    1. Generative AI and LLMs: Many sessions focused on how PyTorch continues to evolve as a primary framework for Large Language Models and Generative AI applications. From scaling these models to optimizing their performance on various hardware platforms, the conference showcased the ongoing advancements and challenges in LLM architecture.
    2. +
    3. Democratizing AI Through Open Source: One of the recurring themes was the importance of open source tools and communities in shaping the future of AI. PyTorch is committed to inclusivity, ease of use, and accessibility to developers of all levels, with a focus on bringing AI to an even larger global audience.
    4. +
    5. Distributed and Edge Computing: Distributed computing and edge deployment appeared in many discussions, highlighting how PyTorch is being used to drive AI to the edge. The focus on edge accelerators, scalable training, and inference showcased how PyTorch enables the deployment of powerful models across diverse environments, from the cloud to on-device applications.
    6. +
    + +

    panel of people on a conference stage

    + +

    Watch the Sessions from PyTorch Conference

    + +

    The PyTorch Conference featured keynote sessions from top AI leaders and interesting lightning talks. You can view all of the conference sessions on our YouTube channel.

    + +
    + +
    + + + +

    PyTorch Conference Startup Showcase

    + +

    man speaking at a conference

    + +

    New this year, the Startup Showcase was an exciting addition to the PyTorch Conference. Featuring early-stage founders pitching their AI startups to a panel of top venture capitalists, this event showcased the next generation of AI-driven innovation. The finalists for the inaugural PyTorch Conference Startup Showcase included Remix Inc., Cartesia, OpenBabylon, Remyx AI, A2 Labs, Inc., QuicSnap, Iso AI, CTGT, and Creao.ai, representing some of the most innovative AI/ML startups in the industry. Attendees got a front-row seat to see cutting-edge AI startups in action, while top VCs from the AI industry evaluated the pitches.

    + +

    Congratulations to the PyTorch Conference Startup Showcase winner, CTGT! Deep learning can be opaque and biased, which limits its potential in crucial areas like healthcare and finance. CTGT is changing the game by enhancing data lineage in LLMs and cutting hallucinations. They’re empowering companies to create customized models using 500x less compute.

    + +

    View the Startup Showcase

    + +

    Mini-Summits

    + +

    The DL Compiler Mini-Summit offered attendees a deep dive into the advances in deep learning (DL) compilers that are transforming AI workloads.

    + +

    View the DL Compiler Mini-Summit

    + +

    People watching an event

    + +

    The Fine-Tuning Mini-Summit brought together a thriving community of researchers, developers, practitioners and hobbyists which focuses on topics ranging from memory efficiency, parameter-efficient fine-tuning and quantization to performance at scale and reproducible evaluations.

    + +

    View the Fine-Tuning Mini-Summit

    + +

    Major Takeaways from the PyTorch Conference 2024

    + +

    Matt giving his keynote

    + +
      +
    1. LLMs are Here to Stay: were a focal point of the event, reaffirming their pivotal role in the future of AI. As these models continue to scale, PyTorch remains the preferred framework for developing, training, and deploying them across various platforms and industries.
    2. +
    3. Open Source Drives Innovation: A key takeaway from the conference was that open-source tools like PyTorch are vital for democratizing AI. This community-driven approach accelerates innovation, enabling researchers and developers globally to collaborate and contribute to faster advancements and more accessible AI technologies.
    4. +
    5. Ethics and Sustainability Matter: The focus on ethical AI development was a significant takeaway. Talks on the inclusivity of computer vision models, the environmental impacts of AI infrastructure, and the need for transparent, unbiased AI models highlighted the growing importance of ethical considerations in the future of AI.
    6. +
    7. PyTorch Expands Beyond the Cloud: With several sessions dedicated to edge AI and distributed computing, the conference showcased how PyTorch is expanding beyond cloud-based applications into edge devices and diverse computing environments. This shift is crucial as AI advances into areas like autonomous vehicles, mobile applications, and IoT devices.
    8. +
    + +

    Thank You to Our Sponsors

    + +

    A crowd of people at a conference

    + +

    Sponsor logos

    + +

    We would like to thank each of the sponsors that made the PyTorch Conference 2024 possible. These include:

    + +

    Diamond Sponsors:

    + +
      +
    • AMD
    • +
    • Cloud Native Computing Foundation
    • +
    • IBM
    • +
    • Intel – PyTorch
    • +
    • Lightning.ai
    • +
    • Meta – PyTorch
    • +
    + +

    Platinum Sponsors:

    + +
      +
    • Arm
    • +
    • Google
    • +
    • Lambda Labs
    • +
    • Nvidia
    • +
    + +

    Silver Sponsors:

    + +
      +
    • Anyscale – PyTorch
    • +
    • Baseten
    • +
    • Chainguard
    • +
    • Databricks
    • +
    • Fal
    • +
    • FuriosaAi
    • +
    • HPE
    • +
    • Jane Street
    • +
    • Microsoft – PyTorch
    • +
    • MinIO
    • +
    • Outerbounds
    • +
    • Together.AI
    • +
    + +

    Bronze Sponsors:

    + +
      +
    • d-Matrix
    • +
    • MemVerge
    • +
    • Perforated AI
    • +
    • Quansight
    • +
    • Rotational Labs
    • +
    • ScaleGenAI
    • +
    + +

    Special Event Sponsors:

    + +
      +
    • PyTorch Flare Party: Hugging Face
    • +
    • Startup Showcase: Mayfield
    • +
    • Diversity Scholarship: AWS
    • +
    • Women and Non-Binary in PyTorch Lunch: Google
    • +
    • Happy Hour Reception: Lightning.AI
    • +
    + +

    Thank you for your continued support in advancing the PyTorch ecosystem and helping to shape the future of AI!

    + +

    Save the Date

    + +

    See you next year for the PyTorch Conference in San Francisco at the Palace of Fine Arts from October 22-23, 2025.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-developer-day-2020/index.html b/blog/pytorch-developer-day-2020/index.html new file mode 100644 index 000000000000..fdfee3379641 --- /dev/null +++ b/blog/pytorch-developer-day-2020/index.html @@ -0,0 +1,664 @@ + + + + + + + + + + + + + Announcing PyTorch Developer Day 2020 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    November 01, 2020

    +

    + Announcing PyTorch Developer Day 2020 +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    Starting this year, we plan to host two separate events for PyTorch: one for developers and users to discuss core technical development, ideas and roadmaps called “Developer Day”, and another for the PyTorch ecosystem and industry communities to showcase their work and discover opportunities to collaborate called “Ecosystem Day” (scheduled for early 2021).

    + +
    + +
    + +

    The PyTorch Developer Day (#PTD2) is kicking off on November 12, 2020, 8AM PST with a full day of technical talks on a variety of topics, including updates to the core framework, new tools and libraries to support development across a variety of domains. You’ll also see talks covering the latest research around systems and tooling in ML.

    + +

    For Developer Day, we have an online networking event limited to people composed of PyTorch maintainers and contributors, long-time stakeholders and experts in areas relevant to PyTorch’s future. Conversations from the networking event will strongly shape the future of PyTorch. Hence, invitations are required to attend the networking event.

    + +

    All talks will be livestreamed and available to the public.

    + + +

    Visit the event website to learn more. We look forward to welcoming you to PyTorch Developer Day on November 12th!

    + +

    Thank you,

    + +

    The PyTorch team

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-developer-day-2021/index.html b/blog/pytorch-developer-day-2021/index.html new file mode 100644 index 000000000000..960157ea6d74 --- /dev/null +++ b/blog/pytorch-developer-day-2021/index.html @@ -0,0 +1,666 @@ + + + + + + + + + + + + + Announcing PyTorch Developer Day 2021 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    August 23, 2021

    +

    + Announcing PyTorch Developer Day 2021 +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    We are excited to announce PyTorch Developer Day (#PTD2), taking place virtually from December 1 & 2, 2021. Developer Day is designed for developers and users to discuss core technical developments, ideas, and roadmaps.

    + +
    + +
    + +

    Event Details

    +

    Technical Talks Live Stream - December 1, 2021

    + +

    Join us for technical talks on a variety of topics, including updates to the core framework, new tools and libraries to support development across a variety of domains, responsible AI and industry use cases. All talks will take place on December 1 and will be live streamed on PyTorch channels.

    + +

    Stay up to date by following us on our social channels: Twitter, Facebook, or LinkedIn.

    + +

    Poster Exhibition & Networking - December 2, 2021

    + +

    On the second day, we’ll be hosting an online poster exhibition on Gather.Town. There will be opportunities to meet the authors and learn more about their PyTorch projects as well as network with the community. This poster and networking event is limited to people composed of PyTorch maintainers and contributors, long-time stakeholders and experts in areas relevant to PyTorch’s future. Conversations from the networking event will strongly shape the future of PyTorch. As such, invitations are required to attend the networking event.

    + +

    Call for Content Now Open

    + +

    Submit your poster abstracts today! Please send us the title and brief summary of your project, tools and libraries that could benefit PyTorch researchers in academia and industry, application developers, and ML engineers for consideration. The focus must be on academic papers, machine learning research, or open-source projects related to PyTorch development, Responsible AI or Mobile. Please no sales pitches. Deadline for submission is September 24, 2021.

    + +

    Visit the event website for more information and we look forward to having you at PyTorch Developer Day.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-docathon-h2-2023-wrap/index.html b/blog/pytorch-docathon-h2-2023-wrap/index.html new file mode 100644 index 000000000000..48969e8399e5 --- /dev/null +++ b/blog/pytorch-docathon-h2-2023-wrap/index.html @@ -0,0 +1,666 @@ + + + + + + + + + + + + + 🎉 PyTorch Docathon H2 2023 Wrap-up 🎉 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    November 16, 2023

    +

    + 🎉 PyTorch Docathon H2 2023 Wrap-up 🎉 +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    We are thrilled to announce the successful completion of the Fall 2023 PyTorch Docathon! The event was a resounding success, and we want to extend our heartfelt gratitude to all the participants who made it possible. Dedication, expertise, and tireless efforts of our open-source contributors have once again helped us to improve PyTorch documentation.

    + +

    This Docathon ran from Nov 1 through Nov 15 with more than 170 registrants. The energy and enthusiasm were palpable, and entrants were judged on the difficulty of submissions that resulted in over TBA merged pull requests. We have fixed the PyTorch docstrings and made them compatible with the PEP 257 Python Docstring Conventions guidelines. We also have fixed multiple bugs in the pytorch/tutorials repo.

    + +

    We want to give a special shout-out to our top contributors, who went above and beyond during this event. Your dedication and expertise have been invaluable in enhancing the PyTorch documentation and empowering developers worldwide.

    + +

    Meet the top contributors:

    + + + +

    You can see the full docathon leaderboard published here.

    + +

    As we bring this Docathon to a close, we encourage each and every one of you to stay inspired and keep contributing to PyTorch documentation and code, and pushing the boundaries of what’s possible with PyTorch. Your collective efforts are shaping the landscape of deep learning and fostering innovation in the PyTorch community.

    + +

    Thank you again for your participation and support. We look forward to seeing what you will achieve next!

    + +

    Team PyTorch

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-docathon-h2-2024-wrap-up/index.html b/blog/pytorch-docathon-h2-2024-wrap-up/index.html new file mode 100644 index 000000000000..af6ffaf327e5 --- /dev/null +++ b/blog/pytorch-docathon-h2-2024-wrap-up/index.html @@ -0,0 +1,666 @@ + + + + + + + + + + + + + 🎉 PyTorch Docathon H1 2024 Wrap-up 🎉 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    + +
    + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    We are thrilled to announce the successful completion of the H1 2024 PyTorch Docathon! The event was a resounding success, and we want to extend our heartfelt gratitude to all the participants who made it possible. Dedication, expertise, and tireless efforts of our open-source contributors have once again helped us to improve PyTorch documentation.

    + +

    This Docathon ran from June 4 through June 20 with more than 176 registrants. The energy and enthusiasm were palpable, and entrants were judged on the difficulty of submissions that resulted in over 50 merged pull requests.

    + +

    We want to give a special shout-out to our top contributors, who went above and beyond during this event. Your dedication and expertise have been invaluable in enhancing the PyTorch documentation and empowering developers worldwide.

    + +

    Meet the top contributors

    + + + +

    For the full list of participants, see here.

    + +

    As we bring this Docathon to a close, we encourage each and every one of you to stay inspired and keep contributing to PyTorch documentation and code, and pushing the boundaries of what’s possible with PyTorch. Your collective efforts are shaping the landscape of deep learning and fostering innovation in the PyTorch community.

    + +

    Thank you again for your participation and support. We look forward to seeing what you will achieve next!

    + +

    Team PyTorch

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-documentary/index.html b/blog/pytorch-documentary/index.html new file mode 100644 index 000000000000..95dcbd61e586 --- /dev/null +++ b/blog/pytorch-documentary/index.html @@ -0,0 +1,689 @@ + + + + + + + + + + + + + Powering the AI Revolution: The PyTorch Documentary | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + The PyTorch Foundation + +

    +

    Now live: The official PyTorch Documentary! This film unveils the authentic narrative of PyTorch’s inception, attributing its existence to a dedicated group of unsung heroes driving technological innovation.

    + +

    The documentary shares the strength of the PyTorch community, resonating with our communities across the globe. We hope this story of PyTorch inspires greater contributions, attracts more contributors to the project, and fosters widespread recognition of PyTorch’s significance in the open source community.

    + + + +

    We couldn’t have produced this without the support of our PyTorch Foundation members and sponsors:

    + +

    company logos

    + +

    AMD

    + +

    “PyTorch’s growth and adoption in the AI community is a testament to open collaboration. The collective efforts of all the contributors have helped propel PyTorch as one of the most widely adopted AI frameworks in the industry. AMD is proud to be a part of this movement - making sure that the future of AI is open - and we are excited to continue contributing to this vibrant ecosystem.”

    + +

    – Niles Burbank, AMD

    + +

    AWS

    + +

    “The release of the PyTorch Documentary showcases the innovation and real-world impact of one of the most widely adopted open source machine learning frameworks. By supporting and contributing to the PyTorch community, AWS helps enable cutting-edge machine learning research that drives advancements in AI capabilities. We are excited about the documentary as it highlights the power of collaboration in propelling PyTorch to the forefront of machine learning and empowering developers and data scientists to create groundbreaking models. At AWS, we celebrate frameworks like PyTorch that foster environments where open source machine learning technologies can grow and benefit the community at-large, as well as our customers.”

    + +

    – Brian Granger, AWS

    + +

    Google Cloud

    + +

    “Google recognizes the impact of PyTorch on the AI community, providing researchers and developers with powerful, flexible tools for innovation. This documentary not only celebrates the remarkable achievements of the PyTorch community but also highlights the collaborative spirit driving advancements in AI. We look forward to continuing our support for PyTorch and fostering an open ecosystem that accelerates machine learning research and application.”

    + +

    – Dwarak Rajagopal, Google

    + +

    Meta

    + +

    “We have been so impressed with the growth and collaboration that PyTorch has created over the years. From very humble beginnings at Meta to a cornerstone in AI research and development, the documentary showcases the dedication of our contributors since the start. It’s an honor to be a part of something so impactful, and now it’s been documented for our community to take part in.”

    + +

    – Soumith Chintala, Meta

    + +

    Microsoft Azure

    + +

    “We’re truly excited about the premiere of the PyTorch Documentary. At Microsoft, PyTorch has been our default deep learning framework for building AI solutions including Microsoft Copilot. Additionally, we have made significant investments to create an optimized environment for our customers to develop, train, fine-tune and deploy their PyTorch workloads on Azure and Windows, furthering our commitment to democratize AI.”

    + +

    – Eric Boyd, Microsoft

    + +

    PyTorch Foundation

    + +

    “The release of the PyTorch documentary marks a significant milestone for our community, showcasing the incredible journey and rapid evolution of PyTorch. We are excited to share these stories and achievements with the world, and we look forward to continuing to foster innovation and growth of the PyTorch community and PyTorch’s evolving ecosystem.”

    + +

    – Matt White, PyTorch Foundation

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-ecosystem/index.html b/blog/pytorch-ecosystem/index.html new file mode 100644 index 000000000000..cd029035fe8a --- /dev/null +++ b/blog/pytorch-ecosystem/index.html @@ -0,0 +1,745 @@ + + + + + + + + + + + + + PyTorch Adds New Ecosystem Projects for Encrypted AI and Quantum Computing, Expands PyTorch Hub | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    The PyTorch ecosystem includes projects, tools, models and libraries from a broad community of researchers in academia and industry, application developers, and ML engineers. The goal of this ecosystem is to support, accelerate, and aid in your exploration with PyTorch and help you push the state of the art, no matter what field you are exploring. Similarly, we are expanding the recently launched PyTorch Hub to further help you discover and reproduce the latest research.

    + +

    In this post, we’ll highlight some of the projects that have been added to the PyTorch ecosystem this year and provide some context on the criteria we use to evaluate community projects. We’ll also provide an update on the fast-growing PyTorch Hub and share details on our upcoming PyTorch Summer Hackathon.

    + +
    + +
    + +

    Recently added ecosystem projects

    + +

    From private AI to quantum computing, we’ve seen the community continue to expand into new and interesting areas. The latest projects include:

    + +
      +
    • +

      Advertorch: A Python toolbox for adversarial robustness research. The primary functionalities are implemented in PyTorch. Specifically, AdverTorch contains modules for generating adversarial perturbations and defending against adversarial examples, as well as scripts for adversarial training.

      +
    • +
    • +

      botorch: A modular and easily extensible interface for composing Bayesian optimization primitives, including probabilistic models, acquisition functions, and optimizers.

      +
    • +
    • +

      Skorch: A high-level library for PyTorch that provides full scikit-learn compatibility.

      +
    • +
    • +

      PyTorch Geometric: A library for deep learning on irregular input data such as graphs, point clouds, and manifolds.

      +
    • +
    • +

      PySyft: A Python library for encrypted, privacy preserving deep learning.

      +
    • +
    • +

      PennyLane: A library for quantum ML, automatic differentiation, and optimization of hybrid quantum-classical computations.

      +
    • +
    • +

      Flair: A very simple framework for state-of-the-art natural language processing (NLP).

      +
    • +
    + +

    What makes a great project?

    + +

    When we review project submissions for the PyTorch ecosystem, we take into account a number of factors that we feel are important and that we would want in the projects we use ourselves. Some of these criteria include:

    + +
      +
    1. Well-tested: Users should be confident that ecosystem projects will work well with PyTorch, and include support for CI to ensure that testing is occurring on a continuous basis and the project can run on the latest version of PyTorch.
    2. +
    3. Clear utility: Users should understand where each project fits within the PyTorch ecosystem and the value it brings.
    4. +
    5. Permissive licensing: Users must be able to utilize ecosystem projects without licensing concerns. e.g. BSD-3, Apache-2 and MIT licenses
    6. +
    7. Easy onboarding: Projects need to have support for binary installation options (pip/Conda), clear documentation and a rich set of tutorials (ideally built into Jupyter notebooks).
    8. +
    9. Ongoing maintenance: Project authors need to be committed to supporting and maintaining their projects.
    10. +
    11. Community: Projects should have (or be on track to building) an active, broad-based community.
    12. +
    + +

    If you would like to have your project included in the PyTorch ecosystem and featured on pytorch.org/ecosystem, please complete the form here. If you’ve previously submitted a project for consideration and haven’t heard back, we promise to get back to you as soon as we can - we’ve received a lot of submissions!

    + +

    PyTorch Hub for reproducible research | New models

    + +

    Since launching the PyTorch Hub in beta, we’ve received a lot of interest from the community including the contribution of many new models. Some of the latest include U-Net for Brain MRI contributed by researchers at Duke University, Single Shot Detection from NVIDIA and Transformer-XL from HuggingFace.

    + +

    We’ve seen organic integration of the PyTorch Hub by folks like paperswithcode, making it even easier for you to try out the state of the art in AI research. In addition, companies like Seldon provide production-level support for PyTorch Hub models on top of Kubernetes.

    + +

    What are the benefits of contributing a model in the PyTorch Hub?

    + +
      +
    • +

      Compatibility: PyTorch Hub models are prioritized first for testing by the TorchScript and Cloud TPU teams, and used as baselines for researchers across a number of fields.

      +
    • +
    • +

      Visibility: Models in the Hub will be promoted on pytorch.org as well as on paperswithcode.

      +
    • +
    • +

      Ease of testing and reproducibility: Each model comes with code, clear preprocessing requirements, and methods/dependencies to run. There is also tight integration with Google Colab, making it a true single click to get started.

      +
    • +
    + +

    PyTorch Hub contributions welcome!

    + +

    We are actively looking to grow the PyTorch Hub and welcome contributions. You don’t need to be an original paper author to contribute, and we’d love to see the number of domains and fields broaden. So what types of contributions are we looking for?

    + +
      +
    • +

      Artifacts of a published or an arXiv paper (or something of a similar nature that serves a different audience — such as ULMFit) that a large audience would need.

      + +

      AND

      +
    • +
    • +

      Reproduces the published results (or better)

      +
    • +
    + +

    Overall these models are aimed at researchers either trying to reproduce a baseline, or trying to build downstream research on top of the model (such as feature-extraction or fine-tuning) as well as researchers looking for a demo of the paper for subjective evaluation. Please keep this audience in mind when contributing.

    + +

    If you are short on inspiration or would just like to find out what the SOTA is an any given field or domain, checkout the Paperswithcode state-of-the-art gallery.

    + +

    PyTorch Summer Hackathon

    + +

    We’ll be hosting the first PyTorch Summer Hackathon next month. We invite you to apply to participate in the in-person hackathon on August 8th to 9th at Facebook’s Menlo Park campus. We’ll be bringing the community together to work on innovative ML projects that can solve a broad range of complex challenges.

    + +

    Applications will be reviewed and accepted on a rolling basis until spaces are filled. For those who cannot join this Hackathon in person, we’ll be following up soon with other ways to participate.

    + +

    Please visit this link to apply.

    + +

    Thank you for being part of the PyTorch community!

    + +

    -Team PyTorch

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-edge/index.html b/blog/pytorch-edge/index.html new file mode 100644 index 000000000000..3fb263519075 --- /dev/null +++ b/blog/pytorch-edge/index.html @@ -0,0 +1,675 @@ + + + + + + + + + + + + + PyTorch Edge: Enabling On-Device Inference Across Mobile and Edge Devices with ExecuTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + the PyTorch Edge Team + +

    +

    We are excited to announce ExecuTorch, our all-new solution for enabling on-device inference capabilities across mobile and edge devices with the backing of industry leaders like Arm, Apple, and Qualcomm Innovation Center.

    + +

    As part of PyTorch Edge’s vision for the future of the on-device AI stack and ecosystem, ExecuTorch addresses the fragmentation in the on-device AI ecosystem. It offers a design that provides extension points for seamless third-party integration to accelerate ML models on specialized hardware. Our partners have contributed custom delegate implementations to optimize model inference execution on their respective hardware platforms.

    + +

    We have created extensive documentation that provides more details about ExecuTorch’s architecture, its high-level components, example ML models running on ExecuTorch, and end-to-end tutorials for exporting and running a model on various hardware devices. We are excited to see all of the innovative use cases of ExecuTorch built by the community.

    + +

    Key Components of ExecuTorch

    + +

    ExecuTorch offers a compact runtime with a lightweight operator registry to cover the PyTorch ecosystem of models, and a streamlined path to execute PyTorch programs on edge devices. These devices range from mobile phones to embedded hardware powered by specific delegates built by our partners. In addition, ExecuTorch ships with a Software Developer Kit (SDK) and toolchain that provide an ergonomic UX for ML Developers to go from model authoring to training and device delegation in a single PyTorch workflow. This suite of tools enables ML developers to perform on-device model profiling and better ways of debugging the original PyTorch model.

    + +

    ExecuTorch is architected from the ground up in a composable manner to allow ML developers to make decisions on what components to leverage as well as entry points to extend them if needed. This design provides the following benefits to the ML community:

    + +
      +
    • Portability: Compatibility with a wide variety of computing platforms, from high-end mobile phones to highly constrained embedded systems and microcontrollers.
    • +
    • Productivity: Enabling developers to use the same toolchains and SDK from PyTorch model authoring and conversion, to debugging and deployment to a wide variety of platforms, resulting in productivity gains.
    • +
    • Performance: Providing end users with a seamless and high-performance experience due to a lightweight runtime as well as its ability to utilize full hardware capabilities, including general purpose CPUs and specialized purpose microprocessors such as NPUs and DSPs.
    • +
    + +

    PyTorch Edge: from PyTorch Mobile to ExecuTorch

    + +

    Bringing research and production environments closer together is a fundamental goal of PyTorch. ML engineers increasingly use PyTorch to author and deploy machine learning models in highly dynamic and ever-evolving environments, from servers to edge devices such as mobile phones and embedded hardware.

    + +

    With the increasing adoption of AI in Augmented Reality (AR), Virtual Reality (VR), Mixed Reality (MR), Mobile, IoT and other domains, there is a growing need for an end-to-end on-device solution that is extensible, modular, and aligned with the PyTorch stack.

    + +

    PyTorch Edge builds on the same fundamental principle of improving research to production by enabling the deployment of various ML models (spanning vision, speech, NLP, translation, ranking, integrity and content creation tasks) to edge devices via a low-friction development and deployment process. It provides a framework stack that spans the universe of on-device use-cases that the PyTorch community cares about.

    + +

    PyTorch Edge provides portability of core components that is required to reach a wide spectrum of devices which are characterized by differing hardware configurations, performance and efficiency. Such portability is achieved by allowing optimization that are custom developed for the target use-cases, and developer productivity via well defined entry-points, representations, and tools to tie all this together into a thriving ecosystem.

    + +

    PyTorch Edge is the future of the on-device AI stack and ecosystem for PyTorch. We are excited to see what the community builds with ExecuTorch’s on-device inference capabilities across mobile and edge devices backed by our industry partner delegates.

    + +

    Learn more about PyTorch Edge and ExecuTorch.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-enterprise-support-update/index.html b/blog/pytorch-enterprise-support-update/index.html new file mode 100644 index 000000000000..fcfc324e7cb0 --- /dev/null +++ b/blog/pytorch-enterprise-support-update/index.html @@ -0,0 +1,655 @@ + + + + + + + + + + + + + PyTorch Enterprise Support Program Update | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    November 10, 2022

    +

    + PyTorch Enterprise Support Program Update +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    On May 25, 2021, we announced the PyTorch Enterprise Support Program (ESP) that enabled providers to develop and offer tailored enterprise-grade support to their customers.

    + +

    The program enabled Program certified service providers to develop and offer tailored enterprise-grade support to their customers through contribution of hotfixes and other improvements requested by PyTorch enterprise users who were developing models in production at scale for mission-critical applications. However, as we evaluate community feedback, we found ongoing ESP support was not necessary at this time and will immediately divert these resources to other areas to improve the user experience for the entire community.

    + +

    Today, we are removing the PyTorch long-term support (LTS 1.8.2) download link from the “Get Started” page from the “Start Locally” download option in order to simplify the user experience. One can download PyTorch v1.8.2 in previous versions. Please note that it is only supported for Python while it is being deprecated. If there are any updates to ESP/LTS, we will update future blogs.

    + +

    + +

    + +

    Please reach out to marketing@pytorch.org with any questions.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-feature-classification-changes/index.html b/blog/pytorch-feature-classification-changes/index.html new file mode 100644 index 000000000000..034e138eae36 --- /dev/null +++ b/blog/pytorch-feature-classification-changes/index.html @@ -0,0 +1,690 @@ + + + + + + + + + + + + + PyTorch feature classification changes | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    July 28, 2020

    +

    + PyTorch feature classification changes +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    Traditionally features in PyTorch were classified as either stable or experimental with an implicit third option of testing bleeding edge features by building master or through installing nightly builds (available via prebuilt whls). This has, in a few cases, caused some confusion around the level of readiness, commitment to the feature and backward compatibility that can be expected from a user perspective. Moving forward, we’d like to better classify the 3 types of features as well as define explicitly here what each mean from a user perspective.

    + +

    New Feature Designations

    + +

    We will continue to have three designations for features but, as mentioned, with a few changes: Stable, Beta (previously Experimental) and Prototype (previously Nightlies). Below is a brief description of each and a comment on the backward compatibility expected:

    + +

    Stable

    +

    Nothing changes here. A stable feature means that the user value-add is or has been proven, the API isn’t expected to change, the feature is performant and all documentation exists to support end user adoption.

    + +

    Level of commitment: We expect to maintain these features long term and generally there should be no major performance limitations, gaps in documentation and we also expect to maintain backwards compatibility (although breaking changes can happen and notice will be given one release ahead of time).

    + +

    Beta

    +

    We previously called these features ‘Experimental’ and we found that this created confusion amongst some of the users. In the case of a Beta level features, the value add, similar to a Stable feature, has been proven (e.g. pruning is a commonly used technique for reducing the number of parameters in NN models, independent of the implementation details of our particular choices) and the feature generally works and is documented. This feature is tagged as Beta because the API may change based on user feedback, because the performance needs to improve or because coverage across operators is not yet complete.

    + +

    Level of commitment: We are committing to seeing the feature through to the Stable classification. We are however not committing to Backwards Compatibility. Users can depend on us providing a solution for problems in this area going forward, but the APIs and performance characteristics of this feature may change.

    + +
    + +
    + +

    Prototype

    +

    Previously these were features that were known about by developers who paid close attention to RFCs and to features that land in master. These features are part of the release and are available as part of binary distributions like PyPI or Conda. We would like to get high bandwidth partner feedback ahead of a real release in order to gauge utility and any changes we need to make to the UX. For each prototype feature, a pointer to draft docs or other instructions will be provided.

    + +

    Level of commitment: We are committing to gathering high bandwidth feedback only. Based on this feedback and potential further engagement between community members, we as a community will decide if we want to upgrade the level of commitment or to fail fast. Additionally, while some of these features might be more speculative (e.g. new Frontend APIs), others have obvious utility (e.g. model optimization) but may be in a state where gathering feedback outside of high bandwidth channels is not practical, e.g. the feature may be in an earlier state, may be moving fast (PRs are landing too quickly to catch a major release) and/or generally active development is underway.

    + +

    What changes for current features?

    + +

    First and foremost, you can find these designations on pytorch.org/docs. We will also be linking any early stage features here for clarity.

    + +

    Additionally, the following features will be reclassified under this new rubric:

    + +
      +
    1. High Level Autograd APIs: Beta (was Experimental)
    2. +
    3. Eager Mode Quantization: Beta (was Experimental)
    4. +
    5. Named Tensors: Prototype (was Experimental)
    6. +
    7. TorchScript/RPC: Prototype (was Experimental)
    8. +
    9. Channels Last Memory Layout: Beta (was Experimental)
    10. +
    11. Custom C++ Classes: Beta (was Experimental)
    12. +
    13. PyTorch Mobile: Beta (was Experimental)
    14. +
    15. Java Bindings: Beta (was Experimental)
    16. +
    17. Torch.Sparse: Beta (was Experimental)
    18. +
    + +

    Cheers,

    + +

    Joe, Greg, Woo & Jessica

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-for-amd-rocm-platform-now-available-as-python-package/index.html b/blog/pytorch-for-amd-rocm-platform-now-available-as-python-package/index.html new file mode 100644 index 000000000000..408b7a184097 --- /dev/null +++ b/blog/pytorch-for-amd-rocm-platform-now-available-as-python-package/index.html @@ -0,0 +1,715 @@ + + + + + + + + + + + + + PyTorch for AMD ROCm™ Platform now available as Python package | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Niles Burbank – Director PM at AMD, Mayank Daga – Director, Deep Learning Software at AMD + +

    +

    With the PyTorch 1.8 release, we are delighted to announce a new installation option for users of +PyTorch on the ROCm™ open software platform. An installable Python package is now hosted on +pytorch.org, along with instructions for local installation in the same simple, selectable format as +PyTorch packages for CPU-only configurations and other GPU platforms. PyTorch on ROCm includes full +capability for mixed-precision and large-scale training using AMD’s MIOpen & RCCL libraries. This +provides a new option for data scientists, researchers, students, and others in the community to get +started with accelerated PyTorch using AMD GPUs.

    + +
    + +
    + +

    The ROCm Ecosystem

    + +

    ROCm is AMD’s open source software platform for GPU-accelerated high performance computing and +machine learning. Since the original ROCm release in 2016, the ROCm platform has evolved to support +additional libraries and tools, a wider set of Linux® distributions, and a range of new GPUs. This includes +the AMD Instinct™ MI100, the first GPU based on AMD CDNA™ architecture.

    + +

    The ROCm ecosystem has an established history of support for PyTorch, which was initially implemented +as a fork of the PyTorch project, and more recently through ROCm support in the upstream PyTorch +code. PyTorch users can install PyTorch for ROCm using AMD’s public PyTorch docker image, and can of +course build PyTorch for ROCm from source. With PyTorch 1.8, these existing installation options are +now complemented by the availability of an installable Python package.

    + +

    The primary focus of ROCm has always been high performance computing at scale. The combined +capabilities of ROCm and AMD’s Instinct family of data center GPUs are particularly suited to the +challenges of HPC at data center scale. PyTorch is a natural fit for this environment, as HPC and ML +workflows become more intertwined.

    + +

    Getting started with PyTorch for ROCm

    + +

    The scope for this build of PyTorch is AMD GPUs with ROCm support, running on Linux. The GPUs +supported by ROCm include all of AMD’s Instinct family of compute-focused data center GPUs, along +with some other select GPUs. A current list of supported GPUs can be found in the ROCm Github +repository. After confirming that the target system includes supported GPUs and the current 4.0.1 +release of ROCm, installation of PyTorch follows the same simple Pip-based installation as any other +Python package. As with PyTorch builds for other platforms, the configurator at https://pytorch.org/get-started/locally/ provides the specific command line to be run.

    + +

    PyTorch for ROCm is built from the upstream PyTorch repository, and is a full featured implementation. +Notably, it includes support for distributed training across multiple GPUs and supports accelerated +mixed precision training.

    + +

    More information

    + +

    A list of ROCm supported GPUs and operating systems can be found at +https://github.com/RadeonOpenCompute/ROCm +General documentation on the ROCm platform is available at https://rocmdocs.amd.com/en/latest/ +ROCm Learning Center at https://developer.amd.com/resources/rocm-resources/rocm-learning-center/ General information on AMD’s offerings for HPC and ML can be found at https://amd.com/hpc

    + +

    Feedback

    +

    An engaged user base is a tremendously important part of the PyTorch ecosystem. We would be deeply +appreciative of feedback on the PyTorch for ROCm experience in the PyTorch discussion forum and, where appropriate, reporting any issues via Github.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-hackathon-2021/index.html b/blog/pytorch-hackathon-2021/index.html new file mode 100644 index 000000000000..39356cb8954b --- /dev/null +++ b/blog/pytorch-hackathon-2021/index.html @@ -0,0 +1,695 @@ + + + + + + + + + + + + + Announcing PyTorch Annual Hackathon 2021 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    September 08, 2021

    +

    + Announcing PyTorch Annual Hackathon 2021 +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    We’re excited to announce the PyTorch Annual Hackathon 2021! This year, we’re looking to support the community in creating innovative PyTorch tools, libraries, and applications. 2021 is the third year we’re hosting this Hackathon, and we welcome you to join the PyTorch community and put your machine learning skills into action. Submissions start on September 8 and end on November 3. Good luck to everyone!

    + +
    + +
    + +

    Submission Categories

    +

    You can enter your PyTorch projects into three categories:

    + +
      +
    • +

      PyTorch Responsible AI Development Tools & Libraries - Build an AI development tool or library that helps develop AI models and applications responsibly. These tools, libraries, and apps need to support a researcher or developer to factor in fairness, security, and privacy throughout the entire machine learning development process of data gathering, model training, model validation, inferences, monitoring, and more.

      +
    • +
    • +

      Web and Mobile Applications Powered by PyTorch - Build an application with the web, mobile interface, and/or embedded device powered by PyTorch so the end users can interact with it. The submission must be built on PyTorch or use PyTorch-based libraries such as torchvision, torchtext, and fast.ai.

      +
    • +
    • +

      PyTorch Developer Tools & Libraries - Build a creative, useful, and well-implemented tool or library for improving the productivity and efficiency of PyTorch researchers and developers. The submission must be a machine learning algorithm, model, or application built using PyTorch or PyTorch-based libraries.

      +
    • +
    + +

    Prizes

    +

    Submissions will be judged on the idea’s quality, originality, implementation, and potential impact.

    + +
      +
    • +

      First-Place Winners in each category of the Hackathon will receive $5,000 in cash, along with a 30-minute call with the PyTorch development team.

      +
    • +
    • +

      Second-Place Winners will receive $3,000.

      +
    • +
    • +

      Third-Place Winners will receive $2,000.

      +
    • +
    + +

    All winners will also receive the opportunity to create blog posts that will be featured throughout PyTorch channels as well as an exclusive Github badge. Honorable Mentions will also be awarded to the following three highest-scoring entries in each category and will receive $1,000 each.

    + +

    Cloud Computing Credits

    +

    Request $100 in credits from Amazon Web Services or Google Cloud for your computing costs. Please allow 3 business days for your request to be reviewed. Credits will be provided to verified registrants until the supplies run out. For more information, see https://pytorch2021.devpost.com/details/sponsors.

    + +

    2020 Winning Projects

    + +

    DeMask won first place in the PyTorch Developer Tools category. Built using Asteroid, a PyTorch-based audio source separation toolkit, DeMask is an end-to-end model for enhancing speech while wearing face masks.

    + +

    Q&Aid won first place in the Web/Mobile Applications Powered by PyTorch category. Backed by PyTorch core algorithms and models, Q&Aid is a conceptual health care chatbot aimed at making health care diagnoses and facilitating communication between patients and doctors.

    + +

    FairTorch won first place in the PyTorch Responsible AI Development Tools category. FairTorch is a PyTorch fairness library that lets developers add constraints to their models to equalize metrics across subgroups by simply adding a few lines of code.

    + +

    How to Join

    +

    If you’re interested in joining this year’s PyTorch Hackathon, register at http://pytorch2021.devpost.com.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-landscape/index.html b/blog/pytorch-landscape/index.html new file mode 100644 index 000000000000..0c1c2a1c6d27 --- /dev/null +++ b/blog/pytorch-landscape/index.html @@ -0,0 +1,680 @@ + + + + + + + + + + + + + Introducing the New PyTorch Landscape: Your Guide to the PyTorch Ecosystem | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    We’re excited to reveal our brand new PyTorch Landscape. The PyTorch Landscape helps researchers, developers, and organizations easily locate useful, curated, community-built tools that augment the PyTorch core framework.

    + +

    landscape banner

    + +

    What the Landscape Offers

    + +

    The Landscape visually organizes projects into three categories—Modeling, Training, and Optimizations—making finding relevant frameworks, libraries, and projects easy. Users can quickly locate curated, valuable tools for a variety of use cases that complement the PyTorch framework. Each tool that is part of the Landscape has been reviewed and vetted by PyTorch project experts. The projects in the Landscape are considered to be mature and healthy and provide valuable capabilities that complement the PyTorch framework in their respective domains.

    + +

    Explore the AI Landscape

    + +

    The Explore page presents platforms, tools, and libraries, each with a logo, description, and links to GitHub and further details. This categorized, visual approach simplifies discovery and provides quick access to essential technologies.

    + +

    Guide Page: A Closer Look

    + +

    For deeper insights, the Guide page expands on each project, highlighting methodologies and trends shaping AI development, from adversarial robustness to self-supervised learning. There are also project statistics provided for each project, including metrics such as number of stars, contributors, commit history, languages used, license, and other valuable metrics that provide an in-depth understanding of the project and how it may be used.

    + +

    Tracking AI’s Growth: The Stats Page

    + +

    The Stats page provides insights into AI development trends, tracking repository activity, programming languages, and industry funding data.

    + +
      +
    • Repositories: 117 repositories, 20.5k contributors, and 797.2k stars across 815MB of source code.
    • +
    • Development Trends: Weekly commit activity over the last year.
    • +
    • Licensing Breakdown: Repositories are categorized by license type.
    • +
    • Funding & Acquisitions: Insights into investment trends, including funding rounds and acquisitions.
    • +
    + +

    Why Use the PyTorch Landscape?

    + +

    Finding useful and high quality open source projects that complement the PyTorch core system can be overwhelming. The PyTorch Landscape offers a clear, accessible way to explore the ecosystem of community-built tools, whether you’re researching, building models, or making strategic decisions.

    + +

    Stay ahead with the PyTorch Landscape — your guide to the PyTorch Ecosystem.

    + +

    Want to Contribute a Project to the PyTorch Landscape?

    + +

    Have you built a useful open source tool that you would like to share with the PyTorch community? Then help us grow the Ecosystem by contributing your tool! You can find the instructions to apply here. We welcome all contributions from the community!

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-library-updates-new-model-serving-library/index.html b/blog/pytorch-library-updates-new-model-serving-library/index.html new file mode 100644 index 000000000000..6619b31965d5 --- /dev/null +++ b/blog/pytorch-library-updates-new-model-serving-library/index.html @@ -0,0 +1,728 @@ + + + + + + + + + + + + + PyTorch library updates including new model serving library | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    Along with the PyTorch 1.5 release, we are announcing new libraries for high-performance PyTorch model serving and tight integration with TorchElastic and Kubernetes. Additionally, we are releasing updated packages for torch_xla (Google Cloud TPUs), torchaudio, torchvision, and torchtext. All of these new libraries and enhanced capabilities are available today and accompany all of the core features released in PyTorch 1.5.

    + +

    TorchServe (Experimental)

    + +

    TorchServe is a flexible and easy to use library for serving PyTorch models in production performantly at scale. It is cloud and environment agnostic and supports features such as multi-model serving, logging, metrics, and the creation of RESTful endpoints for application integration. TorchServe was jointly developed by engineers from Facebook and AWS with feedback and engagement from the broader PyTorch community. The experimental release of TorchServe is available today. Some of the highlights include:

    + +
      +
    • Support for both Python-based and TorchScript-based models
    • +
    • Default handlers for common use cases (e.g., image segmentation, text classification) as well as the ability to write custom handlers for other use cases
    • +
    • Model versioning, the ability to run multiple versions of a model at the same time, and the ability to roll back to an earlier version
    • +
    • The ability to package a model, learning weights, and supporting files (e.g., class mappings, vocabularies) into a single, persistent artifact (a.k.a. the “model archive”)
    • +
    • Robust management capability, allowing full configuration of models, versions, and individual worker threads via command line, config file, or run-time API
    • +
    • Automatic batching of individual inferences across HTTP requests
    • +
    • Logging including common metrics, and the ability to incorporate custom metrics
    • +
    • Ready-made Dockerfile for easy deployment
    • +
    • HTTPS support for secure deployment
    • +
    + +

    To learn more about the APIs and the design of this feature, see the links below:

    +
      +
    • See for a full multi-node deployment reference architecture.
    • +
    • The full documentation can be found here.
    • +
    + +

    TorchElastic integration with Kubernetes (Experimental)

    + +

    TorchElastic is a proven library for training large scale deep neural networks at scale within companies like Facebook, where having the ability to dynamically adapt to server availability and scale as new compute resources come online is critical. Kubernetes enables customers using machine learning frameworks like PyTorch to run training jobs distributed across fleets of powerful GPU instances like the Amazon EC2 P3. Distributed training jobs, however, are not fault-tolerant, and a job cannot continue if a node failure or reclamation interrupts training. Further, jobs cannot start without acquiring all required resources, or scale up and down without being restarted. This lack of resiliency and flexibility results in increased training time and costs from idle resources. TorchElastic addresses these limitations by enabling distributed training jobs to be executed in a fault-tolerant and elastic manner. Until today, Kubernetes users needed to manage Pods and Services required for TorchElastic training jobs manually.

    + +

    Through the joint collaboration of engineers at Facebook and AWS, TorchElastic, adding elasticity and fault tolerance, is now supported using vanilla Kubernetes and through the managed EKS service from AWS.

    + +

    To learn more see the TorchElastic repo for the controller implementation and docs on how to use it.

    + +

    torch_xla 1.5 now available

    + +

    torch_xla is a Python package that uses the XLA linear algebra compiler to accelerate the PyTorch deep learning framework on Cloud TPUs and Cloud TPU Pods. torch_xla aims to give PyTorch users the ability to do everything they can do on GPUs on Cloud TPUs as well while minimizing changes to the user experience. The project began with a conversation at NeurIPS 2017 and gathered momentum in 2018 when teams from Facebook and Google came together to create a proof of concept. We announced this collaboration at PTDC 2018 and made the PyTorch/XLA integration broadly available at PTDC 2019. The project already has 28 contributors, nearly 2k commits, and a repo that has been forked more than 100 times.

    + +

    This release of torch_xla is aligned and tested with PyTorch 1.5 to reduce friction for developers and to provide a stable and mature PyTorch/XLA stack for training models using Cloud TPU hardware. You can try it for free in your browser on an 8-core Cloud TPU device with Google Colab, and you can use it at a much larger scaleon Google Cloud.

    + +

    See the full torch_xla release notes here. Full docs and tutorials can be found here and here.

    + +

    PyTorch Domain Libraries

    + +

    torchaudio, torchvision, and torchtext complement PyTorch with common datasets, models, and transforms in each domain area. We’re excited to share new releases for all three domain libraries alongside PyTorch 1.5 and the rest of the library updates. For this release, all three domain libraries are removing support for Python2 and will support Python3 only.

    + +

    torchaudio 0.5

    +

    The torchaudio 0.5 release includes new transforms, functionals, and datasets. Highlights for the release include:

    + +
      +
    • Added the Griffin-Lim functional and transform, InverseMelScale and Vol transforms, and DB_to_amplitude.
    • +
    • Added support for allpass, fade, bandpass, bandreject, band, treble, deemph, and riaa filters and transformations.
    • +
    • New datasets added including LJSpeech and SpeechCommands datasets.
    • +
    + +

    See the release full notes here and full docs can be found here.

    + +

    torchvision 0.6

    +

    The torchvision 0.6 release includes updates to datasets, models and a significant number of bug fixes. Highlights include:

    + +
      +
    • Faster R-CNN now supports negative samples which allows the feeding of images without annotations at training time.
    • +
    • Added aligned flag to RoIAlign to match Detectron2.
    • +
    • Refactored abstractions for C++ video decoder
    • +
    + +

    See the release full notes here and full docs can be found here.

    + +

    torchtext 0.6

    +

    The torchtext 0.6 release includes a number of bug fixes and improvements to documentation. Based on user’s feedback, dataset abstractions are currently being redesigned also. Highlights for the release include:

    + +
      +
    • Fixed an issue related to the SentencePiece dependency in conda package.
    • +
    • Added support for the experimental IMDB dataset to allow a custom vocab.
    • +
    • A number of documentation updates including adding a code of conduct and a deduplication of the docs on the torchtext site.
    • +
    + +

    Your feedback and discussions on the experimental datasets API are welcomed. You can send them to issue #664. We would also like to highlight the pull request here where the latest dataset abstraction is applied to the text classification datasets. The feedback can be beneficial to finalizing this abstraction.

    + +

    See the release full notes here and full docs can be found here.

    + +

    We’d like to thank the entire PyTorch team, the Amazon team and the community for all their contributions to this work.

    + +

    Cheers!

    + +

    Team PyTorch

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-native-architecture-optimization/index.html b/blog/pytorch-native-architecture-optimization/index.html new file mode 100644 index 000000000000..fb16059c504c --- /dev/null +++ b/blog/pytorch-native-architecture-optimization/index.html @@ -0,0 +1,1194 @@ + + + + + + + + + + + + + PyTorch Native Architecture Optimization: torchao | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    + +
    + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    We’re happy to officially launch torchao, a PyTorch native library that makes models faster and smaller by leveraging low bit dtypes, quantization and sparsity. torchao is an accessible toolkit of techniques written (mostly) in easy to read PyTorch code spanning both inference and training. This blog will help you pick which techniques matter for your workloads.

    + +

    We benchmarked our techniques on popular GenAI models like LLama 3 and Diffusion models and saw minimal drops in accuracy. Unless otherwise noted the baselines are bf16 run on A100 80GB GPU.

    + +

    Our topline metrics for llama 3 are

    +
      +
    • 97% speedup for Llama 3 8B inference using autoquant with int4 weight only quantization and hqq
    • +
    • 73% peak VRAM reduction for Llama 3.1 8B inference at 128K context length with a quantized KV cache
    • +
    • 50% speedup for Llama 3 70B pretraining using float8 training on H100
    • +
    • 30% peak VRAM reduction for Llama 3 8B using 4 bit quantized optimizers.
    • +
    + +

    Our topline metrics for diffusion model inference

    +
      +
    • 53% speedup using float8 dynamic quantization inference with float8 row-wise scaling on flux1.dev onH100
    • +
    • 50% reduction in model VRAM for CogVideoX using int8 dynamic quantization
    • +
    + +

    Below we’ll walk through some of the techniques available in torchao you can apply to your models for inference and training.

    + +

    Inference

    + +

    Our inference quantization algorithms work over arbitrary PyTorch models that contain nn.Linear layers. Weight only and dynamic activation quantization for various dtypes and sparse layouts can be chosen using our top level quantize_ api

    + +
    from torchao.quantization import (  
    +    quantize_,  
    +    int4_weight_only,  
    +)  
    +quantize_(model, int4_weight_only())
    +
    + +

    Sometimes quantizing a layer can make it slower because of overhead so if you’d rather we just pick how to quantize each layer in a model for you then you can instead run

    + +
    model = torchao.autoquant(torch.compile(model, mode='max-autotune'))
    +
    + +

    quantize_ API has a few different options depending on whether your model is compute bound or memory bound.

    + +
    from torchao.quantization import (  
    +    # Memory bound models  
    +    int4_weight_only,  
    +    int8_weight_only,
    +
    +    # Compute bound models  
    +    int8_dynamic_activation_int8_semi_sparse_weight,  
    +    int8_dynamic_activation_int8_weight,  
    +      
    +    # Device capability 8.9+  
    +    float8_weight_only,  
    +    float8_dynamic_activation_float8_weight,  
    +)
    +
    + +

    We also have extensive benchmarks on diffusion models in collaboration with the HuggingFace diffusers team in diffusers-torchao where we demonstrated 53.88% speedup on Flux.1-Dev and 27.33% speedup on CogVideoX-5b

    + +

    + +

    Our APIs are composable so we’ve for example composed sparsity and quantization to bring 5% speedup for ViT-H inference

    + +

    But also can do things like quantize weights to int4 and the kv cache to int8 to support Llama 3.1 8B at the full 128K context length running in under 18.9GB of VRAM.
    +

    + +

    QAT

    + +

    Post training quantization, especially at less than 4 bit can suffer from serious accuracy degradations. Using Quantization Aware Training (QAT) we’ve managed to recover up to 96% of the accuracy degradation on hellaswag. We’ve integrated this as an end to end recipe in torchtune with a minimal tutorial

    + +

    + +

    Training

    + +

    Low precision compute and communications

    + +

    torchao provides easy to use e2e workflows for reducing the precision of training compute and distributed communications, starting with float8 for `torch.nn.Linear` layers.Here is a one-liner to convert the compute gemms of your training run to float8:

    + +
    from torchao.float8 import convert_to_float8_training  
    +convert_to_float8_training(model)
    +
    + +

    For an e2e example of how to speed up LLaMa 3 70B pretraining by up to 1.5x with float8, see our README, and torchtitan’s blog and float8 recipe.

    + +

    Performance and accuracy of float8 pretraining of LLaMa 3 70B, vs bfloat16

    + +

    +(source: https://dev-discuss.pytorch.org/t/enabling-float8-all-gather-in-fsdp2/2359)

    + +

    We are expanding our training workflows to more dtypes and layouts

    + +
      +
    1. NF4 QLoRA in torchtune
    2. +
    3. Prototype int8 training support
    4. +
    5. Accelerated sparse 2:4 training
    6. +
    + +

    Low bit Optimizers

    + +

    Inspired by Bits and Bytes we’ve also added prototype support for 8 and 4 bit optimizers as a drop in replacement for AdamW.

    + +
    from torchao.prototype.low_bit_optim import AdamW8bit, AdamW4bit  
    +optim = AdamW8bit(model.parameters())
    +
    + +

    + +

    Integrations

    + +

    We’ve been actively working on making sure torchao works well in some of the most important projects in open source.

    + +
      +
    1. Huggingface transformers as an inference backend
    2. +
    3. In diffusers-torchao as a reference implementation for accelerating diffusion models
    4. +
    5. In HQQ for fast 4 bit inference
    6. +
    7. In torchtune for PyTorch native QLoRA and QAT recipes
    8. +
    9. In torchchat for post training quantization
    10. +
    11. In SGLang for for int4 and int8 post training quantization
    12. +
    + +

    Conclusion

    + +

    If you’re interested in making your models faster and smaller for training or inference, we hope you’ll find torchao useful and easy to integrate.

    + +

    pip install torchao

    + +

    There are a lot of things we’re excited about next ranging from going lower than 4 bit, performant kernels for high-throughput inference, expanding to more layers, scaling types or granularities, MX hardware support and supporting more hardware backends. If any of the above sounds exciting you can follow our progress at: https://github.com/pytorch/ao

    + +

    If you’re interested in working on torchao, we’ve created a contributors guide, and if you have any questions we hang out on the #torchao channel on discord.gg/gpumode

    + +

    Acknowledgements

    + +

    We are fortunate to stand on the shoulders of giants and collaborate with some of the best people in open source. Thank you!

    + +
      +
    1. Bits and Bytes for pioneering work in low bit optimizers and QLoRA
    2. +
    3. Answer.ai for their engineering work to get FSDP and QLoRA composing
    4. +
    5. Mobius Labs for the lovely back and forths on quantization algorithms and low bit kernels
    6. +
    7. HuggingFace transformers for their help in battle testing and integrating our work
    8. +
    9. HuggingFace diffusers for our collaboration on extensive benchmarks and best practices
    10. +
    11. torch.compile so we could write our algorithms in pure PyTorch
    12. +
    13. GPU MODE for most of our early contributors
    14. +
    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-profiler-1.9-released/index.html b/blog/pytorch-profiler-1.9-released/index.html new file mode 100644 index 000000000000..f16695f2d6e0 --- /dev/null +++ b/blog/pytorch-profiler-1.9-released/index.html @@ -0,0 +1,854 @@ + + + + + + + + + + + + + What’s New in PyTorch Profiler 1.9? | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    August 03, 2021

    +

    + What’s New in PyTorch Profiler 1.9? +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Sabrina Smai, Program Manager on the AI Framework team at Microsoft + +

    +

    PyTorch Profiler v1.9 has been released! The goal of this new release (previous PyTorch Profiler release) is to provide you with new state-of-the-art tools to help diagnose and fix machine learning performance issues regardless of whether you are working on one or numerous machines. The objective is to target the execution steps that are the most costly in time and/or memory, and visualize the work load distribution between GPUs and CPUs.

    + +

    Here is a summary of the five major features being released:

    + +
      +
    1. Distributed Training View: This helps you understand how much time and memory is consumed in your distributed training job. Many issues occur when you take a training model and split the load into worker nodes to be run in parallel as it can be a black box. The overall model goal is to speed up model training. This distributed training view will help you diagnose and debug issues within individual nodes.
    2. +
    3. Memory View: This view allows you to understand your memory usage better. This tool will help you avoid the famously pesky Out of Memory error by showing active memory allocations at various points of your program run.
    4. +
    5. GPU Utilization Visualization: This tool helps you make sure that your GPU is being fully utilized.
    6. +
    7. Cloud Storage Support: Tensorboard plugin can now read profiling data from Azure Blob Storage, Amazon S3, and Google Cloud Platform.
    8. +
    9. Jump to Source Code: This feature allows you to visualize stack tracing information and jump directly into the source code. This helps you quickly optimize and iterate on your code based on your profiling results.
    10. +
    + +

    Getting Started with PyTorch Profiling Tool

    +

    PyTorch includes a profiling functionality called « PyTorch Profiler ». The PyTorch Profiler tutorial can be found here.

    + +

    To instrument your PyTorch code for profiling, you must:

    + +

    $ pip install torch-tb-profiler

    + +
    import torch.profiler as profiler
    +With profiler.profile(XXXX)
    +
    + +

    Comments:

    + +

    • For CUDA and CPU profiling, see below:

    +
    with torch.profiler.profile( 
    +activities=[ 
    +torch.profiler.ProfilerActivity.CPU, 
    +torch.profiler.ProfilerActivity.CUDA], 
    +
    + +

    • With profiler.record_function(“$NAME”): allows putting a decorator (a tag associated to a name) for a block of function

    + +

    • Profile_memory=True parameter under profiler.profile allows you to profile CPU and GPU memory footprint

    + +

    Visualizing PyTorch Model Performance using PyTorch Profiler

    + +

    Distributed Training

    + +

    Recent advances in deep learning argue for the value of large datasets and large models, which requires you to scale out model training to more computational resources. Distributed Data Parallel (DDP) and NVIDIA Collective Communications Library (NCCL) are the widely adopted paradigms in PyTorch for accelerating your deep learning training.

    + +

    In this release of PyTorch Profiler, DDP with NCCL backend is now supported.

    + +
    + +
    + +

    Computation/Communication Overview

    + +

    In the Computation/Communication overview under the Distributed training view, you can observe the computation-to-communication ratio of each worker and [load balancer](https://en.wikipedia.org/wiki/Load_balancing_(computing) nodes between worker as measured by granularity.

    + +

    Scenario 1:

    + +

    If the computation and overlapping time of one worker is much larger than the others, this may suggest an issue in the workload balance or worker being a straggler. Computation is the sum of kernel time on GPU minus the overlapping time. The overlapping time is the time saved by interleaving communications during computation. The more overlapping time represents better parallelism between computation and communication. Ideally the computation and communication completely overlap with each other. Communication is the total communication time minus the overlapping time. The example image below displays how this scenario appears on Tensorboard.

    + +
    + +

    Figure: A straggler example

    +
    + +

    Scenario 2:

    + +

    If there is a small batch size (i.e. less computation on each worker) or the data to be transferred is large, the computation-to-communication may also be small and be seen in the profiler with low GPU utilization and long waiting times. This computation/communication view will allow you to diagnose your code to reduce communication by adopting gradient accumulation, or to decrease the communication proportion by increasing batch size. DDP communication time depends on model size. Batch size has no relationship with model size. So increasing batch size could make computation time longer and make computation-to-communication ratio bigger.

    + +

    Synchronizing/Communication Overview

    + +

    In the Synchronizing/Communication view, you can observe the efficiency of communication. This is done by taking the step time minus computation and communication time. Synchronizing time is part of the total communication time for waiting and synchronizing with other workers. The Synchronizing/Communication view includes initialization, data loader, CPU computation, and so on Insights like what is the ratio of total communication is really used for exchanging data and what is the idle time of waiting for data from other workers can be drawn from this view.

    + +
    + +
    + +

    For example, if there is an inefficient workload balance or straggler issue, you’ll be able to identify it in this Synchronizing/Communication view. This view will show several workers’ waiting time being longer than others.

    + +
    + +
    + +

    This table view above allows you to see the detailed statistics of all communication ops in each node. This allows you to see what operation types are being called, how many times each op is called, what is the size of the data being transferred by each op, etc.

    + +

    Memory View:

    + +

    This memory view tool helps you understand the hardware resource consumption of the operators in your model. Understanding the time and memory consumption on the operator-level allows you to resolve performance bottlenecks and in turn, allow your model to execute faster. Given limited GPU memory size, optimizing the memory usage can:

    + +
      +
    1. Allow bigger model which can potentially generalize better on end level tasks.
    2. +
    3. Allow bigger batch size. Bigger batch sizes increase the training speed.
    4. +
    + +

    The profiler records all the memory allocation during the profiler interval. Selecting the “Device” will allow you to see each operator’s memory usage on the GPU side or host side. You must enable profile_memory=True to generate the below memory data as shown here.

    + +
    With torch.profiler.profile(
    +Profiler_memory=True # this will take 1 – 2 minutes to complete. 
    +)
    +
    + +

    Important Definitions:

    + +

    • “Size Increase” displays the sum of all allocation bytes and minus all the memory release bytes.

    + +

    • “Allocation Size” shows the sum of all allocation bytes without considering the memory release.

    + +

    • “Self” means the allocated memory is not from any child operators, instead by the operator itself.

    + +
    + +
    + +

    GPU Metric on Timeline:

    + +

    This feature will help you debug performance issues when one or more GPU are underutilized. Ideally, your program should have high GPU utilization (aiming for 100% GPU utilization), minimal CPU to GPU communication, and no overhead.

    + +

    Overview: +The overview page highlights the results of three important GPU usage metrics at different levels (i.e. GPU Utilization, Est. SM Efficiency, and Est. Achieved Occupancy). Essentially, each GPU has a bunch of SM each with a bunch of warps that can execute a bunch of threads concurrently. Warps execute a bunch because the amount depends on the GPU. But at a high level, this GPU Metric on Timeline tool allows you can see the whole stack, which is useful.

    + +

    If the GPU utilization result is low, this suggests a potential bottleneck is present in your model. Common reasons:

    + +

    •Insufficient parallelism in kernels (i.e., low batch size)

    + +

    •Small kernels called in a loop. This is to say the launch overheads are not amortized

    + +

    •CPU or I/O bottlenecks lead to the GPU not receiving enough work to keep busy

    + +

    Looking of the overview page where the performance recommendation section is where you’ll find potential suggestions on how to increase that GPU utilization. In this example, GPU utilization is low so the performance recommendation was to increase batch size. Increasing batch size 4 to 32, as per the performance recommendation, increased the GPU Utilization by 60.68%.

    + +

    GPU Utilization: the step interval time in the profiler when a GPU engine was executing a workload. The high the utilization %, the better. The drawback of using GPU utilization solely to diagnose performance bottlenecks is it is too high-level and coarse. It won’t be able to tell you how many Streaming Multiprocessors are in use. Note that while this metric is useful for detecting periods of idleness, a high value does not indicate efficient use of the GPU, only that it is doing anything at all. For instance, a kernel with a single thread running continuously will get a GPU Utilization of 100%

    + +

    Estimated Stream Multiprocessor Efficiency (Est. SM Efficiency) is a finer grained metric, it indicates what percentage of SMs are in use at any point in the trace This metric reports the percentage of time where there is at least one active warp on a SM and those that are stalled (NVIDIA doc). Est. SM Efficiency also has it’s limitation. For instance, a kernel with only one thread per block can’t fully use each SM. SM Efficiency does not tell us how busy each SM is, only that they are doing anything at all, which can include stalling while waiting on the result of a memory load. To keep an SM busy, it is necessary to have a sufficient number of ready warps that can be run whenever a stall occurs

    + +

    Estimated Achieved Occupancy (Est. Achieved Occupancy) is a layer deeper than Est. SM Efficiency and GPU Utilization for diagnosing performance issues. Estimated Achieved Occupancy indicates how many warps can be active at once per SMs. Having a sufficient number of active warps is usually key to achieving good throughput. Unlike GPU Utilization and SM Efficiency, it is not a goal to make this value as high as possible. As a rule of thumb, good throughput gains can be had by improving this metric to 15% and above. But at some point you will hit diminishing returns. If the value is already at 30% for example, further gains will be uncertain. This metric reports the average values of all warp schedulers for the kernel execution period (NVIDIA doc). The larger the Est. Achieve Occupancy value is the better.

    + +
    + +

    Overview details: Resnet50_batchsize4

    +
    + +
    + +

    Overview details: Resnet50_batchsize32

    +
    + +

    Kernel View +The kernel has “Blocks per SM” and “Est. Achieved Occupancy” which is a great tool to compare model runs.

    + +
    + +
    + +

    Mean Blocks per SM:
    +Blocks per SM = Blocks of this kernel / SM number of this GPU. If this number is less than 1, it indicates the GPU multiprocessors are not fully utilized. “Mean Blocks per SM” is weighted average of all runs of this kernel name, using each run’s duration as weight.

    + +

    Mean Est. Achieved Occupancy:
    +Est. Achieved Occupancy is defined as above in overview. “Mean Est. Achieved Occupancy” is weighted average of all runs of this kernel name, using each run’s duration as weight.

    + +

    Trace View +This trace view displays a timeline that shows the duration of operators in your model and which system executed the operation. This view can help you identify whether the high consumption and long execution is because of input or model training. Currently, this trace view shows GPU Utilization and Est. SM Efficiency on a timeline.

    + +
    + +
    + +

    GPU utilization is calculated independently and divided into multiple 10 millisecond buckets. The buckets’ GPU utilization values are drawn alongside the timeline between 0 – 100%. In the above example, the “ProfilerStep5” GPU utilization during thread 28022’s busy time is higher than the following the one during “Optimizer.step”. This is where you can zoom-in to investigate why that is.

    + +
    + +
    + +

    From above, we can see the former’s kernels are longer than the later’s kernels. The later’s kernels are too short in execution, which results in lower GPU utilization.

    + +

    Est. SM Efficiency: Each kernel has a calculated est. SM efficiency between 0 – 100%. For example, the below kernel has only 64 blocks, while the SMs in this GPU is 80. Then its “Est. SM Efficiency” is 64/80, which is 0.8.

    + +
    + +
    + +

    Cloud Storage Support

    + +

    After running pip install tensorboard, to have data be read through these cloud providers, you can now run:

    + +
    torch-tb-profiler[blob] 
    +torch-tb-profiler[gs] 
    +torch-tb-profiler[s3] 
    +
    +

    pip install torch-tb-profiler[blob], pip install torch-tb-profiler[gs], or pip install torch-tb-profiler[S3] to have data be read through these cloud providers. For more information, please refer to this README.

    + +

    Jump to Source Code:

    + +

    One of the great benefits of having both TensorBoard and the PyTorch Profiler being integrated directly in Visual Studio Code (VS Code) is the ability to directly jump to the source code (file and line) from the profiler stack traces. VS Code Python Extension now supports TensorBoard Integration.

    + +

    Jump to source is ONLY available when Tensorboard is launched within VS Code. Stack tracing will appear on the plugin UI if the profiling with_stack=True. When you click on a stack trace from the PyTorch Profiler, VS Code will automatically open the corresponding file side by side and jump directly to the line of code of interest for you to debug. This allows you to quickly make actionable optimizations and changes to your code based on the profiling results and suggestions.

    + +
    + +

    Gify: Jump to Source using Visual Studio Code Plug In UI

    +
    + +

    For how to optimize batch size performance, check out the step-by-step tutorial here. PyTorch Profiler is also integrated with PyTorch Lightning and you can simply launch your lightning training jobs with –trainer.profiler=pytorch flag to generate the traces.

    + +

    What’s Next for the PyTorch Profiler?

    +

    You just saw how PyTorch Profiler can help optimize a model. You can now try the Profiler by pip install torch-tb-profiler to optimize your PyTorch model.

    + +

    Look out for an advanced version of this tutorial in the future. We are also thrilled to continue to bring state-of-the-art tool to PyTorch users to improve ML performance. We’d love to hear from you. Feel free to open an issue here.

    + +

    For new and exciting features coming up with PyTorch Profiler, follow @PyTorch on Twitter and check us out on pytorch.org.

    + +

    Acknowledgements

    + +

    The author would like to thank the contributions of the following individuals to this piece. From the Facebook side: Geeta Chauhan, Gisle Dankel, Woo Kim, Sam Farahzad, and Mark Saroufim. On the Microsoft side: AI Framework engineers (Teng Gao, Mike Guo, and Yang Gu), Guoliang Hua, and Thuy Nguyen.

    + + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-shanghai-notes/index.html b/blog/pytorch-shanghai-notes/index.html new file mode 100644 index 000000000000..090d9ed1b5ec --- /dev/null +++ b/blog/pytorch-shanghai-notes/index.html @@ -0,0 +1,683 @@ + + + + + + + + + + + + + PyTorch Shanghai Meetup Notes | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    September 08, 2024

    +

    + PyTorch Shanghai Meetup Notes +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    Summary

    + +

    group photo

    + +

    We are honored to successfully host the PyTorch Shanghai Meetup on August 15, 2024. This Meetup has received great attention from the industry. We invited senior PyTorch developers from Intel and Huawei as guest speakers, who shared their valuable experience and the latest technical trends. In addition, this event also attracted PyTorch enthusiasts from many technology companies and well-known universities. A total of more than 40 participants gathered together to discuss and exchange the latest applications and technological advances of PyTorch.

    + +

    This Meetup not only strengthened the connection between PyTorch community members, but also provided a platform for local AI technology enthusiasts to learn, communicate and grow. We look forward to the next gathering to continue to promote the development of PyTorch technology in the local area.

    + +

    1. PyTorch Foundation Updates

    + +

    man instructing students

    + +

    PyTorch Board member Fred Li shared the latest updates in the PyTorch community, He reviewed the development history of the PyTorch community, explained in detail the growth path of community developers, encouraged everyone to delve deeper into technology, and introduced the upcoming PyTorch Conference 2024 related matters.

    + +

    2. Intel’s Journey with PyTorch Democratizing AI with ubiquitous hardware and open software

    + +

    PyTorch CPU module maintainer Jiong Gong shared 6-year technical contributions from Intel to PyTorch and its ecosystem, explored the remarkable advancements that Intel has made in both software and hardware democratizing AI, ensuring accessibility, and optimizing performance across a diverse range of Intel hardware platforms.

    + +

    man instructing students

    + +

    3. Exploring Multi-Backend Support in PyTorch Ecosystem: A Case Study of Ascend

    + +

    man instructing students

    + +

    Fengchun Hua, a PyTorch contributor from Huawei, took Huawei Ascend NPU as an example to demonstrate the latest achievements in multi-backend support for PyTorch applications. He introduced the hardware features of Huawei Ascend NPU and the infrastructure of CANN (Compute Architecture for Neural Networks), and explained the key achievements and innovations in native support work. He also shared the current challenges and the next work plan.

    + +

    Yuanhao Ji, another PyTorch contributor from Huawei, then introduced the Autoload Device Extension proposal, explained its implementation details and value in improving the scalability of PyTorch, and introduced the latest work progress of the PyTorch Chinese community.

    + +

    4. Intel XPU Backend for Inductor

    + +

    man instructing students

    + +

    Eikan is a PyTorch contributor from Intel. He focuses on torch.compile stack for both Intel CPU and GPU. In this session, Eikan presented Intel’s efforts on torch.compile for Intel GPUs. He provided updates on the current status of Intel GPUs within PyTorch, covering both functionality and performance aspects. Additionally, Eikan used Intel GPU as a case study to demonstrate how to integrate a new backend into the Inductor using Triton.

    + +

    5. PyTorch PrivateUse1 Evolution Approaches and Insights

    + +

    man instructing students

    + +

    Jiawei Li, a PyTorch collaborator from Huawei, introduced PyTorch’s Dispatch mechanism and emphasized the limitations of DIspatchKey. He took Huawei Ascend NPU as an example to share the best practices of the PyTorch PrivateUse1 mechanism. He mentioned that while using the PrivateUse1 mechanism, Huawei also submitted many improvements and bug fixes for the mechanism to the PyTorch community. He also mentioned that due to the lack of upstream CI support for out-of-tree devices, changes in upstream code may affect their stability and quality, and this insight was recognized by everyone.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-xla-spmd/index.html b/blog/pytorch-xla-spmd/index.html new file mode 100644 index 000000000000..11a2b2c1af3e --- /dev/null +++ b/blog/pytorch-xla-spmd/index.html @@ -0,0 +1,814 @@ + + + + + + + + + + + + + PyTorch/XLA SPMD: Scale Up Model Training and Serving with Automatic Parallelization | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Yeounoh Chung, Jon Bolin, Milad Mohammadi, Jiewen Tan, Jack Cao, Joe Spisak, Alex Spiridonov, Shauheen Zahirazami, Steven Krawczyk, Wonjoo Lee Mohit Khatwani, Wanchao Liang, Vaibhav Singh + +

    +

    Today, we are delighted to announce PyTorch/XLA SPMD: the integration of GSPMD into PyTorch with an easy to use API. PyTorch developers seeking superior performance and scale can train and serve the largest neural networks while maximizing utilization of AI accelerators, such as Google Cloud TPUs.

    + +

    Introduction

    + +

    GSPMD is an automatic parallelization system for ML workloads. The XLA compiler transforms the single device program into a partitioned one with proper collectives, based on the user provided sharding hints. This allows developers to write PyTorch programs as if they are on a single large device without any custom sharded computation and/or collective communication ops to scale models.

    + +

    PyTorch/XLA SPMD allows PyTorch users to parallelize their ML workloads with GSPMD with less effort and with better performance. Some of the key highlights are:

    + +
      +
    • Better developer experience. Everything happens with a few sharding annotations from the user, and PyTorch/XLA SPMD achieves comparable performance to the most efficient PyTorch sharding implementation (see the Examples and Results section below). PyTorch/XLA SPMD separates the task of programming an ML model from the challenge of parallelization. Its automated approach to model sharding frees up the user from implementing the sharded version of ops with proper collectives in place.
    • +
    • A single API that enables a large variety of parallelism algorithms (including data parallelism, fully sharded data parallelism, spatial partitioning tensor and pipeline parallelism, as well as combinations of these algorithms) for different ML workloads and model architectures.
    • +
    • Industry-leading performance in large model training. PyTorch/XLA SPMD brings the powerful XLA GSPMD to PyTorch, enabling users to harness the full power of Google Cloud TPUs.
    • +
    • Enabling PyTorch and JAX developers take advantage of the same underlying XLA API to scale models.
    • +
    + +

    Key Concepts

    + +

    The key concepts behind the sharding annotation API are: 1) Mesh, 2) Partition Spec, and 3) mark_sharding API to express sharding intent using Mesh and Partition Spec. A more detailed design overview is available as a user guide here.

    + +

    Mesh

    + +

    For a given cluster of devices, a physical mesh is a representation of the interconnect topology.

    + +

    We derive a logical mesh based on this topology to create sub-groups of devices which can be used for partitioning different axes of tensors in a model. We apply sharding annotations to map the program across the logical mesh; this automatically inserts communication collectives in the program graph to support functional correctness (see the figure below).

    + +

    SPMD on PyTorch/XLA

    + +

    We abstract logical mesh with Mesh API. The axes of the logical Mesh can be named. Here is an example:

    + +
    import numpy as np
    +import torch_xla.runtime as xr
    +import torch_xla.experimental.xla_sharding as xs
    +from torch_xla.experimental.xla_sharding import Mesh
    +
    +# Enable XLA SPMD execution mode.
    +xr.use_spmd()
    +
    +# Assuming you are running on a TPU host that has 8 devices attached
    +num_devices = xr.global_runtime_device_count()
    +# mesh shape will be (4,2) in this example
    +mesh_shape = (num_devices // 2, 2)
    +device_ids = np.array(range(num_devices))
    +# axis_names 'x' nad 'y' are optional
    +mesh = Mesh(device_ids, mesh_shape, ('x', 'y'))
    +
    +mesh.get_logical_mesh()
    +>> array([[0, 1],
    +          [2, 3],
    +          [4, 5],
    +          [6, 7]])
    +mesh.shape()
    +>> OrderedDict([('x', 4), ('y', 2)])
    +
    + +

    Partition Spec

    + +

    partition_spec has the same rank as the input tensor. Each dimension describes how the corresponding input tensor dimension is sharded across the device mesh (logically defined by mesh_shape). partition_spec is a tuple of device_mesh dimension index, None, or a tuple of mesh dimension indices. The index can be an int or str if the corresponding mesh dimension is named. This specifies how each input rank is sharded (index to mesh_shape) or replicated (None).

    + +
    # Provide optional mesh axis names and use them in the partition spec
    +mesh = Mesh(device_ids, (4, 2), ('data', 'model'))
    +partition_spec = ('model', 'data')
    +xs.mark_sharding(input_tensor, mesh, partition_spec)
    +
    + +

    We support all three types of sharding described in the original GSPMD paper. For instance, one can specify partial replication like this:

    + +
    # Provide optional mesh axis names and use them in the partition spec
    +mesh = Mesh(device_ids, (2, 2, 2), ('x', 'y', 'z'))
    +
    +# evenly shard across x and z and replicate among y
    +partition_spec = ('x', 'z')  # equivalent to ('x', None, 'z')
    +xs.mark_sharding(input_tensor, mesh, partition_spec)
    +
    + +

    Simple Example With Sharding Annotation

    + +

    Users can annotate native PyTorch tensors using the mark_sharding API (src). This takes torch.Tensor as input and returns a XLAShardedTensor as output.

    + +
    def mark_sharding(t: Union[torch.Tensor, XLAShardedTensor], mesh: Mesh, partition_spec: Tuple[Union[int, None]]) -> XLAShardedTensor
    +
    + +

    Invoking mark_sharding API takes a user defined logical mesh and partition_spec and generates a sharding annotation for the XLA compiler. The sharding specification is attached to the XLATensor, as well as the original input tensor. Here is a simple usage example from the [RFC], to illustrate how the sharding annotation API works:

    + +
    import numpy as np
    +import torch
    +import torch_xla.core.xla_model as xm
    +import torch_xla.runtime as xr
    +import torch_xla.experimental.xla_sharding as xs
    +from torch_xla.experimental.xla_sharded_tensor import XLAShardedTensor
    +from torch_xla.experimental.xla_sharding import Mesh
    +
    +# Enable XLA SPMD execution mode.
    +xr.use_spmd()
    +
    +# Device mesh, this and partition spec as well as the input tensor shape define the individual shard shape.
    +num_devices = xr.global_runtime_device_count()
    +mesh_shape = (2, num_devicese // 2)  # 2x4 on v3-8, 2x2 on v4-8  
    +device_ids = np.array(range(num_devices))
    +mesh = Mesh(device_ids, mesh_shape, ('x', 'y'))
    +
    +t = torch.randn(8, 4).to(xm.xla_device())
    +
    +# Mesh partitioning, each device holds 1/8-th of the input
    +partition_spec = (0, 1)
    +m1_sharded = xs.mark_sharding(t, mesh, partition_spec)
    +assert isinstance(m1_sharded, XLAShardedTensor) == True
    +# Note that the sharding annotation is also in-placed updated to t
    +
    + +

    We can annotate different tensors in the PyTorch program to enable different parallelism techniques, as described in the comment below:

    + +
    # Sharding annotate the linear layer weights. SimpleLinear() is a nn.Module.
    +model = SimpleLinear().to(xm.xla_device())
    +xs.mark_sharding(model.fc1.weight, mesh, partition_spec)
    +
    +# Training loop
    +model.train()
    +for step, (data, target) in enumerate(loader):
    +  # Assumes `loader` returns data, target on XLA device
    +  optimizer.zero_grad()
    +  # Sharding annotate input data, we can shard any input
    +  # dimensions. Sharding the batch dimension enables 
    +  # data parallelism, sharding the feature dimension enables
    +  # spatial partitioning.
    +  xs.mark_sharding(data, mesh, partition_spec)
    +  ouput = model(data)
    +  loss = loss_fn(output, target)
    +  optimizer.step()
    +  xm.mark_step()
    +
    + +

    More complete unit test cases and integration test examples are available in the PyTorch/XLA repo.

    + +

    Results

    + +

    Performance

    + +

    We measured the performance of PyTorch/XLA SPMD using a GPT-2 model (src) and compared it with user-mode FSDP.

    + +

    Here, SPMD applies the same sharding scheme as the FSDP plot (i.e. 1D sharding). Users are expected to achieve better MFU results by exploring more advanced SPMD sharding schemes.

    + +

    SPMD vs. FSDP

    + +

    We use Model FLOPS Utilization (MFU) as a metric for comparison. MFU is “the ratio of the observed throughput relative to the theoretical maximum throughput of a system operating at peak FLOPs” (PaLM paper).

    + +
    flops_per_step = 6 * global_batch_size * seq_len * num_params
    +model_flops_utilization = flops_per_step / step_time(s) / chip_count / flops_per_chip
    +
    + +

    This estimation assumes that the input dimensionality is much larger than the input sequence length (d_model » seq_len). If this assumption is violated the self-attention FLOPs start to be significant enough and this expression will underestimate the true MFU.

    + +

    Scalability

    + +

    One of the core benefits of SPMD is the flexible partitioning which can be used to save accelerator memory (HBM) usage and improve scalability. For scalability analysis, we present two studies: 1) we examine the peak HBM across 4 model sizes using Hugging Face transformers (GPT-2) as the base implementation; 2) we examine the peak HBM usage with spatial partitioning.

    + +

    Peak HBM Utilization

    + +

    The above figure illustrates the unsharded 2B parameters model peak memory footprint stands at 26GB (red dashed line). harding model weights (model parallelism) reduces the peak memory footprint, and thus, enables larger model training with a given TPU pod slice. In these experiments, we achieved up to 39.75% MFU on a 4B parameters model on Google Cloud TPU v4-16.

    + +

    We also ran an input batch scalability test using spatial partitioning and a simple ResNet50 example (src) on Cloud TPU v4-8. Input batch is commonly sharded across the batch dimension for data parallelism (DDP, FSDP), but PyTorch/XLA SPMD enables input sharding across input feature dimensions for spatial sharding. As shown in the below figure, one can push the per-device batch size to 512 with spatial partitioning which is not possible with other data parallelism techniques.

    + +

    Batch size scaling with spatial partitioning

    + +

    The Road Forward for PyTorch/XLA SPMD

    + +

    We are ecstatic about what’s ahead for PyTorch/XLA and invite the community to join us. SPMD is still experimental, and we continuously add new features to it. In future releases, we plan to address async dataloading, partially replicated sharding, and other improvements. We’d love to hear from you, answer your questions about PyTorch/XLA SPMD, and learn how you use SPMD.

    + +

    Cheers!

    + +

    The PyTorch/XLA Team at Google

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch2-2-lib-updates/index.html b/blog/pytorch2-2-lib-updates/index.html new file mode 100644 index 000000000000..a1c1efb7c405 --- /dev/null +++ b/blog/pytorch2-2-lib-updates/index.html @@ -0,0 +1,754 @@ + + + + + + + + + + + + + New Library Updates in PyTorch 2.2 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    January 30, 2024

    +

    + New Library Updates in PyTorch 2.2 +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    Summary

    + +

    We are bringing a number of improvements to the current PyTorch libraries, alongside the PyTorch 2.2 release. These updates demonstrate our focus on developing common and extensible APIs across all domains to make it easier for our community to build ecosystem projects on PyTorch.

    + + + + + + + + + + + + + + + + + + + + +
    Latest Stable Library Versions (Full List)* +
    TorchArrow 0.1.0 + TorchRec 0.6.0 + TorchVision 0.17 +
    TorchAudio 2.2.0 + TorchServe 0.9.0 + TorchX 0.7.0 +
    TorchData 0.7.1 + TorchText 0.17.0 + PyTorch on XLA Devices 2.1 +
    + +

    *To see prior versions or (unstable) nightlies, click on versions in the top left menu above ‘Search Docs’.

    + +

    TorchRL

    + +

    Feature: TorchRL’s Offline RL Data Hub

    + +

    TorchRL now provides one of the largest dataset hubs for offline RL and imitation learning, and it all comes under a single data format (TED, for TorchRL Episode Data format). This makes it possible to easily swap from different sources in a single training loop. It is also now possible to easily combine datasets of different sources through the ReplayBufferEnsemble class. The data processing is fully customizable. Sources include simulated tasks (Minari, D4RL, VD4RL), robotic datasets (Roboset, OpenX Embodied dataset) and gaming (GenDGRL/ProcGen, Atari/DQN). Check these out in the documentation.

    + +

    Aside from these changes, our replay buffers can now be dumped on disk using the .dumps() method which will serialize the buffers on disk using the TensorDict API which is faster, safer and more efficient than using torch.save.

    + +

    Finally, replay buffers can now be read and written from separate processes on the same machine without any extra code needed from the user!

    + +

    TorchRL2Gym environment API

    + +

    To facilitate TorchRL’s integration in existing code-bases and enjoy all the features of TorchRL’s environment API (execution on device, batched operations, transforms…) we provide a TorchRL-to-gym API that allows users to register any environment they want in gym or gymnasium. This can be used in turn to make TorchRL a universal lib-to-gym converter that works across stateless (eg, dm_control) and stateless (Brax, Jumanji) environments. The feature is thoroughly detailed in the doc. The info_dict reading API has also been improved.

    + +

    Environment speedups

    + +

    We added the option of executing environments on a different environment than the one used to deliver data in ParallelEnv. We also speeded up the GymLikeEnv class to a level that now makes it competitive with gym itself.

    + +

    Scaling objectives

    + +

    The most popular objectives for RLHF and training at scale (PPO and A2C) are now compatible with FSDP and DDP models!

    + +

    TensorDict

    + +

    Feature: MemoryMappedTensor to replace MemmapTensor

    + +

    We provide a much more efficient mmap backend for TensorDict; MemoryMappedTensor, which directly subclasses torch.Tensor. It comes with a bunch of utils to be constructed, such as from_tensor, empty and many more. MemoryMappedTensor is now much safer and faster than its counterpart. The library remains fully compatible with the previous class to facilitate transition.

    + +

    We also introduce a new set of multithreaded serialization methods that make tensordict serialization highly competitive with torch.save, with serialization and deserialization speeds for LLMs more than 3x faster than with torch.save.

    + +

    Feature: Non-tensor data within TensorDict

    + +

    It is not possible to carry non-tensor data through the NonTensorData tensorclass. This makes it possible to build tensordicts with metadata. The memmap-API is fully compatible with these values, allowing users to seamlessly serialize and deserialize such objects. To store non-tensor data in a tensordict, simply assign it using the __setitem__ method.

    + +

    Efficiency improvements

    + +

    Several methods runtime have been improved, such as unbind, split, map or even TensorDict instantiation. Check our benchmarks!

    + +

    TorchRec/fbgemm_gpu

    + +

    VBE

    + +

    TorchRec now natively supports VBE (variable batched embeddings) within the EmbeddingBagCollection module. This allows variable batch size per feature, unlocking sparse input data deduplication, which can greatly speed up embedding lookup and all-to-all time. To enable, simply initialize KeyedJaggedTensor with stride_per_key_per_rank and inverse_indices fields, which specify batch size per feature and inverse indices to reindex the embedding output respectively.

    + +

    In addition to the TorchRec library changes, fbgemm_gpu has added the support for variable batch size per feature in TBE. VBE is enabled on split TBE training for both weighted and unweighted cases. To use VBE, please make sure to use the latest fbgemm_gpu version.

    + +

    Embedding offloading

    + +

    This technique refers to using CUDA UVM to cache ‘hot’ embeddings (i.e. store embedding tables on host memory with cache on HBM memory), and prefetching the cache. Embedding offloading allows running a larger model with fewer GPUs, while maintaining competitive performance. Use the prefetching pipeline (PrefetchTrainPipelineSparseDist) and pass in per-table cache load factor and the prefetch_pipeline flag through constraints in the planner to use this feature.

    + +

    Fbgemm_gpu has introduced UVM cache pipeline prefetching in v0.5.0 for TBE performance speedup. This allows cache-insert to be executed in parallel with TBE forward/backward. To enable this feature, please be sure to use the latest fbgemm_gpu version.

    + +

    Trec.shard/shard_modules

    + +

    These APIs replace embedding submodules with its sharded variant. The shard API applies to an individual embedding module while the shard_modules API replaces all embedding modules and won’t touch other non-embedding submodules.

    + +

    Embedding sharding follows similar behavior to the prior TorchRec DistributedModuleParallel behavior, except the ShardedModules have been made composable, meaning the modules are backed by TableBatchedEmbeddingSlices which are views into the underlying TBE (including .grad). This means that fused parameters are now returned with named_parameters(), including in DistributedModuleParallel.

    + +

    TorchVision

    + +

    The V2 transforms are now stable!

    + +

    The torchvision.transforms.v2 namespace was still in BETA stage until now. It is now stable! Whether you’re new to Torchvision transforms, or you’re already experienced with them, we encourage you to start with Getting started with transforms v2 in order to learn more about what can be done with the new v2 transforms.

    + +

    Browse our main docs for general information and performance tips. The available transforms and functionals are listed in the API reference. Additional information and tutorials can also be found in our example gallery, e.g. Transforms v2: End-to-end object detection/segmentation example or How to write your own v2 transforms.

    + +

    Towards torch.compile() support

    + +

    We are progressively adding support for torch.compile() to torchvision interfaces, reducing graph breaks and allowing dynamic shape.

    + +

    The torchvision ops (nms, [ps_]roi_align, [ps_]roi_pool and deform_conv_2d) are now compatible with torch.compile and dynamic shapes.

    + +

    On the transforms side, the majority of low-level kernels (like resize_image() or crop_image()) should compile properly without graph breaks and with dynamic shapes. We are still addressing the remaining edge-cases, moving up towards full functional support and classes, and you should expect more progress on that front with the next release.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch2-2/index.html b/blog/pytorch2-2/index.html new file mode 100644 index 000000000000..3b2f83a86622 --- /dev/null +++ b/blog/pytorch2-2/index.html @@ -0,0 +1,768 @@ + + + + + + + + + + + + + PyTorch 2.2: FlashAttention-v2 integration, AOTInductor | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    We are excited to announce the release of PyTorch® 2.2 (release note)! PyTorch 2.2 offers ~2x performance improvements to scaled_dot_product_attention via FlashAttention-v2 integration, as well as AOTInductor, a new ahead-of-time compilation and deployment tool built for non-python server-side deployments.

    + +

    This release also includes improved torch.compile support for Optimizers, a number of new inductor optimizations, and a new logging mechanism called TORCH_LOGS.

    + +

    Please note that we are deprecating macOS x86 support, and PyTorch 2.2.x will be the last version that supports macOS x64.

    + +

    Along with 2.2, we are also releasing a series of updates to the PyTorch domain libraries. More details can be found in the library updates blog.

    + +

    This release is composed of 3,628 commits and 521 contributors since PyTorch 2.1. We want to sincerely thank our dedicated community for your contributions. As always, we encourage you to try these out and report any issues as we improve 2.2. More information about how to get started with the PyTorch 2-series can be found at our Getting Started page.

    + +

    Summary:

    + +
      +
    • scaled_dot_product_attention (SDPA) now supports FlashAttention-2, yielding around 2x speedups compared to previous versions.
    • +
    • PyTorch 2.2 introduces a new ahead-of-time extension of TorchInductor called AOTInductor, designed to compile and deploy PyTorch programs for non-python server-side.
    • +
    • torch.distributed supports a new abstraction for initializing and representing ProcessGroups called device_mesh.
    • +
    • PyTorch 2.2 ships a standardized, configurable logging mechanism called TORCH_LOGS.
    • +
    • A number of torch.compile improvements are included in PyTorch 2.2, including improved support for compiling Optimizers and improved TorchInductor fusion and layout optimizations.
    • +
    • Please note that we are deprecating macOS x86 support, and PyTorch 2.2.x will be the last version that supports macOS x64.
    • +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +Stable + Beta + Performance Improvements +
    + FlashAttention-2 Integration + Inductor optimizations +
    + AOTInductor + aarch64 optimizations +
    + TORCH_LOGS + +
    + device_mesh + +
    + Optimizer compilation + +
    + +

    *To see a full list of public feature submissions click here.

    + +

    Beta Features

    + +

    [Beta] FlashAttention-2 support in torch.nn.functional.scaled_dot_product_attention

    + +

    torch.nn.functional.scaled_dot_product_attention (SDPA) now supports FlashAttention-2, yielding around 2x speedups (compared to the previous version) and reaching ~50-73% of theoretical maximum FLOPs/s on A100 GPUs.

    + +

    More information is available on FlashAttention-2 in this paper.

    + +

    For a tutorial on how to use SDPA please see this tutorial.

    + +

    [Beta] AOTInductor: ahead-of-time compilation and deployment for torch.export-ed programs

    + +

    AOTInductor is an extension of TorchInductor, designed to process exported PyTorch models, optimize them, and produce shared libraries as well as other relevant artifacts. These compiled artifacts can be deployed in non-Python environments, which are frequently employed for inference on the server-side. Note that AOTInductor supports the same backends as Inductor, including CUDA, ROCm, and CPU.

    + +

    For more information please see the AOTInductor tutorial.

    + +

    [Beta] Fine-grained configurable logging via TORCH_LOGS

    + +

    PyTorch now ships a standardized, configurable logging mechanism that can be used to analyze the status of various subsystems such as compilation and distributed operations.

    + +

    Logs can be enabled via the TORCH_LOGS environment variable. For example, to set the log level of TorchDynamo to logging.ERROR and the log level of TorchInductor to logging.DEBUG pass TORCH_LOGS=”-dynamo,+inductor” to PyTorch.

    + +

    For more information, please see the logging documentation and tutorial.

    + +

    [Beta] torch.distributed.device_mesh

    + +

    PyTorch 2.2 introduces a new abstraction for representing the ProcessGroups involved in distributed parallelisms called torch.distributed.device_mesh. This abstraction allows users to represent inter-node and intra-node process groups via an N-dimensional array where, for example, one dimension can data parallelism in FSDP while another could represent tensor parallelism within FSDP.

    + +

    For more information, see the device_mesh tutorial.

    + +

    [Beta] Improvements to torch.compile-ing Optimizers

    + +

    A number of improvements have been made to torch.compile-ing Optimizers including less overhead and support for cuda graphs.

    + +

    More technical details of the improvements are available on dev-discuss, and a recipe for torch.compile-ing optimizers is available here.

    + +

    Performance Improvements

    + +

    Inductor Performance Optimizations

    + +

    A number of performance optimizations have been added to TorchInductor including horizontal fusion support for torch.concat, improved convolution layout optimizations, and improved scaled_dot_product_attention pattern matching.

    + +

    For a complete list of inductor optimizations, please see the Release Notes.

    + +

    aarch64 Performance Optimizations

    + +

    PyTorch 2.2 includes a number of performance enhancements for aarch64 including support for mkldnn weight pre-packing, improved ideep primitive caching, and improved inference speed via fixed format kernel improvements to OneDNN.

    + +

    For a complete list of aarch64 optimizations, please see the Release Notes.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch2-3/index.html b/blog/pytorch2-3/index.html new file mode 100644 index 000000000000..755ff68e46ed --- /dev/null +++ b/blog/pytorch2-3/index.html @@ -0,0 +1,734 @@ + + + + + + + + + + + + + PyTorch 2.3 Release Blog | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    April 24, 2024

    +

    + PyTorch 2.3 Release Blog +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    We are excited to announce the release of PyTorch® 2.3 (release note)! PyTorch 2.3 offers support for user-defined Triton kernels in torch.compile, allowing for users to migrate their own Triton kernels from eager without experiencing performance regressions or graph breaks. Tensor Parallelism improves the experience for training Large Language Models using native PyTorch functions, which has been validated on training runs for 100B parameter models. As well, semi-structured sparsity implements semi-structured sparsity as a Tensor subclass, with observed speedups of up to 1.6 over dense matrix multiplication.

    + +

    This release is composed of 3393 commits and 426 contributors since PyTorch 2.2. We want to sincerely thank our dedicated community for your contributions. As always, we encourage you to try these out and report any issues as we improve 2.3. More information about how to get started with the PyTorch 2-series can be found at our Getting Started page.

    + + + + + + + + + + + + + + + + + + + + + + +
    Beta + Prototype + Performance Improvements +
    User-defined Triton kernels in torch.compile + torch.export adds new API to specify dynamic_shapes + Weight-Only-Quantization introduced into Inductor CPU backend +
    Tensor parallelism within PyTorch Distributed + Asynchronous checkpoint generation + +
    Support for semi-structured sparsity + + +
    + +

    *To see a full list of public feature submissions click here.

    + +

    Beta Features

    + +

    [Beta] Support for User-defined Triton kernels in torch.compile

    + +

    Allows for PyTorch code that contains triton kernels to be executed natively using torch.compile. This enables users to migrate code containing triton kernels from eager PyTorch to torch.compile without running into performance regressions or graph breaks. Native support also creates an opportunity for Torch Inductor to precompile the user-defined Triton kernel as well as better organize code around the Triton kernel allowing for further optimizations.

    + +

    You can find more information about how to utilize user defined Triton kernels in torch.compile within this tutorial.

    + +

    [Beta] Tensor Parallelism introduces more efficient ways to train LLMs

    + +

    The Tensor Parallel API facilitates various tensor manipulations across GPUs/hosts and integrates with FSDP for 2D Parallelism (Tensor parallelism across devices + Data Parallelism across hosts). It also offers a low-level API for constructing higher-level Tensor parallel APIs. This API has been validated to support the training of transformer models with over 100 billion parameters.

    + +

    You can find more information on how to utilize this within your workflows within this tutorial.

    + +

    [Beta] Semi-structured sparsity provides users with a way to take advantage of accelerated sparse inference and memory savings

    + +

    torch.sparse.SparseSemiStructuredTensor implements semi-structured sparsity as a Tensor subclass, which have observed speedups of up to 1.6 over dense matrix multiplication.

    + +

    In particular it adds:

    + +
      +
    • Additional support for quantization composability (mixed dtype, dequant fusion)
    • +
    • Updated cuSPARSELt and CUTLASS kernels
    • +
    • torch.compile support
    • +
    + +

    You can find more information on how to take advantage of semi-structured sparsity here.

    + +

    Prototype Features

    + +

    [PROTOTYPE] torch.export adds new API to specify dynamic_shapes

    + +

    You can now use torch.export.Dim to better represent dynamic shapes by enabling developers to specify ranges (min and max values) that can be reused across different input dimensions that are constrained to be equal.

    + +

    To learn more about torch.export.Dim as well as how it can be used to express more interesting relationships (such as linear arithmetic expressions) check out the tutorial here.

    + +

    [PROTOTYPE] Asynchronous checkpoint generation

    + +

    Asynchronous checkpoint generation allows users to continue their training loops while checkpoints are being generated, essentially offloading much of the checkpointing cost.

    + +

    You can find out how to utilize this within your own workflows with this example.

    + +

    Performance Improvements

    + +

    [PROTOTYPE] Weight-Only-Quantization introduced into Inductor CPU backend

    + +

    PyTorch 2.3 enhances LLM inference performance on torch inductor CPU backend. The project gpt-fast offers a simple and efficient PyTorch native acceleration for transformer text generation with torch.compile. Prior to 2.3 only CUDA devices were supported and this feature enables the CPU counterpart by providing highly optimized kernels for the int4 and int8 weight only quantization Linear.

    + +

    For more information / how to utilize this feature please refer to the gpt-fast README.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch2-4/index.html b/blog/pytorch2-4/index.html new file mode 100644 index 000000000000..85e9cee1df67 --- /dev/null +++ b/blog/pytorch2-4/index.html @@ -0,0 +1,791 @@ + + + + + + + + + + + + + PyTorch 2.4 Release Blog | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    July 24, 2024

    +

    + PyTorch 2.4 Release Blog +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    We are excited to announce the release of PyTorch® 2.4 (release note)! PyTorch 2.4 adds support for the latest version of Python (3.12) for torch.compile. AOTInductor freezing gives developers running AOTInductor more performance-based optimizations by allowing the serialization of MKLDNN weights. As well, a new default TCPStore server backend utilizing libuv has been introduced which should significantly reduce initialization times for users running large-scale jobs. Finally, a new Python Custom Operator API makes it easier than before to integrate custom kernels into PyTorch, especially for torch.compile.

    + +

    This release is composed of 3661 commits and 475 contributors since PyTorch 2.3. We want to sincerely thank our dedicated community for your contributions. As always, we encourage you to try these out and report any issues as we improve 2.4. More information about how to get started with the PyTorch 2-series can be found at our Getting Started page.

    + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Beta + Prototype + Performance Improvements +
    Python 3.12 support for torch.compile + FSDP2: DTensor-based per-parameter-sharding FSDP + torch.compile optimizations for AWS Graviton (aarch64-linux) processors +
    AOTInductor Freezing for CPU + torch.distributed.pipelining, simplified pipeline parallelism + BF16 symbolic shape optimization in TorchInductor +
    New Higher-level Python Custom Operator API + Intel GPU is available through source build + Performance optimizations for GenAI projects utilizing CPU devices +
    Switching TCPStore’s default server backend to libuv + + +
    + +

    *To see a full list of public feature submissions click here.

    + +

    Beta Features

    + +

    [Beta] Python 3.12 support for torch.compile

    + +

    torch.compile() previously only supported Python 3.8-3.11. Users can now optimize models with torch.compile() with Python 3.12.

    + +

    [Beta] AOTInductor Freezing for CPU

    + +

    This feature enables users to turn on the freezing flag when using AOTInductor on CPU. With this feature, AOTInductor can cover the same set of op scenarios and reach on-par performance as Inductor CPP backend. Before this support, when models contain MKLDNN operators (when computation-intensive operators are involved, such as Convolution, Linear, ConvTranspose, and so on) and freezing is on, those models will fail to run since AOTInductor didn’t support serializing the MKLDNN weights which have an opaque format.

    + +

    The workflow is as explained in the AOTInductor tutorial, in addition to that users could now add the freezing flag to get better performance:

    +
    export TORCHINDUCTOR_FREEZING=1
    +
    + +

    [Beta] New Higher-level Python Custom Operator API

    + +

    We’ve added a new higher-level Python Custom Operator API that makes it easier than before to extend PyTorch with custom operators that behave like PyTorch’s built-in operators. Operators registered using the new high-level torch.library APIs are guaranteed to be compatible with torch.compile and other PyTorch subsystems; authoring a custom operator in Python using the previous low-level torch.library APIs required deep understanding of PyTorch internals and has many footguns.

    + +

    Please see the tutorial for more information.

    + +

    [Beta] Switching TCPStore’s default server backend to libuv

    + +

    Introduced a new default server backend for TCPStore built with libuv which should introduce significantly lower initialization times and better scalability. This should ideally benefit users with a much shorter startup time when accounting for large-scale jobs.

    + +

    For more information on the motivation + fallback instructions please refer to this tutorial.

    + +

    Prototype Features

    + +

    [PROTOTYPE] FSDP2: DTensor-based per-parameter-sharding FSDP

    + +

    FSDP2 is a new fully sharded data parallelism implementation that uses dim-0 per-parameter sharding to resolve fundamental composability challenges with FSDP1’s flat-parameter sharding.

    + +

    For more information regarding the motivation / design for FSDP2 please refer to the RFC on Github.

    + +

    [PROTOTYPE] torch.distributed.pipelining, simplified pipeline parallelism

    + +

    Pipeline Parallelism is one of the primitive parallelism techniques for deep learning. It allows the execution of a model to be partitioned such that multiple micro-batches can execute different parts of the model code concurrently.

    + +

    torch.distributed.pipelining provides a toolkit that allows for easy implementation of pipeline parallelism on general models while also offering composability with other common PyTorch distributed features like DDP, FSDP, or tensor parallel.

    + +

    For more information on this please refer to our documentation and tutorial.

    + +

    [PROTOTYPE] Intel GPU is available through source build

    + +

    Intel GPU in PyTorch on Linux systems offers fundamental functionalities on Intel® Data Center GPU Max Series: eager mode and torch.compile.

    + +

    For eager mode, the commonly used Aten operators are implemented by using SYCL programming language. The most performance-critical graphs and operators are highly optimized by using oneAPI Deep Neural Network (oneDNN). For torch.compile mode, Intel GPU backend is integrated to Inductor on top of Triton.

    + +

    For more information for Intel GPU source build please refer to our blog post and documentation.

    + +

    Performance Improvements

    + +

    torch.compile optimizations for AWS Graviton (aarch64-linux) processors

    + +

    AWS optimized the PyTorch torch.compile feature for AWS Graviton3 processors. This optimization results in up to 2x better performance for Hugging Face model inference (based on geomean of performance improvement for 33 models) and up to 1.35x better performance for TorchBench model inference (geomean of performance improvement for 45 models) compared to the default eager mode inference across several natural language processing (NLP), computer vision (CV), and recommendation models on AWS Graviton3-based Amazon EC2 instances.

    + +

    For more information regarding specific technical details please refer to the blog post.

    + +

    BF16 symbolic shape optimization in TorchInductor

    + +

    Pytorch users can now experience improved quality and performance gains with the beta BF16 symbolic shape support. While static shape may afford additional optimization opportunities compared to symbolic shape, it is insufficient for scenarios such as inference services with varying batch size and sequence length, or detection models with data-dependent output shape.

    + +

    Verification using TorchBench, Huggingface, and timms_model shows a similar pass rate and comparable speedup with the BF16 static shape scenario. Combining the benefits of symbolic shape with BF16 AMX instructions hardware acceleration provided by Intel CPUs and general Inductor CPU backend optimizations applicable to both static and symbolic shape in PyTorch 2.4, the performance for BF16 symbolic shape has significantly improved compared to PyTorch 2.3.

    + +

    The API to use this feature:

    + +
    model = .
    +model.eval()
    +with torch.autocast(device_type=cpu, dtype=torch.bfloat16), torch.no_grad():
    +   compiled_model = torch.compile(model, dynamic=True)
    +
    + +

    Performance optimizations for GenAI projects utilizing CPU devices

    + +

    Highlighting the enhanced performance of PyTorch on CPU, as demonstrated through the optimizations made for the “Segment Anything Fast” and “Diffusion Fast” project. However, only CUDA devices are supported in the model. We have incorporated CPU support into the projects, enabling users to leverage the increased power of CPU for running the project’s experiments. Meanwhile, we have employed a block-wise attention mask for SDPA as well, which can significantly reduce peak memory usage and improve performance. We have also optimized a series of layout propagation rules in Inductor CPU to improve performance.

    + +

    To facilitate this, we have updated the README file. The API to use this feature is given below, simply providing --device cpu in the command lines:

    + +
      +
    • +

      For Segment Anything Fast:

      + +
      export SEGMENT_ANYTHING_FAST_USE_FLASH_4=0
      +python run_experiments.py 16 vit_b <pytorch_github> <segment-anything_github>
      +<path_to_experiments_data> --run-experiments --num-workers 32 --device cpu
      +
      +
    • +
    • +

      For Diffusion Fast:

      + +
      python run_benchmark.py --compile_unet --compile_vae --enable_fused_projections --device=cpu
      +
      +
    • +
    + +

    Users can follow the guidelines to run the experiments and observe the performance improvements firsthand, as well as explore the performance improvement trends across FP32 and BF16 data types.

    + +

    Additionally, users can achieve good performance using torch.compile and SDPA. By observing the performance trends across these different factors, users can gain a deeper understanding of how various optimizations enhance PyTorch’s performance on CPU.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch2-5/index.html b/blog/pytorch2-5/index.html new file mode 100644 index 000000000000..e384dbba86eb --- /dev/null +++ b/blog/pytorch2-5/index.html @@ -0,0 +1,780 @@ + + + + + + + + + + + + + PyTorch 2.5 Release Blog | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    October 17, 2024

    +

    + PyTorch 2.5 Release Blog +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    We are excited to announce the release of PyTorch® 2.5 (release note)! This release features a new cuDNN backend for SDPA, enabling speedups by default for users of SDPA on H100s or newer GPUs. As well, regional compilation of torch.compile offers a way to reduce the cold start up time for torch.compile by allowing users to compile a repeated nn.Module (e.g. a transformer layer in LLM) without recompilations. Finally, TorchInductor CPP backend offers solid performance speedup with numerous enhancements like FP16 support, CPP wrapper, AOT-Inductor mode, and max-autotune mode.

    + +

    This release is composed of 4095 commits from 504 contributors since PyTorch 2.4. We want to sincerely thank our dedicated community for your contributions. As always, we encourage you to try these out and report any issues as we improve 2.5. More information about how to get started with the PyTorch 2-series can be found at our Getting Started page.

    + +

    As well, please check out our new ecosystem projects releases with TorchRec and TorchFix.

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Beta + Prototype +
    cuDNN backend for SDPA + FlexAttention +
    torch.compile regional compilation without recompilations + Compiled Autograd +
    TorchDynamo added support for exception handling & MutableMapping types + Flight Recorder +
    TorchInductor CPU backend optimization + Max-autotune Support on CPU with GEMM Template +
    + TorchInductor on Windows +
    + FP16 support on CPU path for both eager mode and TorchInductor CPP backend +
    + Autoload Device Extension +
    + Enhanced Intel GPU support +
    + +

    *To see a full list of public feature submissions click here.

    + +

    BETA FEATURES

    + +

    [Beta] cuDNN backend for SDPA

    + +

    The cuDNN “Fused Flash Attention” backend was landed for torch.nn.functional.scaled_dot_product_attention. On NVIDIA H100 GPUs this can provide up to 75% speed-up over FlashAttentionV2. This speedup is enabled by default for all users of SDPA on H100 or newer GPUs.

    + +

    [Beta] torch.compile regional compilation without recompilations

    + +

    Regional compilation without recompilations, via torch._dynamo.config.inline_inbuilt_nn_modules which default to True in 2.5+. This option allows users to compile a repeated nn.Module (e.g. a transformer layer in LLM) without recompilations. Compared to compiling the full model, this option can result in smaller compilation latencies with 1%-5% performance degradation compared to full model compilation.

    + +

    See the tutorial for more information.

    + +

    [Beta] TorchInductor CPU backend optimization

    + +

    This feature advances Inductor’s CPU backend optimization, including CPP backend code generation and FX fusions with customized CPU kernels. The Inductor CPU backend supports vectorization of common data types and all Inductor IR operations, along with the static and symbolic shapes. It is compatible with both Linux and Windows OS and supports the default Python wrapper, the CPP wrapper, and AOT-Inductor mode.

    + +

    Additionally, it extends the max-autotune mode of the GEMM template (prototyped in 2.5), offering further performance gains. The backend supports various FX fusions, lowering to customized kernels such as oneDNN for Linear/Conv operations and SDPA. The Inductor CPU backend consistently achieves performance speedups across three benchmark suites—TorchBench, Hugging Face, and timms—outperforming eager mode in 97.5% of the 193 models tested.

    + +

    PROTOTYPE FEATURES

    + +

    [Prototype] FlexAttention

    + +

    We’ve introduced a flexible API that enables implementing various attention mechanisms such as Sliding Window, Causal Mask, and PrefixLM with just a few lines of idiomatic PyTorch code. This API leverages torch.compile to generate a fused FlashAttention kernel, which eliminates extra memory allocation and achieves performance comparable to handwritten implementations. Additionally, we automatically generate the backwards pass using PyTorch’s autograd machinery. Furthermore, our API can take advantage of sparsity in the attention mask, resulting in significant improvements over standard attention implementations.

    + +

    For more information and examples, please refer to the official blog post and Attention Gym.

    + +

    [Prototype] Compiled Autograd

    + +

    Compiled Autograd is an extension to the PT2 stack allowing the capture of the entire backward pass. Unlike the backward graph traced by AOT dispatcher, Compiled Autograd tracing is deferred until backward execution time, which makes it impervious to forward pass graph breaks, and allows it to record backward hooks into the graph.

    + +

    Please refer to the tutorial for more information.

    + +

    [Prototype] Flight Recorder

    + +

    Flight recorder is a new debugging tool that helps debug stuck jobs. The tool works by continuously capturing information about collectives as they run. Upon detecting a stuck job, the information can be used to quickly identify misbehaving ranks/machines along with code stack traces.

    + +

    For more information please refer to the following tutorial.

    + +

    [Prototype] Max-autotune Support on CPU with GEMM Template

    + +

    Max-autotune mode for the Inductor CPU backend in torch.compile profiles multiple implementations of operations at compile time and selects the best-performing one. This is particularly beneficial for GEMM-related operations, using a C++ template-based GEMM implementation as an alternative to the ATen-based approach with oneDNN and MKL libraries. We support FP32, BF16, FP16, and INT8 with epilogue fusions for x86 CPUs. We’ve seen up to 7% geomean speedup on the dynamo benchmark suites and up to 20% boost in next-token latency for LLM inference.

    + +

    For more information please refer to the tutorial.

    + +

    [Prototype] TorchInductor CPU on Windows

    + +

    Inductor CPU backend in torch.compile now works on Windows. We support MSVC (cl), clang (clang-cl) and Intel compiler (icx-cl) for Windows inductor currently.

    + +

    See the tutorial for more details.

    + +

    [Prototype] FP16 support on CPU path for both eager mode and TorchInductor CPP backend

    + +

    Float16 is a commonly used reduced floating point type for performance improvement in neural network inference/training. Since this release, float16 for both eager and TorchInductor is supported on the CPU path.

    + +

    [Prototype] Autoload Device Extension

    + +

    PyTorch now supports autoloading for out-of-tree device extensions, streamlining integration by eliminating the need for manual imports. This feature, enabled through the torch.backends entrypoint, simplifies usage by ensuring seamless extension loading, while allowing users to disable it via an environment variable if needed.

    + +

    See the tutorial for more information.

    + +

    [Prototype] Enhanced Intel GPU support

    + +

    Intel GPUs support enhancement is now available for both Intel® Data Center GPU Max Series and Intel® Client GPUs (Intel® Core™ Ultra processors with built-in Intel® Arc™ graphics and Intel® Arc™ Graphics for dGPU parts), which is to make it easier to accelerate your Machine Learning workflows on Intel GPUs in PyTorch 2.5 release. We also enabled the initial support of PyTorch on Windows for Intel® Client GPUs in this release.

    + +
      +
    • Expanded PyTorch hardware backend support matrix to include both Intel Data Center and Client GPUs.  
    • +
    • The implementation of SYCL* kernels to enhance coverage and execution of Aten operators on Intel GPUs to boost performance in PyTorch eager mode.
    • +
    • Enhanced Intel GPU backend of torch.compile to improve inference and training performance for a wide range of deep learning workloads.
    • +
    + +

    These features are available through PyTorch preview and nightly binary PIP wheels. For more information regarding Intel GPU support, please refer to documentation.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch2-6/index.html b/blog/pytorch2-6/index.html new file mode 100644 index 000000000000..0b5898e63773 --- /dev/null +++ b/blog/pytorch2-6/index.html @@ -0,0 +1,780 @@ + + + + + + + + + + + + + PyTorch 2.6 Release Blog | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    January 29, 2025

    +

    + PyTorch 2.6 Release Blog +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    We are excited to announce the release of PyTorch® 2.6 (release notes)! This release features multiple improvements for PT2: torch.compile can now be used with Python 3.13; new performance-related knob torch.compiler.set_stance; several AOTInductor enhancements. Besides the PT2 improvements, another highlight is FP16 support on X86 CPUs.

    + +

    NOTE: Starting with this release we are not going to publish on Conda, please see [Announcement] Deprecating PyTorch’s official Anaconda channel for the details.

    + +

    For this release the experimental Linux binaries shipped with CUDA 12.6.3 (as well as Linux Aarch64, Linux ROCm 6.2.4, and Linux XPU binaries) are built with CXX11_ABI=1 and are using the Manylinux 2.28 build platform. If you build PyTorch extensions with custom C++ or CUDA extensions, please update these builds to use CXX_ABI=1 as well and report any issues you are seeing. For the next PyTorch 2.7 release we plan to switch all Linux builds to Manylinux 2.28 and CXX11_ABI=1, please see [RFC] PyTorch next wheel build platform: manylinux-2.28 for the details and discussion.

    + +

    Also in this release as an important security improvement measure we have changed the default value for weights_only parameter of torch.load. This is a backward compatibility-breaking change, please see this forum post for more details.

    + +

    This release is composed of 3892 commits from 520 contributors since PyTorch 2.5. We want to sincerely thank our dedicated community for your contributions. As always, we encourage you to try these out and report any issues as we improve PyTorch. More information about how to get started with the PyTorch 2-series can be found at our Getting Started page.

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Beta + Prototype +
    torch.compiler.set_stance + Improved PyTorch user experience on Intel GPUs +
    torch.library.triton_op + FlexAttention support on X86 CPU for LLMs +
    torch.compile support for Python 3.13 + Dim.AUTO +
    New packaging APIs for AOTInductor + CUTLASS and CK GEMM/CONV Backends for AOTInductor +
    AOTInductor: minifier + +
    AOTInductor: ABI-compatible mode code generation + +
    FP16 support for X86 CPUs + +
    + +

    *To see a full list of public feature submissions click here.

    + +

    BETA FEATURES

    + +

    [Beta] torch.compiler.set_stance

    + +

    This feature enables the user to specify different behaviors (“stances”) that torch.compile can take between different invocations of compiled functions. One of the stances, for example, is

    + +

    “eager_on_recompile”, that instructs PyTorch to code eagerly when a recompile is necessary, reusing cached compiled code when possible.

    + +

    For more information please refer to the set_stance documentation and the Dynamic Compilation Control with torch.compiler.set_stance tutorial.

    + +

    [Beta] torch.library.triton_op

    + +

    torch.library.triton_op offers a standard way of creating custom operators that are backed by user-defined triton kernels.

    + +

    When users turn user-defined triton kernels into custom operators, torch.library.triton_op allows torch.compile to peek into the implementation, enabling torch.compile to optimize the triton kernel inside it.

    + +

    For more information please refer to the triton_op documentation and the Using User-Defined Triton Kernels with torch.compile tutorial.

    + +

    [Beta] torch.compile support for Python 3.13

    + +

    torch.compile previously only supported Python up to version 3.12. Users can now optimize models with torch.compile in Python 3.13.

    + +

    [Beta] New packaging APIs for AOTInductor

    + +

    A new package format, “PT2 archive”, has been introduced. This essentially contains a zipfile of all the files that need to be used by AOTInductor, and allows users to send everything needed to other environments. There is also functionality to package multiple models into one artifact, and to store additional metadata inside of the package.

    + +

    For more details please see the updated torch.export AOTInductor Tutorial for Python runtime.

    + +

    [Beta] AOTInductor: minifier

    + +

    If a user encounters an error while using AOTInductor APIs, AOTInductor Minifier allows creation of a minimal nn.Module that reproduces the error.

    + +

    For more information please see the AOTInductor Minifier documentation.

    + +

    [Beta] AOTInductor: ABI-compatible mode code generation

    + +

    AOTInductor-generated model code has dependency on Pytorch cpp libraries. As Pytorch evolves quickly, it’s important to make sure previously AOTInductor compiled models can continue to run on newer Pytorch versions, i.e. AOTInductor is backward compatible.

    + +

    In order to guarantee application binary interface (ABI) backward compatibility, we have carefully defined a set of stable C interfaces in libtorch and make sure AOTInductor generates code that only refers to the specific set of APIs and nothing else in libtorch. We will keep the set of C APIs stable across Pytorch versions and thus provide backward compatibility guarantees for AOTInductor-compiled models.

    + +

    [Beta] FP16 support for X86 CPUs (both eager and Inductor modes)

    + +

    Float16 datatype is commonly used for reduced memory usage and faster computation in AI inference and training. CPUs like the recently launched Intel® Xeon® 6 with P-Cores support Float16 datatype with native accelerator AMX. Float16 support on X86 CPUs was introduced in PyTorch 2.5 as a prototype feature, and now it has been further improved for both eager mode and Torch.compile + Inductor mode, making it Beta level feature with both functionality and performance verified with a broad scope of workloads.

    + +

    PROTOTYPE FEATURES

    + +

    [Prototype] Improved PyTorch user experience on Intel GPUs

    + +

    PyTorch user experience on Intel GPUs is further improved with simplified installation steps, Windows release binary distribution and expanded coverage of supported GPU models including the latest Intel® Arc™ B-Series discrete graphics. Application developers and researchers seeking to fine-tune, inference and develop with PyTorch models on Intel® Core™ Ultra AI PCs and Intel® Arc™ discrete graphics will now be able to directly install PyTorch with binary releases for Windows, Linux and Windows Subsystem for Linux 2.

    + +
      +
    • Simplified Intel GPU software stack setup to enable one-click installation of the torch-xpu PIP wheels to run deep learning workloads in an out of the box fashion, eliminating the complexity of installing and activating Intel GPU development software bundles.
    • +
    • Windows binary releases for torch core, torchvision and torchaudio have been made available for Intel GPUs, and the supported GPU models have been expanded from Intel® Core™ Ultra Processors with Intel® Arc™ Graphics, Intel® Core™ Ultra Series 2 with Intel® Arc™ Graphics and Intel® Arc™ A-Series Graphics to the latest GPU hardware Intel® Arc™ B-Series graphics.
    • +
    • Further enhanced coverage of Aten operators on Intel GPUs with SYCL* kernels for smooth eager mode execution, as well as bug fixes and performance optimizations for torch.compile on Intel GPUs.
    • +
    + +

    For more information regarding Intel GPU support, please refer to Getting Started Guide.

    + +

    [Prototype] FlexAttention support on X86 CPU for LLMs

    + +

    FlexAttention was initially introduced in PyTorch 2.5 to provide optimized implementations for Attention variants with a flexible API. In PyTorch 2.6, X86 CPU support for FlexAttention was added through TorchInductor CPP backend. This new feature leverages and extends current CPP template abilities to support broad attention variants (e.x.: PageAttention, which is critical for LLMs inference) based on the existing FlexAttention API, and brings optimized performance on x86 CPUs. With this feature, it’s easy to use FlexAttention API to compose Attention solutions on CPU platforms and achieve good performance.

    + +

    [Prototype] Dim.AUTO

    + +

    Dim.AUTO allows usage of automatic dynamic shapes with torch.export. Users can export with Dim.AUTO and “discover” the dynamic behavior of their models, with min/max ranges, relations between dimensions, and static/dynamic behavior being automatically inferred.

    + +

    This is a more user-friendly experience compared to the existing named-Dims approach for specifying dynamic shapes, which requires the user to fully understand the dynamic behavior of their models at export time. Dim.AUTO allows users to write generic code that isn’t model-dependent, increasing ease-of-use for exporting with dynamic shapes.

    + +

    Please see torch.export tutorial for more information.

    + +

    [Prototype] CUTLASS and CK GEMM/CONV Backends for AOTInductor

    + +

    The CUTLASS and CK backend adds kernel choices for GEMM autotuning in Inductor. This is now also available in AOTInductor which can run in C++ runtime environments. A major improvement to the two backends is improved compile-time speed by eliminating redundant kernel binary compilations and dynamic shapes support.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorchs-tracing-based-selective-build/index.html b/blog/pytorchs-tracing-based-selective-build/index.html new file mode 100644 index 000000000000..5c2425fd7d33 --- /dev/null +++ b/blog/pytorchs-tracing-based-selective-build/index.html @@ -0,0 +1,890 @@ + + + + + + + + + + + + + PyTorch’s Tracing Based Selective Build | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    October 17, 2022

    +

    + PyTorch’s Tracing Based Selective Build +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Dhruv Matani, Suraj Subramanian + +

    +

    Introduction

    + +

    TL;DR: It can be challenging to run PyTorch on mobile devices, SBCs (Single Board Computers), and IOT devices. When compiled, the PyTorch library is huge and includes dependencies that might not be needed for the on-device use case.

    + +

    To run a specific set of models on-device, we actually require only a small subset of the features in the PyTorch library. We found that using a PyTorch runtime generated using selective build can achieve up to 90% reduction in binary size (for the CPU and QuantizedCPU backends on an x86-64 build on Linux). In this blog, we share our experience of generating model-specific minimal runtimes using Selective Build and show you how to do the same.

    + +

    Why is this important for app developers?

    + +

    Using a PyTorch runtime generated by selective build can reduce the size of AI-powered apps by 30+ MB - a significant reduction for a typical mobile app! Making mobile applications more lightweight has many benefits - they are runnable on a wider variety of devices, consume less cellular data, and can be downloaded and updated faster on user’s devices.

    + +

    What does the Developer Experience look like?

    + +

    This method can work seamlessly with any existing PyTorch Mobile deployment workflows. All you need to do is replace the general PyTorch runtime library with a runtime customized for the specific models you wish to use in your application. The general steps in this process are:

    + +
      +
    1. Build the PyTorch Runtime in instrumentation mode (this is called an instrumentation build of PyTorch). This will record the used operators, kernels and features.
    2. +
    3. Run your models through this instrumentation build by using the provided model_tracer binary. This will generate a single YAML file that stores all the features used by your model. These features will be preserved in the minimal runtime.
    4. +
    5. Build PyTorch using this YAML file as input. This is the selective build technique, and it greatly reduces the size of the final PyTorch binary.
    6. +
    7. Use this selectively-built PyTorch library to reduce the size of your mobile application!
    8. +
    + +

    Building the PyTorch Runtime in a special “instrumentation” mode ( by passing the TRACING_BASED=1 build option) generates an instrumentation build runtime of PyTorch, along with a model_tracer binary. Running a model with this build allows us to trace the parts of PyTorch used by the model.

    + +

    + +

    + +

    + Figure 1: Instrumentation build of PyTorch +

    + +
    # Clone the PyTorch repo
    +git clone https://github.com/pytorch/pytorch.git
    +cd pytorch
    +
    +# Build the model_tracer
    +USE_NUMPY=0 USE_DISTRIBUTED=0 USE_CUDA=0 TRACING_BASED=1 \
    +  python setup.py develop
    +
    + +

    Now this instrumentation build is used to run a model inference with representative inputs. The model_tracer binary observes parts of the instrumentation build that were activated during the inference run, and dumps it to a YAML file.

    + +

    + +

    + +

    + Figure 2: YAML file generated by running model(s) on an instrumentation build +

    + +
    # Generate YAML file
    +./build/bin/model_tracer \
    +  --model_input_path /tmp/path_to_model.ptl \
    +  --build_yaml_path /tmp/selected_ops.yaml
    +
    + +

    Now we build the PyTorch Runtime again, but this time using the YAML file generated by the tracer. The runtime now only includes those parts that are needed for this model. This is called “Selectively built PyTorch runtime” in the diagram below.

    + +
    # Clean out cached configuration
    +make clean
    +
    +# Build PyTorch using Selected Operators (from the YAML file)
    +# using the host toolchain, and use this generated library
    +BUILD_PYTORCH_MOBILE_WITH_HOST_TOOLCHAIN=1 \
    +USE_LIGHTWEIGHT_DISPATCH=0 \
    +BUILD_LITE_INTERPRETER=1 \
    +SELECTED_OP_LIST=/tmp/selected_ops.yaml \
    +TRACING_BASED=1 \
    +  ./scripts/build_mobile.sh
    +
    + +

    + +

    + +

    + Figure 3: Selective Build of PyTorch and model execution on a selectively built PyTorch runtime +

    + +

    Show me the code!

    + +

    We’ve put together a notebook to illustrate what the process above looks like in code using a simple PyTorch model.

    + +

    For a more hands-on tutorial to deploy this on Android/iOS this tutorial should be helpful.

    + +

    Technical FAQs

    + +

    Why is Tracing needed for a Selective Build of PyTorch?

    + +

    In PyTorch, CPU kernels can call other operators via the PyTorch Dispatcher. Simply including the set of root operators called directly by the model is not sufficient as there might be many more being called under-the-hood transitively. Running the model on representative inputs and observing the actual list of operators called (aka “tracing”) is the most accurate way of determining what parts of PyTorch are used.

    + +

    Additionally, factors such as which dtypes a kernel should handle are also runtime features that depend on actual input provided to the model. Hence, the tracing mechanism is extremely suitable for this purpose.

    + +

    Which features can be selected (in or out) by using Tracing Based Selective Build?

    + +

    The following features can be selected for the PyTorch runtime during the tracing based selective build process:

    + +
      +
    1. CPU/QuantizedCPU kernels for PyTorch’s ATen Operators: If a PyTorch Operator is not needed by a model targeted at a selectively built runtime, then the registration of that CPU kernel is omitted in the runtime. This is controlled via Torchgen code-gen.
    2. +
    3. Primary Operators: This is controlled by a macro named TORCH_SELECTIVE_SCHEMA (via templated selective build) that either selects a primary operator or de-selects it based on information in a generated header file.
    4. +
    5. Code that handles specific dtypes in CPU kernels: This is performed by generating exception throws in specific case statements in the switch case generated by the macro AT_PRIVATE_CHECK_SELECTIVE_BUILD.
    6. +
    7. Registration of Custom C++ Classes that extend PyTorch: This is controlled by the macro TORCH_SELECTIVE_CLASS, which can be used when registering Custom C++ Classes. The torch::selective_class_<> helper is to be used in conjunction with the macro TORCH_SELECTIVE_CLASS.
    8. +
    + +

    What is the structure of the YAML file used during the build?

    + +

    The YAML file generated after tracing looks like the example below. It encodes all the elements of the “selectable” build feature as specified above.

    + +
    include_all_non_op_selectives: false
    +build_features: []
    +operators:
    +    aten::add.Tensor:
    +        is_used_for_training: false
    +        is_root_operator: true
    +        include_all_overloads: false
    +    aten::len.t:
    +        is_used_for_training: false
    +        is_root_operator: true
    +        include_all_overloads: false
    +kernel_metadata:
    +    _local_scalar_dense_cpu:
    +    - Float
    +    add_stub:
    +    - Float
    +    copy_:
    +    - Bool
    +    - Byte
    +    mul_cpu:
    +    - Float
    +custom_classes: []
    +
    + +

    How exactly is code eliminated from the generated binary?

    + +

    Depending on the specific scenario, there are 2 main techniques that are used to hint the compiler and linker about unused and unreachable code. This code is then cleaned up by the compiler or linker as unreachable code.

    + +

    [1] Unreferenced functions removed by the Linker

    + +

    When a function that isn’t transitively referenced from any visible function is present in the compiled object files that are being linked together, the linker will remove it (if the right build flags are provided). This is leveraged in 2 scenarios by the selective build system.

    + +
    Kernel Registration in the Dispatcher
    + +

    If an operator’s kernel isn’t needed, then it isn’t registered with the dispatcher. An unregistered kernel means that the function is unreachable, and it will be removed by the linker.

    + +
    Templated Selective Build
    + +

    The general idea here is that a class template specialization is used to select a class that either captures a reference to a function or not (depending on whether it’s used) and the linker can come along and clean out the unreferenced function.

    + +

    For example, in the code below, there’s no reference to the function “fn2”, so it will be cleaned up by the linker since it’s not referenced anywhere.

    + +
    #include <vector>
    +#include <cstdio>
    +
    +template <typename T, bool>
    +struct FunctionSelector {
    +    T fn_;
    +    FunctionSelector(T fn): fn_(fn) {}
    +    T get() { return this->fn_; }
    +};
    +
    +// The "false" specialization of this class does NOT retain the argument passed
    +// to the class constructor, which means that the function pointer passed in
    +// is considered to be unreferenced in the program (unless it is referenced
    +// elsewhere).
    +template <typename T>
    +struct FunctionSelector<T, false> {
    +    FunctionSelector(T) {}
    +};
    +
    +template <typename T>
    +FunctionSelector<T, true> make_function_selector_true(T fn) {
    +    return FunctionSelector<T, true>(fn);
    +}
    +
    +template <typename T>
    +FunctionSelector<T, false> make_function_selector_false(T fn) {
    +    return FunctionSelector<T, false>(fn);
    +}
    +
    +typedef void(*fn_ptr_type)();
    +
    +std::vector<fn_ptr_type> fns;
    +
    +template <typename T>
    +void add_fn(FunctionSelector<T, true> fs) {
    +    fns.push_back(fs.get());
    +}
    +
    +template <typename T>
    +void add_fn(FunctionSelector<T, false>) {
    +    // Do nothing.
    +}
    +
    +// fn1 will be kept by the linker since it is added to the vector "fns" at
    +// runtime.
    +void fn1() {
    +    printf("fn1\n");
    +}
    +
    +// fn2 will be removed by the linker since it isn't referenced at all.
    +void fn2() {
    +    printf("fn2\n");
    +}
    +
    +int main() {
    +    add_fn(make_function_selector_true(fn1));
    +    add_fn(make_function_selector_false(fn2));
    +}
    +
    + +

    [2] Dead Code Eliminated by the Compiler

    + +

    C++ Compilers can detect dead (unreachable) code by analyzing the code’s control flow statically. For example, if there’s a code-path that comes after an unconditional exception throw, then all the code after it will be marked as dead code and not converted to object code by the compiler. Typically, compilers require the use of the -fdce flag to eliminate dead code.

    + +

    In the example below, you can see that the C++ code on the left (in the red boxes) doesn’t have any corresponding generated object code on the right.

    + +

    + +

    + +

    + Figure 4: Dead Code Elimination by C++ Compilers +

    + +

    This property is leveraged in the bodies of PyTorch kernel implementations that have a lot of repeated code to handle multiple dtypes of a Tensor. A dtype is the underlying data-type that the Tensor stores elements of. This can be one of float, double, int64, bool, int8, etc…

    + +

    Almost every PyTorch CPU kernel uses a macro of the form AT_DISPATCH_ALL_TYPES* that is used to substitute some code specialized for every dtype that the kernel needs to handle. For example:

    + +
    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
    +    kBool, kHalf, kBFloat16, dtype, "copy_kernel", [&] {
    +  cpu_kernel_vec(
    +      iter,
    +      [=](scalar_t a) -> scalar_t { return a; },
    +      [=](Vectorized<scalar_t> a) -> Vectorized<scalar_t> { return a; });
    +});
    +
    + +

    The macro AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3 internally has a switch-case statement that looks like the code in Figure-4 above. The tracing process records the dtypes triggered for the kernel tag “copy_kernel” and the build process processes these tags and inserts throw statements in every case statement that is handling the dtype that isn’t required for this kernel tag.

    + +

    This is how dtype selectivity is implemented in PyTorch’s Tracing Based Selective Build.

    + +

    Conclusion

    + +

    Tracing Based Selective Build is a practical and scalable approach to selecting only the used parts of an application to retain code that static analysis can not detect. This code is usually extremely data/input dependent in nature.

    + +

    This article provides detailed insights into how Tracing Based Selective Build works under the hood, and the technical details related to its implementation. These techniques can also be applied to other applications and situations that can benefit from reduced binary size.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/quantization-aware-training/index.html b/blog/quantization-aware-training/index.html new file mode 100644 index 000000000000..e7b79da37ea2 --- /dev/null +++ b/blog/quantization-aware-training/index.html @@ -0,0 +1,901 @@ + + + + + + + + + + + + + Quantization-Aware Training for Large Language Models with PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Andrew Or, Jerry Zhang, Evan Smothers, Kartikay Khandelwal, Supriya Rao + +

    +

    In this blog, we present an end-to-end Quantization-Aware Training (QAT) flow for large language models in PyTorch. We demonstrate how QAT in PyTorch can recover up to 96% of the accuracy degradation on hellaswag and 68% of the perplexity degradation on wikitext for Llama3 compared to post-training quantization (PTQ). We present the QAT APIs in torchao and showcase how users can leverage them for fine-tuning in torchtune.

    + +

    Llama3-8B fine-tuned on the C4 dataset (en subset) with and without QAT using int8 per token dynamic activations + int4 grouped per channel weights, evaluated on hellaswag and wikitext on a A100 GPU. Note the log scale for wikitext (lower is better).

    + +

    Figure 1: Llama3-8B fine-tuned on the C4 dataset (en subset) with and without QAT using int8 per token dynamic activations + int4 grouped per channel weights, evaluated on hellaswag and wikitext on a A100 GPU. Note the log scale for wikitext (lower is better).

    + +

    To demonstrate the effectiveness of QAT in an end-to-end flow, we further lowered the quantized model to XNNPACK, a highly optimized neural network library for backends including iOS and Android, through executorch. After lowering to XNNPACK, the QAT model saw 16.8% lower perplexity than the PTQ model, while maintaining the same model size and on-device inference and generation speeds.

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Lowered model metric + PTQ + QAT +
    Wikitext word perplexity (↓) + 23.316 + 19.403 +
    Wikitext byte perplexity (↓) + 1.850 + 1.785 +
    Wikitext bits per byte (↓) + 0.887 + 0.836 +
    Model size + 3.881 GB + 3.881 GB +
    On-device inference speed + 5.065 tok/s + 5.265 tok/s +
    On-device generation speed + 8.369 tok/s + 8.701 tok/s +
    + +

    Table 1: QAT achieved 16.8% lower perplexity and unchanged model sizes and on-device inference and generation speeds on the Llama3-8B model lowered to XNNPACK. Linear layers are quantized using int8 per token dynamic activations + int4 grouped per channel weights, and embeddings are additionally quantized to int4 using a group size of 32 (QAT is only applied to linear layers). Wikitext evaluation is performed using 5 samples and a max sequence length of 127 on server CPU, since evaluation is not available on device (lower is better for all wikitext results). On-device inference and generation is benchmarked on the Samsung Galaxy S22 smartphone.

    + +

    QAT APIs

    + +

    We are excited for users to try our QAT API in torchao, which can be leveraged for both training and fine-tuning. This API involves two steps, prepare and convert: prepare applies a transformation on the linear layers in the model to simulate the numerics of quantization during training, and convert actually quantizes these layers into lower bit-widths after training. The converted model can then be used in the exact same way as the PTQ model:

    + +
    import torch
    +from torchtune.models.llama3 import llama3
    +from torchao.quantization.prototype.qat import Int8DynActInt4WeightQATQuantizer
    +
    +# Smaller version of llama3 to fit in a single GPU
    +model = llama3(
    +    vocab_size=4096,
    +    num_layers=16,
    +    num_heads=16,
    +    num_kv_heads=4,
    +    embed_dim=2048,
    +    max_seq_len=2048,
    +).cuda()
    +
    +# Quantizer for int8 dynamic per token activations +
    +# int4 grouped per channel weights, only for linear layers
    +qat_quantizer = Int8DynActInt4WeightQATQuantizer()
    +
    +# Insert "fake quantize" operations into linear layers.
    +# These operations simulate quantization numerics during
    +# training without performing any dtype casting
    +model = qat_quantizer.prepare(model)
    +
    +# Standard training loop
    +optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9, weight_decay=1e-5)
    +loss_fn = torch.nn.CrossEntropyLoss()
    +for i in range(10):
    +    example = torch.randint(0, 4096, (2, 16)).cuda()
    +    target = torch.randn((2, 16, 4096)).cuda()
    +    output = model(example)
    +    loss = loss_fn(output, target)
    +    loss.backward()
    +    optimizer.step()
    +    optimizer.zero_grad()
    +
    +# Convert fake quantize to actual quantize operations
    +# The quantized model has the exact same structure as the
    +# quantized model produced in the corresponding PTQ flow
    +# through `Int8DynActInt4WeightQuantizer`
    +model = qat_quantizer.convert(model)
    +
    +# inference or generate
    +
    + +

    Fine-tuning with torchtune

    + +

    We also integrated this QAT flow into torchtune and provided recipes to run this in a distributed setting, similar to the existing full fine-tune distributed recipe. Users can additionally apply QAT during LLM fine-tuning by running the following command. See this README for more details.

    + +
    tune run --nproc_per_node 8 qat_distributed --config llama3/8B_qat_full
    +
    + +

    What is Quantization-Aware Training?

    + +

    Quantization-Aware Training (QAT) is a common quantization technique for mitigating model accuracy/perplexity degradation that arises from quantization. This is achieved by simulating quantization numerics during training while keeping the weights and/or activations in the original data type, typically float, effectively “fake quantizing” the values instead of actually casting them to lower bit-widths:

    + +
    # PTQ: x_q is quantized and cast to int8
    +# scale and zero point (zp) refer to parameters used to quantize x_float
    +# qmin and qmax refer to the range of quantized values
    +x_q = (x_float / scale + zp).round().clamp(qmin, qmax).cast(int8)
    +
    +# QAT: x_fq is still in float
    +# Fake quantize simulates the numerics of quantize + dequantize
    +x_fq = (x_float / scale + zp).round().clamp(qmin, qmax)
    +x_fq = (x_fq - zp) * scale
    +
    + +

    Since quantization involves non-differentiable operations like rounding, the QAT backward pass typically uses straight-through estimators (STE), a mechanism to estimate the gradients flowing through non-smooth functions, to ensure the gradients passed to the original weights are still meaningful. In this manner, the gradients are computed with the knowledge that the weights will ultimately be quantized after training, effectively allowing the model to adjust for quantization noise during the training process. Note that an alternative to QAT is quantized training, which actually casts the values to lower bit dtypes during training, but prior efforts have only seen success up to 8-bits, whereas QAT is effective even at lower bit-widths.

    + +

    QAT in PyTorch

    + +

    We added an initial QAT flow in torchao under prototype here. Currently we support int8 dynamic per-token activations + int4 grouped per-channel weights (abbreviated 8da4w) for linear layers. These settings are motivated by a combination of kernel availability on edge backends and prior research on LLM quantization, which found that per-token activation and per-group weight quantization achieves the best model quality for LLMs compared to other quantization schemes.

    + +

    torchao QAT flow. This flow involves two steps: (1) prepare, which inserts the fake quantization ops into the model’s linear layers, and (2) convert, which converts these fake quantization ops with actual quantize and dequantize ops after training.

    + +

    Figure 2: torchao QAT flow. This flow involves two steps: (1) prepare, which inserts the fake quantization ops into the model’s linear layers, and (2) convert, which converts these fake quantization ops with actual quantize and dequantize ops after training.

    + +

    This flow produces the exact same quantized model as the PTQ flow using the same quantization settings (through Int8DynActInt4WeightQuantizer), but with quantized weights that achieve superior accuracies and perplexities. Thus, we can use the model converted from the QAT flow as a drop-in replacement for the PTQ model and reuse all the backend delegation logic and underlying kernels.

    + +

    Experimental Results

    + +

    All experiments in this blog post are performed using the torchtune QAT integration described above. We use 6-8 A100 GPUs with 80 GBs each to fine-tune Llama2-7B and Llama3-8B on the C4 dataset (en subset) for 5000 steps. For all experiments, we use batch size = 2, learning rate = 2e-5, max sequence length = 4096 for Llama2 and 8192 for Llama3, Fully Sharded Data Parallel (FSDP) as our distribution strategy, and activation checkpointing to reduce memory footprint. For 8da4w experiments, we use a group size of 256 for weights.

    + +

    Since the pre-training dataset is not easily accessible, we perform QAT during the fine-tuning process. Empirically, we found that disabling fake quantization for the first N steps led to better results, presumably because doing so allows the weights to stabilize before we start introducing quantization noise to the fine-tuning process. We disable fake quantization for the first 1000 steps for all our experiments.

    + +

    We evaluate our quantized models using the lm-evaluation-harness integration in torchtune. We report evaluation results from a variety of tasks commonly used to evaluate LLMs, including hellaswag, a commonsense sentence completion task, wikitext, a next token/byte prediction task, and a few question-answering tasks such as arc, openbookqa, and piqa. For wikitext, perplexity refers to the inverse of how well the model can predict the next word or byte (lower is better), and bits_per_byte refers to how many bits are needed to predict the next byte (lower is also better here). For all other tasks, acc_norm refers to the accuracy normalized by the byte-length of the target string.

    + +

    Int8 Dynamic Activations + Int4 Weight Quantization (8da4w)

    + +

    Starting with Llama2 8da4w quantization, we saw that QAT was able to recover 62% of the normalized accuracy degradation on hellaswag compared to PTQ, and 58% and 57% of the word and byte perplexity degradation (respectively) on wikitext. We see similar improvements for most of the other tasks.

    + +

    Llama2-7B 8da4w quantization with and without QAT

    + +

    Figure 3a: Llama2-7B 8da4w quantization with and without QAT

    + +

    Llama2-7B 8da4w quantization with and without QAT, evaluated on wikitext (lower is better)

    + +

    Figure 3b: Llama2-7B 8da4w quantization with and without QAT, evaluated on wikitext (lower is better)

    + +

    Llama3 8da4w quantization saw even more pronounced improvements with QAT. On the hellaswag evaluation task, we were able to recover 96% of the normalized accuracy degradation on hellaswag compared to PTQ, with minimal overall degradation (<1%) compared to the non-quantized accuracy. On the wikitext evaluation task, QAT recovered 68% and 65% of the word and byte perplexity degradation (respectively). Even on arc_challenge, which was difficult for Llama2 QAT, we were able to recover 51% of the normalized accuracy degradation.

    + +

    Llama3-8B 8da4w quantization with and without QAT

    + +

    Figure 4a: Llama3-8B 8da4w quantization with and without QAT

    + +

    Llama3-8B 8da4w quantization with and without QAT, evaluated on wikitext (lower is better)

    + +

    Figure 4b: Llama3-8B 8da4w quantization with and without QAT, evaluated on wikitext (lower is better)

    + +

    Lower Bit Weight Only Quantization

    + +

    We further extended the torchao QAT flow to 2-bit and 3-bit weight only quantization and repeated the same experiments for Llama3-8B. Quantization degradation is more severe at lower bit-widths, so we use a group size of 32 for all experiments for finer-grained quantization.

    + +

    However, this is still not enough for 2-bits PTQ, which saw wikitext perplexity explode. To mitigate this problem, we leverage knowledge from prior sensitivity analysis that the first 3 and last 2 layers of the Llama3 model are the most sensitive, and skip quantizing these layers in exchange for a moderate increase in quantized model size (1.78 GB for 2-bits and 1.65 GB for 3-bits). This brought the wikitext word perplexity down from 603336 to 6766, which is significant but still far from acceptable. To further improve the quantized model, we turn to QAT.

    + +

    Llama3-8B 2-bit weight only quantization with and without QAT, evaluated on wikitext (lower is better). Bars with “skip” refer to skipping quantization for the first 3 and last 2 layers of the model, which are more sensitive to quantization. Note the log scale.

    + +

    Figure 5a: Llama3-8B 2-bit weight only quantization with and without QAT, evaluated on wikitext (lower is better). Bars with “skip” refer to skipping quantization for the first 3 and last 2 layers of the model, which are more sensitive to quantization. Note the log scale.

    + +

    We observe that applying QAT while skipping quantization for the first 3 and last 2 layers further brought the word perplexity down to a much more reasonable value of 30 (from 6766). More generally, QAT was able to recover 53% of the normalized accuracy degradation on hellaswag compared to PTQ, and 99% and 89% of the word and byte perplexity degradation (respectively) on wikitext. Without skipping the sensitive layers, however, QAT was far less effective at mitigating degradation in quantized model quality.

    + +

    Llama3-8B 2-bit weight only quantization with and without QAT. Bars with “skip” refer to skipping quantization for the first 3 and last 2 layers of the model, which are more sensitive to quantization.

    + +

    Figure 5b: Llama3-8B 2-bit weight only quantization with and without QAT. Bars with “skip” refer to skipping quantization for the first 3 and last 2 layers of the model, which are more sensitive to quantization.

    + +

    For 3-bit weight only quantization, QAT was effective even without skipping the first 3 and last 2 layers, though skipping these layers still led to better results for both PTQ and QAT. In the skip case, QAT was able to recover 63% of the normalized accuracy degradation on hellaswag compared to PTQ, and 72% and 65% of the word and byte perplexity degradation (respectively) on wikitext.

    + +

    Llama3-8B 3-bit weight only quantization with and without QAT. Bars with “skip” refer to skipping quantization for the first 3 and last 2 layers of the model, which are more sensitive to quantization.

    + +

    Figure 6a: Llama3-8B 3-bit weight only quantization with and without QAT. Bars with “skip” refer to skipping quantization for the first 3 and last 2 layers of the model, which are more sensitive to quantization.

    + +

    Llama3-8B 3-bit weight only quantization with and without QAT, evaluated on wikitext (lower is better). Bars with “skip” refer to skipping quantization for the first 3 and last 2 layers of the model, which are more sensitive to quantization. Note the log scale.

    + +

    Figure 6b: Llama3-8B 3-bit weight only quantization with and without QAT, evaluated on wikitext (lower is better). Bars with “skip” refer to skipping quantization for the first 3 and last 2 layers of the model, which are more sensitive to quantization. Note the log scale.

    + +

    QAT Overhead

    + +

    QAT inserts many fake quantize operations throughout the model, adding considerable overhead to both the fine-tuning speed and the memory usage. For a model like Llama3-8B for example, we have (32 * 7) + 1 = 225 linear layers, each of which has at least 1 fake quantize for the weights and potentially 1 fake quantize for the input activations. Memory footprint increase is also significant, since we cannot mutate the weights in-place and so we need to clone them before applying fake quantization, though this overhead can be mostly mitigated by enabling activation checkpointing.

    + +

    In our microbenchmarks, we found that 8da4w QAT fine-tuning is ~34% slower than regular full fine-tuning. With activation checkpointing, the memory increase per GPU is around 2.35 GB. Most of these overheads are fundamental to how QAT works, though we may be able to speed up computation with torch.compile in the future.

    + + + + + + + + + + + + + + + + + +
    Per GPU statistics + Full fine-tuning + QAT fine-tuning +
    Median tokens per second + 546.314 tok/s + 359.637 tok/s +
    Median peak memory + 67.501 GB + 69.850 GB +
    + +

    Table 2: Llama3 QAT fine-tuning overhead for int8 per token dynamic activations + int4 grouped per channel weights on 6 A100 GPUs (each with 80GB memory).

    + +

    Looking Ahead

    + +

    In this blog, we presented a QAT flow for LLMs through torchao, integrated this flow with the fine-tuning APIs in torchtune, and demonstrated its potential to recover most of the quantization degradation compared to PTQ and match non-quantized performance on certain tasks. There are many directions for future explorations:

    + +
      +
    • Hyperparameter tuning. It is likely that extensive hyperparameter tuning can further improve the results of finetuning and QAT. In addition to the general hyperparameters like the learning rate, batch size, dataset size, and number of fine-tuning steps, we should also tune QAT-specific ones, such as when to start/stop fake quantization, how many steps to fake quantize, and regularization parameters for fake quantized values.
    • +
    • Outlier reduction techniques. In our experiments, we found that both PTQ and QAT were susceptible to outliers. In addition to simple clamping and regularization during fine-tuning, we can explore techniques that allow the network to learn how to control these outliers (e.g. learned quantization ranges, clipped softmax, and gated attention), or possibly even borrow outlier suppression techniques from post-training settings (e.g. SpinQuant, SmoothQuant) and apply them sparingly throughout the fine-tuning process.
    • +
    • Mixed-precision and more complex dtypes. Especially in the lower bit regime, we saw that skipping quantization for certain sensitive layers was effective for both PTQ and QAT. Did we need to skip quantizing these layers altogether, or can we still quantize them, just to lower bit-widths? It will be interesting to explore mixed-precision quantization in the context of QAT. Training with newer dtypes such as MX4 is another promising direction, especially given that the upcoming Blackwell GPUs will no longer support int4 tensor cores.
    • +
    • Composability with LoRA and QLoRA. Our QAT integration in torchtune currently only supports the full fine-tuning workflow. However, many users wish to fine-tune their models using low-ranked adaptors to substantially reduce their memory footprint. Composing QAT with techniques like LoRA / QLoRA will enable users to reap the memory and performance benefits of these approaches while producing a model that will ultimately be quantized with minimal model quality degradation.
    • +
    • Composability with torch.compile. This is another potential way to significantly speed up fake quantization computations in QAT while reducing memory footprint. torch.compile is currently not compatible with the distribution strategy used in full distributed fine-tuning recipes in torchtune (with or without QAT), but support will be added in the near future.
    • +
    • Quantizing other layers. In this work, we only explored quantizing the linear layers. However, in the context of long sequence lengths, the KV cache often becomes the throughput bottleneck and can reach tens of GBs, hence LLM-QAT explored quantizing the KV cache alongside activations and weights. Prior work has also had success with quantizing the embedding layer down to 2-bits in other transformer-based models.
    • +
    • End-to-end evaluation on performant cuda kernels. A natural extension of this work is to provide an end-to-end QAT flow evaluated on performant cuda kernels, similar to the existing 8da4w QAT flow lowered to XNNPACK kernels through executorch. For int4 weight only quantization, we can leverage the efficient int4 weight mm kernel with bitpacking for quantization, and there is ongoing work to add QAT support for this kernel: https://github.com/pytorch/ao/pull/383. For 8da4w quantization, mixed 4-bit/8-bit GEMM is also being added in cutlass. This will be needed to build an efficient 8da4w cuda kernel.
    • +
    + +

    The QAT code can be found here. Please refer to this torchtune tutorial to get started. If you have any further questions, please feel free to open an issue on the torchao github or reach out to andrewor@meta.com. We welcome your feedback and contributions!

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/quantization-in-practice/index.html b/blog/quantization-in-practice/index.html new file mode 100644 index 000000000000..0ef96c37f1d4 --- /dev/null +++ b/blog/quantization-in-practice/index.html @@ -0,0 +1,1119 @@ + + + + + + + + + + + + + Practical Quantization in PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    February 08, 2022

    +

    + Practical Quantization in PyTorch +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Suraj Subramanian, Mark Saroufim, Jerry Zhang + +

    +

    Quantization is a cheap and easy way to make your DNN run faster and with lower memory requirements. PyTorch offers a few different approaches to quantize your model. In this blog post, we’ll lay a (quick) foundation of quantization in deep learning, and then take a look at how each technique looks like in practice. Finally we’ll end with recommendations from the literature for using quantization in your workflows.

    + +

    + +
    + Fig 1. PyTorch <3 Quantization +

    + +

    Contents

    + +

    Fundamentals of Quantization

    + +
    +

    If someone asks you what time it is, you don’t respond “10:14:34:430705”, but you might say “a quarter past 10”.

    +
    + +

    Quantization has roots in information compression; in deep networks it refers to reducing the numerical precision of its weights and/or activations.

    + +

    Overparameterized DNNs have more degrees of freedom and this makes them good candidates for information compression [1]. When you quantize a model, two things generally happen - the model gets smaller and runs with better efficiency. Hardware vendors explicitly allow for faster processing of 8-bit data (than 32-bit data) resulting in higher throughput. A smaller model has lower memory footprint and power consumption [2], crucial for deployment at the edge.

    + +

    Mapping function

    +

    The mapping function is what you might guess - a function that maps values from floating-point to integer space. A commonly used mapping function is a linear transformation given by , where is the input and are quantization parameters.

    + +

    To reconvert to floating point space, the inverse function is given by .

    + +

    , and their difference constitutes the quantization error.

    + +

    Quantization Parameters

    +

    The mapping function is parameterized by the scaling factor and zero-point .

    + +

    is simply the ratio of the input range to the output range +

    + +

    where [] is the clipping range of the input, i.e. the boundaries of permissible inputs. [] is the range in quantized output space that it is mapped to. For 8-bit quantization, the output range .

    + +

    acts as a bias to ensure that a 0 in the input space maps perfectly to a 0 in the quantized space.

    + +

    Calibration

    +

    The process of choosing the input clipping range is known as calibration. The simplest technique (also the default in PyTorch) is to record the running mininmum and maximum values and assign them to and . TensorRT also uses entropy minimization (KL divergence), mean-square-error minimization, or percentiles of the input range.

    + +

    In PyTorch, Observer modules (code) collect statistics on the input values and calculate the qparams . Different calibration schemes result in different quantized outputs, and it’s best to empirically verify which scheme works best for your application and architecture (more on that later).

    + +
    from torch.quantization.observer import MinMaxObserver, MovingAverageMinMaxObserver, HistogramObserver
    +C, L = 3, 4
    +normal = torch.distributions.normal.Normal(0,1)
    +inputs = [normal.sample((C, L)), normal.sample((C, L))]
    +print(inputs)
    +
    +# >>>>>
    +# [tensor([[-0.0590,  1.1674,  0.7119, -1.1270],
    +#          [-1.3974,  0.5077, -0.5601,  0.0683],
    +#          [-0.0929,  0.9473,  0.7159, -0.4574]]]),
    +
    +# tensor([[-0.0236, -0.7599,  1.0290,  0.8914],
    +#          [-1.1727, -1.2556, -0.2271,  0.9568],
    +#          [-0.2500,  1.4579,  1.4707,  0.4043]])]
    +
    +observers = [MinMaxObserver(), MovingAverageMinMaxObserver(), HistogramObserver()]
    +for obs in observers:
    +  for x in inputs: obs(x) 
    +  print(obs.__class__.__name__, obs.calculate_qparams())
    +
    +# >>>>>
    +# MinMaxObserver (tensor([0.0112]), tensor([124], dtype=torch.int32))
    +# MovingAverageMinMaxObserver (tensor([0.0101]), tensor([139], dtype=torch.int32))
    +# HistogramObserver (tensor([0.0100]), tensor([106], dtype=torch.int32))
    +
    + +

    Affine and Symmetric Quantization Schemes

    +

    Affine or asymmetric quantization schemes assign the input range to the min and max observed values. Affine schemes generally offer tighter clipping ranges and are useful for quantizing non-negative activations (you don’t need the input range to contain negative values if your input tensors are never negative). The range is calculated as +. Affine quantization leads to more computationally expensive inference when used for weight tensors [3].

    + +

    Symmetric quantization schemes center the input range around 0, eliminating the need to calculate a zero-point offset. The range is calculated as +. For skewed signals (like non-negative activations) this can result in bad quantization resolution because the clipping range includes values that never show up in the input (see the pyplot below).

    + +
    act =  torch.distributions.pareto.Pareto(1, 10).sample((1,1024))
    +weights = torch.distributions.normal.Normal(0, 0.12).sample((3, 64, 7, 7)).flatten()
    +
    +def get_symmetric_range(x):
    +  beta = torch.max(x.max(), x.min().abs())
    +  return -beta.item(), beta.item()
    +
    +def get_affine_range(x):
    +  return x.min().item(), x.max().item()
    +
    +def plot(plt, data, scheme):
    +  boundaries = get_affine_range(data) if scheme == 'affine' else get_symmetric_range(data)
    +  a, _, _ = plt.hist(data, density=True, bins=100)
    +  ymin, ymax = np.quantile(a[a>0], [0.25, 0.95])
    +  plt.vlines(x=boundaries, ls='--', colors='purple', ymin=ymin, ymax=ymax)
    +
    +fig, axs = plt.subplots(2,2)
    +plot(axs[0, 0], act, 'affine')
    +axs[0, 0].set_title("Activation, Affine-Quantized")
    +
    +plot(axs[0, 1], act, 'symmetric')
    +axs[0, 1].set_title("Activation, Symmetric-Quantized")
    +
    +plot(axs[1, 0], weights, 'affine')
    +axs[1, 0].set_title("Weights, Affine-Quantized")
    +
    +plot(axs[1, 1], weights, 'symmetric')
    +axs[1, 1].set_title("Weights, Symmetric-Quantized")
    +plt.show()
    +
    + +

    + +
    Fig 2. Clipping ranges (in purple) for affine and symmetric schemes +

    + +

    In PyTorch, you can specify affine or symmetric schemes while initializing the Observer. Note that not all observers support both schemes.

    + +
    for qscheme in [torch.per_tensor_affine, torch.per_tensor_symmetric]:
    +  obs = MovingAverageMinMaxObserver(qscheme=qscheme)
    +  for x in inputs: obs(x)
    +  print(f"Qscheme: {qscheme} | {obs.calculate_qparams()}")
    +
    +# >>>>>
    +# Qscheme: torch.per_tensor_affine | (tensor([0.0101]), tensor([139], dtype=torch.int32))
    +# Qscheme: torch.per_tensor_symmetric | (tensor([0.0109]), tensor([128]))
    +
    + +

    Per-Tensor and Per-Channel Quantization Schemes

    +

    Quantization parameters can be calculated for the layer’s entire weight tensor as a whole, or separately for each channel. In per-tensor, the same clipping range is applied to all the channels in a layer

    + +

    + +
    Fig 3. Per-Channel uses one set of qparams for each channel. Per-tensor uses the same qparams for the entire tensor. +

    + +

    For weights quantization, symmetric-per-channel quantization provides better accuracies; per-tensor quantization performs poorly, possibly due to high variance in conv weights across channels from batchnorm folding [3].

    + +
    from torch.quantization.observer import MovingAveragePerChannelMinMaxObserver
    +obs = MovingAveragePerChannelMinMaxObserver(ch_axis=0)  # calculate qparams for all `C` channels separately
    +for x in inputs: obs(x)
    +print(obs.calculate_qparams())
    +
    +# >>>>>
    +# (tensor([0.0090, 0.0075, 0.0055]), tensor([125, 187,  82], dtype=torch.int32))
    +
    + +

    Backend Engine

    +

    Currently, quantized operators run on x86 machines via the FBGEMM backend, or use QNNPACK primitives on ARM machines. Backend support for server GPUs (via TensorRT and cuDNN) is coming soon. Learn more about extending quantization to custom backends: RFC-0019.

    + +
    backend = 'fbgemm' if x86 else 'qnnpack'
    +qconfig = torch.quantization.get_default_qconfig(backend)  
    +torch.backends.quantized.engine = backend
    +
    + +

    QConfig

    + +

    The QConfig (code) NamedTuple stores the Observers and the quantization schemes used to quantize activations and weights.

    + +

    Be sure to pass the Observer class (not the instance), or a callable that can return Observer instances. Use with_args() to override the default arguments.

    + +
    my_qconfig = torch.quantization.QConfig(
    +  activation=MovingAverageMinMaxObserver.with_args(qscheme=torch.per_tensor_affine),
    +  weight=MovingAveragePerChannelMinMaxObserver.with_args(qscheme=torch.qint8)
    +)
    +# >>>>>
    +# QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.MovingAverageMinMaxObserver'>, qscheme=torch.per_tensor_affine){}, weight=functools.partial(<class 'torch.ao.quantization.observer.MovingAveragePerChannelMinMaxObserver'>, qscheme=torch.qint8){})
    +
    + +

    In PyTorch

    + +

    PyTorch allows you a few different ways to quantize your model depending on

    +
      +
    • if you prefer a flexible but manual, or a restricted automagic process (Eager Mode v/s FX Graph Mode)
    • +
    • if qparams for quantizing activations (layer outputs) are precomputed for all inputs, or calculated afresh with each input (static v/s dynamic),
    • +
    • if qparams are computed with or without retraining (quantization-aware training v/s post-training quantization)
    • +
    + +

    FX Graph Mode automatically fuses eligible modules, inserts Quant/DeQuant stubs, calibrates the model and returns a quantized module - all in two method calls - but only for networks that are symbolic traceable. The examples below contain the calls using Eager Mode and FX Graph Mode for comparison.

    + +

    In DNNs, eligible candidates for quantization are the FP32 weights (layer parameters) and activations (layer outputs). Quantizing weights reduces the model size. Quantized activations typically result in faster inference.

    + +

    As an example, the 50-layer ResNet network has ~26 million weight parameters and computes ~16 million activations in the forward pass.

    + +

    Post-Training Dynamic/Weight-only Quantization

    +

    Here the model’s weights are pre-quantized; the activations are quantized on-the-fly (“dynamic”) during inference. The simplest of all approaches, it has a one line API call in torch.quantization.quantize_dynamic. Currently only Linear and Recurrent (LSTM, GRU, RNN) layers are supported for dynamic quantization.

    + +

    (+) Can result in higher accuracies since the clipping range is exactly calibrated for each input [1].

    + +

    (+) Dynamic quantization is preferred for models like LSTMs and Transformers where writing/retrieving the model’s weights from memory dominate bandwidths [4].

    + +

    (-) Calibrating and quantizing the activations at each layer during runtime can add to the compute overhead.

    + +
    import torch
    +from torch import nn
    +
    +# toy model
    +m = nn.Sequential(
    +  nn.Conv2d(2, 64, (8,)),
    +  nn.ReLU(),
    +  nn.Linear(16,10),
    +  nn.LSTM(10, 10))
    +
    +m.eval()
    +
    +## EAGER MODE
    +from torch.quantization import quantize_dynamic
    +model_quantized = quantize_dynamic(
    +    model=m, qconfig_spec={nn.LSTM, nn.Linear}, dtype=torch.qint8, inplace=False
    +)
    +
    +## FX MODE
    +from torch.quantization import quantize_fx
    +qconfig_dict = {"": torch.quantization.default_dynamic_qconfig}  # An empty key denotes the default applied to all modules
    +model_prepared = quantize_fx.prepare_fx(m, qconfig_dict)
    +model_quantized = quantize_fx.convert_fx(model_prepared)
    +
    + +

    Post-Training Static Quantization (PTQ)

    +

    PTQ also pre-quantizes model weights but instead of calibrating activations on-the-fly, the clipping range is pre-calibrated and fixed (“static”) using validation data. Activations stay in quantized precision between operations during inference. About 100 mini-batches of representative data are sufficient to calibrate the observers [2]. The examples below use random data in calibration for convenience - using that in your application will result in bad qparams.

    + +

    + PTQ flowchart +
    + Fig 4. Steps in Post-Training Static Quantization +

    + +

    Module fusion combines multiple sequential modules (eg: [Conv2d, BatchNorm, ReLU]) into one. Fusing modules means the compiler needs to only run one kernel instead of many; this speeds things up and improves accuracy by reducing quantization error.

    + +

    (+) Static quantization has faster inference than dynamic quantization because it eliminates the float<->int conversion costs between layers.

    + +

    (-) Static quantized models may need regular re-calibration to stay robust against distribution-drift.

    + +
    # Static quantization of a model consists of the following steps:
    +
    +#     Fuse modules
    +#     Insert Quant/DeQuant Stubs
    +#     Prepare the fused module (insert observers before and after layers)
    +#     Calibrate the prepared module (pass it representative data)
    +#     Convert the calibrated module (replace with quantized version)
    +
    +import torch
    +from torch import nn
    +import copy
    +
    +backend = "fbgemm"  # running on a x86 CPU. Use "qnnpack" if running on ARM.
    +
    +model = nn.Sequential(
    +     nn.Conv2d(2,64,3),
    +     nn.ReLU(),
    +     nn.Conv2d(64, 128, 3),
    +     nn.ReLU()
    +)
    +
    +## EAGER MODE
    +m = copy.deepcopy(model)
    +m.eval()
    +"""Fuse
    +- Inplace fusion replaces the first module in the sequence with the fused module, and the rest with identity modules
    +"""
    +torch.quantization.fuse_modules(m, ['0','1'], inplace=True) # fuse first Conv-ReLU pair
    +torch.quantization.fuse_modules(m, ['2','3'], inplace=True) # fuse second Conv-ReLU pair
    +
    +"""Insert stubs"""
    +m = nn.Sequential(torch.quantization.QuantStub(), 
    +                  *m, 
    +                  torch.quantization.DeQuantStub())
    +
    +"""Prepare"""
    +m.qconfig = torch.quantization.get_default_qconfig(backend)
    +torch.quantization.prepare(m, inplace=True)
    +
    +"""Calibrate
    +- This example uses random data for convenience. Use representative (validation) data instead.
    +"""
    +with torch.inference_mode():
    +  for _ in range(10):
    +    x = torch.rand(1,2, 28, 28)
    +    m(x)
    +    
    +"""Convert"""
    +torch.quantization.convert(m, inplace=True)
    +
    +"""Check"""
    +print(m[[1]].weight().element_size()) # 1 byte instead of 4 bytes for FP32
    +
    +
    +## FX GRAPH
    +from torch.quantization import quantize_fx
    +m = copy.deepcopy(model)
    +m.eval()
    +qconfig_dict = {"": torch.quantization.get_default_qconfig(backend)}
    +# Prepare
    +model_prepared = quantize_fx.prepare_fx(m, qconfig_dict)
    +# Calibrate - Use representative (validation) data.
    +with torch.inference_mode():
    +  for _ in range(10):
    +    x = torch.rand(1,2,28, 28)
    +    model_prepared(x)
    +# quantize
    +model_quantized = quantize_fx.convert_fx(model_prepared)
    +
    + +

    Quantization-aware Training (QAT)

    +

    + QAT flowchart +
    + Fig 5. Steps in Quantization-Aware Training +

    + +

    The PTQ approach is great for large models, but accuracy suffers in smaller models [[6]]. This is of course due to the loss in numerical precision when adapting a model from FP32 to the INT8 realm (Figure 6(a)). QAT tackles this by including this quantization error in the training loss, thereby training an INT8-first model.

    + +

    + Fig. 6: Comparison of PTQ and QAT +
    + Fig 6. Comparison of PTQ and QAT convergence [3] +

    + +

    All weights and biases are stored in FP32, and backpropagation happens as usual. However in the forward pass, quantization is internally simulated via FakeQuantize modules. They are called fake because they quantize and immediately dequantize the data, adding quantization noise similar to what might be encountered during quantized inference. The final loss thus accounts for any expected quantization errors. Optimizing on this allows the model to identify a wider region in the loss function (Figure 6(b)), and identify FP32 parameters such that quantizing them to INT8 does not significantly affect accuracy.

    + +

    + Fake Quantization in the forward and backward pass +
    Fig 7. Fake Quantization in the forward and backward pass +
    Image source: https://developer.nvidia.com/blog/achieving-fp32-accuracy-for-int8-inference-using-quantization-aware-training-with-tensorrt +

    + +

    (+) QAT yields higher accuracies than PTQ.

    + +

    (+) Qparams can be learned during model training for more fine-grained accuracy (see LearnableFakeQuantize)

    + +

    (-) Computational cost of retraining a model in QAT can be several hundred epochs [1]

    + +
    # QAT follows the same steps as PTQ, with the exception of the training loop before you actually convert the model to its quantized version
    +
    +import torch
    +from torch import nn
    +
    +backend = "fbgemm"  # running on a x86 CPU. Use "qnnpack" if running on ARM.
    +
    +m = nn.Sequential(
    +     nn.Conv2d(2,64,8),
    +     nn.ReLU(),
    +     nn.Conv2d(64, 128, 8),
    +     nn.ReLU()
    +)
    +
    +"""Fuse"""
    +torch.quantization.fuse_modules(m, ['0','1'], inplace=True) # fuse first Conv-ReLU pair
    +torch.quantization.fuse_modules(m, ['2','3'], inplace=True) # fuse second Conv-ReLU pair
    +
    +"""Insert stubs"""
    +m = nn.Sequential(torch.quantization.QuantStub(), 
    +                  *m, 
    +                  torch.quantization.DeQuantStub())
    +
    +"""Prepare"""
    +m.train()
    +m.qconfig = torch.quantization.get_default_qconfig(backend)
    +torch.quantization.prepare_qat(m, inplace=True)
    +
    +"""Training Loop"""
    +n_epochs = 10
    +opt = torch.optim.SGD(m.parameters(), lr=0.1)
    +loss_fn = lambda out, tgt: torch.pow(tgt-out, 2).mean()
    +for epoch in range(n_epochs):
    +  x = torch.rand(10,2,24,24)
    +  out = m(x)
    +  loss = loss_fn(out, torch.rand_like(out))
    +  opt.zero_grad()
    +  loss.backward()
    +  opt.step()
    +
    +"""Convert"""
    +m.eval()
    +torch.quantization.convert(m, inplace=True)
    +
    + +

    Sensitivity Analysis

    +

    Not all layers respond to quantization equally, some are more sensitive to precision drops than others. Identifying the optimal combination of layers that minimizes accuracy drop is time-consuming, so [3] suggest a one-at-a-time sensitivity analysis to identify which layers are most sensitive, and retaining FP32 precision on those. In their experiments, skipping just 2 conv layers (out of a total 28 in MobileNet v1) give them near-FP32 accuracy. Using FX Graph Mode, we can create custom qconfigs to do this easily:

    + +
    # ONE-AT-A-TIME SENSITIVITY ANALYSIS 
    +
    +for quantized_layer, _ in model.named_modules():
    +  print("Only quantizing layer: ", quantized_layer)
    +
    +  # The module_name key allows module-specific qconfigs. 
    +  qconfig_dict = {"": None, 
    +  "module_name":[(quantized_layer, torch.quantization.get_default_qconfig(backend))]}
    +
    +  model_prepared = quantize_fx.prepare_fx(model, qconfig_dict)
    +  # calibrate
    +  model_quantized = quantize_fx.convert_fx(model_prepared)
    +  # evaluate(model)
    +
    + +

    Another approach is to compare statistics of the FP32 and INT8 layers; commonly used metrics for these are SQNR (Signal to Quantized Noise Ratio) and Mean-Squre-Error. Such a comparative analysis may also help in guiding further optimizations.

    + +

    + Fig 8. Comparing model weights and activations +
    + Fig 8. Comparing model weights and activations +

    + +

    PyTorch provides tools to help with this analysis under the Numeric Suite. Learn more about using Numeric Suite from the full tutorial.

    + +
    # extract from https://pytorch.org/tutorials/prototype/numeric_suite_tutorial.html
    +import torch.quantization._numeric_suite as ns
    +
    +def SQNR(x, y):
    +    # Higher is better
    +    Ps = torch.norm(x)
    +    Pn = torch.norm(x-y)
    +    return 20*torch.log10(Ps/Pn)
    +
    +wt_compare_dict = ns.compare_weights(fp32_model.state_dict(), int8_model.state_dict())
    +for key in wt_compare_dict:
    +    print(key, compute_error(wt_compare_dict[key]['float'], wt_compare_dict[key]['quantized'].dequantize()))
    +
    +act_compare_dict = ns.compare_model_outputs(fp32_model, int8_model, input_data)
    +for key in act_compare_dict:
    +    print(key, compute_error(act_compare_dict[key]['float'][0], act_compare_dict[key]['quantized'][0].dequantize()))
    +
    +
    + +

    Recommendations for your workflow

    +

    + Suggested quantization workflow +
    + Fig 9. Suggested quantization workflow +

    +

    Click for larger image

    + +

    Points to note

    +
      +
    • Large (10M+ parameters) models are more robust to quantization error. [2]
    • +
    • Quantizing a model from a FP32 checkpoint provides better accuracy than training an INT8 model from scratch.[2]
    • +
    • Profiling the model runtime is optional but it can help identify layers that bottleneck inference.
    • +
    • Dynamic Quantization is an easy first step, especially if your model has many Linear or Recurrent layers.
    • +
    • Use symmetric-per-channel quantization with MinMax observers for quantizing weights. Use affine-per-tensor quantization with MovingAverageMinMax observers for quantizing activations[2, 3]
    • +
    • Use metrics like SQNR to identify which layers are most suscpetible to quantization error. Turn off quantization on these layers.
    • +
    • Use QAT to fine-tune for around 10% of the original training schedule with an annealing learning rate schedule starting at 1% of the initial training learning rate. [3]
    • +
    • If the above workflow didn’t work for you, we want to know more. Post a thread with details of your code (model architecture, accuracy metric, techniques tried). Feel free to cc me @suraj.pt.
    • +
    + +

    That was a lot to digest, congratulations for sticking with it! Next, we’ll take a look at quantizing a “real-world” model that uses dynamic control structures (if-else, loops). These elements disallow symbolic tracing a model, which makes it a bit tricky to directly quantize the model out of the box. In the next post of this series, we’ll get our hands dirty on a model that is chock full of loops and if-else blocks, and even uses third-party libraries in the forward call.

    + +

    We’ll also cover a cool new feature in PyTorch Quantization called Define-by-Run, that tries to ease this constraint by needing only subsets of the model’s computational graph to be free of dynamic flow. Check out the Define-by-Run poster at PTDD’21 for a preview.

    + +

    References

    +

    [1] Gholami, A., Kim, S., Dong, Z., Yao, Z., Mahoney, M. W., & Keutzer, K. (2021). A survey of quantization methods for efficient neural network inference. arXiv preprint arXiv:2103.13630.

    + +

    [2] Krishnamoorthi, R. (2018). Quantizing deep convolutional networks for efficient inference: A whitepaper. arXiv preprint arXiv:1806.08342.

    + +

    [3] Wu, H., Judd, P., Zhang, X., Isaev, M., & Micikevicius, P. (2020). Integer quantization for deep learning inference: Principles and empirical evaluation. arXiv preprint arXiv:2004.09602.

    + +

    [4] PyTorch Quantization Docs

    + + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/real-time-speech-rec/index.html b/blog/real-time-speech-rec/index.html new file mode 100644 index 000000000000..e0733c598a51 --- /dev/null +++ b/blog/real-time-speech-rec/index.html @@ -0,0 +1,877 @@ + + + + + + + + + + + + + Real-time Audio-visual Speech Recognition | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    October 10, 2023

    +

    + Real-time Audio-visual Speech Recognition +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    Audio-Visual Speech Recognition (AV-ASR, or AVSR) is the task of transcribing text from audio and visual streams, which has recently attracted a lot of research attention due to its robustness to noise. The vast majority of work to date has focused on developing AV-ASR models for non-streaming recognition; studies on streaming AV-ASR are very limited.

    + +

    We have developed a compact real-time speech recognition system based on TorchAudio, a library for audio and signal processing with PyTorch. It can run locally on a laptop with high accuracy without accessing the cloud. Today, we are releasing the real-time AV-ASR recipe under a permissive open license (BSD-2-Clause license), enabling a broad set of applications and fostering further research on audio-visual models for speech recognition.

    + +

    This work is part of our approach to AV-ASR research. A promising aspect of this approach is its ability to automatically annotate large-scale audio-visual datasets, which enables the training of more accurate and robust speech recognition systems. Furthermore, this technology has the potential to run on smart devices since it achieves the latency and memory efficiency that such devices require for inference.

    + +

    In the future, speech recognition systems are expected to power applications in numerous domains. One of the primary applications of AV-ASR is to enhance the performance of ASR in noisy environments. Since visual streams are not affected by acoustic noise, integrating them into an audio-visual speech recognition model can compensate for the performance drop of ASR models. Our AV-ASR system has the potential to serve multiple purposes beyond speech recognition, such as text summarization, translation and even text-to-speech conversion. Moreover, the exclusive use of VSR can be useful in certain scenarios, e.g. where speaking is not allowed, in meetings, and where privacy in public conversations is desired.

    + +

    AV-ASR

    + +

    Fig. 1 The pipeline for audio-visual speech recognition system

    + +

    Fig. 1: The pipeline for audio-visual speech recognition system

    + +

    Our real-time AV-ASR system is presented in Fig. 1. It consists of three components, a data collection module, a pre-processing module and an end-to-end model. The data collection module comprises hardware devices, such as a microphone and camera. Its role is to collect information from the real world. Once the information is collected, the pre-processing module location and crop out face. Next, we feed the raw audio stream and the pre-processed video stream into our end-to-end model for inference.

    + +

    Data collection

    + +

    We use torchaudio.io.StreamReader to capture audio/video from streaming device input, e.g. microphone and camera on laptop. Once the raw video and audio streams are collected, the pre-processing module locates and crops faces. It should be noted that data is immediately deleted during the streaming process.

    + +

    Pre-processing

    + +

    Before feeding the raw stream into our model, each video sequence has to undergo a specific pre-processing procedure. This involves three critical steps. The first step is to perform face detection. Following that, each individual frame is aligned to a referenced frame, commonly known as the mean face, in order to normalize rotation and size differences across frames. The final step in the pre-processing module is to crop the face region from the aligned face image. We would like to clearly note that our model is fed with raw audio waveforms and pixels of the face, without any further preprocessing like face parsing or landmark detection. An example of the pre-processing procedure is illustrated in Table 1.

    + + + + + + + + + + + + + + +
    +Original image + + + +Detected image + + +Transformed image + + +Cropped image + +
    + 0. Original + +1. Detection + +2. Alignment + +3. Crop +
    + +

    Table 1: Preprocessing pipeline.

    + +

    Model

    + +

    Fig. 2 The architecture for the audio-visual speech recognition system.

    + +

    Fig. 2: The architecture for the audio-visual speech recognition system

    + +

    We consider two configurations: Small with 12 Emformer blocks and Large with 28, with 34.9M and 383.3M parameters, respectively. Each AV-ASR model composes front-end encoders, a fusion module, an Emformer encoder, and a transducer model. To be specific, we use convolutional frontends to extract features from raw audio waveforms and facial images. The features are concatenated to form 1024-d features, which are then passed through a two-layer multi-layer perceptron and an Emformer transducer model. The entire network is trained using RNN-T loss. The architecture of the proposed AV-ASR model is illustrated in Fig. 2.

    + +

    Analysis

    + +

    Datasets. We follow Auto-AVSR: Audio-Visual Speech Recognition with Automatic Labels to use publicly available audio-visual datasets including LRS3, VoxCeleb2 and AVSpeech for training. We do not use mouth ROIs or facial landmarks or attributes during both training and testing stages.

    + +

    Comparisons with the state-of-the-art. Non-streaming evaluation results on LRS3 are presented in Table 2. Our audio-visual model with an algorithmic latency of 800 ms (160ms+1280msx0.5) yields a WER of 1.3%, which is on par with those achieved by state-of-the-art offline models such as AV-HuBERT, RAVEn, and Auto-AVSR.

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Method + Total Hours + WER (%) +
    ViT3D-CM + 90, 000 + 1.6 +
    AV-HuBERT + 1, 759 + 1.4 +
    RAVEn + 1, 759 + 1.4 +
    AutoAVSR + 3, 448 + 0.9 +
    Ours + 3, 068 + 1.3 +
    + +

    Table 2: Non-streaming evaluation results for audio-visual models on the LRS3 dataset.

    + +

    Noisy experiments. During training, 16 different noise types are randomly injected to audio waveforms, including 13 types from Demand database, ‘DLIVING’,’DKITCHEN’, ‘OMEETING’, ‘OOFFICE’, ‘PCAFETER’, ‘PRESTO’, ‘PSTATION’, ‘STRAFFIC’, ‘SPSQUARE’, ‘SCAFE’, ‘TMETRO’, ‘TBUS’ and ‘TCAR’, two more types of noise from speech commands database, white and pink and one more type of noise from NOISEX-92 database, babble noise. SNR levels in the range of [clean, 7.5dB, 2.5dB, -2.5dB, -7.5dB] are selected from with a uniform distribution. Results of ASR and AV-ASR models, when tested with babble noise, are shown in Table 3. With increasing noise level, the performance advantage of our audio-visual model over our audio-only model grows, indicating that incorporating visual data improves noise robustness.

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Type + + 10dB + 5dB + 0dB + -5dB + -10dB +
    A + 1.6 + 1.8 + 3.2 + 10.9 + 27.9 + 55.5 +
    A+V + 1.6 + 1.7 + 2.1 + 6.2 + 11.7 + 27.6 +
    + +

    Table 3: Streaming evaluation WER (%) results at various signal-to-noise ratios for our audio-only (A) and audio-visual (A+V) models on the LRS3 dataset under 0.80-second latency constraints.

    + +

    Real-time factor. The real-time factor (RTF) is an important measure of a system’s ability to process real-time tasks efficiently. An RTF value of less than 1 indicates that the system meets real-time requirements. We measure RTF using a laptop with an Intel® Core™ i7-12700 CPU running at 2.70 GHz and an NVIDIA 3070 GeForce RTX 3070 Ti GPU. To the best of our knowledge, this is the first AV-ASR model that reports RTFs on the LRS3 benchmark. The Small model achieves a WER of 2.6% and an RTF of 0.87 on CPU (Table 4), demonstrating its potential for real-time on-device inference applications.

    + + + + + + + + + + + + + + + + + + + + + + + + +
    Model + Device + Streaming WER [%] + RTF +
    Large + GPU + 1.6 + 0.35 +
    Small + GPU + 2.6 + 0.33 +
    CPU + 0.87 +
    + +

    Table 4: Impact of AV-ASR model size and device on WER and RTF. Note that the RTF calculation includes the pre-processing step wherein the Ultra-Lightweight Face Detection Slim 320 model is used to generate face bounding boxes.

    + +

    Learn more about the system from the published works below:

    + +
      +
    • Shi, Yangyang, Yongqiang Wang, Chunyang Wu, Ching-Feng Yeh, Julian Chan, Frank Zhang, Duc Le, and Mike Seltzer. “Emformer: Efficient memory transformer based acoustic model for low latency streaming speech recognition.” In ICASSP 2021-2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 6783-6787. IEEE, 2021.
    • +
    • Ma, Pingchuan, Alexandros Haliassos, Adriana Fernandez-Lopez, Honglie Chen, Stavros Petridis, and Maja Pantic. “Auto-AVSR: Audio-Visual Speech Recognition with Automatic Labels.” In ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 1-5. IEEE, 2023.
    • +
    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/rebellions/index.html b/blog/rebellions/index.html new file mode 100644 index 000000000000..c543c69f03b2 --- /dev/null +++ b/blog/rebellions/index.html @@ -0,0 +1,669 @@ + + + + + + + + + + + + + Rebellions Joins the PyTorch Foundation as a General Member | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    Rebellions logo

    + +

    The PyTorch Foundation, a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem, is announcing today that Rebellions has joined as a general member.

    + +

    Rebellions is a South Korea-based semiconductor company specializing in the design and development of AI chips for data centers and edge devices. Their innovative hardware and software solutions aim to accelerate generative AI and machine learning workloads, focusing on high energy efficiency and performance. The company successfully launched and deployed its AI chip ‘ATOM’ targeting data centers in 2023 and is developing its next-generation AI accelerator ‘REBEL’.

    + +

    “We’re thrilled to welcome Rebellions as a new general member of the PyTorch Foundation,” said Matt White, Executive Director of the PyTorch Foundation. “Rebellions brings a unique perspective to the PyTorch ecosystem with their focus on advancing the integration of NPU architectures for AI acceleration with PyTorch. Their expertise will play a vital role in ensuring PyTorch continues to evolve as a versatile framework, accommodating the diverse needs of modern AI workloads. We look forward to collaborating with Rebellions to drive innovation and strengthen the PyTorch ecosystem for developers worldwide.”

    + +

    Rebellions has introduced native support for PyTorch 2.0 in their RBLN SDK. This integration includes compatibility with torch.compile, a pivotal feature of PyTorch 2.0 that enhances model performance. Through this development, Rebellions has empowered developers to seamlessly harness the full potential of their AI accelerator lineup within the environment.

    + +

    Rebellions is also deeply committed to advancing the PyTorch ecosystem through collaborative innovation starting in Korea. The company has established a Special Interest Group (SIG) focusing on Pytorch Core within the PyTorch Korea community and is actively working with volunteers recruited through MODULABS, an open research institute, to integrate native support for the deep learning framework into their Neural Processing Unit (NPU).

    + +

    In addition, Rebellions is collaborating with academic institutions, such as Yonsei University, Hanyang University, University of Science & Technology (UST) and national agencies, such as the Electronics and Telecommunications Research Institute (ETRI), to offer undergraduate and graduate courses on PyTorch and enable them to leverage Pytorch as their research platform.

    + +

    These initiatives highlight Rebellions’ dedication to optimizing the PyTorch experience for developers and researchers alike, while also fostering education and innovation in the field.

    + +

    “By integrating our hardware innovations with PyTorch, we’re building Native NPU support to accelerate diverse AI workloads.” said Hong-seok Kim, the Chief Software Architect at Rebellions. “We’re excited to contribute to the PyTorch community by community-driven initiatives and partnerships, advancing NPU architecture support for next-generation AI solutions. Together with the PyTorch community, we aim to pioneer new possibilities in AI acceleration and empower developers worldwide with efficient computing solutions.”

    + +

    To learn more about how your organization can be a part of the PyTorch Foundation, visit our website.

    + +

    About Rebellions

    + +

    Rebellions is a South Korea-based semiconductor company specializing in the design and development of AI chips for data centers and edge devices. Their innovative hardware and software solutions aim to accelerate generative AI and machine learning workloads, focusing on high energy efficiency and performance. The company successfully launched and deployed its AI chip ‘ATOM’ targeting data centers in 2023 and is developing its next-generation AI accelerator ‘REBEL’ incorporating a scalable chiplet architecture and high-bandwidth memory.

    + +

    About PyTorch Foundation

    + +

    The PyTorch Foundation is a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem. The PyTorch Foundation is supported by its members and leading contributors to the PyTorch open source project. The Foundation leverages resources provided by members and contributors to enable community discussions and collaboration.

    + +

    About The Linux Foundation

    + +

    The Linux Foundation is the world’s leading home for collaboration on open source software, hardware, standards, and data. Linux Foundation projects are critical to the world’s infrastructure including Linux, Kubernetes, Node.js, ONAP, PyTorch, RISC-V, SPDX, OpenChain, and more. The Linux Foundation focuses on leveraging best practices and addressing the needs of contributors, users, and solution providers to create sustainable models for open collaboration. For more information, please visit us at linuxfoundation.org.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/reducing-checkpointing-times/index.html b/blog/reducing-checkpointing-times/index.html new file mode 100644 index 000000000000..df48557e36c1 --- /dev/null +++ b/blog/reducing-checkpointing-times/index.html @@ -0,0 +1,708 @@ + + + + + + + + + + + + + Reducing Model Checkpointing Times by Over 10x with PyTorch Distributed Asynchronous Checkpointing | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Meta: Lucas Pasqualin, Less Wright, Iris Zhang (PyTorch), Chien-Chin Huang; IBM Research: Swaminathan Sundararaman, Saransh Gupta, Raghu Ganti + +

    +

    Summary: With PyTorch distributed’s new asynchronous checkpointing feature, developed with feedback from IBM, we show how IBM Research Team is able to implement and reduce effective checkpointing time by a factor of 10-20x. Example: 7B model ‘down time’ for a checkpoint goes from an average of 148.8 seconds to 6.3 seconds, or 23.62x faster.

    + +

    This directly translates into either more net training progress for every given 24 hour period while continuing to robustly checkpoint or more frequent checkpoints to shorten recovery window/time.

    + +

    In this note, we showcase the usage code and architecture that makes asynchronous checkpointing possible, along with timing results verified by IBM’s Research team.

    + +

    Async Checkpointing vs Standard Checkpointing

    + +

    Model checkpointing is a vital part of large model training, but checkpointing is an expensive process as each checkpoint process involves blocking training progress in order to save out the latest model weights. However, not checkpointing or reducing checkpointing frequency can result in a significant loss in training progress. For example, failures such as a deadlock, straggler, and gpu errors require the training process to be restarted. In order to restart from a failure, all (training) workers must stop their training process and be restarted from the last saved checkpoint.

    + +

    Thus, the inherent tension between robustness to failures vs training progress plays out as a tradeoff, but now with asynchronous checkpointing, PyTorch Distributed is able to significantly reduce this tension and enable frequent checkpoint with minimal impact to the overall training time.

    + +

    For background, it was almost exactly a year ago that we showcased how distributed checkpointing had massively sped up checkpointing times from the original torch.save() functionality. As IBM Research had noted, torch.save could take up to 30 minutes to checkpoint a single 11B model (PyTorch 1.13).

    + +

    With advancements in distributed checkpointing, checkpoints could be done in under 4 minutes for up to 30B model sizes.

    + +

    With asynchronous checkpointing, the training time lost due to checkpointing now moves to under 30 seconds, and often as short as 6 seconds.

    + +

    To be clear, asynchronous checkpointing does not compress the actual serialization checkpointing time as the previous update showcased. Rather it moves the final checkpointing process off the critical path (to cpu threads) to allow GPU training to continue while finalizing the checkpoint under separate threads.

    + +

    However, to the user, the effect is nearly the same in that down time for training due to checkpointing is substantially reduced, in many cases by 10x or even 20x.

    + +

    Async Dist Checkpointing

    + +

    As the above speedup chart shows, asynchronous checkpointing produces a 10x to 23x further improvement over the previous large improvements from a year ago.

    + +

    How does Asynchronous Checkpointing work?

    + +

    Asynchronous checkpointing modularizes the checkpointing process into two parts rather than one monolithic process. The first phase copies the data from each gpu/rank from GPU to CPU. This is the visible downtime to the user and can take from 6 - 14 seconds for 7B-13B model sizes. The second phase asynchronously copies the data from CPU memory to disk to persist the checkpoint.

    + +

    Once data is copied to CPU in the first phase, the GPU is free to immediately resume training. Hence with asynchronous checkpointing the downtime for checkpointing is simply the time needed to copy over the latest model states to CPU.

    + +

    At the same time that training resumes, non-blocking CPU threads work with the freshly arrived data in memory to complete the full checkpointing/serialization process to disk (i.e. persistent save).

    + +

    flow diagram

    + +

    Note that PyTorch’s Distributed Checkpointer relies on collective communication calls for per-rank metadata necessary to optimize saves, as well as a final synchronization which marks checkpointing as complete and makes the action atomic. This can interfere with distributed training (as distributed training also relies upon similar calls to synchronize training across multiple GPUs) if the Checkpointing thread utilizes the same process group used for training.

    + +

    Specifically, a race condition between the calls could potentially cause training and asynch checkpointing save threads to wait on collective calls at the same time, resulting in a true collective hang.

    + +

    We avoided this scenario by initializing a separate process group for async checkpointing. This separates the checkpointing collectives into their own logical process group, which thus ensures it will not interfere with collective calls in the main training threads.

    + +

    How do I use Asynchronous Checkpointing in my training?

    + +

    Usage of Asynchronous checkpointing is relatively straightforward. Using the latest nightly version of PyTorch, you will want to initialize your process group with both nccl and gloo. Gloo is required for the cpu threads portion.

    + +

    From there, create a duplicate process group which the asynchronous checkpointing will utilize. +Then train as usual but at the point when you want to checkpoint, use the asynchronous save api, passing in the states to save, the checkpoint id and the checkpoint process group.

    + +

    Code snippet

    + +

    Asynchronous checkpointing is also fully implemented in torchtitan. Here, it is implemented for use with pre-training your own Llama2 or Lllama3 model. Using it is as simple as updating the toml config file:

    + +

    Code snippet

    + +

    Future work

    + +

    Checkpointing has made huge strides over the past year. Moving from almost half an hour checkpoints to under 5 minutes with distributed checkpointing and now to under 30 seconds with asynchronous checkpointing.

    + +

    The last frontier - zero overhead checkpointing where even the < 30 seconds is eliminated by streaming the updated weights during the backward pass such that checkpoint data is already on cpu at the point asynchronous checkpointing would kick in.

    + +

    This would effectively move large model training to where checkpointing has no disruption or downtime enabling both more robustness (as checkpoints could be taken more frequently) and faster training progress due to no downtime for checkpointing.

    + +

    Source code link: https://github.com/pytorch/pytorch/blob/main/torch/distributed/checkpoint/state_dict_saver.py

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/running-pytorch-models-on-jetson-nano/index.html b/blog/running-pytorch-models-on-jetson-nano/index.html new file mode 100644 index 000000000000..603c1d740706 --- /dev/null +++ b/blog/running-pytorch-models-on-jetson-nano/index.html @@ -0,0 +1,910 @@ + + + + + + + + + + + + + Running PyTorch Models on Jetson Nano | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    March 16, 2022

    +

    + Running PyTorch Models on Jetson Nano +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Jeff Tang, Hamid Shojanazeri, Geeta Chauhan + +

    +

    Overview

    +

    NVIDIA Jetson Nano, part of the Jetson family of products or Jetson modules, is a small yet powerful Linux (Ubuntu) based embedded computer with 2/4GB GPU. With it, you can run many PyTorch models efficiently. This document summarizes our experience of running different deep learning models using 3 different mechanisms on Jetson Nano:

    + +
      +
    1. +

      Jetson Inference the higher-level NVIDIA API that has built-in support for running most common computer vision models which can be transfer-learned with PyTorch on the Jetson platform.

      +
    2. +
    3. +

      TensorRT, an SDK for high-performance inference from NVIDIA that requires the conversion of a PyTorch model to ONNX, and then to the TensorRT engine file that the TensorRT runtime can run.

      +
    4. +
    5. +

      PyTorch with the direct PyTorch API torch.nn for inference.

      +
    6. +
    + +

    Setting up Jetson Nano

    +

    After purchasing a Jetson Nano here, simply follow the clear step-by-step instructions to download and write the Jetson Nano Developer Kit SD Card Image to a microSD card, and complete the setup. After the setup is done and the Nano is booted, you’ll see the standard Linux prompt along with the username and the Nano name used in the setup.

    + +

    To check the GPU status on Nano, run the following commands:

    + +
    sudo pip3 install jetson-stats
    +sudo jtop
    +
    + +

    You’ll see information, including:

    + +
    + +
    + +

    You can also see the installed CUDA version:

    + +
    $ ls -lt /usr/local
    +lrwxrwxrwx  1 root root   22 Aug  2 01:47 cuda -> /etc/alternatives/cuda
    +lrwxrwxrwx  1 root root   25 Aug  2 01:47 cuda-10 -> /etc/alternatives/cuda-10
    +drwxr-xr-x 12 root root 4096 Aug  2 01:47 cuda-10.2
    +
    + +

    To use a camera on Jetson Nano, for example, Arducam 8MP IMX219, follow the instructions here or run the commands below after installing a camera module:

    + +
    cd ~
    +wget https://github.com/ArduCAM/MIPI_Camera/releases/download/v0.0.3/install_full.sh
    +chmod +x install_full.sh
    +./install_full.sh -m arducam
    +
    + +

    Another way to do this is to use the original Jetson Nano camera driver:

    + +
    sudo dpkg -r arducam-nvidia-l4t-kernel
    +sudo shutdown -r now
    +
    + +

    Then, use ls /dev/video0 to confirm the camera is found:

    + +
    $ ls /dev/video0
    +/dev/video0
    +
    + +

    And finally, the following command to see the camera in action:

    + +
    nvgstcapture-1.0 --orientation=2
    +
    + +

    Using Jetson Inference

    +

    NVIDIA Jetson Inference API offers the easiest way to run image recognition, object detection, semantic segmentation, and pose estimation models on Jetson Nano. Jetson Inference has TensorRT built-in, so it’s very fast.

    + +

    To test run Jetson Inference, first clone the repo and download the models:

    + +
    git clone --recursive https://github.com/dusty-nv/jetson-inference
    +cd jetson-inference
    +
    + +

    Then use the pre-built Docker Container that already has PyTorch installed to test run the models:

    + +
    docker/run.sh --volume ~/jetson_inference:/jetson_inference
    +
    + +

    To run image recognition, object detection, semantic segmentation, and pose estimation models on test images, use the following:

    + +
    cd build/aarch64/bin
    +./imagenet.py images/jellyfish.jpg /jetson_inference/jellyfish.jpg
    +./segnet.py images/dog.jpg /jetson_inference/dog.jpeg
    +./detectnet.py images/peds_0.jpg /jetson_inference/peds_0.jpg
    +./posenet.py images/humans_0.jpg /jetson_inference/pose_humans_0.jpg
    +
    + +

    Four result images from running the four different models will be generated. Exit the docker image to see them:

    + +
    $ ls -lt ~/jetson_inference/
    +-rw-r--r-- 1 root root  68834 Oct 15 21:30 pose_humans_0.jpg
    +-rw-r--r-- 1 root root 914058 Oct 15 21:30 peds_0.jpg
    +-rw-r--r-- 1 root root 666239 Oct 15 21:30 dog.jpeg
    +-rw-r--r-- 1 root root 179760 Oct 15 21:29 jellyfish.jpg
    +
    + +
    + Using jest interface example 1 + Using jest interface example 2 +
    + +
    + Using jest interface example 3 + Using jest interface example 4 +
    + +

    You can also use the docker image to run PyTorch models because the image has PyTorch, torchvision and torchaudio installed:

    + +
    # pip list|grep torch
    +torch (1.9.0)
    +torchaudio (0.9.0a0+33b2469)
    +torchvision (0.10.0a0+300a8a4)
    +
    + +

    Although Jetson Inference includes models already converted to the TensorRT engine file format, you can fine-tune the models by following the steps in Transfer Learning with PyTorch (for Jetson Inference) here.

    + +

    Using TensorRT

    +

    TensorRT is an SDK for high-performance inference from NVIDIA. Jetson Nano supports TensorRT via the Jetpack SDK, included in the SD Card image used to set up Jetson Nano. To confirm that TensorRT is already installed in Nano, run dpkg -l|grep -i tensorrt:

    + +
    + +
    + +

    Theoretically, TensorRT can be used to “take a trained PyTorch model and optimize it to run more efficiently during inference on an NVIDIA GPU.” Follow the instructions and code in the notebook to see how to use PyTorch with TensorRT through ONNX on a torchvision Resnet50 model:

    + +
      +
    1. +

      How to convert the model from PyTorch to ONNX;

      +
    2. +
    3. +

      How to convert the ONNX model to a TensorRT engine file;

      +
    4. +
    5. +

      How to run the engine file with the TensorRT runtime for performance improvement: inference time improved from the original 31.5ms/19.4ms (FP32/FP16 precision) to 6.28ms (TensorRT).

      +
    6. +
    + +

    You can replace the Resnet50 model in the notebook code with another PyTorch model, go through the conversion process above, and run the finally converted model TensorRT engine file with the TensorRT runtime to see the optimized performance. But be aware that due to the Nano GPU memory size, models larger than 100MB are likely to fail to run, with the following error information:

    + +

    Error Code 1: Cuda Runtime (all CUDA-capable devices are busy or unavailable)

    + +

    You may also see an error when converting a PyTorch model to ONNX model, which may be fixed by replacing:

    + +

    torch.onnx.export(resnet50, dummy_input, "resnet50_pytorch.onnx", verbose=False)

    + +

    with:

    + +

    torch.onnx.export(model, dummy_input, "deeplabv3_pytorch.onnx", opset_version=11, verbose=False)

    + +

    Using PyTorch

    +

    First, to download and install PyTorch 1.9 on Nano, run the following commands (see here for more information):

    + +
    wget https://nvidia.box.com/shared/static/p57jwntv436lfrd78inwl7iml6p13fzh.whl -O torch-1.8.0-cp36-cp36m-linux_aarch64.whl -O torch-1.9.0-cp36-cp36m-linux_aarch64.whl
    +sudo apt-get install python3-pip libopenblas-base libopenmpi-dev 
    +pip3 install Cython
    +pip3 install numpy torch-1.9.0-cp36-cp36m-linux_aarch64.whl
    +
    + +

    To download and install torchvision 0.10 on Nano, run the commands below:

    + +
    https://drive.google.com/uc?id=1tU6YlPjrP605j4z8PMnqwCSoP6sSC91Z
    +pip3 install torchvision-0.10.0a0+300a8a4-cp36-cp36m-linux_aarch64.whl
    +
    + +

    After the steps above, run this to confirm:

    +
    $ pip3 list|grep torch
    +torch (1.9.0)
    +torchvision (0.10.0)
    +
    + +

    You can also use the docker image described in the section Using Jetson Inference (which also has PyTorch and torchvision installed), to skip the manual steps above.

    + +

    The official YOLOv5 repo is used to run the PyTorch YOLOv5 model on Jetson Nano. After logging in to Jetson Nano, follow the steps below:

    + +
      +
    • Get the repo and install what’s required:
    • +
    + +
    git clone https://github.com/ultralytics/yolov5
    +cd yolov5
    +pip install -r requirements.txt
    +
    + +
      +
    • Run python3 detect.py, which by default uses the PyTorch yolov5s.pt model. You should see something like:
    • +
    + +
    detect: weights=yolov5s.pt, source=data/images, imgsz=[640, 640], conf_thres=0.25, iou_thres=0.45, max_det=1000, device=, view_img=False, save_txt=False, save_conf=False, save_crop=False, nosave=False, classes=None, agnostic_nms=False, augment=False, visualize=False, update=False, project=runs/detect, name=exp, exist_ok=False, line_thickness=3, hide_labels=False, hide_conf=False, half=False
    +YOLOv5 🚀 v5.0-499-g48b00db torch 1.9.0 CUDA:0 (NVIDIA Tegra X1, 3956.1015625MB)
    +
    +Fusing layers... 
    +Model Summary: 224 layers, 7266973 parameters, 0 gradients
    +image 1/5 /home/jeff/repos/yolov5-new/yolov5/data/images/bus.jpg: 640x480 4 persons, 1 bus, 1 fire hydrant, Done. (0.142s)
    +...
    +
    + +

    The inference time on Jetson Nano GPU is about 140ms, more than twice as fast as the inference time on iOS or Android (about 330ms).

    + +

    If you get an error “ImportError: The _imagingft C module is not installed.” then you need to reinstall pillow:

    +
    sudo apt-get install libpng-dev
    +sudo apt-get install libfreetype6-dev
    +pip3 uninstall pillow
    +pip3 install --no-cache-dir pillow
    +
    + +

    After successfully completing the python3 detect.py run, the object detection results of the test images located in data/images will be in the runs/detect/exp directory. To test the detection with a live webcam instead of local images, use the --source 0 parameter when running python3 detect.py):

    + +
    ~/repos/yolov5$ ls -lt runs/detect/exp10
    +total 1456
    +-rw-rw-r-- 1 jeff jeff 254895 Oct 15 16:12 zidane.jpg
    +-rw-rw-r-- 1 jeff jeff 202674 Oct 15 16:12 test3.png
    +-rw-rw-r-- 1 jeff jeff 217117 Oct 15 16:12 test2.jpg
    +-rw-rw-r-- 1 jeff jeff 305826 Oct 15 16:12 test1.png
    +-rw-rw-r-- 1 jeff jeff 495760 Oct 15 16:12 bus.jpg
    +
    + +

    Using the same test files used in the PyTorch iOS YOLOv5 demo app or Android YOLOv5 demo app, you can compare the results generated with running the YOLOv5 PyTorch model on mobile devices and Jetson Nano:

    + +
    + PyTorch YOLOv5 on Jetson Nano, example with a dog + PyTorch YOLOv5 on Jetson Nano, example with a horse and a rider +
    +

    Figure 1. PyTorch YOLOv5 on Jetson Nano.

    + +
    + PyTorch YOLOv5 on iOS, example with a dog + PyTorch YOLOv5 on iOS, example with a horse and a rider +
    +

    Figure 2. PyTorch YOLOv5 on iOS.

    + +
    + PyTorch YOLOv5 on Android, example with a dog + PyTorch YOLOv5 on Android, example with a horse and a rider +
    +

    Figure 3. PyTorch YOLOv5 on Android.

    + +

    Summary

    +

    Based on our experience of running different PyTorch models for potential demo apps on Jetson Nano, we see that even Jetson Nano, a lower-end of the Jetson family of products, provides a powerful GPU and embedded system that can directly run some of the latest PyTorch models, pre-trained or transfer learned, efficiently.

    + +

    Building PyTorch demo apps on Jetson Nano can be similar to building PyTorch apps on Linux, but you can also choose to use TensorRT after converting the PyTorch models to the TensorRT engine file format.

    + +

    But if you just need to run some common computer vision models on Jetson Nano using NVIDIA’s Jetson Inference which supports image recognition, object detection, semantic segmentation, and pose estimation models, then this is the easiest way.

    + +

    References

    +

    Torch-TensorRT, a compiler for PyTorch via TensorRT: +https://github.com/NVIDIA/Torch-TensorRT/

    + +

    Jetson Inference docker image details: +https://github.com/dusty-nv/jetson-inference/blob/master/docs/aux-docker.md

    + +

    A guide to using TensorRT on the NVIDIA Jetson Nano: +https://docs.donkeycar.com/guide/robot_sbc/tensorrt_jetson_nano/ +including:

    + +
      +
    1. +

      Use Jetson as a portable GPU device to run an NN chess engine model: +https://medium.com/@ezchess/jetson-lc0-running-leela-chess-zero-on-nvidia-jetson-a-portable-gpu-device-a213afc9c018

      +
    2. +
    3. +

      A MaskEraser app using PyTorch and torchvision, installed directly with pip: +https://github.com/INTEC-ATI/MaskEraser#install-pytorch

      +
    4. +
    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/scaling-multimodal-foundation-models-in-torchmultimodal-with-pytorch-distributed/index.html b/blog/scaling-multimodal-foundation-models-in-torchmultimodal-with-pytorch-distributed/index.html new file mode 100644 index 000000000000..d349f2b42cb2 --- /dev/null +++ b/blog/scaling-multimodal-foundation-models-in-torchmultimodal-with-pytorch-distributed/index.html @@ -0,0 +1,885 @@ + + + + + + + + + + + + + Scaling Multimodal Foundation Models in TorchMultimodal with Pytorch Distributed | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Ankita De, Edward Wang (EcoF), Rohan Varma, Anjali Sridhar, Kartikay Khandelwal + +

    +

    Introduction

    + +

    In recent years, scaling model sizes has become a promising area of research. In the field of NLP, language models have gone from hundreds of millions of parameters (BERT) to hundreds of billions of parameters (GPT-3) demonstrating significant improvements on downstream tasks. The scaling laws for large scale language models have also been studied extensively in the industry. A similar trend can be observed in the vision field, with the community moving to transformer based models (like Vision Transformer, Masked Auto Encoders) as well. It is clear that individual modalities - text, image, video - have benefited massively from recent advancements in scale, and frameworks have quickly adapted to accommodate larger models.

    + +

    At the same time, multimodality is becoming increasingly important in research with tasks like image-text retrieval, visual question-answering, visual dialog and text to image generation gaining traction in real world applications. Training large scale multimodal models is the natural next step and we already see several efforts in this area like CLIP from OpenAI, Parti from Google and CM3 from Meta.

    + +

    In this blog, we present a case study demonstrating the scaling of FLAVA to 10B params using techniques from PyTorch Distributed. FLAVA is a vision and language foundation model, available in TorchMultimodal, which has shown competitive performance on both unimodal and multimodal benchmarks. We also give the relevant code pointers in this blog. The instructions for running an example script to scale FLAVA can be found here.

    + +

    Scaling FLAVA Overview

    + +

    FLAVA is a foundation multimodal model which consists of transformer based image and text encoders followed by a transformer-based multimodal fusion module. It is pretrained on both unimodal and multimodal data with a diverse set of losses. This includes masked language, image and multimodal modeling losses that require the model to reconstruct the original input from its context (self-supervised learning). It also uses image text matching loss over positive and negative examples of aligned image-text pairs as well as CLIP style contrastive loss. In addition to multimodal tasks (like image-text retrieval), FLAVA demonstrated competitive performance on unimodal benchmarks as well (GLUE tasks for NLP and image classification for vision).

    + +

    + +

    + +

    The original FLAVA model has ~350M parameters and uses ViT-B16 configurations (from the Vision Transformer paper) for image and text encoders. The multimodal fusion transformer follows the unimodal encoders but with half the number of layers. We explore increasing the size of each encoder to larger ViT variants.

    + +

    Another aspect of scaling is adding the ability to increase the batch size. FLAVA makes use of contrastive loss over in-batch negatives, which typically benefits from large batch size (as studied here). The largest training efficiency or throughput is also generally achieved when operating near maximum possible batch sizes as determined by the amount of GPU memory available (also see the experiments section).

    + +

    The following table displays the different model configurations we experimented with. We also determine the maximum batch size that was able to fit in memory for each configuration in the experiments section.

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Approx Model paramsHidden sizeMLP sizeHeadsUnimodal layersMultimodal layersModel size (fp32)
    350M (original)7683072121261.33GB
    900M102440961624123.48GB
    1.8B128051201632166.66GB
    2.7B1408614416402010.3GB
    4.8B1664819216482418.1GB
    10B20481024016644038GB
    + +

    Optimization overview

    + +

    PyTorch offers several native techniques to efficiently scale models. In the following sections, we go over some of these techniques and show how they can be applied to scale up a FLAVA model to 10 billion parameters.

    + +

    Distributed Data Parallel

    + +

    A common starting point for distributed training is data parallelism. Data parallelism replicates the model across each worker (GPU), and partitions the dataset across the workers. Different workers process different data partitions in parallel and synchronize their gradients (via all reduce) before model weights are updated. The figure below showcases the flow (forward, backward, and weight update steps) for processing a single example for data parallelism:

    + +

    + +

    + +

    + Source: https://engineering.fb.com/2021/07/15/open-source/fsdp/ +

    + +

    PyTorch provides a native API, DistributedDataParallel (DDP) to enable data parallelism which can be used as a module wrapper as showcased below. Please see PyTorch Distributed documentation for more details.

    + +
    from torchmultimodal.models.flava.model import flava_model_for_pretraining
    +import torch
    +import torch.distributed as dist
    +
    +model = flava_model_for_pretraining().cuda()
    +# Initialize PyTorch Distributed process groups
    +# Please see https://pytorch.org/tutorials/intermediate/dist_tuto.html for details
    +dist.init_process_group(backend=”nccl”)
    +# Wrap model in DDP
    +model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[torch.cuda.current_device()])
    +
    + +

    Fully Sharded Data Parallel

    + +

    GPU memory usage of a training application can roughly be broken down into model inputs, intermediate activations (needed for gradient computation), model parameters, gradients, and optimizer states. Scaling a model will typically increase each of these elements. Scaling a model with DDP can eventually result in out-of-memory issues when a single GPU’s memory becomes insufficient since it replicates the parameters, gradients, and optimizer states on all workers.

    + +

    To reduce this replication and save GPU memory, we can shard the model parameters, gradients, and optimizer states across all workers with each worker only managing a single shard. This technique was popularized by the ZeRO-3 approach developed by Microsoft. A PyTorch-native implementation of this approach is available as FullyShardedDataParallel (FSDP) API, released as a beta feature in PyTorch 1.12. During a module’s forward and backward passes, FSDP unshards the model parameters as needed for computation (using all-gather) and reshards them after computation. It synchronizes gradients using the reduce-scatter collective to ensure sharded gradients are globally averaged. The forward and backward pass flow of a model wrapped in FSDP are detailed below:

    + +

    + +

    + +

    + Source: https://engineering.fb.com/2021/07/15/open-source/fsdp/ +

    + +

    To use FSDP, the submodules of a model need to be wrapped with the API to control when specific submodules are sharded or unsharded. FSDP provides an auto-wrapping API (see the auto_wrap_policy argument) that can be used out of the box as well as several wrapping policies and the ability to write your own policy.

    + +

    The following example demonstrates wrapping the FLAVA model with FSDP. We specify the auto-wrapping policy as transformer_auto_wrap_policy. This will wrap individual transformer layers (TransformerEncoderLayer), the image transformer (ImageTransformer), text encoder (BERTTextEncoder) and multimodal encoder (FLAVATransformerWithoutEmbeddings) as individual FSDP units. This uses a recursive wrapping approach for efficient memory management. For example, after an individual transformer layer’s forward or backward pass is finished, its parameters are discarded, freeing up memory thereby reducing peak memory usage.

    + +

    FSDP also provides a number of configurable options to tune the performance of applications. For example, in our use case, we illustrate the use of the new limit_all_gathers flag, which prevents all-gathering model parameters too early thereby alleviating memory pressure on the application. We encourage users to experiment with this flag which can potentially improve the performance of applications with high active memory usage.

    + +
    import torch
    +from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
    +from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy
    +from torchmultimodal.models.flava.model import flava_model_for_pretraining
    +from torchmultimodal.models.flava.text_encoder import BertTextEncoder
    +from torchmultimodal.models.flava.image_encoder import ImageTransformer
    +from torchmultimodal.models.flava.transformer import FLAVATransformerWithoutEmbeddings
    +from torchmultimodal.modules.layers.transformer import TransformerEncoderLayer
    +
    +model = flava_model_for_pretraining().cuda()
    +dist.init_process_group(backend=”nccl”)
    +
    +model = FSDP(
    +               model,
    +               device_id=torch.cuda.current_device(),
    +               auto_wrap_policy=partial(
    +                   transformer_auto_wrap_policy,
    +                   transformer_layer_cls={
    +                       TransformerEncoderLayer,
    +                       ImageTransformer,
    +                       BERTTextEncoder,
    +                       FLAVATransformerWithoutEmbeddings
    +                   },
    +               ),
    +               limit_all_gathers=True,
    +           )
    +
    + +

    Activation Checkpointing

    + +

    As discussed above, intermediate activations, model parameters, gradients, and optimizer states contribute to the overall GPU memory usage. FSDP can reduce memory consumption due to the latter three but does not reduce memory consumed by activations. Memory used by activations increases with increase in batch size or number of hidden layers. Activation checkpointing is a technique to decrease this memory usage by recomputing the activations during the backward pass instead of holding them in memory for a specific checkpointed module. For example, we observed ~4x reduction in the peak active memory after forward pass by applying activation checkpointing to the 2.7B parameter model.

    + +

    PyTorch offers a wrapper based activation checkpointing API. In particular, checkpoint_wrapper allows users to wrap an individual module with checkpointing, and apply_activation_checkpointing allows users to specify a policy with which to wrap modules within an overall module with checkpointing. Both these APIs can be applied to most models as they do not require any modifications to the model definition code. However, if more granular control over checkpointed segments, such as checkpointing specific functions within a module, is required, the functional torch.utils.checkpoint API can be leveraged, although this requires modification to the model code. The application of the activation checkpointing wrapper to individual FLAVA transformer layers (denoted by TransformerEncoderLayer) is shown below. For a thorough description of activation checkpointing, please see the description in the PyTorch documentation.

    + +
    from torchmultimodal.models.flava.model import flava_model_for_pretraining
    +from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import apply_activation_checkpointing, checkpoint_wrapper, CheckpointImpl
    +from torchmultimodal.modules.layers.transformer import TransformerEncoderLayer
    +
    +model = flava_model_for_pretraining()
    +checkpoint_tformer_layers_policy = lambda submodule: isinstance(submodule, TransformerEncoderLayer)
    +
    +apply_activation_checkpointing(
    +               model,
    +               checkpoint_wrapper_fn=checkpoint_wrapper,
    +               check_fn=checkpoint_tformer_layers_policy,
    +           )
    +
    +

    Used together, wrapping FLAVA transformer layers with activation checkpointing and wrapping the overall model with FSDP as demonstrated above, we are able to scale FLAVA to 10B parameters.

    + +

    Experiments

    + +

    We conduct an empirical study about the impact of the different optimizations from the previous section on system performance. For all our experiments, we use a single node with 8 A100 40GB GPUs and run the pretraining for 1000 iterations. All runs also used PyTorch’s automatic mixed precision with the bfloat16 data type. TensorFloat32 format is also enabled to improve matmul performance on the A100. We define throughput as the average number of items (text or image) processed per second (we ignore the first 100 iterations while measuring throughput to account for warmup). We leave training to convergence and its impact on downstream task metrics as an area for future study.

    + +

    Figure 1 plots the throughput for each model configuration and optimization, both with a local batch size of 8 and then with the maximum batch size possible on 1 node. Absence of a data point for a model variant for an optimization indicates that the model could not be trained on a single node.

    + +

    Figure 2 plots the maximum possible batch size per worker for each optimization. We observe a few things:

    + +
      +
    1. Scaling model size: DDP is only able to fit the 350M and 900M model on a node. With FSDP, due to memory savings, we are able to train ~3x bigger models compared to DDP (i.e. the 1.8B and 2.7B variants). Combining activation checkpointing (AC) with FSDP enables training even bigger models, on the order of ~10x compared to DDP (i.e. 4.8B and 10B variants)
    2. +
    3. Throughput: +
        +
      • For smaller model sizes, at a constant batch size of 8, the throughput for DDP is slightly higher than or equal to FSDP, explainable by the additional communication required by FSDP. It is lowest for FSDP and AC combined together. This is because AC re-runs checkpointed forward passes during the backwards pass, trading off additional computation for memory savings. However, in the case of the 2.7B model, FSDP + AC actually has higher throughput compared to FSDP alone. This is because the 2.7B model with FSDP is operating close to the memory limit even at batch size 8 triggering CUDA malloc retries which tend to slow down training. AC helps with reducing the memory pressure and leads to no retries.
      • +
      • For DDP and FSDP + AC, the throughput increases with an increase in batch size for each model. For FSDP alone, this is true for smaller variants. However, with the 1.8B and 2.7B parameter models, we observe throughput degradation when increasing batch size. A potential reason for this, as noted above also, is that at the memory limit, PyTorch’s CUDA memory management may have to retry cudaMalloc calls and/or run expensive defragmentation steps to find free memory blocks to handle the workload’s memory requirements which can result in training slowdown.
      • +
      • For larger models that can only be trained with FSDP (1.8B, 2.7B, 4.8B) the setting with highest throughput achieved is with FSDP + AC scaling to the maximum batch size. For 10B, we observe nearly equal throughput for smaller and maximum batch size. This might be counterintuitive as AC results in increased computation and maxing out batch size potentially leads to expensive defragmentation operations due to operating at CUDA memory limit. However, for these large models, the increase in batch size is large enough to mask this overhead.
      • +
      +
    4. +
    + +

    + +

    + +

    + Figure 1: Training throughput for different configurations +

    + +
      +
    1. Batch size: FSDP alone enables slightly higher batch sizes compared to DDP. Using FSDP + AC enables ~3x batch size compared to DDP for the 350M param model and ~5.5x for 900M param model. Even for 10B, a max batch size of ~20 which is fairly decent. This essentially enables larger global batch size using fewer GPUs which is especially useful for contrastive learning tasks.
    2. +
    + +

    + +

    + +

    + Figure 2: Max local batchsize possible for different configurations +

    + +

    Conclusion

    + +

    As the world moves towards multimodal foundation models, scaling model parameters and efficient training is becoming an area of focus. The PyTorch ecosystem aims to accelerate innovation in this field by providing different tools to the research community, both for training and scaling multimodal models. With FLAVA, we laid out an example of scaling a model for multimodal understanding. In the future, we plan to add support for other kinds of models like the ones for multimodal generation and demonstrate their scaling factors. We also hope to automate many of these scaling and memory saving techniques (such as sharding and activation checkpointing) to reduce the amount of user experimentation needed to achieve the desired scale and maximum training throughput.

    + +

    References

    + + + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/scaling-pytorch-fsdp-for-training-foundation-models-on-ibm-cloud/index.html b/blog/scaling-pytorch-fsdp-for-training-foundation-models-on-ibm-cloud/index.html new file mode 100644 index 000000000000..9f0de4be9be8 --- /dev/null +++ b/blog/scaling-pytorch-fsdp-for-training-foundation-models-on-ibm-cloud/index.html @@ -0,0 +1,726 @@ + + + + + + + + + + + + + Scaling PyTorch FSDP for Training Foundation Models on IBM Cloud | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Linsong Chu, Less Wright, Hamid Shojanazeri, Sophia Wen, Raghu Ganti, Geeta Chauhan + +

    +

    Large model training using a cloud native approach is of growing interest for many enterprises given the emergence and success of foundation models. Some AI practitioners may assume that the only way they can achieve high GPU utilization for distributed training jobs is to run them on HPC systems, such as those inter-connected with Infiniband and may not consider Ethernet connected systems. We demonstrate how the latest distributed training technique, Fully Sharded Data Parallel (FSDP) from PyTorch, successfully scales to models of size 10B+ parameters using commodity Ethernet networking in IBM Cloud.

    + +

    PyTorch FSDP Scaling

    + +

    As models get larger, the standard techniques for data parallel training work only if the GPU can hold a full replica of the model, along with its training state (optimizer, activations, etc.). However, GPU memory increases have not kept up with the model size increases and new techniques for training such models have emerged (e.g., Fully Sharded Data Parallel, DeepSpeed), which allow us to efficiently distribute the model and data over multiple GPUs during training. In this blog post, we demonstrate a path to achieve remarkable scaling of model training to 64 nodes (512 GPUs) using PyTorch native FSDP APIs as we increase model sizes to 11B.

    + +

    What is Fully Sharded Data Parallel?

    + +

    FSDP extends the distributed data parallel training (DDP) approach by sharding model parameters, gradient and optimizer states into K FSDP units, determined by using a wrapping policy. FSDP achieves large model training efficiency in terms of resources and performance by significantly reducing the memory footprint on each GPU and overlapping computation and communication.

    + +

    Resource efficiency is achieved with memory footprint reduction by having all GPUs own a portion of each FSDP unit. To process a given FSDP unit, all GPUs share their locally owned portion via all_gather communication calls.

    + +

    Performance efficiency is accomplished by overlapping all_gather communication calls for upcoming FSDP units with computation of the current FSDP unit. Once the current FSDP unit has been processed, the non-locally owned parameters are dropped, freeing memory for the upcoming FSDP units. This process achieves training efficiency by the overlap of computation and communication, while also reducing the peak memory needed by each GPU.

    + +

    In what follows, we demonstrate how FSDP allows us to keep hundreds of GPUs highly utilized throughout a distributed training job, while running over standard Ethernet networking (system description towards the end of the blog). We chose the T5 architecture for our experiments and leveraged the code from the FSDP workshop. In each of our experiments, we start with a single node experiment to create a baseline and report the metric seconds/iteration normalized by the batch size as well as compute the teraflops based on the Megatron-LM paper (see Appendix for details of teraflop computation for T5). Our experiments aim to maximize the batch size (while avoiding cudaMalloc retries) to take full advantage of overlap in computation and communications, as discussed below. Scaling is defined as the ratio of the seconds/iteration normalized by batch size for N nodes versus a single node, representing how well we can utilize the additional GPUs as more nodes are added.

    + +

    Experimental Results

    + +

    Our first set of experiments using the T5-3B configuration (mixed precision with BF16, activation checkpointing, and transformer wrapping policy) demonstrated scaling efficiency of 95% as we increased the number of GPUs from 8 to 512 (1 to 64 nodes, respectively). We achieved these results without any modifications to the existing FSDP APIs. We observed that, for this scale, over Ethernet based network, there is sufficient bandwidth to enable continuous overlap of communication and computation.

    + +

    However, when we increased the T5 model size to 11B, the scaling efficiency declined substantially to 20%. The PyTorch profiler shows that overlap of communication and computation was very limited. Further investigation into the network bandwidth usage revealed that the poor overlap is being caused by latency in the communication of individual packets and not the bandwidth required (in fact, our peak bandwidth utilization is 1/4th of that available). This led us to hypothesize that if we can increase the compute time by increasing the batch size, we can better overlap communication and computation. However, given we are already at maximum GPU memory allocation, we must identify opportunities to rebalance the memory allocation to allow for increase in batch size. We identified that the model state was being allocated a lot more memory than was needed. The primary function of these reservations is to have pre-reserved memory ready to aggressively send/receive tensors during the communication periods and too few buffers can result in increased wait times, whereas too many buffers result in smaller batch sizes.

    + +

    To achieve better efficiency, the PyTorch distributed team introduced a new control knob, the rate_limiter which controls how much memory is allocated for send/receive of tensors, alleviating the memory pressure and providing room for higher batch sizes. In our case, the rate_limiter could increase the batch size from 20 to 50, thus increasing compute time by 2.5x and allowing for much greater overlap of communication and computation. With this fix, we increased the scaling efficiency to >75% (at 32 nodes)!

    + +

    Continued investigation into the factors limiting scaling efficiency uncovered that the rate limiter was creating a recurring pipeline bubble of GPU idle time. This was due to the rate limiter using a block and flush approach for the allocation and release of each set of memory buffers. By waiting for the entire block to complete before initiating a new all_gather, the GPU was idling at the start of each block, while waiting for the new set of all_gather parameters to arrive. This bubble was alleviated by moving to a sliding window approach. Upon the completion of a single all_gather step and its computation (rather than a block of them), the memory is freed and the next all_gather is immediately issued in a much more uniform manner. This improvement eliminated the pipeline bubble and boosted the scaling efficiencies to >90% (at 32 nodes).

    + +

    + +

    + +

    +Figure 1: Scaling of T5-XL (3B) and T5-XXL (11B) from 1 node to 64 nodes +

    + +

    + +

    + +

    +Figure 2: TFLOPs/sec usage for T5-XL(3B) and T5-XXL (11B) as we increase number of nodes +

    + +

    IBM Cloud AI System and Middleware

    + +

    The AI infrastructure used for this work is a large-scale AI system on IBM Cloud consisting of nearly 200 nodes, each node with 8 NVIDIA A100 80GB cards, 96 vCPUs, and 1.2TB CPU RAM. The GPU cards within a node are connected via NVLink with a card-to-card bandwidth of 600GBps. Nodes are connected by 2 x 100Gbps Ethernet links with SRIOV based TCP/IP stack, providing a usable bandwidth of 120Gbps.

    + +

    The IBM Cloud AI System has been production-ready since May of 2022 and is configured with the OpenShift container platform to run AI workloads. We also built a software stack for production AI workloads that provide end-to-end tools for training workloads. The middleware leverages Ray for pre and post processing workloads and PyTorch for training of models. We also integrate a Kubernetes native scheduler, MCAD, that manages multiple jobs with job queuing, gang scheduling, prioritization, and quota management. A multi-NIC CNI discovers all available network interfaces and handles them as a single NIC pool enabling optimized use of the network interfaces in Kubernetes. Finally, CodeFlare CLI supports a single pane for observability of the full stack using a desktop CLI (e.g., GPU utilization, application metrics like loss, gradient norm).

    + +

    + +

    + +

    +Figure 3: Foundation Model Middleware Stack +

    + +

    Conclusion and Future Work

    + +

    In conclusion, we demonstrated how we can achieve remarkable scaling of FSDP APIs over non-InfiniBand networks. We identified the bottleneck that had limited scaling to less than 20% efficiency for 11B parameter model training. After identifying the issue, we were able to correct this with a new rate limiter control to ensure a more optimal balance of reserved memory and communication overlap relative to compute time. With this improvement, we were able to achieve 90% scaling efficiency (a 4.5x improvement), at 256 GPUs and 80% at 512 GPUs for training of the 11B parameter model. In addition, the 3B parameter model scales extremely well with 95% efficiency even as we increase the number of GPUs to 512.

    + +

    This is a first in the industry to achieve such scaling efficiencies for up to 11B parameter models using Kubernetes with vanilla Ethernet and PyTorch native FSDP API’s. This improvement enables users to train huge models on a Hybrid Cloud platform in a cost efficient and sustainable manner.

    + +

    We plan on continuing to investigate scaling with decoder only models and increasing the size of these models to 100B+ parameters. From a system design perspective, we are exploring capabilities such as RoCE and GDR that can improve latencies of communications over Ethernet networks.

    + +

    Acknowledgements

    + +

    This blog was possible because of contributions from both PyTorch Distributed and IBM Research teams.

    + +

    From the PyTorch Distributed team, we would like to thank Less Wright, Hamid Shojanazeri, Geeta Chauhan, Shen Li, Rohan Varma, Yanli Zhao, Andrew Gu, Anjali Sridhar, Chien-Chin Huang, and Bernard Nguyen.

    + +

    From the IBM Research team, we would like to thank Linsong Chu, Sophia Wen, Lixiang (Eric) Luo, Marquita Ellis, Davis Wertheimer, Supriyo Chakraborty, Raghu Ganti, Mudhakar Srivatsa, Seetharami Seelam, Carlos Costa, Abhishek Malvankar, Diana Arroyo, Alaa Youssef, Nick Mitchell.

    + +

    Appendix

    + +

    Teraflop computation

    + +

    The T5-XXL (11B) architecture has two types of T5 blocks, one is an encoder and the second is a decoder. Following the approach of Megatron-LM, where each matrix multiplication requires 2m×k×n FLOPs, where the first matrix is of size m×k and the second is k×n. The encoder block consists of self-attention and feed forward layers, whereas the decoder block consists of self-attention, cross-attention, and feed forward layers.

    + +

    The attention (both self and cross) block consists of a QKV projection, which requires 6Bsh2 operations, an attention matrix computation requiring 2Bs2h operations, an attention over values which needs 2Bs2h computations, and the post-attention linear projection requires 2Bsh2 operations. Finally, the feed forward layer requires 15Bsh2 operations.

    + +

    The total for an encoder block is 23Bsh2+4Bs2h, whereas for a decoder block, it comes to 31Bsh2+8Bs2h. With a total of 24 encoder and 24 decoder blocks and 2 forward passes (as we discard the activations) and one backward pass (equivalent to two forward passes), the final FLOPs computation comes to be 96×(54Bsh2+ 12Bs2h) + 6BshV. Here, B is the batch size per GPU, s is sequence length, h is hidden state size, and V is vocabulary size. +We repeat a similar computation for T5-XL (3B) architecture, which is slightly different.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/scaling-pytorch-models-on-cloud-tpus-with-fsdp/index.html b/blog/scaling-pytorch-models-on-cloud-tpus-with-fsdp/index.html new file mode 100644 index 000000000000..2909cf5065f5 --- /dev/null +++ b/blog/scaling-pytorch-models-on-cloud-tpus-with-fsdp/index.html @@ -0,0 +1,761 @@ + + + + + + + + + + + + + Scaling PyTorch models on Cloud TPUs with FSDP | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    + +
    + +
    +
    +
    + +
    +

    + by + + Ronghang Hu, Vaibhav Singh, Jack Cao, Milad Mohammadi, Yeounoh Chung, Shauheen Zahirazami, Ross Girshick + +

    +

    Introduction

    + +

    The research community has witnessed a lot of successes with large models across NLP, computer vision, and other domains in recent years. Many of these successes were enabled by Cloud TPUs – which are powerful hardware for distributed training. To support TPUs in PyTorch, the PyTorch/XLA library provides a backend for XLA devices (most notably TPUs) and lays the groundwork for scaling large PyTorch models on TPUs.

    + +

    However, most existing modeling scaling tools in the PyTorch ecosystem assume GPU (or CPU) devices, often depend on specific features in CUDA, and do not work directly on TPUs. The lack of scaling tools makes it challenging to build large models that cannot fit into the memory of a single TPU chip.

    + +

    To support model scaling on TPUs, we implemented the widely-adopted Fully Sharded Data Parallel (FSDP) algorithm for XLA devices as part of the PyTorch/XLA 1.12 release. We provide an FSDP interface with a similar high-level design to the CUDA-based PyTorch FSDP class while also handling several restrictions in XLA (see Design Notes below for more details). This FSDP interface allowed us to easily build models with e.g. 10B+ parameters on TPUs and has enabled many research explorations.

    + +

    Using Fully Sharded Data Parallel (FSDP) in PyTorch/XLA

    + +

    We provide a wrapper class XlaFullyShardedDataParallel over a given PyTorch model to shard its parameters across data-parallel workers. An example usage is as follows:

    + +
    import torch
    +import torch_xla.core.xla_model as xm
    +from torch_xla.distributed.fsdp import XlaFullyShardedDataParallel as FSDP
    +
    +model = FSDP(my_module)
    +optim = torch.optim.Adam(model.parameters(), lr=0.0001)
    +output = model(x, y)
    +loss = output.sum()
    +loss.backward()
    +optim.step()
    +
    + +

    Wrapping an nn.Module instance with XlaFullyShardedDataParallel enables the ZeRO-2 algorithm on it, where its gradients and the optimizer states are sharded for the entire training process. During its forward and backward passes, the full parameters of the wrapped module are first reconstructed from their corresponding shards for computation.

    + +

    Nested FSDP wrapping can be used to further save memory. This allows the model to store only the full parameters of one individual layer at any given time. For nested FSDP, one should first wrap its individual submodules with an inner FSDP before wrapping the base model with an outer FSDP. This allows the model to store only the full parameters of one individual layer at any given time. And having an outer wrapper ensures to handle any leftover parameters, corresponding to the ZeRO-3 algorithm. Nested FSDP wrapping can be applied at any depth of submodules and there can be more than 2 layers of nesting.

    + +

    Model checkpoint saving and loading for models and optimizers can be done like before by saving and loading their .state_dict(). Meanwhile, each training process should save its own checkpoint file of the sharded model parameters and optimizer states, and load the checkpoint file for the corresponding rank when resuming (regardless of ZeRO-2 or ZeRO-3, i.e. nested wrapping or not). A command line tool and a Python interface are provided to consolidate the sharded model checkpoint files together into a full/unshareded model checkpoint file.

    + +

    Gradient checkpointing (also referred to as “activation checkpointing” or “rematerialization”) is another common technique for model scaling and can be used in conjunction with FSDP. We provide checkpoint_module, a wrapper function over a given nn.Module instance for gradient checkpointing (based on torch_xla.utils.checkpoint.checkpoint).

    + +

    The MNIST and ImageNet examples below provide illustrative usages of (plain or nested) FSDP, saving and consolidation of model checkpoints, as well as gradient checkpointing.

    + +

    Starting examples of FSDP in PyTorch/XLA

    + +

    Training MNIST and ImageNet with FSDP

    + +

    MNIST and ImageNet classification can often be used as starting points to build more complicated deep learning models. We provide the following FSDP examples on these two datasets:

    + + + +

    A comparison of them with the vanilla data-parallel examples of MNIST and ImageNet illustrates how to adapt a training script to use FSDP. A major distinction to keep in mind is that when stepping the optimizer on an FSDP-wrapped model, one should directly call optimizer.step() instead of xm.optimizer_step(optimizer). The latter reduces the gradients across ranks, which is not what we need in FSDP, where the gradients are already reduced and sharded (from a reduce-scatter op in its backward pass).

    + +

    Installation

    + +

    FSDP is available from the PyTorch/XLA 1.12 and newer nightly releases. Please refer to https://github.com/pytorch/xla#-available-images-and-wheels for a guide on installation as well as Cloud TPU allocation. Then clone PyTorch/XLA repo on a TPU VM as follows

    + +
    mkdir -p ~/pytorch && cd ~/pytorch
    +git clone --recursive https://github.com/pytorch/xla.git
    +cd ~/
    +
    + +

    Train MNIST on v3-8 TPU

    + +

    It gets around 98.9 accuracy for 2 epochs:

    + +
    python3 ~/pytorch/xla/test/test_train_mp_mnist_fsdp_with_ckpt.py \
    +  --batch_size 16 --drop_last --num_epochs 2 \
    +  --use_nested_fsdp
    +
    + +

    The script above automatically tests consolidation of the sharded model checkpoints at the end. You can also manually consolidate the sharded checkpoint files via

    + +
    python3 -m torch_xla.distributed.fsdp.consolidate_sharded_ckpts \
    +  --ckpt_prefix /tmp/mnist-fsdp/final_ckpt \
    +  --ckpt_suffix "_rank-*-of-*.pth"
    +
    + +

    Train ImageNet with ResNet-50 on v3-8 TPU

    + +

    It gets around 75.9 accuracy for 100 epochs, same as what one would get without using FSDP; download and preprocess the ImageNet-1k dataset to /datasets/imagenet-1k:

    + +
    python3 ~/pytorch/xla/test/test_train_mp_imagenet_fsdp.py \
    +  --datadir /datasets/imagenet-1k --drop_last \
    +  --model resnet50 --test_set_batch_size 64 --eval_interval 10 \
    +  --lr 0.4 --batch_size 128 --num_warmup_epochs 5 \
    +  --lr_scheduler_divide_every_n_epochs 30 --lr_scheduler_divisor 10 \
    +  --num_epochs 100 \
    +  --use_nested_fsdp
    +
    + +

    You can also explore other options in these two examples, such as --use_gradient_checkpointing to apply gradient checkpointing (i.e. activation checkpointing) on the ResNet blocks, or --compute_dtype bfloat16 to perform forward and backward passes in bfloat16 precision.

    + +

    Examples on large-scale models

    + +

    When building large models on TPUs, we often need to be aware of the memory constraints (e.g. 16 GB per core in TPU v3 and 32 GB per chip in TPU v4). For large models that cannot fit into a single TPU memory or the host CPU memory, one should use nested FSDP to implement the ZeRO-3 algorithm interleave submodule construction with inner FSDP wrapping, so that the full model never needs to be stored in memory during construction.

    + +

    We illustrate these cases in https://github.com/ronghanghu/ptxla_scaling_examples, which provides examples of training a Vision Transformer (ViT) model with 10B+ parameters on a TPU v3 pod (with 128 cores) as well as other cases.

    + +

    Design Notes

    + +

    One might wonder why we need to develop a separate FSDP class in PyTorch/XLA instead of directly reusing PyTorch’s FSDP class or extending it to the XLA backend. The main motivation behind a separate FSDP class in PyTorch/XLA is that the native PyTorch’s FSDP class heavily relies on CUDA features that are not supported by XLA devices, while XLA also has several unique characteristics that need special handling. These distinctions require a different implementation of FSDP that would be much easier to build in a separate class.

    + +

    Changes in API calls

    +

    One prominent distinction is that the native PyTorch FSDP is built upon separate CUDA streams for asynchronous execution in eager mode, while PyTorch/XLA runs in lazy mode and also does not support streams. In addition, TPU requires that all devices homogeneously run the same program. As a result, in the PyTorch/XLA FSDP implementation, CUDA calls and per-process heterogeneity need to be replaced by XLA APIs and alternative homogeneous implementations.

    + +

    Tensor Storage Handling

    + +

    Another prominent distinction is how to free a tensor’s storage, which is much harder in XLA than in CUDA. To implement ZeRO-3, one needs to free the storage of full parameters after a module’s forward pass, so that the next module can reuse this memory buffer for subsequent computation. PyTorch’s FSPD accomplishes this on CUDA by freeing the actual storage of a parameter p via p.data.storage().resize_(0). However, XLA tensors do not have this .storage() handle given that the XLA HLO IRs are completely functional and do not provide any ops to deallocate a tensor or resize its storage. Below the PyTorch interface, only the XLA compiler can decide when to free a TPU device memory corresponding to an XLA tensor, and a prerequisite is that the memory can only be released when the tensor object gets deallocated in Python – which cannot happen in FSDP because these parameter tensors are referenced as module attributes and also saved by PyTorch autograd for the backward pass.

    + +

    Our solution to this issue is to split a tensor’s value properties from its autograd Variable properties, and to free a nn.Parameter tensor by setting its .data attribute to a dummy scalar of size 1. This way the actual data tensor for the full parameter gets dereferenced in Python so that XLA can recycle its memory for other computation, while autograd can still trace the base nn.Parameter as a weak reference to the parameter data. To get this to work, one also needs to handle views over the parameters as views in PyTorch also hold references to its actual data (this required fixing a shape-related issue with views in PyTorch/XLA).

    + +

    Working with XLA compiler

    + +

    The solution above should be enough to free full parameters if the XLA compiler faithfully preserves the operations and their execution order in our PyTorch program. But there is another problem – XLA attempts to optimize the program to speed up its execution by applying common subexpression elimination (CSE) to the HLO IRs. In a naive implementation of FSDP, the XLA compiler typically eliminates the 2nd all-gather in the backward pass to reconstruct the full parameters when it sees that it is a repeated computation from the forward pass, and directly holds and reuses the full parameters we want to free up after the forward pass. To guard against this undesired compiler behavior, we introduced the optimization barrier op into PyTorch/XLA and used it to stop eliminating the 2nd all-gather. This optimization barrier is also applied to a similar case of gradient checkpointing to prevent CSE between forward and backward passes that could eliminate the rematerialization.

    + +

    In the future, if the distinctions between CUDA and XLA become not as prominent as mentioned above, it could be worth considering a merge of the PyTorch/XLA FSDP with the native PyTorch FSDP to have a unified interface.

    + +

    Acknowledgments

    + +

    Thanks to Junmin Hao from AWS for reviewing the PyTorch/XLA FSDP pull request. Thanks to Brian Hirsh from the Meta PyTorch team for support on the PyTorch core issues. Thanks to Isaack Karanja, Will Cromar, and Blake Hechtman from Google for support on GCP, XLA, and TPU issues.

    + +

    Thanks to Piotr Dollar, Wan-Yen Lo, Alex Berg, Ryan Mark, Kaiming He, Xinlei Chen, Saining Xie, Shoubhik Debnath, Min Xu, and Vaibhav Aggarwal from Meta FAIR for various TPU-related discussions.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/scaling-recommendation-2d-sparse-parallelism/index.html b/blog/scaling-recommendation-2d-sparse-parallelism/index.html new file mode 100644 index 000000000000..a9ca6beedd24 --- /dev/null +++ b/blog/scaling-recommendation-2d-sparse-parallelism/index.html @@ -0,0 +1,877 @@ + + + + + + + + + + + + + Scaling Recommendation Systems Training to Thousands of GPUs with 2D Sparse Parallelism | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + PyTorch Team at Meta: Chunzhi Yang, Rich Zhu, Zain Huda, Liangbei Xu, Xin Zhang, Jiyan Yang, Dennis van der Staay, Wang Zhou, Jin Fang, Jade Nie, Yuxi Hu + +

    +

    At Meta, recommendation systems are the cornerstone of delivering relevant and personalized ads to billions of users globally. Through technologies like PyTorch’s TorchRec, we’ve successfully developed solutions that enable model training across hundreds of GPUs. While these systems have served us well, recent research on scaling laws has revealed a compelling opportunity: we can achieve significantly better model performance by training dramatically larger neural networks.

    + +

    However, this insight presents us with a new challenge. Our current training infrastructure, though highly optimized for hundreds of GPUs, cannot efficiently scale to the thousands of GPUs needed to train these larger models. The leap from hundreds to thousands of GPUs introduces complex technical challenges, particularly around handling sparse operations in recommendation models. These challenges require fundamentally new approaches to distributed training, which we address with a novel parallelization strategy.

    + +

    To address these issues, we introduced 2D embedding parallel, a novel parallelism strategy that overcomes the sparse scaling challenges inherent in training large recommendation models across thousands of GPUs. This is available today in TorchRec through the DMPCollection API. This approach combines two complementary parallelization techniques: data parallelism for the sparse components of the model, and model parallelism for the embedding tables, leveraging TorchRec’s robust sharding capabilities. By strategically integrating these techniques, we’ve created a solution that scales to thousands of GPUs and now powers Meta’s largest recommendation model training runs.

    + +

    What are the sparse scaling challenges?

    + +

    We identified three key challenges that prevented us from naively scaling our model to thousands of GPUs:

    + +
      +
    • Imbalancing and straggler issue: with more GPUs it’s harder to achieve balanced sharding, some ranks can have much heavier workload for embedding computations, which can slow down the entire training.
    • +
    • Communication across nodes: As training jobs utilize an increased number of GPUs, the all-to-all communication bandwidth can drop under certain network topologies which can increase communication latency significantly.
    • +
    • Memory overhead: The memory used by input features is often negligible, however, as we use thousands of GPUs, we can introduce larger input features and the memory requirements can become significant.
    • +
    + +

    With 2D embedding parallel, we can describe our new parallelism scheme like this, in this example we have 2 model replicas (Replica 1: GPU1/GPU3, Replica 2: GPU2/GPU4)

    + +

    Flow diagram

    + +

    Figure 1: Layout illustration of 2D Sparse Parallelism

    + +

    With 2D sparse parallelism we address these challenges, instead of sharding tables across all ranks, we first evenly divide all ranks into several parallel groups:

    + +
      +
    1. Within each group, we use model parallel for the embedding tables, such as column-wise/row-wise sharding. At scale, for our largest tables, we have also developed a grid sharding, which shards embedding tables on the row and column dimension.
    2. +
    3. Across groups, we do data parallel, such that each rank in a group has its corresponding replica rank in the other groups (replica rank means storing the same embedding table shards). +
        +
      1. After each group has completed its own backward pass, we all reduce the embedding table weights across the replicas to keep them synchronized.
      2. +
      +
    4. +
    + +

    Our production solution

    + +

    TorchRec is our library to build the sparse part of the recommendation models in native PyTorch. With the traditional API being DistributedModelParallel which applies model parallel to the embedding tables. We introduce a new API alongside it, known as DMPCollection, which serves as the main entry point for enabling 2D parallel on TorchRec models. We designed it to be as easy of a change as applying FSDP/DDP is.

    + +

    To understand what DMPCollection does, we have to understand what DistributedModelParallel (DMP) does first:

    + +
      +
    1. Create embedding tables, known as EmbeddingBagCollection and EmbeddingCollections.
    2. +
    3. Generate a sharding plan with respect to GPU topology, embedding tables, memory available, input data, and more.
    4. +
    5. Wrap model with DMP and the associated sharding plan passed in.
    6. +
    7. DMP initializes and shards the embedding tables in accordance with the sharding plan.
    8. +
    9. On a train step, DMP takes an input batch, communicates it to the appropriate GPUs containing the embedding table shard of interest, looks up the value, and returns it back to the GPU that requested it. This is all done on the global process group, with some exceptions for special sharding (such as table row wise sharding)
    10. +
    + +

    DistributedModelParallel was built for model parallel with many parts working under the assumption of sharding and working around the global world size. We need to change these parts in a way where we can introduce additional dimensions of parallelism without losing the optimizations and feature set of TorchRec.

    + +

    DMPCollection changes a few key parts to enable 2D parallel in an extensible way,

    + +
      +
    • Generate sharding plans for the smaller sharding group once, once passed in we communicate to the appropriate ranks across the global group and remap the ranks to fit the new sharding group ranks.
    • +
    • Create two new NCCL process groups, known as sharding and replica process groups. The sharding process group is passed into sharding and train step components of TorchRec. The replica process group is used for the weight and optimizer state synchronization, the all reduce call happens over this process group. +
        +
      • The sub NCCL process groups allow us to efficiently communicate only between the ranks that are relevant for a particular comm. Each rank will have two associated process groups.
      • +
      +
    • +
    + +

    To the user, the change is very simple, while taking away all the complexity around applying the parallelism strategies to the model.

    + +

    How do we create these sharding and replication groups?

    + +

    These process groups are one of the keys to DMPCollection’s performant implementation. From our earlier diagram, we showed a simple 2x2 GPU setup, however, at scale, how do we assign which ranks are part of a given sharding group and what are their replica ranks across the sharding groups?

    + +

    Consider the following setup with 2 nodes, each with 4 GPUs. The sharding and replication groups under 2D parallel will be,

    + + + + + + +
    + + + + + + + + + + + + + + +
    Sharding Group + Sharding Ranks +
    0 + 0, 2, 4, 6 +
    1 + 1, 3, 5, 7 +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + +
    Replication Group + Replication Ranks +
    0 + 0, 1 +
    1 + 2, 3 +
    2 + 4, 5 +
    3 + 6, 7 +
    + + +
    + +

    We use the following formulation,

    + +
      +
    1. Divide all trainers into G sharding groups, each with L trainers +
        +
      1. Groups, G, is determined by G = T / L, where T is total number of trainers
      2. +
      +
    2. +
    3. For each group, G, we assigned non-contiguous trainer ranks based on the group it’s in, following, +
        +
      1. [i, G+i, 2G+i, …, (L - 1) G+i], where* i = 0 to G-1*
      2. +
      +
    4. +
    5. From the groups, G, we can create the replication group, which is every G continuous ranks +
        +
      1. (0 to G-1, G to 2* G - 1) each continuous set stores the duplicate embedding table shards.
      2. +
      +
    6. +
    + +

    This means our sharding groups, G, are of size L, which can be known as the number of ranks to apply model parallel across. This, in turn, gives us replica groups, each of size G, which are the ranks we data parallel across.

    + +

    In DMPCollection, we’re able to create these process groups efficiently with the use of DeviceMesh, we create the entire GPU topology in a 2x2 matrix, with each row representing the group of sharding ranks and each column representing the corresponding replica ranks,

    + +
    create peer matrix
    +num_groups = global_world_size // sharding_group_size
    +for each group_rank in num_groups:
    +	peers = [num_groups * rank + group_rank for rank in range(sharding_group_size)]
    +	add peer to peer matrix
    +
    +initalize DeviceMesh with two dimensions (shard, replicate)
    +slice DeviceMesh on shard for sharding process group
    +slide DeviceMesh on replicate for replica process group
    +
    + +

    With our DeviceMesh approach, should we want to change the topology or provide further flexibility in the future, we can easily extend our creation logic to any form of topologies and even extend for further dimensions of parallelism if needed.

    + +

    Performance of 2D parallel

    + +

    Our rank partitioning strategy optimizes communication patterns by strategically placing model replica ranks for each shard within the same compute node. This architecture provides significant performance benefits for the weight synchronization operation. After the backward pass, we perform all-reduce operations to synchronize model weights—which is an expensive process given the large parameter counts we have to communicate and sync—with our setup of placing replicas on the same node we leverage intra node’s high-bandwidth over-relying on slower inter-node bandwidth.

    + +

    The effect of this design choice on the other communication collectives generally improves the latencies. The improvement stems from two factors.

    + +
      +
    1. By sharding the embedding tables over a reduced number of ranks and conducting communications for the model within the smaller group, we achieve a lower all-to-all latency.
    2. +
    3. With the replication in 2D parallel, our embedding lookup latency on a rank reduces, we can reduce the local batch size to 1/Nth of the equivalent global batch size, where N is the number of model replicas.
    4. +
    + +

    A production model trace exemplifies these two factors, here we run the 2D parallel job on 1024 GPUs, with a sharding group size of 256 GPUs.

    + +

    State diagram

    + +

    Figure 2: Comparing latencies between non 2D parallel and 2D parallel workloads

    + +

    There are two key levers users have to tune to maximize performance for their workloads:

    + +
      +
    1. The size of the model sharding group relative to the global world size. The global world size divided by the sharding group size represents the number of model replicas we will have. +
        +
      1. To maximize performance, users can look to scale up their model up to 8x, this scaling factor maintains the intra-host all reduce. +
          +
        1. For further scaling, the all reduce would have to happen over inter host. From our experiments, we did not see an obvious performance regression and in fact note advantages of an inter host all reduce. We can change our sharding and replica topology to inter host all reduce, which can help us introduce fault tolerance strategies should a particular host go down.
        2. +
        +
      2. +
      +
    2. +
    3. Frequency of all reduce synchronization, DMPCollection comes with a sync() call, which can be tuned to be called every N training steps, performing a sort of local SGD training. With scale, reducing the frequency of synchronization can bring significant gains to performance.
    4. +
    + +

    Future Work

    + +

    Readers should note that 2D sparse parallel training differs from non-parallelized training because we synchronize the embedding table weights rather than the gradients. This approach is made possible by TorchRec’s use of FBGEMM, which provides optimized kernels under the hood. One of FBGEMM’s key optimizations is the fusion of the optimizer in the backward pass. Instead of fully materializing the embedding table gradients—which would consume significant memory—they are passed directly to the optimizer update. Attempting to materialize and synchronize these gradients would create substantial overhead, making that approach impractical.

    + +

    Our exploration revealed that to achieve training results comparable to the baseline, we synchronize optimizer states on a delayed schedule, with the timing dependent on the number of sharding/replica groups (ie: for Adagrad we update the momentum behind by one sync step). This approach also enables users to implement local SGD or semi-synchronized training strategies, which can achieve convergence and potentially produce better loss curves than the baseline.

    + +

    We thank you for reading our post! This is an exciting direction we have come across that we hope to develop further to maximize performance of recommendation systems and push the state of the art.

    + + + + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/scaling-vision-model-training-platforms-with-pytorch/index.html b/blog/scaling-vision-model-training-platforms-with-pytorch/index.html new file mode 100644 index 000000000000..679778b38ed4 --- /dev/null +++ b/blog/scaling-vision-model-training-platforms-with-pytorch/index.html @@ -0,0 +1,814 @@ + + + + + + + + + + + + + Scaling Vision Model Training Platforms with PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Vaibhav Aggarwal, Mannat Singh, Anjali Sridhar, Yanghao Li, Shoubhik Debnath, Ronghang Hu, Will Feng, Xinlei Chen, Tingting Markstrum, Diana Liskovich, Anupam Bhatnagar, Chay Ryali, Haoqi Fan, Tete Xiao, Min Xu, Rahul Iyer, Christoph Feichtenhofer, Ross Girshick, Piotr Dollar, Aaron Adcock, Wan-Yen Lo, CK Luk + +

    +

    TL;DR: We demonstrate the use of PyTorch with FairScale’s FullyShardedDataParallel (FSDP) API in writing large vision transformer models. We discuss our techniques for scaling and optimizing these models on a GPU cluster. The goal of this platform scaling effort is to enable research at scale. This blog does not discuss model accuracy, new model architectures, or new training recipes.

    + +

    1. Introduction

    + +

    Latest vision research [1, 2] demonstrates model scaling as a promising research direction. In this project, we aim to enable our platforms to train massive vision transformer (ViT) [3] models. We present our work on scaling the largest trainable ViT from 1B to 120B parameters in FAIR vision platforms. We wrote ViT in PyTorch and leveraged its support for large-scale, distributed training on a GPU cluster.

    + +

    In the rest of this blog, we will first discuss the main challenges, namely scalability, optimization, and numerical stability. Then we will discuss how we tackle them with techniques including data and model parallelism, automatic mixed precision, kernel fusion, and bfloat16. Finally, we present our results and conclude.

    + +

    2. Main Challenges

    + +

    2.1 Scalability

    + +

    The key scalability challenge is to efficiently shard a model’s operations and state across multiple GPUs. A 100B parameter model requires ~200GB of RAM just for parameters, assuming fp16 representation. So, it is impossible to fit the model on a single GPU (A100 has at most 80GB RAM). Therefore, we need some way to efficiently shard a model’s data (input, parameters, activations, and optimizer state) across multiple GPUs.

    + +

    Another aspect of this problem is to scale without significantly changing the training recipe. E.g. Certain representation learning recipes use a global batch size of up to 4096 beyond which we start to see accuracy degradation. We cannot scale to more than 4096 GPUs without using some form of tensor or pipeline parallelism.

    + +

    2.2 Optimization

    + +

    The key optimization challenge is to maintain high GPU utilization even as we scale the number of model parameters and flops. When we scale models to teraflops and beyond, we start to hit major bottlenecks in our software stack that super-linearly increase training time and reduce accelerator utilization. We require hundreds or thousands of GPUs to run just a single experiment. Improvements in accelerator utilization can lead to significant reductions in cost and improve fleet utilization. It enables us to fund more projects and run more experiments in parallel.

    + +

    2.3 Numerical Stability

    + +

    The key stability challenge is to avoid numerical instability and divergence at large scale. We empirically observed in our experiments that the training instability gets severe and hard to deal with when we scale up model sizes, data, batch sizes, learning rate, etc. Vision Transformers particularly face training instability even at a lower parameter threshold. E.g., we find it challenging to train even ViT-H (with just 630M parameters) in mixed-precision mode without using strong data augmentation. We need to study the model properties and training recipes to make sure that the models train stably and converge.

    + +

    3. Our Solutions

    + +

    Figure 1 depicts our solutions to each of the challenges.

    + +

    + +

    + +

    3.1 Addressing scaling challenges with data parallelism and model parallelism

    + +

    We apply various forms of data and model parallelism to enable fitting very large models in GPU memory.

    + +

    We use FairScale’s FullyShardedDataParallel (FSDP) API [4], based on PyTorch, to shard parameters, gradients, and optimizer state across multiple GPUs, thereby reducing the memory footprint per GPU. This process consists of the following three steps:

    + +
      +
    • +

      Step 1: We wrapped the entire model in a single FSDP instance. This shards the model parameters at the end of a forward pass and gathers parameters at the beginning of a forward pass. This enabled us to scale ~3x from 1.5B to 4.5B parameters.

      +
    • +
    • +

      Step 2: We experimented with wrapping individual model layers in separate FSDP instances. This nested wrapping further reduced the memory footprint by sharding and gathering parameters of individual model layers instead of an entire model. The peak memory is then determined by an individually wrapped transformer block in GPU memory in this mode instead of the entire model.

      +
    • +
    • +

      Step 3: We used activation-checkpoint to reduce the memory consumption by activations. It saves the input tensors and discards the intermediate activation tensors during the forward pass. These are recomputed during the backward pass.

      +
    • +
    + +

    In addition, we experimented with model-parallelism techniques such as pipeline parallelism [5], which allow us to scale to more GPUs without increasing the batch size.

    + +

    3.2 Addressing optimization challenges with advanced AMP and kernel fusion

    + +

    Advanced AMP

    + +

    Automatic Mixed Precision (AMP) [6] training refers to training models using a lower precision of bits than FP32 or the default but still maintaining accuracy. We experimented with three levels of AMP as described below:

    + +
      +
    • +

      AMP O1: This refers to training in mixed precision where weights are in FP32 and some operations are in FP16. With AMP O1, the ops that might impact accuracy remain in FP32 and are not autocasted to FP16.

      +
    • +
    • +

      AMP O2: This refers to training in mixed precision but with more weights and ops in FP16 than in O1. Weights do not implicitly remain in FP32 and are cast to FP16. A copy of the master weights is maintained in the FP32 precision that is used by the optimizer. If we want the normalization layer weights in FP32 then we need to explicitly use layer wrapping to ensure that.

      +
    • +
    • +

      Full FP16: This refers to training in full FP16 where weights and operations are in FP16. FP16 is challenging to enable for training due to convergence issues.

      +
    • +
    + +

    We found that AMP O2 with LayerNorm wrapping in FP32 leads to the best performance without sacrificing accuracy.

    + +

    Kernel Fusion

    + +
      +
    • To reduce GPU kernel launch overhead and increase GPU work granularity, we experimented with kernel fusions, including fused dropout and fused layer-norm, using the xformers library [7].
    • +
    + +

    3.3 Addressing stability challenges by studying ops numerical stability and training recipes

    + +

    BFloat16 in general but with LayerNorm in FP32

    + +

    The bfloat16 (BF16) [8] floating-point format provides the same dynamic range as FP32 with a memory footprint identical to FP16. We found that we could train models in the BF16 format using the same set of hyperparameters as in FP32, without special parameter tuning. Nevertheless, we found that we need to keep LayerNorm in FP32 mode in order for the training to converge.

    + +

    3.4 Final training recipe

    + +

    A summary of the final training recipe.

    + +
      +
    1. Wrap the outer model in an FSDP instance. Enable parameter sharding after the forward pass.
    2. +
    3. Wrap individual ViT blocks with activation checkpointing, nested FSDP wrapping, and parameter flattening.
    4. +
    5. Enable mixed precision mode (AMP O2) with bfloat16 representation. Maintain the optimizer state in FP32 precision to enhance numerical stability.
    6. +
    7. Wrap normalization layers like LayerNorm in FP32 for better numerical stability.
    8. +
    9. Maximize the Nvidia TensorCore utilization by keeping matrix dimensions to be multiple of 8. For More details check Nvidia Tensor Core Performance Guide.
    10. +
    + +

    4. Results

    + +

    In this section, we show the scaling results of ViT on three types of tasks: (1) image classification, (2) object detection (3) video understanding. Our key result is that we are able to train massive ViT backbones across these vision tasks after applying the discussed scaling and optimization techniques. This enables vision research at a much larger scale. We trained the models to convergence to verify that we maintain the current baselines even with all the optimizations. A common trend in Figures 2, 3, 4 is that we are able to train up to 25B-param models with an epoch time of less than 4 hours on 128 A100 GPUs. The 60B and 120B models are relatively slower to train.

    + +

    Figure 2 shows the image-classification scaling result. It plots the epoch time for training ViTs on ImageNet using 128 A100-80GB GPUs with different model sizes.

    + +

    + +

    + +

    +Figure 2: Image-classification scaling result. +

    + +

    Figure 3 shows the object-detection scaling result. It plots the epoch time for training ViTDet [9] with different ViT backbones on COCO using 128 A100-80GB GPUs.

    + +

    + +

    + +

    +Figure 3: Object-detection scaling result. +

    + +

    Figure 4 shows the video-understanding scaling result. It plots the epoch time for training MViTv2 [10] models on Kinetics 400 [11] using 128 V100 (32 GB) GPUs in FP32.

    + +

    + +

    + +

    +Figure 4: Video-understanding scaling result. +

    + +

    Figure 5 shows the optimization result with the ViT-H model in Figure 2 on 8 A100-40GB GPUs. +Three versions are used: (1) the baseline uses PyTorch’s DDP [12] with AMP O1, (2) FSDP + AMP-O2 + other optimizations, and (3) FSDP + FP16 + other optimizations. These optimizations altogether speed up the training by up to 2.2x.

    + +

    + +

    + +

    +Figure 5: Training speedups from various optimizations. +

    + +

    5. Concluding Remarks

    + +

    We have demonstrated the use of PyTorch with FairScale’s FullyShardedDataParallel (FSDP) API in writing large vision transformer models. We discuss our techniques for scaling and optimizing these models on a GPU cluster. We hope that this article can motivate others to develop large-scale ML models with PyTorch and its ecosystem.

    + +

    References

    + +

    [1] Masked Autoencoders Are Scalable Vision Learners

    + +

    [2] Revisiting Weakly Supervised Pre-Training of Visual Perception Models

    + +

    [3] An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale

    + +

    [4] fairscale.nn.FullyShardedDataParallel

    + +

    [5] Pipeline parallelism in PyTorch

    + +

    [6] Automatic Mixed Precision (AMP) in PyTorch

    + +

    [7] xformers

    + +

    [8] The bfloat16 numerical format

    + +

    [9] Exploring Plain Vision Transformer Backbones for Object Detection

    + +

    [10] MViTv2: Improved Multiscale Vision Transformers for Classification and Detection

    + +

    [11] https://www.deepmind.com/open-source/kinetics

    + +

    [12] Getting Started with Distributed Data Parallel (DDP)

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/sglang-joins-pytorch/index.html b/blog/sglang-joins-pytorch/index.html new file mode 100644 index 000000000000..5ef1322bcb42 --- /dev/null +++ b/blog/sglang-joins-pytorch/index.html @@ -0,0 +1,733 @@ + + + + + + + + + + + + + SGLang Joins PyTorch Ecosystem: Efficient LLM Serving Engine | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + SGLang Team + +

    +

    sglang logo

    + +

    We’re thrilled to announce that the SGLang project has been integrated into the PyTorch ecosystem! This integration ensures that SGLang aligns with PyTorch’s standards and practices, providing developers with a reliable and community-supported framework for fast and flexible serving of LLMs.

    + +

    To view the PyTorch Ecosystem, see the PyTorch Landscape and learn more about how projects can join the PyTorch Ecosystem.

    + +

    About SGLang

    + +

    SGLang is a fast-serving engine for large language models and vision language models. It makes the interaction with models faster and more controllable by co-designing the backend runtime and frontend language.

    + +

    The core features include:

    + +
      +
    • Fast Backend Runtime: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, continuous batching, token attention (paged attention), speculative decoding, tensor parallelism, chunked prefill, structured outputs, and quantization (FP8/INT4/AWQ/GPTQ).
    • +
    • Flexible Frontend Language: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
    • +
    • Extensive Model Support: Supports a wide range of generative models (Llama, Gemma, Mistral, Qwen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
    • +
    • Active Community: SGLang is open source and backed by an active community with industry adoption.
    • +
    + +

    SGLang is famous for its fast speed. It can often significantly outperform other state-of-the-art frameworks in terms of serving throughput and latency. You can learn more about the underlying techniques from the past release blog posts: v0.2 blog, v0.3 blog, v0.4 blog.

    + +

    SGLang has been widely adopted by leading industry companies and frontier research labs. For example, xAI uses SGLang to serve its flagship model, Grok 3, which is currently the best model according to the Chatbot Arena leaderboard. Microsoft Azure uses SGLang to serve DeepSeek R1 on AMD GPUs, which is currently the best open source model.

    + +

    Serving DeepSeek Models

    + +

    You can easily launch a Docker container to serve a DeepSeek model with the following command:

    + +
    # Pull the latest image
    +docker pull lmsysorg/sglang:latest
    +
    +# Launch a server
    +docker run --gpus all --shm-size 32g -p 30000:30000 -v ~/.cache/huggingface:/root/.cache/huggingface --ipc=host --network=host --privileged lmsysorg/sglang:latest \
    +    python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3 --tp 8 --trust-remote-code --port 30000
    +
    + +

    Then you can query the server with the OpenAI-compatible API

    + +
    import openai
    +client = openai.Client(base_url=f"http://127.0.0.1:30000/v1", api_key="None")
    +
    +response = client.chat.completions.create(
    +    model="deepseek-ai/DeepSeek-V3",
    +    messages=[
    +        {"role": "user", "content": "List 3 countries and their capitals."},
    +    ],
    +    temperature=0,
    +    max_tokens=64,
    +)
    +
    + +

    The server launch command above works for 8xH200. You can find detailed instructions for other hardware (MI300X, H100, A100, H20, L40S) at https://docs.sglang.ai/references/deepseek.html.

    + +

    SGLang integrates DeepSeek-specific optimizations, such as MLA throughput optimizations, MLA-optimized kernels, data-parallel attention, multi-token prediction, and DeepGemm, making it the top choice for serving DeepSeek models by dozens of companies, including AMD, NVIDIA, and many cloud providers. The team is actively working on integrating more optimizations following the 2025 H1 roadmap below.

    + +

    Serving Llama Models

    + +

    Similarly, you can launch the server for a Llama 3.1 text model with:

    + +
    python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct
    +
    + +

    Or a Llama 3.2 multimodal model with:

    + +
    python3 -m sglang.launch_server --model-path meta-llama/Llama-3.2-11B-Vision-Instruct  --chat-template=llama_3_vision
    +
    + +

    Roadmap

    + +

    This year, the SGLang team will continue to push the boundaries of system efficiency. You can find the roadmap of 2025H1 here. The focus is

    + +
      +
    • Throughput-oriented large-scale deployment similar to the DeepSeek inference system
    • +
    • Long context optimizations
    • +
    • Low latency speculative decoding
    • +
    • Reinforcement learning training framework integration
    • +
    • Kernel optimizations
    • +
    + +

    Community

    + +

    SGLang has been deployed to large-scale production, generating trillions of tokens every day. It has an active community with over three hundred contributors on GitHub. It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, iFlytek, Jam & Tea Studios, LinkedIn, LMSYS, Meituan, Nebius, Novita AI, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, and 01.AI.

    + +

    logos

    + +

    Conclusion

    + +

    We’re excited to welcome SGLang to the PyTorch ecosystem. SGLang accelerates the serving of large language and vision language models. It’s widely adopted by industry, powering the large-scale online serving of frontier models like Grok and DeepSeek.

    + +

    We invite you to explore the SGLang GitHub repo, join the community on Slack, and reach out to contact@sglang.ai for inquiries or collaboration opportunities. Together, we can make powerful AI models accessible to everyone.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/snowflake-joins-pytorch/index.html b/blog/snowflake-joins-pytorch/index.html new file mode 100644 index 000000000000..b9711df75269 --- /dev/null +++ b/blog/snowflake-joins-pytorch/index.html @@ -0,0 +1,673 @@ + + + + + + + + + + + + + Snowflake Joins the PyTorch Foundation as a General Member | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    Snowflake logo

    + +

    The PyTorch Foundation, a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem, is announcing today that Snowflake has joined as a general member.

    + +

    Snowflake enables thousands of organizations to unite siloed data, discover and securely share data, power data applications, and execute diverse AI/ML and analytic workloads across multiple clouds and geographies.

    + +

    “By joining the PyTorch community, we know that Snowflake will help accelerate data warehousing solutions and cutting-edge AI frameworks. This showcases the commitment to advancing innovation for data and artificial intelligence,” said Ibrahim Haddad, Executive Director, PyTorch Foundation. “We are thrilled to have Snowflake join the PyTorch Foundation, marking a significant stride in the convergence of data management and deep learning technologies.”

    + +

    Snowflake enables collaboration with AI technologies to handle the storage and analysis of large datasets generated by machine learning and AI applications through scalability and SQL support.

    + +

    With the integrated repository of Python libraries from Anaconda in Snowpark, Snowflake users have always had a streamlined experience to deploy pre-trained PyTorch models in Snowflake to easily and securely make them a part of applications. Now with the addition of GPU instances in Snowpark Container Services (in private preview), training and other computationally intensive processing using PyTorch will also be streamlined, providing teams with an end-to-end solution for AI development and deployment.

    + +

    “Most if not all of our customers incorporate open source software as part of their data stacks, so it is critical for us to work with open source ecosystems like the PyTorch Foundation, alongside incorporating open source to meet the needs of our customers,” said Adrien Treuille, Co-Founder of Streamlit, Director of Product Management at Snowflake. “As AI developers continue to integrate their models as part of applications, the power of Snowflake and PyTorch — coupled with Streamlit as the powerful front-end — creates near-limitless innovation for developers looking to build next-generation apps and unlock even more use cases.”

    + +

    To learn more about the power of Snowflake and PyTorch, tune into Snowflake’s developer conference for AI and apps, BUILD.

    + +

    To learn more about how you can be a part of the PyTorch Foundation, visit our website.

    + +

    About Snowflake

    + +

    Snowflake enables every organization to mobilize their data with Snowflake’s Data Cloud. Customers use the Data Cloud to unite siloed data, discover and securely share data, power data applications, and execute diverse AI/ML and analytic workloads. Wherever data or users live, Snowflake delivers a single data experience that spans multiple clouds and geographies. Thousands of customers across many industries, including 639 of the 2023 Forbes Global 2000 (G2K) as of July 31, 2023, use Snowflake Data Cloud to power their businesses. Learn more at snowflake.com.

    + +

    About PyTorch Foundation

    + +

    The PyTorch Foundation is a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem. The PyTorch Foundation is supported by its members and leading contributors to the PyTorch open source project. The Foundation leverages resources provided by members and contributors to enable community discussions and collaboration.

    + +

    About The Linux Foundation

    + +

    The Linux Foundation is the world’s leading home for collaboration on open source software, hardware, standards, and data. Linux Foundation projects are critical to the world’s infrastructure including Linux, Kubernetes, Node.js, ONAP, PyTorch, RISC-V, SPDX, OpenChain, and more. The Linux Foundation focuses on leveraging best practices and addressing the needs of contributors, users, and solution providers to create sustainable models for open collaboration. For more information, please visit us at linuxfoundation.org. The Linux Foundation has registered trademarks and uses trademarks. For a list of trademarks of The Linux Foundation, please see its trademark usage page. Linux is a registered trademark of Linus Torvalds.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/speeding-up-vits/index.html b/blog/speeding-up-vits/index.html new file mode 100644 index 000000000000..6f6c1d4b21f9 --- /dev/null +++ b/blog/speeding-up-vits/index.html @@ -0,0 +1,752 @@ + + + + + + + + + + + + + Speeding up ViTs using Block Sparsity | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    May 14, 2024

    +

    + Speeding up ViTs using Block Sparsity +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + FAIR at Meta: Mostafa Elhoushi, Sensors and Systems at Meta Reality Labs Research: Syed Shakib Sarwar, Aaryan Kothapalli, Mia Kasperek, Barbara De Salvo, PyTorch at Meta: Christian Puhrsch, Jesse Cai, Joe Isaacson, Quantsight: Andrew James, Pearu Peterson, Nikita Vedeneev + +

    +

    TLDR: We show promising results of up to a 1.46x speedup with <2% drop in accuracy on float32 Vision Transformers on A100 GPUs by applying block sparsity on MLP module’s weights. This approach can potentially be applied to other types of transformers including large language models. Our implementation and benchmarks to reproduce our results are available at https://github.com/pytorch-labs/superblock.

    + +

    Introduction

    + +

    PyTorch has landed a lot of improvements to CUDA kernels that implement block sparse matrix multiplications. Recent updates to Pytorch can lead up to 4.8x speedup on large matrix multiplication shapes with high sparsity levels over dense baselines.

    + +

    In this blog, we show the promising results of applying block sparsity on weights of linear layers of MLP (multi-layer perceptron) layers in vision transformers (ViTs) and show end-to-end model speedups on A100 Nvidia GPUs.

    + +

    As a recap, block sparsity sparsifies weights in tiles of blocks of predetermined size, rather than sparsifying individual elements. This particular sparsity pattern is interesting because it is amenable to GPU acceleration via fast sparse kernels. For more information about the differences between different sparsity patterns, or about sparsity as a whole, please check out torchao.

    + +

    Illustrations of different types of sparsity.

    + +

    Illustrations of different types of sparsity.

    + +

    Approach

    + +

    Our approach can be broken down into two distinct steps:

    + +
      +
    1. Training the model from scratch using block sparse masks subnets.
    2. +
    3. Folding these masks into our weights to accelerate them for inference.
    4. +
    + +

    We explain our training and inference steps below

    + +

    Training

    + +

    Starting with an uninitialized Vision Transformer, we apply random trainable masks with a specified block size and sparsity level on the weights of output projection linear layer of attention blocks, the weights of the two linear layers inside the MLP, a.k.a., FFN (feed forward networks), as well as the final linear classification layer. The forward pass during training follows the supermask approach, as each mask is converted to binary map using a tuned threshold based on sparsity requirements, e.g., if we want 80% sparsity, we will have the threshold automatically tuned to keep top 20% weights. The masks are of a square <block size>x<block size> elements, where <block size> is a hyperparameter. The priority of the weights is dependent on the mask value or score which is trained. We multiply the binary masks of each layer with the weights to sparsify the model.

    + +

    Illustration of the Supermask sparsification approach

    + +

    Illustration of the Supermask sparsification approach.

    + +

    Inference

    + +

    After training, the dense weights can be turned to sparse weights by multiplying with the mask and stored for inference. At this stage, although the weights have a high percentage of zero values, they are still stored in dense format. We use PyTorch’s to_sparse_bsr() API to to convert the weights to Block Sparse Representation (BSR) format that stores only the non-zero values and the indices of their blocks. This step only needs to be done once and the results can be cached for runtime.

    + +

    During runtime, no changes in code are required. We just pass any input tensor to the model, and when the forward() function of the sparsified linear layers are invoked, PyTorch takes care of invoking the optimized matrix multiplication for block sparse weights. This should work for A100 as well as H100 NVIDIA GPUs.

    + +

    Results: Microbenchmarks

    + +

    To validate the viability of block sparsity from a performance standpoint, we first ran a series of microbenchmarks using this simple script. Using the linear shapes from ViT-b, we compared the speedup of our block sparse kernels across a single linear layer as we varied the sparsity level and block size of the weight matrix.

    + +

    We run using PyTorch 2.3.0.dev20240305+cu121 nightly on NVIDIA A100s and report the speedup of each sparsity configuration compared to dense baseline. We observed positive speedups when block size >=32 or sparsity level >= 0.8 for float32, while for bfloat16 we observe smaller speedups and usually for block size 64 and higher sparsities. Hence, for end-to-end speedups on the model, we will focus in this blog on float32 and leave bfloat16 for future work.

    + +

    Micro benchmarking results on linear layers of ViT-b-16.

    + +

    Micro benchmarking results on linear layers of ViT-b-16.

    + +

    Micro benchmarking results on linear layers of ViT-b-16.

    + +

    Results: Vision Transformers

    + +

    Once we confirmed that we were able to show speedups over the linear layers, we focused on showing end-to-end speedups on ViT_B_16.

    + +

    We trained this model from scratch on ImageNet dataset using the standard ViT_B_16 recipe. We show speedups for sparsifying MLP modules and leave sparsifying weights of input and output projections of attention for future work.

    + +

    We looked at wall-clock inference speedup, focusing on batch size 256. We found that:

    + +
      +
    • For 90% sparsity we can get 1.24x, 1.37x, 1.65x speedups for block sizes 16, 32, and 64 respectively.
    • +
    • To obtain speedup, the minimum sparsity for block sizes 16, 32, and 64 are 0.86, 0.82, and 0.7 respectively. Hence, as expected, the larger the block size, the smaller sparsity we need to obtain speedup.
    • +
    + +

    We note a limitation of the sparse_bsr() API: that layers need to be multiples of the block size. Since the dimensions of the last FC classification layer in ViT was not a multiple of the block size, they were not converted to BSR representation in our experiments.

    + +

    Speedup on ViT-b-16 with batch size 256 on MLP modules across different batch sparsities and block sizes.

    + +

    Speedup on ViT-b-16 with batch size 256 on MLP modules across different batch sparsities and block sizes.

    + +

    We also explored the speedup for different batch sizes for 90% sparsity. We observed a speedup over the baseline for batch sizes starting from 16 and upwards. While bigger block sizes have bigger speedups at the largest batch sizes, the smallest possible batch size to obtain >1 speedup is smaller for smaller block sizes.

    + +

    We believe on-device hardware can obtain speedups for batch size 1 as they - unlike server GPUs - can be fully utilized at such small batch sizes.

    + +

    Speedup on ViT-b-16 with 90% sparsity on MLP modules across different batch sizes and block sizes.

    + +

    Speedup on ViT-b-16 with 90% sparsity on MLP modules across different batch sizes and block sizes.

    + +

    Looking at the Top-1 accuracy on ImageNet=blurred test set of the sparsified models for different block sizes and sparsities, we see a few expected results:

    + +
      +
    • low levels of sparsity (<=70%) have no meaningful regression in accuracy
    • +
    • mid levels of sparsity (>=80% to <90%) have limited regression in accuracy
    • +
    • high levels of sparsity (>=90%) removes so many weights that accuracy is significantly impacted
    • +
    + +

    More research could be done to improve accuracies of higher sparsities and larger block sizes. We hope that the block sparsity support in PyTorch and the illustrated speedups in this blog will encourage researchers to explore more accurate sparsification approaches.

    + +

    Accuracies on training ViT-b-16 on ImageNet-blurred using the SuperMask approach.

    + +

    Accuracies on training ViT-b-16 on ImageNet-blurred using the SuperMask approach.

    + +

    Next Steps

    + +

    We have shown promising speedups for block sparsifying MLP modules ViT in float32 precision. There is still more work to be done in order to observe speedups on bfloat16 and we hope to obtain progress on that soon. Possible next steps to further optimize block sparsity on vision transformers and transformers in general:

    + +
      +
    • Perform block sparsity on attention input and output projections.
    • +
    • Perform block sparsity during finetuning rather than training from scratch.
    • +
    • Perform further optimizations on the matmul kernels for ViT’s linear operator specific shapes (especially for 80% and lower sparsity).
    • +
    • Combine with other optimizations such as int8 and torch.compile()
    • +
    • Explore other weight sparsification algorithms, e.g., Spartan, to improve accuracy
    • +
    • Explore selecting weights to sparsify (e.g., specific transformer layers)
    • +
    + +

    Please reach out to melhoushi@meta.com if you have questions or are interested in contributing to block sparsification!

    + +

    Additionally if you’re broadly interested in sparsity please feel free to reach out to @jcaip / jessecai@meta.com and please come check out torchao, a community we’re building for architecture optimization techniques like quantization and sparsity.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/stochastic-weight-averaging-in-pytorch/index.html b/blog/stochastic-weight-averaging-in-pytorch/index.html new file mode 100644 index 000000000000..dbb239364df6 --- /dev/null +++ b/blog/stochastic-weight-averaging-in-pytorch/index.html @@ -0,0 +1,878 @@ + + + + + + + + + + + + + Stochastic Weight Averaging in PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    April 29, 2019

    +

    + Stochastic Weight Averaging in PyTorch +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Pavel Izmailov and Andrew Gordon Wilson + +

    +

    In this blogpost we describe the recently proposed Stochastic Weight Averaging (SWA) technique [1, 2], and its new implementation in torchcontrib. SWA is a simple procedure that improves generalization in deep learning over Stochastic Gradient Descent (SGD) at no additional cost, and can be used as a drop-in replacement for any other optimizer in PyTorch. SWA has a wide range of applications and features:

    + +
      +
    1. SWA has been shown to significantly improve generalization in computer vision tasks, including VGG, ResNets, Wide ResNets and DenseNets on ImageNet and CIFAR benchmarks [1, 2].
    2. +
    3. SWA provides state-of-the-art performance on key benchmarks in semi-supervised learning and domain adaptation [2].
    4. +
    5. SWA is shown to improve the stability of training as well as the final average rewards of policy-gradient methods in deep reinforcement learning [3].
    6. +
    7. An extension of SWA can obtain efficient Bayesian model averaging, as well as high quality uncertainty estimates and calibration in deep learning [4].
    8. +
    9. SWA for low precision training, SWALP, can match the performance of full-precision SGD even with all numbers quantized down to 8 bits, including gradient accumulators [5].
    10. +
    + +

    In short, SWA performs an equal average of the weights traversed by SGD with a modified learning rate schedule (see the left panel of Figure 1.). SWA solutions end up in the center of a wide flat region of loss, while SGD tends to converge to the boundary of the low-loss region, making it susceptible to the shift between train and test error surfaces (see the middle and right panels of Figure 1).

    + +
    + +
    + +

    Figure 1. Illustrations of SWA and SGD with a Preactivation ResNet-164 on CIFAR-100 [1]. Left: test error surface for three FGE samples and the corresponding SWA solution (averaging in weight space). Middle and Right: test error and train loss surfaces showing the weights proposed by SGD (at convergence) and SWA, starting from the same initialization of SGD after 125 training epochs. Please see [1] for details on how these figures were constructed.

    + +

    With our new implementation in torchcontrib using SWA is as easy as using any other optimizer in PyTorch:

    + +
    from torchcontrib.optim import SWA
    +
    +...
    +...
    +
    +# training loop
    +base_opt = torch.optim.SGD(model.parameters(), lr=0.1)
    +opt = torchcontrib.optim.SWA(base_opt, swa_start=10, swa_freq=5, swa_lr=0.05)
    +for _ in range(100):
    +     opt.zero_grad()
    +     loss_fn(model(input), target).backward()
    +     opt.step()
    +opt.swap_swa_sgd()
    +
    + +

    You can wrap any optimizer from torch.optim using the SWA class, and then train your model as usual. When training is complete you simply call swap_swa_sgd() to set the weights of your model to their SWA averages. Below we explain the SWA procedure and the parameters of the SWA class in detail. We emphasize that SWA can be combined with any optimization procedure, such as Adam, in the same way that it can be combined with SGD.

    + +

    Is this just Averaged SGD?

    + +

    At a high level, averaging SGD iterates dates back several decades in convex optimization [6, 7], where it is sometimes referred to as Polyak-Ruppert averaging, or averaged SGD. But the details matter. Averaged SGD is often employed in conjunction with a decaying learning rate, and an exponentially moving average, typically for convex optimization. In convex optimization, the focus has been on improved rates of convergence. In deep learning, this form of averaged SGD smooths the trajectory of SGD iterates, but does not perform very differently.

    + +

    By contrast, SWA is focused on an equal average of SGD iterates with a modified cyclical or high constant learning rate, and exploits the flatness of training objectives [8] specific to deep learning for improved generalization.

    + +

    Stochastic Weight Averaging

    + +

    There are two important ingredients that make SWA work. First, SWA uses a modified learning rate schedule so that SGD continues to explore the set of high-performing networks instead of simply converging to a single solution. For example, we can use the standard decaying learning rate strategy for the first 75% of training time, and then set the learning rate to a reasonably high constant value for the remaining 25% of the time (see the Figure 2 below). The second ingredient is to average the weights of the networks traversed by SGD. For example, we can maintain a running average of the weights obtained in the end of every epoch within the last 25% of training time (see Figure 2).

    +
    + +
    + +

    Figure 2. Illustration of the learning rate schedule adopted by SWA. Standard decaying schedule is used for the first 75% of the training and then a high constant value is used for the remaining 25%. The SWA averages are formed during the last 25% of training.

    + +

    In our implementation the auto mode of the SWA optimizer allows us to run the procedure described above. To run SWA in auto mode you just need to wrap your optimizer base_opt of choice (can be SGD, Adam, or any other torch.optim.Optimizer) with SWA(base_opt, swa_start, swa_freq, swa_lr). After swa_start optimization steps the learning rate will be switched to a constant value swa_lr, and in the end of every swa_freq optimization steps a snapshot of the weights will be added to the SWA running average. Once you run opt.swap_swa_sgd(), the weights of your model are replaced with their SWA running averages.

    + +

    Batch Normalization

    + +

    One important detail to keep in mind is batch normalization. Batch normalization layers compute running statistics of activations during training. Note that the SWA averages of the weights are never used to make predictions during training, and so the batch normalization layers do not have the activation statistics computed after you reset the weights of your model with opt.swap_swa_sgd(). To compute the activation statistics you can just make a forward pass on your training data using the SWA model once the training is finished. In the SWA class we provide a helper function opt.bn_update(train_loader, model). It updates the activation statistics for every batch normalization layer in the model by making a forward pass on the train_loader data loader. You only need to call this function once in the end of training.

    + +

    Advanced Learning-Rate Schedules

    + +

    SWA can be used with any learning rate schedule that encourages exploration of the flat region of solutions. For example, you can use cyclical learning rates in the last 25% of the training time instead of a constant value, and average the weights of the networks corresponding to the lowest values of the learning rate within each cycle (see Figure 3).

    + +
    + +
    + +

    Figure 3. Illustration of SWA with an alternative learning rate schedule. Cyclical learning rates are adopted in the last 25% of training, and models for averaging are collected in the end of each cycle.

    + +

    In our implementation you can implement custom learning rate and weight averaging strategies by using SWA in the manual mode. The following code is equivalent to the auto mode code presented in the beginning of this blogpost.

    + +
    opt = torchcontrib.optim.SWA(base_opt)
    +for i in range(100):
    +    opt.zero_grad()
    +    loss_fn(model(input), target).backward()
    +    opt.step()
    +    if i > 10 and i % 5 == 0:
    +        opt.update_swa()
    +opt.swap_swa_sgd()
    +
    + +

    In manual mode you don’t specify swa_start, swa_lr and swa_freq, and just call opt.update_swa() whenever you want to update the SWA running averages (for example in the end of each learning rate cycle). In manual mode SWA doesn’t change the learning rate, so you can use any schedule you want as you would normally do with any other torch.optim.Optimizer.

    + +

    Why does it work?

    + +

    SGD converges to a solution within a wide flat region of loss. The weight space is extremely high-dimensional, and most of the volume of the flat region is concentrated near the boundary, so SGD solutions will always be found near the boundary of the flat region of the loss. SWA on the other hand averages multiple SGD solutions, which allows it to move towards the center of the flat region.

    + +

    We expect solutions that are centered in the flat region of the loss to generalize better than those near the boundary. Indeed, train and test error surfaces are not perfectly aligned in the weight space. Solutions that are centered in the flat region are not as susceptible to the shifts between train and test error surfaces as those near the boundary. In Figure 4 below we show the train loss and test error surfaces along the direction connecting the SWA and SGD solutions. As you can see, while SWA solution has a higher train loss compared to the SGD solution, it is centered in the region of low loss, and has a substantially better test error.

    + +
    + +
    + +

    Figure 4. Train loss and test error along the line connecting the SWA solution (circle) and SGD solution (square). SWA solution is centered in a wide region of low train loss while the SGD solution lies near the boundary. Because of the shift between train loss and test error surfaces, SWA solution leads to much better generalization.

    + +

    Examples and Results

    + +

    We released a GitHub repo here with examples of using the torchcontrib implementation of SWA for training DNNs. For example, these examples can be used to achieve the following results on CIFAR-100:

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    DNN (Budget)SGDSWA 1 BudgetSWA 1.25 BudgetsSWA 1.5 Budgets
    VGG16 (200)72.55 ± 0.1073.91 ± 0.1274.17 ± 0.1574.27 ± 0.25
    PreResNet110 (150)76.77 ± 0.3878.75 ± 0.1678.91 ± 0.2979.10 ± 0.21
    PreResNet164 (150)78.49 ± 0.3679.77 ± 0.1780.18 ± 0.2380.35 ± 0.16
    WideResNet28x10 (200)80.82 ± 0.2381.46 ± 0.2381.91 ± 0.2782.15 ± 0.27
    + +

    Semi-Supervised Learning

    + +

    In a follow-up paper SWA was applied to semi-supervised learning, where it illustrated improvements beyond the best reported results in multiple settings. For example, with SWA you can get 95% accuracy on CIFAR-10 if you only have the training labels for 4k training data points (the previous best reported result on this problem was 93.7%). This paper also explores averaging multiple times within epochs, which can accelerate convergence and find still flatter solutions in a given time.

    +
    + +
    + +

    Figure 5. Performance of fast-SWA on semi-supervised learning with CIFAR-10. fast-SWA achieves record results in every setting considered.

    + +

    Calibration and Uncertainty Estimates

    +

    SWA-Gaussian (SWAG) is a simple, scalable and convenient approach to uncertainty estimation and calibration in Bayesian deep learning. Similarly to SWA, which maintains a running average of SGD iterates, SWAG estimates the first and second moments of the iterates to construct a Gaussian distribution over weights. SWAG distribution approximates the shape of the true posterior: Figure 6 below shows the SWAG distribution on top of the posterior log-density for PreResNet-164 on CIFAR-100.

    +
    + +
    +

    Figure 6. SWAG distribution on top of posterior log-density for PreResNet-164 on CIFAR-100. The shape of SWAG distribution is aligned with the posterior.

    + +

    Empirically, SWAG performs on par or better than popular alternatives including MC dropout, KFAC Laplace, and temperature scaling on uncertainty quantification, out-of-distribution detection, calibration and transfer learning in computer vision tasks. Code for SWAG is available here.

    + +

    Reinforcement Learning

    + +

    In another follow-up paper SWA was shown to improve the performance of policy gradient methods A2C and DDPG on several Atari games and MuJoCo environments.

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    EnvironmentA2CA2C + SWA
    Breakout522 ± 34703 ± 60
    Qbert18777 ± 77821272 ± 655
    SpaceInvaders7727 ± 112121676 ± 8897
    Seaquest1779 ± 41795 ± 4
    CrazyClimber147030 ± 10239139752 ± 11618
    BeamRider9999 ± 40211321 ± 1065
    + +

    Low Precision Training

    +

    We can filter through quantization noise by combining weights that have been rounded down with weights that have been rounded up. Moreover, by averaging weights to find a flat region of the loss surface, large perturbations of the weights will not affect the quality of the solution (Figures 7 and 8). Recent work shows that by adapting SWA to the low precision setting, in a method called SWALP, one can match the performance of full-precision SGD even with all training in 8 bits [5]. This is quite a practically important result, given that (1) SGD training in 8 bits performs notably worse than full precision SGD, and (2) low precision training is significantly harder than predictions in low precision after training (the usual setting). For example, a ResNet-164 trained on CIFAR-100 with float (16-bit) SGD achieves 22.2% error, while 8-bit SGD achieves 24.0% error. By contrast, SWALP with 8 bit training achieves 21.8% error.

    +
    + +
    + +

    Figure 7. Quantizing in a flat region can still provide solutions with low loss.

    + +
    + +
    + +

    Figure 8. Low precision SGD training (with a modified learning rate schedule) and SWALP.

    + +

    Conclusion

    + +

    One of the greatest open questions in deep learning is why SGD manages to find good solutions, given that the training objectives are highly multimodal, and there are in principle many settings of parameters that achieve no training loss but poor generalization. By understanding geometric features such as flatness, which relate to generalization, we can begin to resolve these questions and build optimizers that provide even better generalization, and many other useful features, such as uncertainty representation. We have presented SWA, a simple drop-in replacement for standard SGD, which can in principle benefit anyone training a deep neural network. SWA has been demonstrated to have strong performance in a number of areas, including computer vision, semi-supervised learning, reinforcement learning, uncertainty representation, calibration, Bayesian model averaging, and low precision training.

    + +

    We encourage you try out SWA! Using SWA is now as easy as using any other optimizer in PyTorch. And even if you have already trained your model with SGD (or any other optimizer), it’s very easy to realize the benefits of SWA by running SWA for a small number of epochs starting with a pre-trained model.

    + +
      +
    • [1] Averaging Weights Leads to Wider Optima and Better Generalization; Pavel Izmailov, Dmitry Podoprikhin, Timur Garipov, Dmitry Vetrov, Andrew Gordon Wilson; Uncertainty in Artificial Intelligence (UAI), 2018
    • +
    • [2] There Are Many Consistent Explanations of Unlabeled Data: Why You Should Average; Ben Athiwaratkun, Marc Finzi, Pavel Izmailov, Andrew Gordon Wilson; International Conference on Learning Representations (ICLR), 2019
    • +
    • [3] Improving Stability in Deep Reinforcement Learning with Weight Averaging; Evgenii Nikishin, Pavel Izmailov, Ben Athiwaratkun, Dmitrii Podoprikhin, Timur Garipov, Pavel Shvechikov, Dmitry Vetrov, Andrew Gordon Wilson, UAI 2018 Workshop: Uncertainty in Deep Learning, 2018
    • +
    • [4] A Simple Baseline for Bayesian Uncertainty in Deep Learning, Wesley Maddox, Timur Garipov, Pavel Izmailov, Andrew Gordon Wilson, arXiv pre-print, 2019: https://arxiv.org/abs/1902.02476
    • +
    • [5] SWALP : Stochastic Weight Averaging in Low Precision Training, Guandao Yang, Tianyi Zhang, Polina Kirichenko, Junwen Bai, Andrew Gordon Wilson, Christopher De Sa, To appear at the International Conference on Machine Learning (ICML), 2019.
    • +
    • [6] David Ruppert. Efficient estimations from a slowly convergent Robbins-Monro process. Technical report, Cornell University Operations Research and Industrial Engineering, 1988.
    • +
    • [7] Acceleration of stochastic approximation by averaging. Boris T Polyak and Anatoli B Juditsky. SIAM Journal on Control and Optimization, 30(4):838–855, 1992.
    • +
    • [8] Loss Surfaces, Mode Connectivity, and Fast Ensembling of DNNs, Timur Garipov, Pavel Izmailov, Dmitrii Podoprikhin, Dmitry Vetrov, Andrew Gordon Wilson. Neural Information Processing Systems (NeurIPS), 2018
    • +
    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/straggler-mitigation/index.html b/blog/straggler-mitigation/index.html new file mode 100644 index 000000000000..c83a299b44c5 --- /dev/null +++ b/blog/straggler-mitigation/index.html @@ -0,0 +1,908 @@ + + + + + + + + + + + + + Straggler Mitigation On PyTorch DDP By Hierarchical SGD | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Yi Wang (Cruise AI), Rohan Varma (Meta AI) + +

    +

    PyTorch DDP has been widely adopted across the industry for distributed training, which by default runs synchronous SGD to synchronize gradients across model replicas at every step. The performance of this technique is critical for fast iteration during model exploration as well as resource and cost saving. The performance is critical for fast iteration and cost saving of model development and exploration. To resolve a ubiquitous performance bottleneck introduced by slow nodes in large-scale training, Cruise and Meta co-developed a solution based on the Hierarchical SGD algorithm to significantly accelerate training in the presence of these stragglers.

    + +

    The Need For Straggler Mitigation

    + +

    In DDP setup, a straggler problem can occur when one or more processes run much slower (“stragglers”) than other processes. When this happens, all the processes have to wait for the stragglers before synchronizing gradients and completing the communication, which essentially bottlenecks distributed performance to the slowest worker.As a result, even for the cases of training relatively small models, the communication cost can still be a major performance bottleneck.

    + +

    Potential Causes of Stragglers

    + +

    Severe straggler issues are usually caused by workload imbalance before synchronization, and many factors can contribute to this imbalance. For instance, some data loader workers in the distributed environment can become stragglers, because some input examples can be outliers in terms of the data size, or the data transfer of some examples can be drastically slowed down due to unstable network I/O, or the on-the-fly data transformation costs can have a high variance.

    + +

    Besides data loading, other phases before gradient synchronization can also cause stragglers, such as unbalanced workloads of embedding table lookup during the forward pass in recommendation systems.

    + +

    The Appearance of Stragglers

    + +

    If we profile DDP training jobs that have stragglers, we can find that some processes may have much higher gradient synchronization costs (a.k.a., allreducing gradients) than other processes at a certain step. As a result, the distributed performance can be dominated by the communication cost even if the model size is very small. In this case, some processes run faster than the straggler(s) at a step, and hence they have to wait for the stragglers and spend a much longer time on allreduce.

    + +

    The below shows screenshots of two trace files output by PyTorch profiler in a use case. Each screenshot profiles 3 steps.

    +
      +
    • The first screenshot shows that a process has a very high allreduce cost in both the first and the third steps, because this process reaches the synchronization phase earlier than the straggler(s), and it spends more time on waiting. On the other hand, the allreduce cost is relatively small in the second step, this suggests that 1) there is no straggler at this step; or 2) this process is the straggler among all the processes, so it does not need to wait for any other process.
    • +
    + +

    chart showing allreduce cost

    + +

    Both the 1st and the 3rd Steps Are Slowed Down by Stragglers

    + +
      +
    • The second screenshot shows a normal case without stragglers. In this case, all the gradient synchronizations are relatively short.
    • +
    + +

    chart showing normal case without stragglers

    + +

    Normal Case Without Stragglers

    + +

    Hierarchical SGD in PyTorch

    + +

    Recently hierarchical SGD has been proposed to optimize the communication costs by mainly reducing the total amount of data transfer in large-scale distributed training, and multiple convergence analyses have been provided (example). As a main novelty of this post, at Cruise we could leverage hierarchical SGD to mitigate stragglers, which may also occur on training relatively small models. Our implementation has been upstreamed by Cruise to PyTorch in early 2022.

    + +

    How Does Hierarchical SGD Work?

    + +

    As the name implies, hierarchical SGD organizes all the processes into groups at different levels as a hierarchy, and runs synchronization by following the rules below:

    + +
      +
    • All the groups at the same level have the same number of processes, and the processes in these groups synchronize at the same frequency concurrently, where the synchronization period is pre-defined by the user.
    • +
    • The higher level a group is, the larger synchronization period is used, as the synchronization becomes more expensive.
    • +
    • When multiple overlapping groups are supposed to synchronize according to their periods, to reduce redundant synchronization and avoid data race across groups, only the highest-level group runs synchronization.
    • +
    + +

    The following figure illustrates an example of 4-level hierarchy SGD among 16 processes on 8 machines, each of which has 2 GPUs:

    + +
      +
    1. Level 1: Each process runs mini-batch SGD locally;
    2. +
    3. Level 2: Each 4-process group across 2 machines runs synchronization every 2 steps;
    4. +
    5. Level 3: Each 8-process group across 4 machines runs synchronization every 4 steps;
    6. +
    7. Level 4: The global process group of all 16 processes over 8 machines runs synchronization every 8 steps.
    8. +
    + +

    Particularly, when the step number can be divided by 8, only the synchronization at 3) is executed, and when the step number can be divided by 4 but not 8, only the synchronization at 2) is executed.

    + +

    An example of 4-level hierarchy SGD among 16 processes on 8 machines, each of which has 2 GPUs

    + +

    Intuitively, hierarchical SGD can be viewed as an extension of local SGD, which only has a two-level hierarchy – every process runs mini-batch SGD locally and then synchronizes globally at a certain frequency. This can also help explain that, just like local SGD, hierarchical SGD synchronizes model parameters instead of gradients. Otherwise the gradient descent will be mathematically incorrect when the frequency is greater than 1.

    + +

    Why Can Hierarchical SGD Mitigate Stragglers?

    + +

    The key insight here is that, when there is a random straggler, it only directly slows down a relatively small group of processes instead of all the processes. Next time another random straggler is very likely to slow down a different small group, and hence a hierarchy can help smooth out the straggler effect.

    + +

    The example below assumes that there is a random straggler among totally 8 processes at every step. After 4 steps, vanilla DDP that runs synchronous SGD will be slowed down by straggler 4 times, because it runs global synchronization at every step. In contrast, hierarchical SGD runs synchronization with the groups of 4 processes after the first two steps, and then a global synchronization after another two steps. We can see that both the first two and the last two stragglers have a large overlap, and hence the performance loss can be mitigated.

    + +

    flow diagram

    + +

    Essentially, the mitigation effect of this hierarchical SGD example actually is between local SGD at a frequency of every 2 steps and every 4 steps. The main advantage of hierarchical SGD over local SGD is a better convergence efficiency of the same global synchronization frequency, because hierarchical SGD allows more low-level synchronization. Moreover, it is possible for hierarchical SGD to provide a global synchronization frequency lower than local SGD with model parity, leading to a higher training performance, especially in a large-scale distributed training.

    + +

    Ease of Use

    + +

    Straggler mitigation is not a novel study in distributed training. Multiple approaches have been proposed, such as gossip SGD, data encoding, gradient coding, as well as some particularly designed for parameter-server architecture, including backup workers and stale synchronous parallel. However, to the best of our knowledge, before this effort we have not found a good open-source PyTorch implementation of straggler mitigation that can work like a plugin to our training system at Cruise. In contrast, our implementation only requires the minimal changes – no need to modify the existing code or tune any existing hyperparameters. This is a very appealing advantage for industry users.

    + +

    As the code example below shows, only a few lines need to be added to the setup of DDP model, and the training loop code can keep untouched. As explained previously, hierarchical SGD is an extended form of local SGD, so the enablement can be quite similar to local SGD (see PyTorch docs of PostLocalSGDOptimizer):

    + +
      +
    1. Register a post-local SGD communication hook to run a warmup stage of fully synchronous SGD and defer hierarchical SGD.
    2. +
    3. Create a post-local SGD optimizer that wraps an existing local optimizer and a hierarchical SGD configuration.
    4. +
    + +
    import torch.distributed.algorithms.model_averaging.hierarchical_model_averager as hierarchicalSGD
    +from torch.distributed.algorithms.ddp_comm_hooks.post_localSGD_hook import (
    +    PostLocalSGDState,
    +    post_localSGD_hook,
    +)
    +from torch.distributed.optim import PostLocalSGDOptimizer
    +
    +ddp_model = nn.parallel.DistributedDataParallel(
    +    module=model,
    +    device_ids=[rank],
    +)
    +
    +# Register a post-local SGD communication hook for the warmup.
    +subgroup, _ = torch.distributed.new_subgroups()
    +state = PostLocalSGDState(subgroup=subgroup, start_localSGD_iter=1_000)
    +ddp_model.register_comm_hook(state, post_localSGD_hook)
    +
    +# Wraps the existing (local) optimizer to run hierarchical model averaging.
    +optim = PostLocalSGDOptimizer(
    +  optim=optim,
    +  averager=hierarchicalSGD.HierarchicalModelAverager(
    +    # The config runs a 4-level hierarchy SGD among 128 processes:
    +    # 1) Each process runs mini-batch SGD locally;
    +    # 2) Each 8-process group synchronize every 2 steps;
    +    # 3) Each 32-process group synchronize every 4 steps;
    +    # 4) All 128 processes synchronize every 8 steps.
    +    period_group_size_dict=OrderedDict([(2, 8), (4, 32), (8, 128)]),
    +    # Do not run hierarchical SGD until 1K steps for model parity.
    +    warmup_steps=1_000)
    +)
    +
    + +

    Algorithm Hyperparameters

    + +

    Hierarchical SGD has two major hyperparameters: period_group_size_dict and warmup_steps.

    + +
      +
    • period_group_size_dict is an ordered dictionary mapping from synchronization period to process group size, used for initializing process groups of different sizes in a hierarchy to synchronize parameters concurrently. A larger group is expected to use a larger synchronization period.
    • +
    • warmup_steps specifies a number of steps as the warmup stage to run synchronous SGD before hierarchical SGD. Similar to post-local SGD algorithm, a warmup stage is usually recommended to achieve a higher accuracy. The value should be the same as start_localSGD_iter arg used in PostLocalSGDState when post_localSGD_hook is registered. Typically the warmup stage should at least cover the beginning of training when the loss is decreased drastically.
    • +
    + +

    A subtle difference between the PyTorch implementation and the initial design proposed by relevant papers is that, after the warmup stage, by default the processes within each host still run intra-host gradient synchronization at every step. This is because that:

    + +
      +
    1. The intra-host communication is relatively cheap, and it can usually significantly accelerate the convergence;
    2. +
    3. The intra-host group (of size 4 or 8 for most industry users) can usually be a good choice of the smallest group of processes that synchronize most frequently in hierarchical SGD. If the synchronization period is 1, then gradient synchronization is faster than model parameter synchronization (a.k.a., model averaging), because DDP automatically overlaps gradient synchronization and the backward pass.
    4. +
    + +

    Such intra-host gradient synchronization can be disabled by unsetting post_local_gradient_allreduce arg in PostLocalSGDState.

    + +

    Demonstration

    + +

    Now we demonstrate that hierarchical SGD can accelerate distributed training by mitigating stragglers.

    + +

    Experimental Setup

    + +

    We compared the performance of hierarchical SGD against local SGD and synchronous SGD on ResNet18 (model size: 45MB). Since the model is so small, the training is not bottlenecked by data transfer cost during synchronization. To avoid the noises incurred by data loading from remote storage, the input data was randomly simulated from memory. We varied the number of GPUs used by training from 64 to 256. The batch size per worker is 32, and the number of iterations of training is 1,000. Since we don’t evaluate convergence efficiency in this set of experiments, warmup is not enabled.

    + +

    We also emulated stragglers at a rate of 1% on 128 and 256 GPUs, and 2% on 64 GPUs, to make sure at least one stragglers at every step on average. These stragglers randomly appear on different CUDA devices. Each straggler stalls for 1 second besides the normal per-step training time (~55ms in our setup). This can be perceived as a practical scenario where 1% or 2% of input data are outliers in terms of the data pre-processing cost (I/O and/or data transformation on the fly) during training, and such cost is 20X+ larger than the average.

    + +

    The code snippet below shows how a straggler can be emulated in the training loop. We applied it to a ResNet model, and it can be easily applied to the other models as well.

    + +
         loss = loss_fn(y_pred, y)
    +     # Emulate a straggler that lags for 1 second at a rate of 1%.
    +     if random.randint(1, 100) == 1:
    +         time.sleep(1)
    +     loss.backward()
    +     optimizer.step()
    +
    + +

    The experiments are conducted on us-central1 GCP cluster. Each machine has 4 NVIDIA Tesla T4 GPUs with 16 GB memory per GPU, connected through a 32 Gbit/s ethernet network. Each instance also features 96 vCPUs, 360 GB RAM.

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Architecture + ResNet18 (45MB) +
    Workers + 64, 128, 256 +
    Backend + NCCL +
    GPU + Tesla T4, 16 GB memory +
    Batch size + 32 x ## of workers +
    Straggler Duration + 1 sec +
    Straggler Rate + 1% on 128 and 256 GPUs, 2% on 64 GPUs +
    + +

    We used multiple configurations for both local SGD and hierarchical SGD. Local SGD runs global synchronization every 2, 4, and 8 steps, respectively.

    + +

    We ran hierarchical SGD with the following configurations:

    + +
      +
    1. On 64 GPUs: +
        +
      1. Each 8-process group, 32-process, and the global 64-process group synchronizes every 2, 4, and 8 steps, respectively. Denoted as “HSGD 2-8,4-32,8-64”.
      2. +
      3. Each 32-process group and the global 64-process group synchronizes every 4 and 8 steps, respectively. Denoted as “HSGD 4-32,8-64”.
      4. +
      +
    2. +
    3. On 128 GPUs: +
        +
      1. Each 8-process group, 32-process group, and the global 128-process group synchronizes every 2, 4, and 8 steps, respectively. Denoted as “HSGD 2-8,4-32,8-128”.
      2. +
      3. Each 32-process group and the global 128-process group synchronizes every 4 and 8 steps, respectively. Denoted as “HSGD 4-32,8-128”.
      4. +
      +
    4. +
    5. On 256 GPUs: +
        +
      1. Each 4-process group, 16-process group, 64-process group, and the global 256-process group synchronizes every 1, 2, 4, and 8 steps, respectively. Denoted as “HSGD 1-4,2-16,4-64,8-256”.
      2. +
      3. Each 8-process group, 64-process group, and the global 256-process group synchronizes every 2, 4, and 8 steps. Denoted as “HSGD 2-8,4-64,8-256”.
      4. +
      5. Each 16-process group and the global 256-process group synchronizes every 4 and 8 steps, respectively. Denoted as “HSGD 4-16,8-256”.
      6. +
      +
    6. +
    + +

    Experimental Results

    + +

    The figures below show the speedups of different communication schemes against the baseline of synchronous SGD, with the emulated stragglers. We can make the following observations:

    + +
      +
    1. As expected, we can see that both hierarchical SGD and local SGD can achieve a higher speedup with a lower synchronization frequency.
    2. +
    3. The speedups of the hierarchical SGD schemes are 2.08X-2.45X on 64 GPUs, 2.57X-2.68X on 128 GPUs, and 2.63X-3.25X on 256 GPUs, respectively. This shows that hierarchical SGD can significantly mitigate stragglers, and such mitigation can be more effective at a larger scale.
    4. +
    5. The performance of local SGD with the synchronization period of 2 steps and 8 steps can be perceived as the lower bound and upper bound of the experimented hierarchical SGD schemes, respectively. This is because the hierarchical SGD schemes synchronize less frequently than every 2 steps globally, but their low-level synchronization at small groups are the extra overheads in comparison with the global synchronization every 8 steps.
    6. +
    + +

    Overall, hierarchical SGD can provide a finer-grained trade-off between communication cost and model quality than local SGD. Therefore, when local SGD at a relatively large synchronization period like 8 or 4 cannot give a satisfactory convergence efficiency, hierarchical SGD can have a much better chance to achieve both a good speedup and a model parity.

    + +

    Since only simulated data is used in the experiments, we did not demonstrate the model parity here, which in practice can be achieved in two ways:

    +
      +
    1. Tuning the hyperparameters including both hierarchy and warmup steps;
    2. +
    3. For some cases, hierarchical SGD could lead to a slightly lower quality than the original model for the same number of training steps (i.e., lower convergence rate), but with a speedup like 2X+ per training step, it is still possible to achieve model parity with more steps but still less total training time.
    4. +
    + +

    Speedups on 64 GPUs

    + +

    Speedups on 128 GPUs

    + +

    Speedups on 256 GPUs

    + +

    Limitations

    + +

    Before applying hierarchical SGD to straggler mitigation, the user should be aware of a few limitations of this approach:

    + +
      +
    1. This approach can only mitigate non-persistent stragglers, which occur to different workers at different times. However, for the case of persistent stragglers, which can be caused by hardware degradation or a network issue on a specific host, these stragglers will slow down the same low-level subgroup at every time, leading to nearly no straggler mitigation.
    2. +
    3. This approach can only mitigate low-frequency stragglers. E.g., if 30% workers can randomly become stragglers at every step, then most low-level synchronizations will still be slowed down by stragglers. As a result, hierarchical SGD may not show an obvious performance advantage over synchronous SGD.
    4. +
    5. Since hierarchical SGD applies model averaging that does not overlap with backward like gradient averaging used by vanilla DDP, its performance gain of straggler mitigation must outweigh the performance loss of no overlap between communication and backward pass. Therefore, if stragglers only slow down training by less than 10%, hierarchical SGD may not be able to bring much speedup. This limitation can be addressed by overlapping optimizer step and backward pass in the future.
    6. +
    7. Since hierarchical SGD is less well-studied than local SGD, there is no guarantee that hierarchical SGD with a finer-grained synchronization granularity can converge faster than certain advanced forms of local SGD, such as SlowMo, which can improve convergence efficiency with slow momentum. However, to the best of our knowledge, these advanced algorithms cannot be natively supported as a PyTorch DDP plugin like hierarchical SGD yet.
    8. +
    + +

    Acknowledgements

    + +

    We would like to thank Cruise teammates Bo Tian, Sergei Vorobev, Eugene Selivonchyk, Tsugn-Hsien Lee, Dan Ring, Ian Ackerman, Lei Chen, Maegan Chew, Viet Anh To, Xiaohui Long, Zeyu Chen, Alexander Sidorov, Igor Tsvetkov, Xin Hu, Manav Kataria, Marina Rubtsova, and Mohamed Fawzy, as well as Meta teammates Shen Li, Yanli Zhao, Suraj Subramanian, Hamid Shojanzeri, Anjali Sridhar and Bernard Nguyen for the support.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/submit-to-speak/index.html b/blog/submit-to-speak/index.html new file mode 100644 index 000000000000..9cf088b5e57f --- /dev/null +++ b/blog/submit-to-speak/index.html @@ -0,0 +1,716 @@ + + + + + + + + + + + + + 📣 Submit to Speak at PyTorch Conference + Save on Registration | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    Step into the Future of AI at PyTorch Conference 2025.

    + +

    banner ad for conference

    + +

    The Call for Proposals for PyTorch Conference 2025 is officially open!

    + +

    Join us in San Francisco from October 22–23, 2025, to showcase your expertise and innovations with PyTorch—the industry-leading, open-source machine learning framework powering innovations from bare-metal infrastructure to sophisticated application and agent layers. This is your opportunity to share insights, breakthroughs, and case studies with a global audience of AI and Generative AI practitioners, researchers, and developers.

    + +

    people watching presentation at conference

    + +

    Submit your proposals and prepare to engage, learn, and network alongside some of the brightest minds in the AI/ML community. We’re seeking sessions, Birds of a Feather discussions, lightning talks, and poster sessions on the following topics:

    + +
      +
    • Core PyTorch Framework
    • +
    • PyTorch on Accelerator Hardware
    • +
    • PyTorch Ecosystem and Tools
    • +
    • AI Applications and Use Cases
    • +
    • AI in Research and Academia
    • +
    • AI in Industry and Enterprise Applications
    • +
    • AI Infrastructure and Scalability
    • +
    • Ethical AI, Governance, and Regulation
    • +
    • Training, Fine-Tuning, and Alignment
    • +
    • Inference, Deployment, and Serving
    • +
    • Performance Measurement and Benchmarking
    • +
    • Data Engineering and Management for AI
    • +
    • Generative AI and Large Language Models (LLMs)
    • +
    • Model Optimization and Efficiency
    • +
    • Open Source Collaboration, Education and Community Building
    • +
    • Edge AI and On-Device
    • +
    • DL Compilers and Kernel Authoring
    • +
    + +
    +

    Learn more and submit your talk by Sunday, June 1, at 11:59 PDT!

    + + SUBMIT TO SPEAK + +
    + +
    + +

    people arriving at conference

    + +

    Save up to USD$500 with Super Early Bird Pricing!

    + +
      +
    • Reserve your pass by 11:59 PM PDT on March 21 and score Super Early Bird pricing for just USD$499. That’s a savings of up to USD$500!
    • +
    • Student or faculty? Learn more about our discounted academic rate.
    • +
    • Need help covering travel costs? We offer discretionary travel funding for those community members who would otherwise not be able to attend. Learn more.
    • +
    + + + +
    + +

    Become a Sponsor at PyTorch Conference 2025!

    + +

    Seize your opportunity to influence the future of Generative AI and Machine Learning by sponsoring PyTorch Conference 2025. PyTorch is at the forefront of innovation—empowering rapid experimentation, flexible model development, and efficient deployment into production environments with its powerful, versatile ecosystem of tools and thriving community of dedicated users.

    + +

    As a sponsor, you’ll gain more than visibility; you’ll strategically position your organization at the heart of a vibrant, global AI/ML ecosystem. Connect directly with 3,000+ expert attendees, researchers, engineers, and decision-makers, and actively shape the conversations driving the next generation of AI advancements.

    + + + +

    For more details on CFP submissions, registration, and sponsorship, visit the PyTorch Conference Website.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/tac-elects-new-leadership/index.html b/blog/tac-elects-new-leadership/index.html new file mode 100644 index 000000000000..a414ee5ca8e5 --- /dev/null +++ b/blog/tac-elects-new-leadership/index.html @@ -0,0 +1,742 @@ + + + + + + + + + + + + + PyTorch Foundation Technical Advisory Council Elects New Leadership | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    We are pleased to announce the first-ever Chair and Vice Chair of the PyTorch Foundation’s Technical Advisory Council (TAC): Luca Antiga as the Chair and Jiong Gong as Vice Chair. Both leaders bring extensive experience and deep commitment to the PyTorch community, and they are set to guide the TAC in its mission to foster an open, diverse, and innovative PyTorch technical community.

    + +

    Meet the New Leadership

    + +

    Luca Antiga

    + +

    Luca Antiga is the CTO at Lightning AI since 2022. He is an early contributor to PyTorch core and co-authored “Deep Learning with PyTorch” (published by Manning). He started his journey as a researcher in Bioengineering, and later co-founded Orobix, a company focused on building and deploying AI in production settings.

    + +

    “I am looking forward to taking on the role of the chair of the PyTorch TAC,” says Luca. “As the TAC chair, I will ensure effective, timely topic selection and enhance visibility of technical needs from the board members and from the ecosystem at large. I will strive for directional, cohesive messaging throughout the transition of PyTorch from Meta to the Linux Foundation.”

    + +

    Jiong Gong

    + +

    Jiong Gong is a Principal Engineer and SW Architect for PyTorch Optimization from Intel. He serves as one of the PyTorch CPU module maintainers and is an active contributor to the TorchInductor CPU backend.

    + +

    “I plan to further strengthen the collaboration between PyTorch developers and hardware vendors, promoting innovation and performance optimization across various hardware platforms, enhancing PyTorch ecosystem and streamlining the decision-making process,” says Jiong. “I am honored to serve as the vice chair of the TAC.”

    + +

    What Does the TAC Do?

    + +

    The PyTorch Foundation’s TAC provides a forum for technical communication, leadership, and collaboration for the PyTorch Foundation. The committee members are members of the PyTorch Foundation. The committee holds open meetings once a month that anyone in the community can attend. The committee provides thought leadership on technical topics, knowledge sharing, and a forum to discuss issues with other technical experts in the community.

    + +

    New TAC Webpage

    + +

    Stay connected with the PyTorch Foundation’s Technical Advisory Council (TAC) by visiting our new TAC webpage. Here you can find the TAC members, where to view upcoming meeting agendas, access presentations, attend public meetings, watch meeting recordings and participate in discussions on key technical topics.

    + +

    Plus stay tuned on our blog for regular updates from the PyTorch Foundation TAC leadership.

    + + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/tensor-comprehensions/index.html b/blog/tensor-comprehensions/index.html new file mode 100644 index 000000000000..63825ddd02eb --- /dev/null +++ b/blog/tensor-comprehensions/index.html @@ -0,0 +1,834 @@ + + + + + + + + + + + + + Tensor Comprehensions in PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    March 05, 2018

    +

    + Tensor Comprehensions in PyTorch +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Priya Goyal (FAIR), Nicolas Vasilache (FAIR), Oleksandr Zinenko (Inria & DI ENS), Theodoros Theodoridis (ETH Zürich), Zachary DeVito (FAIR), William S. Moses (MIT CSAIL), Sven Verdoolaege (FAIR), Andrew Adams (FAIR), Albert Cohen (Inria & DI ENS & FAIR) + +

    +

    Tensor Comprehensions (TC) is a tool that lowers the barrier for writing high-performance code. It generates GPU code from a simple high-level language and autotunes the code for specific input sizes.

    + +

    We highly recommend reading the Tensor Comprehensions blogpost first.

    + +

    If you ran into any of the following scenarios, TC is a useful tool for you.

    + +
      +
    • +

      Your PyTorch layer is large and slow, and you contemplated writing a dedicated C++ or CUDA code for it. But you don’t know how to program in CUDA or write low-level code.

      +
    • +
    • +

      You wrote a CUDA layer, but it took a week to write, debug, optimize for speed. You wished you could do this in an hour.

      +
    • +
    • +

      You want to fuse multiple layers like Conv-ReLU-BatchNorm or Linear-ReLU-Linear-ReLU in your network for speed, but it was quite difficult to comprehend

      +
    • +
    • +

      Your research involves weird Tensor shapes that CuDNN and MKL are not optimized for. For example, you do convolutions of 13 x 24 with an input image of 143 x 55. You tried running it with CuDNN and it was slower than you wished.

      +
    • +
    • +

      Your code is slowed-down by transposing Tensors constantly to fit a particular memory layout. You wish it was easy to write custom code that operates efficiently on your input layout.

      +
    • +
    + +

    Tensor Comprehensions are seamless to use in PyTorch, interoperating with PyTorch Tensors and nn Variables.

    + +

    Let us run through using TC with PyTorch.

    + +

    1. Install the package

    + +
    conda install -c pytorch -c tensorcomp tensor_comprehensions
    +
    + +

    At this time we only provide Linux-64 binaries which have been tested on Ubuntu 16.04 and CentOS7.

    + +

    TC depends on heavyweight C++ projects such as Halide, Tapir-LLVM and ISL. Hence, we rely on Anaconda to distribute these dependencies reliably. For the same reason, TC is not available via PyPI.

    + +

    2. Import the python package

    + +
    import tensor_comprehensions as tc
    +
    + +

    3. Define the TC expression and create a python function

    + +
    lang = """
    +def fcrelu(float(B,M) I, float(N,M) W1, float(N) B1) -> (O1) {
    +    O1(b, n) +=! I(b, m) * W1(n, m)
    +    O1(b, n) = O1(b, n) + B1(n)
    +    O1(b, n) = fmax(O1(b, n), 0)
    +}
    +"""
    +fcrelu = tc.define(lang, name="fcrelu")
    +
    + +

    This fcrelu function takes PyTorch Tensors as input and returns a PyTorch Tensor. It takes input I, weight W1, bias B1 and returns output O1.

    + +

    4. Let’s create some dummy input tensors

    + +
    B, M, N = 100, 128, 100
    +I, W1, B1 = torch.randn(B, M).cuda(), torch.randn(N, M).cuda(), torch.randn(N).cuda()
    +
    + +

    5. Now autotune the function for your input sizes

    + +
    fcrelu.autotune(I, W1, B1, cache="fcrelu_100_128_100.tc")
    +
    + +

    The autotuner is your biggest friend. You generally do not want to use a tc function without autotuning it first.

    + +

    When the autotuning is running, the current best performance is displayed. If you are satisfied with the current result or you are out of time, stop the tuning procedure by pressing Ctrl+C.

    + +

    cache saves the results of the autotuned kernel search and saves it to the file fcrelu_100_128_100.tc. The next time you call the same line of code, it loads the results of the autotuning without recomputing it.

    + +

    The autotuner has a few hyperparameters (just like your ConvNet has learning rate, number of layers, etc.). We pick reasonable defaults, but you can read about using advanced options here.

    + +

    6. Call the function with the inputs, to get your result

    + +
    out = fcrelu(I, W1, B1)
    +
    + +

    Now, let’s look at how to write TC expressions.

    + +

    A quick primer on the TC language

    + +

    The TC notation focuses on the mathematical nature of the layer, leaving performance considerations to it’s backend code that uses Halide and polyhedral compilation techniques which accumulate decades of cutting edge Loop Nest Optimization (LNO) research.

    + +

    TC is close to np.einsum. We shall quickly learn TC by example

    + +
    lang = """
    +def matmul(float(M,N) A, float(N,K) B) -> (output) {
    +  output(i, j) +=! A(i, kk) * B(kk, j)
    +}
    +"""
    +
    + +

    In this example, we define a function matmul which takes two input A and B of shapes M x N and N x K and returns a single output. The shape of output is automatically inferred by the TC language (discussed below).

    + +

    Let’s look at this line:

    + +
    output(i, j) +=! A(i, kk) * B(kk, j)
    +
    + +

    It says:

    + +
      +
    • output(i, j) means output is 2D.
    • +
    • for each location output(i, j), we add (+=) A(i, kk) * B(kk, j).
    • +
    • i is well-defined as all locations in A dim=0, i.e. i in range(0, M)
    • +
    • j is well-defined as all locations in B dim=1, i.e. j in range(0, K)
    • +
    • kk is inferred as all locations from 0 to N
    • +
    + +

    The shape of output is inferred from the maximum values i and j can take, which is M and K, so output is of size M x K.

    + +

    The ! symbol initializes output with 0.0. It is equivalent to:

    + +
    output(i, j) = 0
    +output(i, j) += A(i, kk) * B(kk, j)
    +
    + +

    Scalar inputs and range constraints: implement AvgPool2d

    + +
    """
    +
    +def avgpool(float(B, C, H, W) input) -> (output) {{
    +  output(b, c, h, w) += input(b, c, h * {sH} + kh, w * {sW} + kw) where kh in 0:{kH}, kw in 0:{kW}
    +}}
    +
    +"""
    +avgpool = tc.define(LANG, name="avgpool", constants={"sH":1, "sW":1, "kH":2, "kW":2})
    +
    + +

    here the where keyword can take ranges of values to operate on. 0:{kH} is equivalent range(kH) in Python.

    + +

    Note: the syntax for passing in scalars is subject to change in the next release.

    + +

    torch.nn layers

    + +

    We added some sugar-coating around the basic PyTorch integration of TC to make it easy to integrate TC into larger torch.nn models by defining the forward and backward TC expressions and taking Variable inputs / outputs.

    + +

    Some essentials that you will miss (we’re working on them)

    + +

    Autotuning for variable-length sequences

    + +

    The TC auto-tuner requires all input sizes to be specified before-hand. For example, if you have input I1 which is an image batch, the autotuner wants to know the exact shape of I1 to generate an optimized kernel. You cannot specify: image with height between 200 and 300. This is more essential in sequence data such as NLP, where each sentence can have a different length.

    + +

    The reason why the autotuner is non-parametric is because it’s harder and harder to auto-tune parametric constraints, this is active research. Hence, for the first release, we made a conscious decision to give you the tool in a form where we know it works well.

    + +

    As a work-around, if you know that you have a few specific shapes of interest, you can run the autotuner with these multiple shapes.

    + +
    relu = tc.define(LANG, name="relu")
    +batch, channels = 16, 3
    +tc.autotune((batch, channels, 32, 32)) # image of size 32 x 32
    +tc.autotune((batch, channels, 48, 48)) # image of size 48 x 48
    +tc.autotune((batch, channels, 64, 64)) # image of size 64 x 64
    +
    + +

    Now the autotuner is tuned for these three specific image sizes 32x32, 48x48 and 64x64.

    + +

    Lack of loops

    + +

    If you want to write an RNN, it’s easy to see it as a for loop over time. However, the TC language does not have loops yet. If you really want to write RNNs, you can write unrolled loops.

    + +

    Strided-Tensors

    + +

    The TC backend does not support non-contiguous Tensors yet. If the inputs you give are not contiguous, they are made contiguous before passing to the TC backend.

    + +

    Reshaping Tensors within a TC expression

    + +

    You cannot write this operation in TC: torch.matmul(...).view(...).mean(...). Whenever there is need for a view to change the shape of an input, you have to get the output, view it at the PyTorch level.

    + +

    Getting Started

    + +
      +
    • Walk through Tutorial to quickly get started with understanding and using Tensor Comprehensions PyTorch package.
    • +
    • Over 20 examples of various ML layers with TC, including avgpool, maxpool, matmul, matmul - give output buffers and batch-matmul, convolution, strided-convolution, batchnorm, copy, cosine similarity, Linear, Linear + ReLU, group-convolutions, strided group-convolutions, indexing, Embedding (lookup table), small-mobilenet, softmax, tensordot, transpose
    • +
    • Detailed docs on Tensor Comprehensions and integration with PyTorch.
    • +
    + +

    Communication

    + +
      +
    • Slack: For discussion around framework integration, build support, collaboration, etc. join our slack channel.
    • +
    • Email: tensorcomp@fb.com
    • +
    • GitHub: bug reports, feature requests, install issues, RFCs, thoughts, etc.
    • +
    + +

    Acknowledgements

    + +

    We would like to thank Soumith Chintala, Edward Yang and Sam Gross for their immense guidance and help in making the integration API nice and smooth. We would also like to thank rest of the PyTorch team and our pre-release users for their helpful feedback that guided us in making the integration better.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/tensor-memory-format-matters/index.html b/blog/tensor-memory-format-matters/index.html new file mode 100644 index 000000000000..c8cd4b352a6a --- /dev/null +++ b/blog/tensor-memory-format-matters/index.html @@ -0,0 +1,978 @@ + + + + + + + + + + + + + Efficient PyTorch: Tensor Memory Format Matters | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    + +
    + +
    +
    +
    + +
    +

    + by + + Dhruv Matani, Suraj Subramanian + +

    +

    Ensuring the right memory format for your inputs can significantly impact the running time of your PyTorch vision models. When in doubt, choose a Channels Last memory format.

    + +

    When dealing with vision models in PyTorch that accept multimedia (for example image Tensorts) as input, the Tensor’s memory format can significantly impact the inference execution speed of your model on mobile platforms when using the CPU backend along with XNNPACK. This holds true for training and inference on server platforms as well, but latency is particularly critical for mobile devices and users.

    + + + +

    Outline of this article

    +
      +
    1. Deep Dive into matrix storage/memory representation in C++. Introduction to Row and Column major order.
    2. +
    3. Impact of looping over a matrix in the same or different order as the storage representation, along with an example.
    4. +
    5. Introduction to Cachegrind; a tool to inspect the cache friendliness of your code.
    6. +
    7. Memory formats supported by PyTorch Operators.
    8. +
    9. Best practices example to ensure efficient model execution with XNNPACK optimizations
    10. +
    + +

    Matrix Storage Representation in C++

    + +

    Images are fed into PyTorch ML models as multi-dimensional Tensors. These Tensors have specific memory formats. To understand this concept better, let’s take a look at how a 2-d matrix may be stored in memory.

    + +

    Broadly speaking, there are 2 main ways of efficiently storing multi-dimensional data in memory.

    +
      +
    1. Row Major Order: In this format, the matrix is stored in row order, with each row stored before the next row in memory. I.e. row N comes before row N+1.
    2. +
    3. Column Major Order: In this format, the matrix is stored in column-order, with each column stored before the next column in memory. I.e. column N comes before column N+1.
    4. +
    + +

    You can see the differences graphically below.

    + +

    +C++ stores multi-dimensional data in row-major format. +
    +C++ stores multi-dimensional data in row-major format. +

    + +

    Efficiently accessing elements of a 2d matrix

    + +

    Similar to the storage format, there are 2 ways to access data in a 2d matrix.

    + +
      +
    1. Loop Over Rows first: All elements of a row are processed before any element of the next row.
    2. +
    3. Loop Over Columns first: All elements of a column are processed before any element of the next column.
    4. +
    + +

    For maximum efficiency, one should always access data in the same format in which it is stored. I.e. if the data is stored in row-major order, then one should try to access it in that order.

    + +

    The code below (main.cpp) shows 2 ways of accessing all the elements of a 2d 4000x4000 matrix.

    + +
    #include <iostream>
    +#include <chrono>
    +
    +// loop1 accesses data in matrix 'a' in row major order,
    +// since i is the outer loop variable, and j is the
    +// inner loop variable.
    +int loop1(int a[4000][4000]) {
    + int s = 0;
    + for (int i = 0; i < 4000; ++i) {
    +   for (int j = 0; j < 4000; ++j) {
    +     s += a[i][j];
    +   }
    + }
    + return s;
    +}
    +
    +// loop2 accesses data in matrix 'a' in column major order
    +// since j is the outer loop variable, and i is the
    +// inner loop variable.
    +int loop2(int a[4000][4000]) {
    + int s = 0;
    + for (int j = 0; j < 4000; ++j) {
    +   for (int i = 0; i < 4000; ++i) {
    +     s += a[i][j];
    +   }
    + }
    + return s;
    +}
    +
    +int main() {
    + static int a[4000][4000] = {0};
    + for (int i = 0; i < 100; ++i) {
    +   int x = rand() % 4000;
    +   int y = rand() % 4000;
    +   a[x][y] = rand() % 1000;
    + }
    +
    + auto start = std::chrono::high_resolution_clock::now();
    + auto end = start;
    + int s = 0;
    +
    +#if defined RUN_LOOP1
    + start = std::chrono::high_resolution_clock::now();
    +
    + s = 0;
    + for (int i = 0; i < 10; ++i) {
    +   s += loop1(a);
    +   s = s % 100;
    + }
    + end = std::chrono::high_resolution_clock::now();
    +
    + std::cout << "s = " << s << std::endl;
    + std::cout << "Time for loop1: "
    +   << std::chrono::duration<double, std::milli>(end - start).count()
    +   << "ms" << std::endl;
    +#endif
    +
    +#if defined RUN_LOOP2
    + start = std::chrono::high_resolution_clock::now();
    + s = 0;
    + for (int i = 0; i < 10; ++i) {
    +   s += loop2(a);
    +   s = s % 100;
    + }
    + end = std::chrono::high_resolution_clock::now();
    +
    + std::cout << "s = " << s << std::endl;
    + std::cout << "Time for loop2: "
    +   << std::chrono::duration<double, std::milli>(end - start).count()
    +   << "ms" << std::endl;
    +#endif
    +}
    +
    +
    +Lets build and run this program and see what it prints.
    +
    +g++ -O2 main.cpp -DRUN_LOOP1 -DRUN_LOOP2
    +./a.out
    +
    +
    +Prints the following:
    +
    +s = 70
    +Time for loop1: 77.0687ms
    +s = 70
    +Time for loop2: 1219.49ms
    +
    + +

    loop1() is 15x faster than loop2(). Why is that? Let’s find out below!

    + +

    Measure cache misses using Cachegrind

    + +

    Cachegrind is a cache profiling tool used to see how many I1 (first level instruction), D1 (first level data), and LL (last level) cache misses your program caused.

    + +

    Let’s build our program with just loop1() and just loop2() to see how cache friendly each of these functions is.

    + +

    Build and run/profile just loop1()

    + +
    g++ -O2 main.cpp -DRUN_LOOP1
    +valgrind --tool=cachegrind ./a.out
    +
    + +

    Prints:

    + +
    ==3299700==
    +==3299700== I   refs:      643,156,721
    +==3299700== I1  misses:          2,077
    +==3299700== LLi misses:          2,021
    +==3299700== I1  miss rate:        0.00%
    +==3299700== LLi miss rate:        0.00%
    +==3299700==
    +==3299700== D   refs:      160,952,192  (160,695,444 rd   + 256,748 wr)
    +==3299700== D1  misses:     10,021,300  ( 10,018,723 rd   +   2,577 wr)
    +==3299700== LLd misses:     10,010,916  ( 10,009,147 rd   +   1,769 wr)
    +==3299700== D1  miss rate:         6.2% (        6.2%     +     1.0%  )
    +==3299700== LLd miss rate:         6.2% (        6.2%     +     0.7%  )
    +==3299700==
    +==3299700== LL refs:        10,023,377  ( 10,020,800 rd   +   2,577 wr)
    +==3299700== LL misses:      10,012,937  ( 10,011,168 rd   +   1,769 wr)
    +==3299700== LL miss rate:          1.2% (        1.2%     +     0.7%  )
    +
    + +

    Build and run/profile just loop2()

    + +
    g++ -O2 main.cpp -DRUN_LOOP2
    +valgrind --tool=cachegrind ./a.out
    +
    + +

    Prints:

    + +
    ==3300389==
    +==3300389== I   refs:      643,156,726
    +==3300389== I1  misses:          2,075
    +==3300389== LLi misses:          2,018
    +==3300389== I1  miss rate:        0.00%
    +==3300389== LLi miss rate:        0.00%
    +==3300389==
    +==3300389== D   refs:      160,952,196  (160,695,447 rd   + 256,749 wr)
    +==3300389== D1  misses:    160,021,290  (160,018,713 rd   +   2,577 wr)
    +==3300389== LLd misses:     10,014,907  ( 10,013,138 rd   +   1,769 wr)
    +==3300389== D1  miss rate:        99.4% (       99.6%     +     1.0%  )
    +==3300389== LLd miss rate:         6.2% (        6.2%     +     0.7%  )
    +==3300389==
    +==3300389== LL refs:       160,023,365  (160,020,788 rd   +   2,577 wr)
    +==3300389== LL misses:      10,016,925  ( 10,015,156 rd   +   1,769 wr)
    +==3300389== LL miss rate:          1.2% (        1.2%     +     0.7%  )
    +
    + +

    The main differences between the 2 runs are:

    +
      +
    1. D1 misses: 10M v/s 160M
    2. +
    3. D1 miss rate: 6.2% v/s 99.4%
    4. +
    + +

    As you can see, loop2() causes many many more (~16x more) L1 data cache misses than loop1(). This is why loop1() is ~15x faster than loop2().

    + +

    Memory Formats supported by PyTorch Operators

    + +

    While PyTorch operators expect all tensors to be in Channels First (NCHW) dimension format, PyTorch operators support 3 output memory formats.

    + +
      +
    1. Contiguous: Tensor memory is in the same order as the tensor’s dimensions.
    2. +
    3. ChannelsLast: Irrespective of the dimension order, the 2d (image) tensor is laid out as an HWC or NHWC (N: batch, H: height, W: width, C: channels) tensor in memory. The dimensions could be permuted in any order.
    4. +
    5. ChannelsLast3d: For 3d tensors (video tensors), the memory is laid out in THWC (Time, Height, Width, Channels) or NTHWC (N: batch, T: time, H: height, W: width, C: channels) format. The dimensions could be permuted in any order.
    6. +
    + +

    The reason that ChannelsLast is preferred for vision models is because XNNPACK (kernel acceleration library) used by PyTorch expects all inputs to be in Channels Last format, so if the input to the model isn’t channels last, then it must first be converted to channels last, which is an additional operation.

    + +

    Additionally, most PyTorch operators preserve the input tensor’s memory format, so if the input is Channels First, then the operator needs to first convert to Channels Last, then perform the operation, and then convert back to Channels First.

    + +

    When you combine it with the fact that accelerated operators work better with a channels last memory format, you’ll notice that having the operator return back a channels-last memory format is better for subsequent operator calls or you’ll end up having every operator convert to channels-last (should it be more efficient for that specific operator).

    + +

    From the XNNPACK home page:

    + +
    +

    “All operators in XNNPACK support NHWC layout, but additionally allow custom stride along the Channel dimension”.

    +
    + +

    PyTorch Best Practice

    + +

    The best way to get the most performance from your PyTorch vision models is to ensure that your input tensor is in a Channels Last memory format before it is fed into the model.

    + +

    You can get even more speedups by optimizing your model to use the XNNPACK backend (by simply calling optimize_for_mobile() on your torchscripted model). Note that XNNPACK models will run slower if the inputs are contiguous, so definitely make sure it is in Channels-Last format.

    + +

    Working example showing speedup

    + +

    Run this example on Google Colab - note that runtimes on colab CPUs might not reflect accurate performance; it is recommended to run this code on your local machine.

    + +
    import torch
    +from torch.utils.mobile_optimizer import optimize_for_mobile
    +import torch.backends.xnnpack
    +import time
    +
    +print("XNNPACK is enabled: ", torch.backends.xnnpack.enabled, "\n")
    +
    +N, C, H, W = 1, 3, 200, 200
    +x = torch.rand(N, C, H, W)
    +print("Contiguous shape: ", x.shape)
    +print("Contiguous stride: ", x.stride())
    +print()
    +
    +xcl = x.to(memory_format=torch.channels_last)
    +print("Channels-Last shape: ", xcl.shape)
    +print("Channels-Last stride: ", xcl.stride())
    +
    +## Outputs:
    + 
    +# XNNPACK is enabled:  True
    + 
    +# Contiguous shape:  torch.Size([1, 3, 200, 200])
    +# Contiguous stride:  (120000, 40000, 200, 1)
    + 
    +# Channels-Last shape:  torch.Size([1, 3, 200, 200])
    +# Channels-Last stride:  (120000, 1, 600, 3)
    +
    +
    + +

    The input shape stays the same for contiguous and channels-last formats. Internally however, the tensor’s layout has changed as you can see in the strides. Now, the number of jumps required to go across channels is only 1 (instead of 40000 in the contiguous tensor). +This better data locality means convolution layers can access all the channels for a given pixel much faster. Let’s see now how the memory format affects runtime:

    + +
    from torchvision.models import resnet34, resnet50, resnet101
    +
    +m = resnet34(pretrained=False)
    +# m = resnet50(pretrained=False)
    +# m = resnet101(pretrained=False)
    +
    +def get_optimized_model(mm):
    +  mm = mm.eval()
    +  scripted = torch.jit.script(mm)
    +  optimized = optimize_for_mobile(scripted)  # explicitly call the xnnpack rewrite 
    +  return scripted, optimized
    +
    +
    +def compare_contiguous_CL(mm):
    +  # inference on contiguous
    +  start = time.perf_counter()
    +  for i in range(20):
    +    mm(x)
    +  end = time.perf_counter()
    +  print("Contiguous: ", end-start)
    +
    +  # inference on channels-last
    +  start = time.perf_counter()
    +  for i in range(20):
    +    mm(xcl)
    +  end = time.perf_counter()
    +  print("Channels-Last: ", end-start)
    +
    +with torch.inference_mode():
    +  scripted, optimized = get_optimized_model(m)
    +
    +  print("Runtimes for torchscripted model: ")
    +  compare_contiguous_CL(scripted.eval())
    +  print()
    +  print("Runtimes for mobile-optimized model: ")
    +  compare_contiguous_CL(optimized.eval())
    +
    +   
    +## Outputs (on an Intel Core i9 CPU):
    + 
    +# Runtimes for torchscripted model:
    +# Contiguous:  1.6711160129999598
    +# Channels-Last:  1.6678222839999535
    + 
    +# Runtimes for mobile-optimized model:
    +# Contiguous:  0.5712863490000473
    +# Channels-Last:  0.46113000699995155
    +
    +
    + +

    Conclusion

    + +

    The Memory Layout of an input tensor can significantly impact a model’s running time. For Vision Models, prefer a Channels Last memory format to get the most out of your PyTorch models.

    + +

    References

    + + + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/the-road-to-1_0/index.html b/blog/the-road-to-1_0/index.html new file mode 100644 index 000000000000..6b31df131340 --- /dev/null +++ b/blog/the-road-to-1_0/index.html @@ -0,0 +1,751 @@ + + + + + + + + + + + + + The road to 1.0: production ready PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    + +
    + +
    +
    +
    + +
    +

    + by + + The PyTorch Team + +

    +

    We would like to give you a preview of the roadmap for PyTorch 1.0 , the next release of PyTorch. Over the last year, we’ve had 0.2, 0.3 and 0.4 transform PyTorch from a [Torch+Chainer]-like interface into something cleaner, adding double-backwards, numpy-like functions, advanced indexing and removing Variable boilerplate. At this time, we’re confident that the API is in a reasonable and stable state to confidently release a 1.0.

    + +

    However, 1.0 isn’t just about stability of the interface.

    + +

    One of PyTorch’s biggest strengths is its first-class Python integration, imperative style, simplicity of the API and options. These are aspects that make PyTorch good for research and hackability.

    + +

    One of its biggest downsides has been production-support. What we mean by production-support is the countless things one has to do to models to run them efficiently at massive scale:

    + +
      +
    • exporting to C++-only runtimes for use in larger projects
    • +
    • optimizing mobile systems on iPhone, Android, Qualcomm and other systems
    • +
    • using more efficient data layouts and performing kernel fusion to do faster inference (saving 10% of speed or memory at scale is a big win)
    • +
    • quantized inference (such as 8-bit inference)
    • +
    + +

    Startups, large companies and anyone who wants to build a product around PyTorch have asked for production support. At Facebook (the largest stakeholder for PyTorch) we have Caffe2, which has been the production-ready platform, running in our datacenters and shipping to more than 1 billion phones spanning eight generations of iPhones and six generations of Android CPU architectures. It has server-optimized inference on Intel / ARM, TensorRT support, and all the necessary bits for production. Considering all this value locked-in to a platform that the PyTorch team works quite closely with, we decided to marry PyTorch and Caffe2 which gives the production-level readiness for PyTorch.

    + +

    Supporting production features without adding usability issues for our researchers and end-users needs creative solutions.

    + +

    Production != Pain for researchers

    + +

    Adding production capabilities involves increasing the API complexity and number of configurable options for models. One configures memory-layouts (NCHW vs NHWC vs N,C/32,H,W,32, each providing different performance characteristics), quantization (8-bit? 3-bit?), fusion of low-level kernels (you used a Conv + BatchNorm + ReLU, let’s fuse them into a single kernel), separate backend options (MKLDNN backend for a few layers and NNPACK backend for other layers), etc.

    + +

    PyTorch’s central goal is to provide a great platform for research and hackability. So, while we add all these optimizations, we’ve been working with a hard design constraint to never trade these off against usability.

    + +

    To pull this off, we are introducing torch.jit, a just-in-time (JIT) compiler that at runtime takes your PyTorch models and rewrites them to run at production-efficiency. The JIT compiler can also export your model to run in a C++-only runtime based on Caffe2 bits.

    + +
    +

    In 1.0, your code continues to work as-is, we’re not making any big changes to the existing API.

    +
    + +

    Making your model production-ready is an opt-in annotation, which uses the torch.jit compiler to export your model to a Python-less environment, and improving its performance. Let’s walk through the JIT compiler in detail.

    + +

    torch.jit: A JIT-compiler for your models

    + +

    We strongly believe that it’s hard to match the productivity you get from specifying your models directly as idiomatic Python code. This is what makes PyTorch so flexible, but it also means that PyTorch pretty much never knows the operation you’ll run next. This however is a big blocker for export/productionization and heavyweight automatic performance optimizations because they need full upfront knowledge of how the computation will look before it even gets executed.

    + +

    We provide two opt-in ways of recovering this information from your code, one based on tracing native python code and one based on compiling a subset of the python language annotated into a python-free intermediate representation. After thorough discussions we concluded that they’re both going to be useful in different contexts, and as such you will be able to mix and match them freely.

    + +

    Tracing Mode

    + +

    The PyTorch tracer, torch.jit.trace, is a function that records all the native PyTorch operations performed in a code region, along with the data dependencies between them. In fact, PyTorch has had a tracer since 0.3, which has been used for exporting models through ONNX. What changes now, is that you no longer necessarily need to take the trace and run it elsewhere - PyTorch can re-execute it for you, using a carefully designed high-performance C++ runtime. As we develop PyTorch 1.0 this runtime will integrate all the optimizations and hardware integrations that Caffe2 provides.

    + +

    The biggest benefit of this approach is that it doesn’t really care how your Python code is structured — you can trace through generators or coroutines, modules or pure functions. Since we only record native PyTorch operators, these details have no effect on the trace recorded. This behavior, however, is a double-edged sword. For example, if you have a loop in your model, it will get unrolled in the trace, inserting a copy of the loop body for as many times as the loop ran. This opens up opportunities for zero-cost abstraction (e.g. you can loop over modules, and the actual trace will be loop-overhead free!), but on the other hand this will also affect data dependent loops (think of e.g. processing sequences of varying lengths), effectively hard-coding a single length into the trace.

    + +

    For networks that do not contain loops and if statements, tracing is non-invasive and is robust enough to handle a wide variety of coding styles. This code example illustrates what tracing looks like:

    + +
    # This will run your nn.Module or regular Python function with the example
    +# input that you provided. The returned callable can be used to re-execute
    +# all operations that happened during the example run, but it will no longer
    +# use the Python interpreter.
    +from torch.jit import trace
    +traced_model = trace(model, example_input=input)
    +traced_fn = trace(fn, example_input=input)
    +
    +# The training loop doesn't change. Traced model behaves exactly like an
    +# nn.Module, except that you can't edit what it does or change its attributes.
    +# Think of it as a "frozen module".
    +for input, target in data_loader:
    +    loss = loss_fn(traced_model(input), target)
    +
    + +

    Script Mode

    + +

    Tracing mode is a great way to minimize the impact on your code, but we’re also very excited about the models that fundamentally make use of control flow such as RNNs. Our solution to this is a scripting mode.

    + +

    In this case you write out a regular Python function, except that you can no longer use certain more complicated language features. Once you isolated the desired functionality, you let us know that you’d like the function to get compiled by decorating it with an @script decorator. This annotation will transform your python function directly into our high-performance C++ runtime. This lets us recover all the PyTorch operations along with loops and conditionals. They will be embedded into our internal representation of this function, and will be accounted for every time this function is run.

    + +
    from torch.jit import script
    +
    +@script
    +def rnn_loop(x):
    +    hidden = None
    +    for x_t in x.split(1):
    +        x, hidden = model(x, hidden)
    +    return x
    +
    + +

    Optimization and Export

    + +

    Regardless of whether you use tracing or @script, the result is a python-free representation of your model, which can be used to optimize the model or to export the model from python for use in production environments.

    + +

    Extracting bigger segments of the model into an intermediate representation makes it possible to do sophisticated whole-program optimizations and to offload computation to specialized AI accelerators which operate on graphs of computation. We have already been developing the beginnings of these optimizations, including passes that fuse GPU operations together to improve the performance of smaller RNN models.

    + +

    It also allows us to use existing high-performance backends available in Caffe2 today to run the model efficiently. Additionally, @script functions (and modules!) can be fully exported to ONNX in a way that retains their dynamic nature, such that you can easily run them in a Python-free environment using the model executors from Caffe2 or by transferring the model to any other framework supporting ONNX.

    + +

    Usability

    + +

    We care deeply about maintaining our current level of usability and we know that execution of the code not directly in Python leads to harder debugging, but this is something that we think about a lot, and we’re making sure that you’re not getting locked in to a completely different programming language.

    + +

    First, we follow the principle of pay for what you use — if you don’t need to optimize or export your model, you do not have to use these new features and won’t see any downsides. Furthermore, use of traced or @script modules/functions can be done incrementally. For instance, all of these behaviors are allowed: You can trace part of your model and use the trace in a larger non-traced model. You can use tracing for 90% of your model, and use @script for the one sub-module that actually has some control flow in it. You can write a function using @script and have it call a native python function. If something appears incorrect in an @script function, you can remove the annotation and the code will execute in native python where it is easy to debug using your favorite tools and methods. Think of tracing and @script like type annotations using MyPy or TypeScript — each additional annotation can be tested incrementally, and none are required until you want to optimize or productionize.

    + +

    Most importantly, these modes will be built into the core of PyTorch so that mixing and matching them with your existing code can happen seamlessly.

    + +

    Note: The name JIT for these components is a bit of a misnomer and comes from historical reasons. The tracing/function execution in PyTorch started out as an optimizing JIT compiler that generated fused CUDA kernels but then grew to encompass optimization, @script, and export. When it is ready for release we will likely rename this functionality to the hybrid frontend, but we wanted to present it here as it is named in the code so that you can follow along as we develop it.

    + +

    Other changes and improvements

    + +

    Production support is the big feature for 1.0, but we will continue optimizing and fixing other parts of PyTorch as course of the standard release process.

    + +

    On the backend side of things, PyTorch will see some changes, which might affect user-written C and C++ extensions. We are replacing (or refactoring) the backend ATen library to incorporate features and optimizations from Caffe2.

    + +

    Last Words

    + +

    We aim to release 1.0 some time during the summer. You can follow-along our progress on the Pull Requests page.

    + +

    You can read this from the perspective of the Caffe2 project at: https://caffe2.ai/blog/2018/05/02/Caffe2_PyTorch_1_0.html

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/the-torch.fft-module-accelerated-fast-fourier-transforms-with-autograd-in-pyTorch/index.html b/blog/the-torch.fft-module-accelerated-fast-fourier-transforms-with-autograd-in-pyTorch/index.html new file mode 100644 index 000000000000..86b4354b8a03 --- /dev/null +++ b/blog/the-torch.fft-module-accelerated-fast-fourier-transforms-with-autograd-in-pyTorch/index.html @@ -0,0 +1,711 @@ + + + + + + + + + + + + + The torch.fft module: Accelerated Fast Fourier Transforms with Autograd in PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Mike Ruberry, Peter Bell, and Joe Spisak + +

    +

    The Fast Fourier Transform (FFT) calculates the Discrete Fourier Transform in O(n log n) time. It is foundational to a wide variety of numerical algorithms and signal processing techniques since it makes working in signals’ “frequency domains” as tractable as working in their spatial or temporal domains.

    + +

    As part of PyTorch’s goal to support hardware-accelerated deep learning and scientific computing, we have invested in improving our FFT support, and with PyTorch 1.8, we are releasing the torch.fft module. This module implements the same functions as NumPy’s np.fft module, but with support for accelerators, like GPUs, and autograd.

    + +

    Getting started

    + +

    Getting started with the new torch.fft module is easy whether you are familiar with NumPy’s np.fft module or not. While complete documentation for each function in the module can be found here, a breakdown of what it offers is:

    + +
      +
    • fft, which computes a complex FFT over a single dimension, and ifft, its inverse
    • +
    • the more general fftn and ifftn, which support multiple dimensions
    • +
    • The “real” FFT functions, rfft, irfft, rfftn, irfftn, designed to work with signals that are real-valued in their time domains
    • +
    • The “Hermitian” FFT functions, hfft and ihfft, designed to work with signals that are real-valued in their frequency domains
    • +
    • Helper functions, like fftfreq, rfftfreq, fftshift, ifftshift, that make it easier to manipulate signals
    • +
    + +

    We think these functions provide a straightforward interface for FFT functionality, as vetted by the NumPy community, although we are always interested in feedback and suggestions!

    + +

    To better illustrate how easy it is to move from NumPy’s np.fft module to PyTorch’s torch.fft module, let’s look at a NumPy implementation of a simple low-pass filter that removes high-frequency variance from a 2-dimensional image, a form of noise reduction or blurring:

    + +
    import numpy as np
    +import numpy.fft as fft
    +
    +def lowpass_np(input, limit):
    +    pass1 = np.abs(fft.rfftfreq(input.shape[-1])) < limit
    +    pass2 = np.abs(fft.fftfreq(input.shape[-2])) < limit
    +    kernel = np.outer(pass2, pass1)
    +    
    +    fft_input = fft.rfft2(input)
    +    return fft.irfft2(fft_input * kernel, s=input.shape[-2:])
    +
    + +

    Now let’s see the same filter implemented in PyTorch:

    + +
    import torch
    +import torch.fft as fft
    +
    +def lowpass_torch(input, limit):
    +    pass1 = torch.abs(fft.rfftfreq(input.shape[-1])) < limit
    +    pass2 = torch.abs(fft.fftfreq(input.shape[-2])) < limit
    +    kernel = torch.outer(pass2, pass1)
    +    
    +    fft_input = fft.rfft2(input)
    +    return fft.irfft2(fft_input * kernel, s=input.shape[-2:])
    +
    + +

    Not only do current uses of NumPy’s np.fft module translate directly to torch.fft, the torch.fft operations also support tensors on accelerators, like GPUs and autograd. This makes it possible to (among other things) develop new neural network modules using the FFT.

    + +

    Performance

    + +

    The torch.fft module is not only easy to use — it is also fast! PyTorch natively supports Intel’s MKL-FFT library on Intel CPUs, and NVIDIA’s cuFFT library on CUDA devices, and we have carefully optimized how we use those libraries to maximize performance. While your own results will depend on your CPU and CUDA hardware, computing Fast Fourier Transforms on CUDA devices can be many times faster than computing it on the CPU, especially for larger signals.

    + +

    In the future, we may add support for additional math libraries to support more hardware. See below for where you can request additional hardware support.

    + +

    Updating from older PyTorch versions

    + +

    Some PyTorch users might know that older versions of PyTorch also offered FFT functionality with the torch.fft() function. Unfortunately, this function had to be removed because its name conflicted with the new module’s name, and we think the new functionality is the best way to use the Fast Fourier Transform in PyTorch. In particular, torch.fft() was developed before PyTorch supported complex tensors, while the torch.fft module was designed to work with them.

    + +

    PyTorch also has a “Short Time Fourier Transform”, torch.stft, and its inverse torch.istft. These functions are being kept but updated to support complex tensors.

    + +

    Future

    + +

    As mentioned, PyTorch 1.8 offers the torch.fft module, which makes it easy to use the Fast Fourier Transform (FFT) on accelerators and with support for autograd. We encourage you to try it out!

    + +

    While this module has been modeled after NumPy’s np.fft module so far, we are not stopping there. We are eager to hear from you, our community, on what FFT-related functionality you need, and we encourage you to create posts on our forums at https://discuss.pytorch.org/, or file issues on our Github with your feedback and requests. Early adopters have already started asking about Discrete Cosine Transforms and support for more hardware platforms, for example, and we are investigating those features now.

    + +

    We look forward to hearing from you and seeing what the community does with PyTorch’s new FFT functionality!

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/torch-linalg-autograd/index.html b/blog/torch-linalg-autograd/index.html new file mode 100644 index 000000000000..89fbb2968324 --- /dev/null +++ b/blog/torch-linalg-autograd/index.html @@ -0,0 +1,793 @@ + + + + + + + + + + + + + The torch.linalg module: Accelerated Linear Algebra with Autograd in PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Mike Ruberry, Ivan Yashchuk, Xiao Wang, Mario Lezcano and Natalia Gimelshein + +

    +

    Linear algebra is essential to deep learning and scientific computing, and it’s always been a core part of PyTorch. PyTorch 1.9 extends PyTorch’s support for linear algebra operations with the torch.linalg module. This module, documented here, has 26 operators, including faster and easier to use versions of older PyTorch operators, every function from NumPy’s linear algebra module extended with accelerator and autograd support, and a few operators that are completely new. This makes the torch.linalg immediately familiar to NumPy users and an exciting update to PyTorch’s linear algebra support.

    + +

    NumPy-like linear algebra in PyTorch

    + +

    If you’re familiar with NumPy’s linear algebra module then it’ll be easy to start using torch.linalg. In most cases it’s a drop-in replacement. Let’s looking at drawing samples from a multivariate normal distribution using the Cholesky decomposition as a motivating example to demonstrate this:

    + +
    import numpy as np
    +
    +# Creates inputs
    +np.random.seed(0)
    +mu_np = np.random.rand(4)
    +L = np.random.rand(4, 4)
    +# Covariance matrix sigma is positive-definite
    +sigma_np = L @ L.T + np.eye(4)
    +normal_noise_np = np.random.standard_normal(mu_np.size)
    +
    +def multivariate_normal_sample_np(mu, sigma, normal_noise):
    +    return mu + np.linalg.cholesky(sigma) @ normal_noise
    +
    +print("Random sample: ", 
    +      multivariate_normal_sample_np(mu_np, sigma_np, normal_noise_np))
    +: Random sample: [2.9502426 1.78518077 1.83168697 0.90798228]
    +
    + +

    Now let’s see the same sampler implemented in PyTorch:

    + +
    import torch
    +
    +def multivariate_normal_sample_torch(mu, sigma, normal_noise):
    +    return mu + torch.linalg.cholesky(sigma) @ normal_noise
    +
    + +

    The two functions are identical, and we can validate their behavior by calling the function with the same arguments wrapped as PyTorch tensors:

    + +
    # NumPy arrays are wrapped as tensors and share their memory
    +mu_torch = torch.from_numpy(mu_np)
    +sigma_torch = torch.from_numpy(sigma_np)
    +normal_noise_torch = torch.from_numpy(normal_noise_np)
    +
    +multivariate_normal_sample_torch(mu_torch, sigma_torch, normal_noise_torch)
    +: tensor([2.9502, 1.7852, 1.8317, 0.9080], dtype=torch.float64)
    +
    + +

    The only difference is in how PyTorch prints tensors by default.

    + +

    The Cholesky decomposition can also help us quickly compute the probability density function of the non-degenerate multivariate normal distribution. One of the expensive terms in that computation is the square root of the determinant of the covariance matrix. Using properties of the determinant and the Cholesky decomposition we can calculate the same result faster than the naive computation, however. Here’s the NumPy program that demonstrates this:

    + +
    sqrt_sigma_det_np = np.sqrt(np.linalg.det(sigma_np))
    +sqrt_L_det_np = np.prod(np.diag(np.linalg.cholesky(sigma_np)))
    +
    +print("|sigma|^0.5 = ", sqrt_sigma_det_np)
    +: |sigma|^0.5 = 4.237127491242027
    + 
    +print("|L| = ", sqrt_L_det_np)
    +: |L| = 4.237127491242028
    +
    + +

    And here’s the same validation in PyTorch:

    + +
    sqrt_sigma_det_torch = torch.sqrt(torch.linalg.det(sigma_torch))
    +sqrt_L_det_torch = torch.prod(torch.diag(torch.linalg.cholesky(sigma_torch)))
    +
    +print("|sigma|^0.5 = ", sqrt_sigma_det_torch)
    +: |sigma|^0.5 = tensor(4.2371, dtype=torch.float64) 
    +
    +print("|L| = ", sqrt_L_det_torch)
    +: |L| = tensor(4.2371, dtype=torch.float64)
    +
    + +

    We can measure the difference in run time using PyTorch’s built-in benchmark utility:

    + +
    import torch.utils.benchmark as benchmark
    +
    +t0 = benchmark.Timer(
    +    stmt='torch.sqrt(torch.linalg.det(sigma))',
    +    globals={'sigma': sigma_torch})
    +
    +t1 = benchmark.Timer(
    +    stmt='torch.prod(torch.diag(torch.linalg.cholesky(sigma)))',
    +    globals={'sigma': sigma_torch})
    +
    +print(t0.timeit(100))
    +: torch.sqrt(torch.linalg.det(sigma))
    +  80.80 us
    +  1 measurement, 100 runs , 1 thread
    +
    +
    +print(t1.timeit(100))
    +: torch.prod(torch.diag(torch.linalg.cholesky(sigma)))
    +  11.56 us
    +  1 measurement, 100 runs , 1 thread
    +
    + +

    Demonstrating that the approach using the Cholesky decomposition can be significantly faster. Behind the scenes, PyTorch’s linear algebra module uses OpenBLAS or MKL implementations of the LAPACK standard to maximize its CPU performance.

    + +

    Autograd Support

    + +

    PyTorch’s linear algebra module doesn’t just implement the same functions as NumPy’s linear algebra module (and a few more), it also extends them with autograd and CUDA support.

    + +

    Let’s look at a very simple program that just computes an inverse and the gradient of that operation to show how autograd works:

    + +
    t = torch.tensor(((1, 2), (3, 4)), dtype=torch.float32, requires_grad=True)
    +
    +inv = torch.linalg.inv(t)
    +inv.backward(torch.ones_like(inv))
    +
    +print(t.grad)
    +: tensor([[-0.5000, 0.5000],
    +          [ 0.5000, -0.5000]])
    +
    + +

    We can mimic the same computation in NumPy by defining the autograd formula ourselves:

    + +
    a = np.array(((1, 2), (3, 4)), dtype=np.float32)
    +
    +inv_np = np.linalg.inv(a)
    +
    +def inv_backward(result, grad):
    +    return -(result.transpose(-2, -1) @ (grad @ result.transpose(-2, -1)))
    +grad_np = inv_backward(inv_np, np.ones_like(inv_np))
    +
    +print(grad_np)
    +: [[-0.5 0.5]
    +   [ 0.5 -0.5]]
    +
    + +

    Of course, as programs become more complicated it’s convenient to have builtin autograd support, and PyTorch’s linear algebra module supports both real and complex autograd.

    + +

    CUDA Support

    + +

    Support for autograd and accelerators, like CUDA devices, is a core part of PyTorch. The torch.linalg module was developed with NVIDIA’s PyTorch and cuSOLVER teams, who helped optimize its performance on CUDA devices with the cuSOLVER, cuBLAS, and MAGMA libraries. These improvements make PyTorch’s CUDA linear algebra operations faster than ever. For example, let’s look at the performance of PyTorch 1.9’s torch.linalg.cholesky vs. PyTorch 1.8’s (now deprecated) torch.cholesky:

    + +
    + +
    + +

    (The above charts were created using an Ampere A100 GPU with CUDA 11.3, cuSOLVER 11.1.1.58, and MAGMA 2.5.2. Matrices are in double precision.)

    + +

    These charts show that performance has increased significantly on larger matrices, and that batched performance is better across the board. Other linear algebra operations, including torch.linalg.qr and torch.linalg.lstsq, have also had their CUDA performance improved.

    + +

    Beyond NumPy

    + +

    In addition to offering all the functions in NumPy’s linear algebra module with support for autograd and accelerators, torch.linalg has a few new functions of its own. NumPy’s linalg.norm does not allow users to compute vector norms over arbitrary subsets of dimensions, so to enable this functionality we added torch.linalg.vector_norm. We’ve also started modernizing other linear algebra functionality in PyTorch, so we created torch.linalg.householder_product to replace the older torch.orgqr, and we plan to continue adding more linear algebra functionality in the future, too.

    + +

    The Future of Linear Algebra in PyTorch

    + +

    The torch.linalg module is fast and familiar with great support for autograd and accelerators. It’s already being used in libraries like botorch, too. But we’re not stopping here. We plan to continue updating more of PyTorch’s existing linear algebra functionality (like torch.lobpcg) and offering more support for low rank and sparse linear algebra. We also want to hear your feedback on how we can improve, so start a conversation on the forum or file an issue on our Github and share your thoughts.

    + +

    We look forward to hearing from you and seeing what the community does with PyTorch’s new linear algebra functionality!

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/torchchat-local-llm-inference/index.html b/blog/torchchat-local-llm-inference/index.html new file mode 100644 index 000000000000..0babc629281e --- /dev/null +++ b/blog/torchchat-local-llm-inference/index.html @@ -0,0 +1,797 @@ + + + + + + + + + + + + + Introducing torchchat: Accelerating Local LLM Inference on Laptop, Desktop and Mobile | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    Today, we’re releasing torchchat, a library showcasing how to seamlessly and performantly run Llama 3, 3.1, and other large language models across laptop, desktop, and mobile.

    + +

    In our previous blog posts, we showed how to use native PyTorch 2 to run LLMs with great performance using CUDA. Torchchat expands on this with more target environments, models and execution modes. Additionally it provides important functions such as export, quantization and eval in a way that’s easy to understand providing an E2E story for those who want to build a local inference solution.

    + +

    You will find the project organized into three areas:

    + +
      +
    • Python: Torchchat provides a REST API that is called via a Python CLI or can be accessed via the browser
    • +
    • C++: Torchchat produces a desktop-friendly binary using PyTorch’s AOTInductor backend
    • +
    • Mobile devices: Torchchat uses ExecuTorch to export a .pte binary file for on-device inference
    • +
    + +

    torchchat schema

    + +

    Performance

    + +

    The following table tracks the performance of torchchat for Llama 3 for a variety of configurations.
    +Numbers for Llama 3.1 are coming soon.

    + +

    Llama 3 8B Instruct on Apple MacBook Pro M1 Max 64GB Laptop

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Mode + DType + Llama 3 8B Tokens/Sec +
    Arm Compile + float16 + 5.84 +
    int8 + 1.63 +
    int4 + 3.99 +
    Arm AOTI + float16 + 4.05 +
    int8 + 1.05 +
    int4 + 3.28 +
    MPS Eager + float16 + 12.63 +
    int8 + 16.9 +
    int4 + 17.15 +
    + +

    Llama 3 8B Instruct on Linux x86 and CUDA
    +Intel(R) Xeon(R) Platinum 8339HC CPU @ 1.80GHz with 180GB Ram + A100 (80GB)

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +Mode + DType + Llama 3 8B Tokens/Sec +
    x86 Compile + bfloat16 + 2.76 +
    int8 + 3.15 +
    int4 + 5.33 +
    CUDA Compile + bfloat16 + 83.23 +
    int8 + 118.17 +
    int4 + 135.16 +
    + +

    Llama3 8B Instruct on Mobile
    +Torchchat achieves > 8T/s on the Samsung Galaxy S23 and iPhone using 4-bit GPTQ via ExecuTorch.

    + +

    Conclusion

    + +

    We encourage you to clone the torchchat repo and give it a spin, explore its capabilities, and share your feedback as we continue to empower the PyTorch community to run LLMs locally and on constrained devices. Together, let’s unlock the full potential of generative AI and LLMs on any device. Please submit issues as you see them, since we are still iterating quickly. We’re also inviting community contributions across a broad range of areas, from additional models, target hardware support, new quantization schemes, or performance improvements. Happy experimenting!

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/torchcodec/index.html b/blog/torchcodec/index.html new file mode 100644 index 000000000000..00397f912c10 --- /dev/null +++ b/blog/torchcodec/index.html @@ -0,0 +1,749 @@ + + + + + + + + + + + + + torchcodec: Easy and Efficient Video Decoding for PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    We are pleased to officially announce torchcodec, a library for decoding videos into PyTorch tensors. It is fast, accurate, and easy to use. When running PyTorch models on videos, torchcodec is our recommended way to turn those videos into data your model can use.

    + +

    Highlights of torchcodec include:

    + +
      +
    • An intuitive decoding API that treats a video file as a Python sequence of frames. We support both index-based and presentation-time-based frame retrieval.
    • +
    • An emphasis on accuracy: we ensure you get the frames you requested, even if your video has variable frame rates.
    • +
    • A rich sampling API that makes it easy and efficient to retrieve batches of frames.
    • +
    • Best-in-class CPU decoding performance.
    • +
    • CUDA accelerated decoding that enables high throughput when decoding many videos at once.
    • +
    • Support for all codecs available in your installed version of FFmpeg.
    • +
    • Simple binary installs for Linux and Mac.
    • +
    + +

    Easy to Use

    + +

    A simple, intuitive API was one of our main design principles. We start with simple decoding and extracting specific frames of a video:

    + +
    from torchcodec.decoders import VideoDecoder
    +from torch import Tensor
    +
    +decoder = VideoDecoder("my_video.mp4")
    +
    +# Index based frame retrieval.
    +first_ten_frames: Tensor = decoder[10:]
    +last_ten_frames: Tensor = decoder[-10:]
    +
    +# Multi-frame retrieval, index and time based.
    +frames = decoder.get_frames_at(indices=[10, 0, 15])
    +frames = decoder.get_frames_played_at(seconds=[0.2, 3, 4.5])
    +
    + +

    All decoded frames are already PyTorch tensors, ready to be fed into models for training.

    + +

    Of course, more common in ML training pipelines is sampling multiple clips from videos. A clip is just a sequence of frames in presentation order—but the frames are often not consecutive. Our sampling API makes this easy:

    + +
    from torchcodec.samplers import clips_at_regular_timestamps
    +
    +clips = clips_at_regular_timestamps(
    +  decoder,
    +  seconds_between_clip_starts=10,
    +  num_frames_per_clip=5,
    +  seconds_between_frames=0.2,
    +)
    +
    + +

    The above call yields a batch of clips where each clip starts 10 seconds apart, each clip has 5 frames, and those frames are 0.2 seconds apart. See our tutorials on decoding and sampling for more!

    + +

    Fast Performance

    + +

    Performance was our other main design principle. Decoding videos for ML training has different performance requirements than decoding videos for playback. A typical ML video training pipeline will process many different videos (sometimes in the millions!), but only sample a small number of frames (dozens to hundreds) from each video.

    + +

    For this reason, we’ve paid particular attention to our decoder’s performance when seeking multiple times in a video, decoding a small number of frames after each seek. We present experiments with the following four scenarios:

    + +
      +
    1. +

      Decoding and transforming frames from multiple videos at once, inspired by what we have seen in data loading for large-scale training pipelines:

      + +

      a. Ten threads decode batches of 50 videos in parallel.
      +b. For each video, decode 10 frames at evenly spaced times.
      +c. For each frame, resize it to a 256x256 resolution.

      +
    2. +
    3. Decoding 10 frames at random locations in a single video.
    4. +
    5. Decoding 10 frames at evenly spaced times of a single video.
    6. +
    7. Decoding the first 100 frames of a single video.
    8. +
    + +

    We compare the following video decoders:

    + +
      +
    • Torchaudio, CPU decoding only.
    • +
    • Torchvision, using the video_reader backend which is CPU decoding only.
    • +
    • Torchcodec, GPU decoding with CUDA.
    • +
    • Torchcodec, CPU decoding only.
    • +
    + +

    Using the following three videos:

    + +
      +
    1. A synthetically generated video using FFmpeg’s mandelbrot generation pattern. The video is 10 seconds long, 60 frames per second and 1920x1080.
    2. +
    3. Same as above, except the video is 120 seconds long.
    4. +
    5. A promotional video from NASA that is 206 seconds long, 29.7 frames per second and 960x540.
    6. +
    + +

    The experimental script is in our repo. Our experiments run on a Linux system with an Intel processor that has 22 available cores and an NVIDIA GPU. For CPU decoding, all libraries were instructed to automatically determine the best number of threads to use.

    + +

    Benchmark chart

    + +

    From our experiments, we draw several conclusions:

    + +
      +
    • Torchcodec is consistently the best-performing library for the primary use case we designed it for: decoding many videos at once as a part of a training data loading pipeline. In particular, high-resolution videos see great gains with CUDA where decoding and transforms both happen on the GPU.
    • +
    • Torchcodec is competitive on the CPU with seek-heavy use cases such as random and uniform sampling. Currently, torchcodec’s performance is better with shorter videos that have a smaller file size. This performance is due to torchcodec’s emphasis on seek-accuracy, which involves an initial linear scan.
    • +
    • Torchcodec is not as competitive when there is no seeking; that is, opening a video file and decoding from the beginning. This is again due to our emphasis on seek-accuracy and the initial linear scan.
    • +
    + +

    Implementing an approximate seeking mode in torchcodec should resolve these performance gaps, and it’s our highest priority feature for video decoding.

    + +

    What’s Next?

    + +

    As the name implies, the long-term future for torchcodec is more than just video decoding. Our next big feature is audio support—both decoding audio streams from video, and from audio-only media. In the long term, we want torchcodec to be the media decoding library for PyTorch. That means as we implement functionality in torchcodec, we will deprecate and eventually remove complementary features from torchaudio and torchvision.

    + +

    We also have video decoding improvements lined up, such as the previously mentioned approximate seeking mode for those who are willing to sacrifice accuracy for performance.

    + +

    Most importantly, we’re looking for feedback from the community! We’re most interested in working on features that the community finds valuable. Come share your needs and influence our future direction!

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/torchcsprng-release-blog/index.html b/blog/torchcsprng-release-blog/index.html new file mode 100644 index 000000000000..64c8e3e45e55 --- /dev/null +++ b/blog/torchcsprng-release-blog/index.html @@ -0,0 +1,711 @@ + + + + + + + + + + + + + PyTorch framework for cryptographically secure random number generation, torchcsprng, now available | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    One of the key components of modern cryptography is the pseudorandom number generator. Katz and Lindell stated, “The use of badly designed or inappropriate random number generators can often leave a good cryptosystem vulnerable to attack. Particular care must be taken to use a random number generator that is designed for cryptographic use, rather than a ‘general-purpose’ random number generator which may be fine for some applications but not ones that are required to be cryptographically secure.”[1] Additionally, most pseudorandom number generators scale poorly to massively parallel high-performance computation because of their sequential nature. Others don’t satisfy cryptographically secure properties.

    + +

    torchcsprng is a PyTorch C++/CUDA extension that provides cryptographically secure pseudorandom number generators for PyTorch.

    + +

    torchcsprng overview

    + +

    Historically, PyTorch had only two pseudorandom number generator implementations: Mersenne Twister for CPU and Nvidia’s cuRAND Philox for CUDA. Despite good performance properties, neither of them are suitable for cryptographic applications. Over the course of the past several months, the PyTorch team developed the torchcsprng extension API. Based on PyTorch dispatch mechanism and operator registration, it allows the users to extend c10::GeneratorImpl and implement their own custom pseudorandom number generator.

    + +

    torchcsprng generates a random 128-bit key on the CPU using one of its generators and then runs AES128 in CTR mode either on CPU or GPU using CUDA. This then generates a random 128-bit state and applies a transformation function to map it to target tensor values. This approach is based on Parallel Random Numbers: As Easy as 1, 2, 3 (John K. Salmon, Mark A. Moraes, Ron O. Dror, and David E. Shaw, D. E. Shaw Research). It makes torchcsprng both crypto-secure and parallel on both CPU and CUDA.

    + +
    + +
    + +

    Since torchcsprng is a PyTorch extension, it is available on the platforms where PyTorch is available (support for Windows-CUDA will be available in the coming months).

    + +

    Using torchcsprng

    + +

    The torchcsprng API is very simple to use and is fully compatible with the PyTorch random infrastructure:

    + +

    Step 1: Install via binary distribution

    + +

    Anaconda:

    + +
    conda install torchcsprng -c pytorch
    +
    + +

    pip:

    + +
    pip install torchcsprng
    +
    + +

    Step 2: import packages as usual but add csprng

    + +
    import torch
    +import torchcsprng as csprng
    +
    + +

    Step 3: Create a cryptographically secure pseudorandom number generator from /dev/urandom:

    + +
    urandom_gen = csprng.create_random_device_generator('/dev/urandom')
    +
    + +

    and simply use it with the existing PyTorch methods:

    + +
    torch.randn(10, device='cpu', generator=urandom_gen)
    +
    + +

    Step 4: Test with Cuda

    + +

    One of the advantages of torchcsprng generators is that they can be used with both CPU and CUDA tensors:

    + +
    torch.randn(10, device='cuda', generator=urandom_gen)
    +
    + +

    Another advantage of torchcsprng generators is that they are parallel on CPU unlike the default PyTorch CPU generator.

    + +

    Getting Started

    + +

    The easiest way to get started with torchcsprng is by visiting the GitHub page where you can find installation and build instructions, and more how-to examples.

    + +

    Cheers,

    + +

    The PyTorch Team

    + +

    [1] Introduction to Modern Cryptography: Principles and Protocols (Chapman & Hall/CRC Cryptography and Network Security Series) by Jonathan Katz and Yehuda Lindell

    + + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/torchrec-fbgemm-1/index.html b/blog/torchrec-fbgemm-1/index.html new file mode 100644 index 000000000000..bd6bb2e1ca20 --- /dev/null +++ b/blog/torchrec-fbgemm-1/index.html @@ -0,0 +1,759 @@ + + + + + + + + + + + + + TorchRec and FBGEMM 1.0 Stable Release | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    October 23, 2024

    +

    + TorchRec and FBGEMM 1.0 Stable Release +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Paul Zhang, Zain Huda, Sarunya Pumma, Shintaro Iwasaki, Supadchaya Puangpontip, Benson Ma + +

    +

    We are happy to announce the stable release, 1.0, for TorchRec and FBGEMM. TorchRec is the PyTorch native recommendation systems library, powered by FBGEMM’s (Facebook GEneral Matrix Multiplication) efficient, low-level kernels.

    + +

    TorchRec

    + +

    Initially open sourced in 2022, TorchRec provides common primitives for creating state-of-the-art personalization models:

    + +
      +
    • Simple, optimized APIs for distributed training across hundreds of GPUs
    • +
    • Advanced sharding techniques for embeddings
    • +
    • Modules common in authoring recommendation systems
    • +
    • Frictionless path to distributed inference with APIs for quantization and sharding of TorchRec models
    • +
    + +

    Since then, TorchRec has matured significantly, with wide internal adoption across many Meta production recommendation models for training and inference, alongside new features such as: variable batched embeddings, embedding offloading, zero collision hashing, etc. Furthermore, TorchRec has a presence outside of Meta, such as in recommendation models at Databricks and in the Twitter algorithm. As a result, standard TorchRec features have been marked as stable, with PyTorch style BC guarantees, and can be seen on the revamped TorchRec documentation.

    + +

    FBGEMM

    + +

    FBGEMM is a library that provides high-performance kernels for CPUs and GPUs. Since 2018, FBGEMM has supported the efficient execution of Meta-internal and external AI/ML workloads by expanding its scope from performance-critical kernels for inference on CPUs to more complex sparse operators for both training and inference – and recently for Generative AI – on CPUs and GPUs.

    + +

    FBGEMM has been empowering TorchRec through its backend high-performance kernel implementations for recommendation workloads, ranging from embedding bag kernels to jagged tensor operations. Together with TorchRec, we released FBGEMM 1.0, which guarantees the functionality and backward-compatibility of several stable APIs serving its core features with enhanced documentation.

    + +

    Performance

    + +

    DLRM (Deep Learning Recommendation Model) is the standard neural network architecture for powering recommendations at Meta, with categorical features being processed through embeddings, while continuous (dense) features are processed with a bottom multilayer perceptron. The following diagram depicts the basic architecture of DLRM, with a second order interaction layer between the dense and sparse features and a top MLP for generating the prediction.

    + +

    flow diagram

    + +

    TorchRec provides standardized modules with significant optimizations in fusing embedding lookups. EBC is a traditional PyTorch embedding module implementation, containing a collection of torch.nn.EmbeddingBags. FusedEBC, powered by FBGEMM for high performance operations on embedding tables with a fused optimizer and UVM caching/management for alleviating memory constraints, is the optimized version present in sharded TorchRec modules for distributed training and inference. The below benchmark demonstrates the vast performance improvements of FusedEBC in comparison to a traditional PyTorch embedding module implementation (EBC) and the ability for FusedEBC to handle much larger embeddings than what is available on GPU memory with UVM caching.

    + +

    performance chart

    + +

    TorchRec Data Types

    + +

    TorchRec provides standard data types and modules for easy handling of distributed embeddings. Here is a simple example setting up a collection of embedding tables through TorchRec:

    + +
    from torchrec import EmbeddingBagCollection
    +from torchrec import KeyedJaggedTensor
    +from torchrec import JaggedTensor
    +
    +ebc = torchrec.EmbeddingBagCollection(
    +    device="cpu",
    +    tables=[
    +        torchrec.EmbeddingBagConfig(
    +            name="product_table",
    +            embedding_dim=64,
    +            num_embeddings=4096,
    +            feature_names=["product"],
    +            pooling=torchrec.PoolingType.SUM,
    +        ),
    +        torchrec.EmbeddingBagConfig(
    +            name="user_table",
    +            embedding_dim=64,
    +            num_embeddings=4096,
    +            feature_names=["user"],
    +            pooling=torchrec.PoolingType.SUM,
    +        )
    +    ]
    +)
    +
    +product_jt = JaggedTensor(
    +    values=torch.tensor([1, 2, 1, 5]), lengths=torch.tensor([3, 1])
    +)
    +user_jt = JaggedTensor(values=torch.tensor([2, 3, 4, 1]), lengths=torch.tensor([2, 2]))
    +
    +kjt = KeyedJaggedTensor.from_jt_dict({"product": product_jt, "user": user_jt})
    +
    +print("Call EmbeddingBagCollection Forward: ", ebc(kjt))
    +
    + +

    Sharding

    + +

    TorchRec provides a planner class that automatically generates an optimized sharding plan across many GPUs. Here we demonstrate generating a sharding plan across two GPUs:

    + +
    from torchrec.distributed.planner import EmbeddingShardingPlanner, Topology
    +
    +planner = EmbeddingShardingPlanner(
    +    topology=Topology(
    +        world_size=2,
    +        compute_device="cuda",
    +    )
    +)
    +
    +plan = planner.collective_plan(ebc, [sharder], pg)
    +
    +print(f"Sharding Plan generated: {plan}")
    +
    + +

    Model Parallel

    + +

    TorchRec’s main distributed training API is DistributedModelParallel, which calls the planner to generate a sharding plan (demonstrated above) and shards TorchRec modules according to that plan. We demonstrate using DistributedModelParallel to our EmbeddingBagCollection for sharding embeddings distributed training:

    + +
    model = torchrec.distributed.DistributedModelParallel(ebc, device=torch.device("cuda"))
    +
    + +

    Inference

    + +

    TorchRec provides simple APIs for quantizing and sharding embeddings for a model for distributed inference. The usage is demonstrated below:

    + +
    from torchrec.inference.modules import (
    +    quantize_inference_model,
    +    shard_quant_model,
    +)
    +quant_model = quantize_inference_model(ebc)
    +sharded_model, _ = shard_quant_model(
    +    quant_model, compute_device=device, sharding_device=device
    +)
    +
    + +

    Conclusion

    + +

    TorchRec and FBGEMM are now stable, with optimized features for large scale recommendation systems.

    + +

    For setting up TorchRec and FBGEMM, check out the getting started guide.
    +
    +We also recommend the comprehensive, end-to-end tutorial for introducing the features in TorchRec and FBGEMM.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/torchserve-performance-tuning/index.html b/blog/torchserve-performance-tuning/index.html new file mode 100644 index 000000000000..05543156e5b0 --- /dev/null +++ b/blog/torchserve-performance-tuning/index.html @@ -0,0 +1,1084 @@ + + + + + + + + + + + + + Torchserve Performance Tuning, Animated Drawings Case-Study | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Hamid Shojanazeri, Geeta Chauhan, Mark Saroufim, Jesse Smith + +

    +

    In this post we discuss performance tuning of Torchserve for serving your models in production. One of the biggest challenges in the life cycle of a ML project is deploying models in production. This requires a reliable serving solution along with solutions that address the MLOps needs. A robust serving solution needs to provide support for multi model serving, model versioning, metric logging, monitoring and scaling to serve the peak traffic. In this post, we will have an overview of Torchserve and how to tune its performance for production use-cases. We discuss the Animated Drawings app from Meta that can turn your human figure sketches to animations and how it could serve the peak traffic with Torchserve. The Animated Drawing’s workflow is below.

    + +

    + +

    + +

    https://ai.facebook.com/blog/using-ai-to-bring-childrens-drawings-to-life/

    + +

    Many AI systems and tools are designed to handle realistic images of humans, children’s drawings add a level of complexity and unpredictability as they are often constructed in abstract, fanciful ways. These types of morphological and stylistic variations can confuse even state-of-the-art AI systems that excel at spotting objects in photorealistic images and drawings. +Meta AI researchers are working to overcome this challenge so that AI systems will be better able to recognize drawings of human figures in the wildly varied ways that children create them. This great blog post provides more details about the Animated Drawings and the approach taken.

    + +

    Torchserve

    + +

    + +

    Fig1. Overall flow of Torchserve performance tuning
    +

    + +

    Once you have trained your model, it needs to be integrated into a larger system to have a full-fledged application, we use the term “model serving” to refer to this integration. Basically model serving is making your trained model available to run inferences and subsequent use of the model.

    + +

    Torchserve is the Pytorch preferred solution for serving models in production. It is a performant and scalable tool that wraps your model in a HTTP or HTTPS API. It has a frontend implemented in Java that handles multiple tasks from assigning workers for serving models to handling the connection between client and server. Torchserve has a Python backend that is responsible for handling the inference service.

    + +

    Torchserve supports multi model serving and versioning for AB test, dynamic batching, logging and metrics. It exposes four APIs for inference, explanations, management and metrics.

    + +

    Inference API is listening on port 8080 and accessible through localhost by default, this can be configured in Torchserve configuration and enable getting predictions from the model.

    + +

    Explanation API uses Captum under the hood to provide explanations of the model that is being served and listens to the port 8080 as well.

    + +

    Management API allows to register or unregister and describe a model. It also enables users to scale up or down the number of workers that serve the model.

    + +

    Metric API by default listens to port 8082 and enables us to monitor the model that is being served.

    + +

    Torchserve let you scale your model serving and handle the peak traffic by supporting batch inference and multiple workers that serve your model. Scaling can be done through management API and settings through a configuration file. Also, metric API helps you to monitor your model serving through default and customizable metrics.

    + +

    Other advanced settings such as the length of the queue for the received requests, maximum wait time for a batch of inputs and many other properties are configurable through a config file that can be passed to Torchserve when it is started.

    + +

    Steps to serve your model with Torchserve

    + +
      +
    1. Install Torchserve, model archiver and its requirements.
    2. +
    3. Choose a default handler that fits your task (e.g image classification, etc) or author a custom handler.
    4. +
    5. Package your model artifacts (trained model checkpoint and all other necessary files for loading and running your model) and the handler into a “.mar” file using Torcharchive and place it in the model store.
    6. +
    7. Start serving your model.
    8. +
    9. Run inference. +We will discuss model handlers and metrics in more detail here.
    10. +
    + +

    Model handlers

    + +

    Torchserve uses a handler in the backend to load the models, preprocess the received data, run inference and post-process the response. Handler in torchserve is a python script that all the model initialization, preprocessing, inference and post processing logic goes into.

    + +

    Torchserve provides an out of the box handler for a number of applications like image classification, segmentation, object detection and text classification. It also supports custom handlers, in case your use case is not supported in default handlers.

    + +

    It provides a great flexibility in custom handlers, this potentially make Torchserve as multi-framework serving tool. Custom handlers let you define your custom logic to initialize a model that can be used also to load models from other frameworks such as ONNX.

    + +

    Torchserve handler is made of four main functions, initialize, preprocess, inference and postprocess that each return a list. The code snippet below shows an example of a custom handler.Custom handlers inherit from BaseHandler in Torchserve and can overwrite any of the main functions. Here is an example of the handler used for loading the Detectron2 model for figure detection, this model has been exported to Torchscript and uses model.half() to run the inference with FP16, details are explained in another section in this post.

    + +
    
    +class MyModelHandler(BaseHandler):
    +    def initialize(self, context):
    +        self.manifest = ctx.manifest
    +        properties = ctx.system_properties
    +        model_dir = properties.get("model_dir")
    +        serialized_file = self.manifest["model"]["serializedFile"]
    +        model_pt_path = os.path.join(model_dir, serialized_file)
    +
    +        self.device = torch.device(
    +        "cuda:" + str(properties.get("gpu_id"))
    +        if torch.cuda.is_available() and properties.get("gpu_id") is not None
    +        else "cpu"
    +        )
    +        self.model = torch.jit.load(model_pt_path, map_location=self.device)
    +
    +        self.model = self.model.half()
    +
    +    def preprocess(self, data):
    +
    +        inputs = []
    +        for request in batch:
    +
    +            request_body = request.get("body")
    +
    +            input_ = io.BytesIO(request_body)
    +            image = cv2.imdecode(np.fromstring(input_.read(), np.uint8), 1)
    +            input = torch.Tensor(image).permute(2, 0, 1)
    +            input = input.to(self.device)
    +            input = input.half()
    +            inputs.append({"image": input})
    +
    +        return inputs
    +
    +    def inference(self,inputs):
    +        predictions = self.model(**inputs)
    +        return predictions
    +
    +    def postprocess(self, output):
    +        responses = []
    +        for inference_output in inference_outputs:
    +            responses_json = {
    +            'classes': inference_output['pred_classes'].tolist(),
    +            'scores': inference_output['scores'].tolist(),
    +            "boxes": inference_output['pred_boxes'].tolist()
    +            }
    +            responses.append(json.dumps(responses_json))
    +
    +        return responses
    +
    + +

    Metrics

    + +

    An essential component in serving models in production is the ability to monitor them. Torchserve collects system level metrics regularly and allows adding custom metrics as well.

    + +

    System level metrics consist of CPU utilization, available and used disk space and memory on the host machine along with number of requests with different response codes (e.g 200-300, 400-500 and above 500). Custom metrics can be added to the metrics as explained here. TorchServe logs these two sets of metrics to different log files. Metrics are collected by default at:

    + +
      +
    • System metrics - log_directory/ts_metrics.log
    • +
    • Custom metrics - log directory/model_metrics.log
    • +
    + +

    As mentioned before, Torchserve also exposes metric API, that by default listens to port 8082 and enables users to query and monitor the collected metrics. The default metrics endpoint returns Prometheus formatted metrics. You can query metrics using curl requests or point a Prometheus Server to the endpoint and use Grafana for dashboards.

    + +

    While serving a model you can query metrics using curl request as follows:

    + +
    curl http://127.0.0.1:8082/metrics
    +
    + +

    In case you are looking into exporting the logged metrics, please refer to this example that uses mtail to export metrics to Prometheus. Tracking these metrics in a dashboard allows you to monitor performance regressions that may have been sporadic or hard to spot during an offline benchmark run.

    + +

    What to consider for tuning performance of a model in production

    + +

    The workflow suggested in Fig 1, is the general idea on how to approach model deployment in production with Torchserve.

    + +

    In many cases serving models in production is optimized based on throughput or latency service level agreement (SLA)s. Usually real-time applications are more concerned about latency whereas off-line applications may care more about higher throughput.

    + +

    There are a number of main factors contributing to the performance of a serving model in production. In particular, we are focusing on serving Pytorch models with Torchserve here, however most of these factors generalize to all models from other frameworks as well.

    + +
      +
    • Model optimizations: this is a pre-step for deploying models into production. This is a very broad discussion that we will get into in a series of future blogs. This includes techniques like quantization, pruning to decrease the size of the model, using Intermediate representations (IR graphs) such as Torchscript in Pytorch, fusing kernels and many others. Currently torchprep provides many of these techniques as a CLI tool.
    • +
    • Batch inference: it refers to feeding multiple inputs into a model, while it is essential during training, it can be very helpful to manage the cost at inference time as well. Hardware accelerators are optimized for parallelism and batching helps to saturate the compute capacity and often leads to higher throughput. The main difference in inference is you can’t wait too long to get a batch filled from clients, something we call dynamic batching
    • +
    • +

      Number of Workers : Torchserve uses workers to serve models. Torchserve workers are Python processes that hold a copy of the model weights for running inference. Too few workers means you’re not benefitting from enough parallelism but too many can cause worker contention and degrade end to end performance.

      +
    • +
    • Hardware : choosing the appropriate hardware based on the model, application and latency, throughput budget. This could be one of the supported hardwares in Torchserve, CPU, GPU, AWS Inferentia. Some hardware configurations are intended for best in class performance and others are better suited for cost effective inference. From our experiments we’ve found that GPUs shine best at larger batch sizes whereas the right CPUs and AWS Inferentia can be far more cost effective for lower batch sizes and low latency.
    • +
    + +

    Best Practices for Performance tuning on Torchserve

    + +

    To get the best performance out of your model while serving it with Torchserve, we are sharing some of the best practices here. Torchserve provides a benchmark suite that provides helpful insight to make informed decisions on different choices as detailed below.

    + +
      +
    • Optimize your model as the first step, Pytorch model optimization tutorials. Model optimization choices are also closely tied to the hardware of choice. We will discuss it in more detail in another blog post.
    • +
    • Deciding the hardware for model deployment can be closely related to the latency and throughput budget and cost per inference. Depending on the size of model and application it can vary, for some models like computer vision models it has been historically not affordable to run in production on CPU. However, by having optimizations such IPEX as recently added to Torchserve this has been much more affordable and cost beneficial and you can learn more in this investigative case study
    • +
    • +

      Workers in Torchserve are Python processes that provide parallelism, setting the number of workers should be done carefully. By default Torchserve launch number of workers equal to VCPUs or available GPUs on the host, this can add a considerable amount of time to the Torchserve start.

      + +

      Torchserve exposes a config property to set the number of workers. To provide an efficient parallelism through multiple workers and avoiding them to compete over resources, as a baseline we recommend following setting on CPU and GPU:

      + +

      CPU : In the handler, torch.set_num_threads(1) then set the number of workers to num physical cores / 2. But the the best threading configurations can be achieved by leveraging the Intel CPU launcher script.

      + +

      GPU: number of available GPUs can be set through number_gpus in config.properties. Torchserve uses round robin to assign workers to GPUs. We recommend setting the number of workers as follows. Number of worker = (Number of available GPUs) / (Number of Unique Models). Note that GPUs that are pre-Ampere do not provide any resource isolation with Multi Instance GPUs.

      +
    • +
    • Batch size can directly affect the latency and the throughput. To better utilize the compute resources batch size needs to be increased. However, there is a tradeoff between latency and throughput. Larger batch sizes can increase the throughput but results in a higher latency as well. Batch size can be set in Torchserve in two ways, either through model config in config.properties or while registering the model using Management API.
    • +
    + +

    In the next section, we are going to use Torchserve benchmark suite to decide the best combination of model optimization, hardware, workers, and batch size.

    + +

    Animated Drawings Performance Tuning

    + +

    To use the Torchserve benchmark suite, first we need to have an archived file, “.mar” file as discussed above, that contains the model, handler and all other artifacts to load and run inference. Animated Drawings uses Detectron2’s implementation of Mask-RCNN for an object detection model.

    + +

    How to run benchmark suite

    + +

    The Automated benchmark suite in Torchserve let you benchmark multiple models with different setting including batch size and number of worker and finally generate a report for you. To get started:

    + +
    git clone https://github.com/pytorch/serve.git
    +
    +cd serve/benchmarks
    +
    +pip install -r requirements-ab.txt
    +
    +apt-get install apache2-utils
    +
    + +

    Model level settings can be configured in a yaml file similar to

    + +
    
    +Model_name:
    +    eager_mode:
    +        benchmark_engine: "ab"
    +        url: "Path to .mar file"
    +        workers:
    +            - 1
    +            - 4
    +        batch_delay: 100
    +        batch_size:
    +            - 1
    +            - 2
    +            - 4
    +            - 8
    +        requests: 10000
    +        concurrency: 10
    +        input: "Path to model input"
    +        backend_profiling: False
    +        exec_env: "local"
    +        processors:
    +            - "cpu"
    +            - "gpus": "all"
    +
    +
    + +

    This yaml file will be referenced in the benchmark_config_template.yaml file that includes other settings for generating reports, this can optionally work with AWS cloud watch for logs as well.

    + +
    python benchmarks/auto_benchmark.py --input benchmark_config_template.yaml
    +
    + +

    Running the benchmarks, results will be written in “csv” file that can be found in “_ /tmp/benchmark/ab_report.csv_” and full report “/tmp/ts_benchmark/report.md”. It will include items such as Torchserve average latency, model P99 latency, throughput, number of concurrency, number of requests, handler time, and some other metrics. Here we focus on some of the important ones that we track to tune the performance which are, concurrency, model P99 latency, throughput. We look at these numbers specifically in combination with batch size, the used device, number of workers and if any model optimization has been done.

    + +

    The latency SLA for this model has been set to 100 ms, this is real-time application and as we discussed earlier, latency is more of a concern and throughput ideally should be as high as possible while it does not violate the latency SLA.

    + +

    Through searching the space, over different batch sizes (1-32), number of workers (1-16) and devices (CPU,GPU), we have run a set of experiments that summarized the best ones in the table below.

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Device + Concurrency + # Requests + #workers + Batch size + Payload/image + Optimization + Throughput + Latency P99 +
    CPU + 10 + 1000 + 1 + 1 + small + N/A + 3.45 + 305.3 ms +
    CPU + 1 + 1000 + 1 + 1 + small + N/A + 3.45 + 291.8 ms +
    GPU + 10 + 1000 + 1 + 1 + small + N/A + 41.05 + 25.48 ms +
    GPU + 1 + 1000 + 1 + 1 + small + N/A + 42.21 + 23.6 ms +
    GPU + 10 + 1000 + 1 + 4 + small + N/A + 54.78 + 73.62 ms +
    GPU + 10 + 1000 + 1 + 4 + small + model.half() + 78.62 + 50.69 ms +
    GPU + 10 + 1000 + 1 + 8 + small + model.half() + 85.29 + 94.4 ms +
    + +

    The latency of this model on CPU with all of the tried settings in terms of batch size, concurrency and number of workers did not meet the SLA, in fact ~13x higher.

    + +

    Moving the model serving to GPU, immediately could improve the latency ~**13x **from 305 ms down to 23.6 ms.

    + +

    One of the simplest optimizations that we could do for the model was lowering its precision to fp16, it is one liner (model.half()) and could reduce the model P99 latency **by **32% and increase the throughput by almost the same amount.

    + +

    There could be other optimization done by Torchscripting the model and using optimize_for_inference or other tricks including onnx or tensorrt runtime optimizations which leverage aggressive fusions are out of the scope of this post. We will discuss model optimizations in a separate post.

    + +

    We found both on CPU and GPU , setting **number of workers=1 **worked the best in this case.

    + +
      +
    • Moving the model to GPU, using number of workers = 1, and batch size = 1 increased the Throughput ~12x compared to CPU and latency ~13x.
    • +
    • Moving the model to GPU, using model.half(), number of workers = 1, and batch size = 8 yielded best results in terms of Throughput and tolerable latency. Throughput increased ~25x compared to CPU with latency still meeting the SLA (94.4ms).
    • +
    + +

    Note: if you are running the benchmark suite, make sure you are setting a proper batch_delay and set the concurrency of the request to a number proportional to your batch size. Concurrency here means the number of concurrent requests being sent to the server.

    + +

    Conclusion

    + +

    In this post, we have discussed the considerations and knobs that Torchserve expose to tune the performance in production. We have discussed the Torchserve benchmark suite as a means to tune the performance and get insights on possible choices for model optimizations, hardware choice and cost in general. We used Animated Drawings app which uses Detectron2’s Mask-RCNN model as a case-study to showcase the performance tuning with benchmark suite.

    + +

    For more details on Performance tuning in Torchserve please refer to our documentation here. +Also feel free to open a ticket on Torchserve repo for any further questions and feedback.

    + +

    Acknowledgement

    + +

    We would like to thank Somya Jain (Meta), Christopher Gustave (Meta) for their great support and guidance throughout many steps of this blog and providing insights to Sketch Animator workflow. Also, special thanks to Li Ning from AWS for the great efforts to make performance tuning much easier on Torchserve with automated benchmark suite.

    + + + + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/torchtune-fine-tune-llms/index.html b/blog/torchtune-fine-tune-llms/index.html new file mode 100644 index 000000000000..65869f2476d1 --- /dev/null +++ b/blog/torchtune-fine-tune-llms/index.html @@ -0,0 +1,694 @@ + + + + + + + + + + + + + torchtune: Easily fine-tune LLMs using PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    + +
    + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    We’re pleased to announce the alpha release of torchtune, a PyTorch-native library for easily fine-tuning large language models.

    + +

    Staying true to PyTorch’s design principles, torchtune provides composable and modular building blocks along with easy-to-extend training recipes to fine-tune popular LLMs on a variety of consumer-grade and professional GPUs.

    + +

    torchtune supports the full fine-tuning workflow from start to finish, including

    + +
      +
    • Downloading and preparing datasets and model checkpoints.
    • +
    • Customizing the training with composable building blocks that support different model architectures, parameter-efficient fine-tuning (PEFT) techniques, and more.
    • +
    • Logging progress and metrics to gain insight into the training process.
    • +
    • Quantizing the model post-tuning.
    • +
    • Evaluating the fine-tuned model on popular benchmarks.
    • +
    • Running local inference for testing fine-tuned models.
    • +
    • Checkpoint compatibility with popular production inference systems.
    • +
    + +

    To get started, jump right into the code or walk through our many tutorials!

    + +

    Why torchtune?

    + +

    Over the past year there has been an explosion of interest in open LLMs. Fine-tuning these state of the art models has emerged as a critical technique for adapting them to specific use cases. This adaptation can require extensive customization from dataset and model selection all the way through to quantization, evaluation and inference. Moreover, the size of these models poses a significant challenge when trying to fine-tune them on consumer-level GPUs with limited memory.

    + +

    Existing solutions make it hard to add these customizations or optimizations by hiding the necessary pieces behind layers of abstractions. It’s unclear how different components interact with each other and which of these need to be updated to add new functionality. torchtune empowers developers to adapt LLMs to their specific needs and constraints with full control and visibility.

    + +

    torchtune’s Design

    + +

    torchtune was built with the following principles in mind

    + +
      +
    • Easy extensibility - New techniques emerge all the time and everyone’s fine-tuning use case is different. torchtune’s recipes are designed around easily composable components and hackable training loops, with minimal abstraction getting in the way of fine-tuning your fine-tuning. Each recipe is self-contained - no trainers or frameworks, and is designed to be easy to read - less than 600 lines of code!
    • +
    • Democratize fine-tuning - Users, regardless of their level of expertise, should be able to use torchtune. Clone and modify configs, or get your hands dirty with some code! You also don’t need beefy data center GPUs. Our memory efficient recipes have been tested on machines with a single 24GB gaming GPU.
    • +
    • Interoperability with the OSS LLM ecosystem - The open source LLM ecosystem is absolutely thriving, and torchtune takes advantage of this to provide interoperability with a wide range of offerings. This flexibility puts you firmly in control of how you train and use your fine-tuned models.
    • +
    + +

    Over the next year, open LLMs will become even more powerful, with support for more languages (multilingual), more modalities (multimodal) and more tasks. As the complexity of these models increases, we need to pay the same attention to “how” we design our libraries as we do to the features provided or performance of a training run. Flexibility will be key to ensuring the community can maintain the current pace of innovation, and many libraries/tools will need to play well with each other to power the full spectrum of use cases. torchtune is built from the ground up with this future in mind.

    + +

    In the true PyTorch spirit, torchtune makes it easy to get started by providing integrations with some of the most popular tools for working with LLMs.

    + +
      +
    • Hugging Face Hub - Hugging Face provides an expansive repository of open source models and datasets for fine-tuning. torchtune seamlessly integrates through the tune download CLI command so you can get started right away with fine-tuning your first model.
    • +
    • PyTorch FSDP - Scale your training using PyTorch FSDP. It is very common for people to invest in machines with multiple consumer level cards like the 3090/4090 by NVidia. torchtune allows you to take advantage of these setups by providing distributed recipes powered by FSDP.
    • +
    • Weights & Biases - torchtune uses the Weights & Biases AI platform to log metrics and model checkpoints during training. Track your configs, metrics and models from your fine-tuning runs all in one place!
    • +
    • EleutherAI’s LM Evaluation Harness - Evaluating fine-tuned models is critical to understanding whether fine-tuning is giving you the results you need. torchtune includes a simple evaluation recipe powered by EleutherAI’s LM Evaluation Harness to provide easy access to a comprehensive suite of standard LLM benchmarks. Given the importance of evaluation, we will be working with EleutherAI very closely in the next few months to build an even deeper and more “native” integration.
    • +
    • ExecuTorch - Models fine-tuned with torchtune can be easily exported to ExecuTorch, enabling efficient inference to be run on a wide variety of mobile and edge devices.
    • +
    • torchao - Easily and efficiently quantize your fine-tuned models into 4-bit or 8-bit using a simple post-training recipe powered by the quantization APIs from torchao.
    • +
    + +

    What’s Next?

    + +

    This is just the beginning and we’re really excited to put this alpha version in front of a vibrant and energetic community. In the coming weeks, we’ll continue to augment the library with more models, features and fine-tuning techniques. We’d love to hear any feedback, comments or feature requests in the form of GitHub issues on our repository, or on our Discord channel. As always, we’d love any contributions from this awesome community. Happy Tuning!

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/torchvision-mobilenet-v3-implementation/index.html b/blog/torchvision-mobilenet-v3-implementation/index.html new file mode 100644 index 000000000000..6c08e8cff4cc --- /dev/null +++ b/blog/torchvision-mobilenet-v3-implementation/index.html @@ -0,0 +1,1092 @@ + + + + + + + + + + + + + Everything you need to know about TorchVision’s MobileNetV3 implementation | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Vasilis Vryniotis and Francisco Massa + +

    +

    In TorchVision v0.9, we released a series of new mobile-friendly models that can be used for Classification, Object Detection and Semantic Segmentation. In this article, we will dig deep into the code of the models, share notable implementation details, explain how we configured and trained them, and highlight important tradeoffs we made during their tuning. Our goal is to disclose technical details that typically remain undocumented in the original papers and repos of the models.

    + +

    Network Architecture

    + +

    The implementation of the MobileNetV3 architecture follows closely the original paper. It is customizable and offers different configurations for building Classification, Object Detection and Semantic Segmentation backbones. It was designed to follow a similar structure to MobileNetV2 and the two share common building blocks.

    + +

    Off-the-shelf, we offer the two variants described on the paper: the Large and the Small. Both are constructed using the same code with the only difference being their configuration which describes the number of blocks, their sizes, their activation functions etc.

    + +

    Configuration parameters

    + +

    Even though one can write a custom InvertedResidual setting and pass it to the MobileNetV3 class directly, for the majority of applications we can adapt the existing configs by passing parameters to the model building methods. Some of the key configuration parameters are the following:

    + +
      +
    • +

      The width_mult parameter is a multiplier that affects the number of channels of the model. The default value is 1 and by increasing or decreasing it one can change the number of filters of all convolutions, including the ones of the first and last layers. The implementation ensures that the number of filters is always a multiple of 8. This is a hardware optimization trick which allows for faster vectorization of operations.

      +
    • +
    • +

      The reduced_tail parameter halves the number of channels on the last blocks of the network. This version is used by some Object Detection and Semantic Segmentation models. It’s a speed optimization which is described on the MobileNetV3 paper and reportedly leads to a 15% latency reduction without a significant negative effect on accuracy.

      +
    • +
    • +

      The dilated parameter affects the last 3 InvertedResidual blocks of the model and turns their normal depthwise Convolutions to Atrous Convolutions. This is used to control the output stride of these blocks and has a significant positive effect on the accuracy of Semantic Segmentation models.

      +
    • +
    + +

    Implementation details

    + +

    Below we provide additional information on some notable implementation details of the architecture. +The MobileNetV3 class is responsible for building a network out of the provided configuration. Here are some implementation details of the class:

    + +
      +
    • +

      The last convolution block expands the output of the last InvertedResidual block by a factor of 6. The implementation is aligned with the Large and Small configurations described on the paper and can adapt to different values of the multiplier parameter.

      +
    • +
    • +

      Similarly to other models such as MobileNetV2, a dropout layer is placed just before the final Linear layer of the classifier.

      +
    • +
    + +

    The InvertedResidual class is the main building block of the network. Here are some notable implementation details of the block along with its visualization which comes from Figure 4 of the paper:

    + +
      +
    • +

      There is no expansion step if the input channels and the expanded channels are the same. This happens on the first convolution block of the network.

      +
    • +
    • +

      There is always a projection step even when the expanded channels are the same as the output channels.

      +
    • +
    • +

      The activation method of the depthwise block is placed before the Squeeze-and-Excite layer as this improves marginally the accuracy.

      +
    • +
    + +
    + +
    + +

    Classification

    + +

    In this section we provide benchmarks of the pre-trained models and details on how they were configured, trained and quantized.

    + +

    Benchmarks

    + +

    Here is how to initialize the pre-trained models:

    +
    large = torchvision.models.mobilenet_v3_large(pretrained=True, width_mult=1.0,  reduced_tail=False, dilated=False)
    +small = torchvision.models.mobilenet_v3_small(pretrained=True)
    +quantized = torchvision.models.quantization.mobilenet_v3_large(pretrained=True)
    +
    + +

    Below we have the detailed benchmarks between new and selected previous models. As we can see MobileNetV3-Large is a viable replacement of ResNet50 for users who are willing to sacrifice a bit of accuracy for a roughly 6x speed-up:

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    ModelAcc@1Acc@5Inference on CPU (sec)# Params (M)
    MobileNetV3-Large74.04291.3400.04115.48
    MobileNetV3-Small67.66887.4020.01652.54
    Quantized MobileNetV3-Large73.00490.8580.01622.96
    MobileNetV271.88090.2900.06083.50
    ResNet5076.15092.8700.254525.56
    ResNet1869.76089.0800.103211.69
    + +

    Note that the inference times are measured on CPU. They are not absolute benchmarks, but they allow for relative comparisons between models.

    + +

    Training process

    + +

    All pre-trained models are configured with a width multiplier of 1, have full tails, are non-dilated, and were fitted on ImageNet. Both the Large and Small variants were trained using the same hyper-parameters and scripts which can be found in our references folder. Below we provide details on the most notable aspects of the training process.

    + +

    Achieving fast and stable training

    + +

    Configuring RMSProp correctly was crucial to achieve fast training with numerical stability. The authors of the paper used TensorFlow in their experiments and in their runs they reported using quite high rmsprop_epsilon comparing to the default. Typically this hyper-parameter takes small values as it’s used to avoid zero denominators, but in this specific model choosing the right value seems important to avoid numerical instabilities in the loss.

    + +

    Another important detail is that though PyTorch’s and TensorFlow’s RMSProp implementations typically behave similarly, there are a few differences with the most notable in our setup being how the epsilon hyperparameter is handled. More specifically, PyTorch adds the epsilon outside of the square root calculation while TensorFlow adds it inside. The result of this implementation detail is that one needs to adjust the epsilon value while porting the hyper parameter of the paper. A reasonable approximation can be taken with the formula PyTorch_eps = sqrt(TF_eps).

    + +

    Increasing our accuracy by tuning hyperparameters & improving our training recipe

    + +

    After configuring the optimizer to achieve fast and stable training, we turned into optimizing the accuracy of the model. There are a few techniques that helped us achieve this. First of all, to avoid overfitting we augmented out data using the AutoAugment algorithm, followed by RandomErasing. Additionally we tuned parameters such as the weight decay using cross validation. We also found beneficial to perform weight averaging across different epoch checkpoints after the end of the training. Finally, though not used in our published training recipe, we found that using Label Smoothing, Stochastic Depth and LR noise injection improve the overall accuracy by over 1.5 points.

    + +

    The graph and table depict a simplified summary of the most important iterations for improving the accuracy of the MobileNetV3 Large variant. Note that the actual number of iterations done while training the model was significantly larger and that the progress in accuracy was not always monotonically increasing. Also note that the Y-axis of the graph starts from 70% instead from 0% to make the difference between iterations more visible:

    + +
    + +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    IterationAcc@1Acc@5
    Baseline with “MobileNetV2-style” Hyperparams71.54290.068
    + RMSProp with default eps70.68489.38
    + RMSProp with adjusted eps & LR scheme71.76490.178
    + Data Augmentation & Tuned Hyperparams73.8691.292
    + Checkpoint Averaging74.02891.382
    + Label Smoothing & Stochastic Depth & LR noise75.53692.368
    + +

    Note that once we’ve achieved an acceptable accuracy, we verified the model performance on the hold-out test dataset which hasn’t been used before for training or hyper-parameter tuning. This process helps us detect overfitting and is always performed for all pre-trained models prior their release.

    + +

    Quantization

    + +

    We currently offer quantized weights for the QNNPACK backend of the MobileNetV3-Large variant which provides a speed-up of 2.5x. To quantize the model, Quantized Aware Training (QAT) was used. The hyper parameters and the scripts used to train the model can be found in our references folder.

    + +

    Note that QAT allows us to model the effects of quantization and adjust the weights so that we can improve the model accuracy. This translates to an accuracy increase of 1.8 points comparing to simple post-training quantization:

    + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Quantization StatusAcc@1Acc@5
    Non-quantized74.04291.340
    Quantized Aware Training73.00490.858
    Post-training Quantization71.16089.834
    + +

    Object Detection

    + +

    In this section, we will first provide benchmarks of the released models, and then discuss how the MobileNetV3-Large backbone was used in a Feature Pyramid Network along with the FasterRCNN detector to perform Object Detection. We will also explain how the network was trained and tuned alongside with any tradeoffs we had to make. We will not cover details about how it was used with SSDlite as this will be discussed on a future article.

    + +

    Benchmarks

    + +

    Here is how the models are initialized:

    +
    high_res = torchvision.models.detection.fasterrcnn_mobilenet_v3_large_fpn(pretrained=True) 
    +low_res = torchvision.models.detection.fasterrcnn_mobilenet_v3_large_320_fpn(pretrained=True)
    +
    + +

    Below are some benchmarks between new and selected previous models. As we can see the high resolution Faster R-CNN with MobileNetV3-Large FPN backbone seems a viable replacement of the equivalent ResNet50 model for those users who are willing to sacrifice few accuracy points for a 5x speed-up:

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    ModelmAPInference on CPU (sec)# Params (M)
    Faster R-CNN MobileNetV3-Large FPN (High-Res)32.80.840919.39
    Faster R-CNN MobileNetV3-Large 320 FPN (Low-Res)22.80.167919.39
    Faster R-CNN ResNet-50 FPN37.04.151441.76
    RetinaNet ResNet-50 FPN36.44.882534.01
    + +

    Implementation details

    + +

    The Detector uses a FPN-style backbone which extracts features from different convolutions of the MobileNetV3 model. By default the pre-trained model uses the output of the 13th InvertedResidual block and the output of the Convolution prior to the pooling layer but the implementation supports using the outputs of more stages.

    + +

    All feature maps extracted from the network have their output projected down to 256 channels by the FPN block as this greatly improves the speed of the network. These feature maps provided by the FPN backbone are used by the FasterRCNN detector to provide box and class predictions at different scales.

    + +

    Training & Tuning process

    + +

    We currently offer two pre-trained models capable of doing object detection at different resolutions. Both models were trained on the COCO dataset using the same hyper-parameters and scripts which can be found in our references folder.

    + +

    The High Resolution detector was trained with images of 800-1333px, while the mobile-friendly Low Resolution detector was trained with images of 320-640px. The reason why we provide two separate sets of pre-trained weights is because training a detector directly on the smaller images leads to a 5 mAP increase in precision comparing to passing small images to the pre-trained high-res model. Both backbones were initialized with weights fitted on ImageNet and the 3 last stages of their weights where fined-tuned during the training process.

    + +

    An additional speed optimization can be applied on the mobile-friendly model by tuning the RPN NMS thresholds. By sacrificing only 0.2 mAP of precision we were able to improve the CPU speed of the model by roughly 45%. The details of the optimization can be seen below:

    + + + + + + + + + + + + + + + + + + + + + +
    Tuning StatusmAPInference on CPU (sec)
    Before23.00.2904
    After22.80.1679
    + +

    Below we provide some examples of visualizing the predictions of the Faster R-CNN MobileNetV3-Large FPN model:

    + +
    + +
    + +

    Semantic Segmentation

    + +

    In this section we will start by providing some benchmarks of the released pre-trained models. Then we will discuss how a MobileNetV3-Large backbone was combined with segmentation heads such as LR-ASPP, DeepLabV3 and the FCN to conduct Semantic Segmentation. We will also explain how the network was trained and propose a few optional optimization techniques for speed critical applications.

    + +

    Benchmarks

    + +

    This is how to initialize the pre-trained models:

    + +
    lraspp = torchvision.models.segmentation.lraspp_mobilenet_v3_large(pretrained=True) 
    +deeplabv3 = torchvision.models.segmentation.deeplabv3_mobilenet_v3_large(pretrained=True)
    +
    + +

    Below are the detailed benchmarks between new and selected existing models. As we can see, the DeepLabV3 with a MobileNetV3-Large backbone is a viable replacement of FCN with ResNet50 for the majority of applications as it achieves similar accuracy with a 8.5x speed-up. We also observe that the LR-ASPP network supersedes the equivalent FCN in all metrics:

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    ModelmIoUGlobal Pixel AccInference on CPU (sec)# Params (M)
    LR-ASPP MobileNetV3-Large57.991.20.32783.22
    DeepLabV3 MobileNetV3-Large60.391.20.586911.03
    FCN MobileNetV3-Large (not released)57.890.90.37025.05
    DeepLabV3 ResNet5066.492.46.353139.64
    FCN ResNet5060.591.45.014632.96
    + +

    Implementation details

    + +

    In this section we will discuss important implementation details of tested segmentation heads. Note that all models described in this section use a dilated MobileNetV3-Large backbone.

    + +

    LR-ASPP

    + +

    The LR-ASPP is the Lite variant of the Reduced Atrous Spatial Pyramid Pooling model proposed by the authors of the MobileNetV3 paper. Unlike the other segmentation models in TorchVision, it does not make use of an auxiliary loss. Instead it uses low and high-level features with output strides of 8 and 16 respectively.

    + +

    Unlike the paper where a 49x49 AveragePooling layer with variable strides is used, our implementation uses an AdaptiveAvgPool2d layer to process the global features. This is because the authors of the paper tailored the head to the Cityscapes dataset while our focus is to provide a general purpose implementation that can work on multiple datasets. Finally our implementation always has a bilinear interpolation before returning the output to ensure that the sizes of the input and output images match exactly.

    + +

    DeepLabV3 & FCN

    + +

    The combination of MobileNetV3 with DeepLabV3 and FCN follows closely the ones of other models and the stage estimation for these methods is identical to LR-ASPP. The only notable difference is that instead of using high and low level features, we attach the normal loss to the feature map with output stride 16 and an auxiliary loss on the feature map with output stride 8.

    + +

    Finally we should note that the FCN version of the model was not released because it was completely superseded by the LR-ASPP both in terms of speed and accuracy. The pre-trained weights are still available and can be used with minimal changes to the code.

    + +

    Training & Tuning process

    + +

    We currently offer two MobileNetV3 pre-trained models capable of doing semantic segmentation: the LR-ASPP and the DeepLabV3. The backbones of the models were initialized with ImageNet weights and trained end-to-end. Both architectures were trained on the COCO dataset using the same scripts with similar hyper-parameters. Their details can be found in our references folder.

    + +

    Normally, during inference the images are resized to 520 pixels. An optional speed optimization is to construct a Low Res configuration of the model by using the High-Res pre-trained weights and reducing the inference resizing to 320 pixels. This will improve the CPU execution times by roughly 60% while sacrificing a couple of mIoU points. The detailed numbers of this optimization can be found on the table below:

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Low-Res ConfigurationmIoU DifferenceSpeed ImprovementmIoUGlobal Pixel AccInference on CPU (sec)
    LR-ASPP MobileNetV3-Large-2.165.26%55.890.30.1139
    DeepLabV3 MobileNetV3-Large-3.863.86%56.590.30.2121
    FCN MobileNetV3-Large (not released)-3.057.57%54.890.10.1571
    + +

    Here are some examples of visualizing the predictions of the LR-ASPP MobileNetV3-Large model:

    + +
    + +
    + +

    We hope that you found this article interesting. We are looking forward to your feedback to see if this is the type of content you would like us to publish more often. If the community finds that such posts are useful, we will be happy to publish more articles that cover the implementation details of newly introduced Machine Learning models.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/torchvision-ssd-implementation/index.html b/blog/torchvision-ssd-implementation/index.html new file mode 100644 index 000000000000..3d0913639eaf --- /dev/null +++ b/blog/torchvision-ssd-implementation/index.html @@ -0,0 +1,802 @@ + + + + + + + + + + + + + Everything You Need To Know About Torchvision’s SSD Implementation | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Vasilis Vryniotis + +

    +

    In TorchVision v0.10, we’ve released two new Object Detection models based on the SSD architecture. Our plan is to cover the key implementation details of the algorithms along with information on how they were trained in a two-part article.

    + +

    In part 1 of the series, we will focus on the original implementation of the SSD algorithm as described on the Single Shot MultiBox Detector paper. We will briefly give a high-level description of how the algorithm works, then go through its main components, highlight key parts of its code, and finally discuss how we trained the released model. Our goal is to cover all the necessary details to reproduce the model including those optimizations which are not covered on the paper but are part on the original implementation.

    + +

    How Does SSD Work?

    + +

    Reading the aforementioned paper is highly recommended but here is a quick oversimplified refresher. Our target is to detect the locations of objects in an image along with their categories. Here is the Figure 5 from the SSD paper with prediction examples of the model:

    + +
    + +
    + +

    The SSD algorithm uses a CNN backbone, passes the input image through it and takes the convolutional outputs from different levels of the network. The list of these outputs are called feature maps. These feature maps are then passed through the Classification and Regression heads which are responsible for predicting the class and the location of the boxes.

    + +

    Since the feature maps of each image contain outputs from different levels of the network, their size varies and thus they can capture objects of different dimensions. On top of each, we tile several default boxes which can be thought as our rough prior guesses. For each default box, we predict whether there is an object (along with its class) and its offset (correction over the original location). During training time, we need to first match the ground truth to the default boxes and then we use those matches to estimate our loss. During inference, similar prediction boxes are combined to estimate the final predictions.

    + +

    The SSD Network Architecture

    + +

    In this section, we will discuss the key components of SSD. Our code follows closely the paper and makes use of many of the undocumented optimizations included in the official implementation.

    + +

    DefaultBoxGenerator

    + +

    The DefaultBoxGenerator class is responsible for generating the default boxes of SSD and operates similarly to the AnchorGenerator of FasterRCNN (for more info on their differences see pages 4-6 of the paper). It produces a set of predefined boxes of specific width and height which are tiled across the image and serve as the first rough prior guesses of where objects might be located. Here is Figure 1 from the SSD paper with a visualization of ground truths and default boxes:

    + +
    + +
    + +

    The class is parameterized by a set of hyperparameters that control their shape and tiling. The implementation will provide automatically good guesses with the default parameters for those who want to experiment with new backbones/datasets but one can also pass optimized custom values.

    + +

    SSDMatcher

    + +

    The SSDMatcher class extends the standard Matcher used by FasterRCNN and it is responsible for matching the default boxes to the ground truth. After estimating the IoUs of all combinations, we use the matcher to find for each default box the best candidate ground truth with overlap higher than the IoU threshold. The SSD version of the matcher has an extra step to ensure that each ground truth is matched with the default box that has the highest overlap. The results of the matcher are used in the loss estimation during the training process of the model.

    + +

    Classification and Regression Heads

    + +

    The SSDHead class is responsible for initializing the Classification and Regression parts of the network. Here are a few notable details about their code:

    + + + +

    Backbone Feature Extractor

    + +

    The feature extractor reconfigures and enhances a standard VGG backbone with extra layers as depicted on the Figure 2 of the SSD paper:

    + +
    + +
    + +

    The class supports all VGG models of TorchVision and one can create a similar extractor class for other types of CNNs (see this example for ResNet). Here are a few implementation details of the class:

    + +
      +
    • Patching the ceil_mode parameter of the 3rd Maxpool layer is necessary to get the same feature map sizes as the paper. This is due to small differences between PyTorch and the original Caffe implementation of the model.
    • +
    • It adds a series of extra feature layerson top of VGG. If the highres parameter is True during its construction, it will append an extra convolution. This is useful for the SSD512 version of the model.
    • +
    • As discussed on section 3 of the paper, the fully connected layers of the original VGG are converted to convolutions with the first one using Atrous. Moreover maxpool5’s stride and kernel size is modified.
    • +
    • As described on section 3.1, L2 normalization is used on the output of conv4_3 and a set of learnable weights are introduced to control its scaling.
    • +
    + +

    SSD Algorithm

    + +

    The final key piece of the implementation is on the SSD class. Here are some notable details:

    + + + +

    Here are the two core methods of the implementation:

    + + + +

    The SSD300 VGG16 Model

    + +

    The SSD is a family of models because it can be configured with different backbones and different Head configurations. In this section, we will focus on the provided SSD pre-trained model. We will discuss the details of its configuration and the training process used to reproduce the reported results.

    + +

    Training process

    + +

    The model was trained using the COCO dataset and all of its hyper-parameters and scripts can be found in our references folder. Below we provide details on the most notable aspects of the training process.

    + +

    Paper Hyperparameters

    + +

    In order to achieve the best possible results on COCO, we adopted the hyperparameters described on the section 3 of the paper concerning the optimizer configuration, the weight regularization etc. Moreover we found it useful to adopt the optimizations that appear in the official implementation concerning the tiling configuration of the DefaultBox generator. This optimization was not described in the paper but it was crucial for improving the detection precision of smaller objects.

    + +

    Data Augmentation

    + +

    Implementing the SSD Data Augmentation strategy as described on page 6 and page 12 of the paper was critical to reproducing the results. More specifically the use of random “Zoom In” and “Zoom Out” transformations make the model robust to various input sizes and improve its precision on the small and medium objects. Finally since the VGG16 has quite a few parameters, the photometric distortions included in the augmentations have a regularization effect and help avoid the overfitting.

    + +

    Weight Initialization & Input Scaling

    + +

    Another aspect that we found beneficial was to follow the weight initialization scheme proposed by the paper. To do that, we had to adapt our input scaling method by undoing the 0-1 scaling performed by ToTensor() and use pre-trained ImageNet weights fitted with this scaling (shoutout to Max deGroot for providing them in his repo). All the weights of new convolutions were initialized using Xavier and their biases were set to zero. After initialization, the network was trained end-to-end.

    + +

    LR Scheme

    + +

    As reported on the paper, after applying aggressive data augmentations it’s necessary to train the models for longer. Our experiments confirm this and we had to tweak the Learning rate, batch sizes and overall steps to achieve the best results. Our proposed learning scheme is configured to be rather on the safe side, showed signs of plateauing between the steps and thus one is likely to be able to train a similar model by doing only 66% of our epochs.

    + +

    Breakdown of Key Accuracy Improvements

    + +

    It is important to note that implementing a model directly from a paper is an iterative process that circles between coding, training, bug fixing and adapting the configuration until we match the accuracies reported on the paper. Quite often it also involves simplifying the training recipe or enhancing it with more recent methodologies. It is definitely not a linear process where incremental accuracy improvements are achieved by improving a single direction at a time but instead involves exploring different hypothesis, making incremental improvements in different aspects and doing a lot of backtracking.

    + +

    With that in mind, below we try to summarize the optimizations that affected our accuracy the most. We did this by grouping together the various experiments in 4 main groups and attributing the experiment improvements to the closest match. Note that the Y-axis of the graph starts from 18 instead from 0 to make the difference between optimizations more visible:

    + +
    + +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Model ConfigurationmAP deltamAP
    Baseline with “FasterRCNN-style” Hyperparams-19.5
    + Paper Hyperparams1.621.1
    + Data Augmentation1.822.9
    + Weight Initialization & Input Scaling123.9
    + LR scheme1.225.1
    + +

    Our final model achieves an mAP of 25.1 and reproduces exactly the COCO results reported on the paper. Here is a detailed breakdown of the accuracy metrics.

    + +

    We hope you found the part 1 of the series interesting. On the part 2, we will focus on the implementation of SSDlite and discuss its differences from SSD. Until then, we are looking forward to your feedback.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/torchvision-ssdlite-implementation/index.html b/blog/torchvision-ssdlite-implementation/index.html new file mode 100644 index 000000000000..713e8833159d --- /dev/null +++ b/blog/torchvision-ssdlite-implementation/index.html @@ -0,0 +1,803 @@ + + + + + + + + + + + + + Everything You Need To Know About Torchvision’s SSDlite Implementation | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Vasilis Vryniotis + +

    +

    In the previous article, we’ve discussed how the SSD algorithm works, covered its implementation details and presented its training process. If you have not read the previous blog post, I encourage you to check it out before continuing.

    + +

    In this part 2 of the series, we will focus on the mobile-friendly variant of SSD called SSDlite. Our plan is to first go through the main components of the algorithm highlighting the parts that differ from the original SSD, then discuss how the released model was trained and finally provide detailed benchmarks for all the new Object Detection models that we explored.

    + +

    The SSDlite Network Architecture

    + +

    The SSDlite is an adaptation of SSD which was first briefly introduced on the MobileNetV2 paper and later reused on the MobileNetV3 paper. Because the main focus of the two papers was to introduce novel CNN architectures, most of the implementation details of SSDlite were not clarified. Our code follows all the details presented on the two papers and where necessary fills the gaps from the official implementation.

    + +

    As noted before, the SSD is a family of models because one can configure it with different backbones (such as VGG, MobileNetV3 etc) and different Heads (such as using regular convolutions, separable convolutions etc). Thus many of the SSD components remain the same in SSDlite. Below we discuss only those that are different

    + +

    Classification and Regression Heads

    + +

    Following the Section 6.2 of the MobileNetV2 paper, SSDlite replaces the regular convolutions used on the original Heads with separable convolutions. Consequently, our implementation introduces new heads that use 3x3 Depthwise convolutions and 1x1 projections. Since all other components of the SSD method remain the same, to create an SSDlite model our implementation initializes the SSDlite head and passes it directly to the SSD constructor.

    + +

    Backbone Feature Extractor

    + +

    Our implementation introduces a new class for building MobileNet feature extractors. Following the Section 6.3 of the MobileNetV3 paper, the backbone returns the output of the expansion layer of the Inverted Bottleneck block which has an output stride of 16 and the output of the layer just before the pooling which has an output stride of 32. Moreover, all extra blocks of the backbone are replaced with lightweight equivalents which use a 1x1 compression, a separable 3x3 convolution with stride 2 and a 1x1 expansion. Finally to ensure that the heads have enough prediction power even when small width multipliers are used, the minimum depth size of all convolutions is controlled by the min_depth hyperparameter.

    + +

    The SSDlite320 MobileNetV3-Large model

    + +
    + +
    + +

    This section discusses the configuration of the provided SSDlite pre-trained model along with the training processes followed to replicate the paper results as closely as possible.

    + +

    Training process

    + +

    All of the hyperparameters and scripts used to train the model on the COCO dataset can be found in our references folder. Here we discuss the most notable details of the training process.

    + +

    Tuned Hyperparameters

    + +

    Though the papers don’t provide any information on the hyperparameters used for training the models (such as regularization, learning rate and the batch size), the parameters listed in the configuration files on the official repo were good starting points and using cross validation we adjusted them to their optimal values. All the above gave us a significant boost over the baseline SSD configuration.

    + +

    Data Augmentation

    + +

    Key important difference of SSDlite comparing to SSD is that the backbone of the first has only a fraction of the weights of the latter. This is why in SSDlite, the Data Augmentation focuses more on making the model robust to objects of variable sizes than trying to avoid overfitting. Consequently, SSDlite uses only a subset of the SSD transformations and this way it avoids the over-regularization of the model.

    + +

    LR Scheme

    + +

    Due to the reliance on Data Augmentation to make the model robust to small and medium sized objects, we found that it is particularly beneficial for the training recipe to use large number of epochs. More specifically by using roughly 3x more epochs than SSD we are able to increase our precision by 4.2mAP points and by using a 6x multiplier we improve by 4.9mAP. Increasing further the epochs seems to yield diminishing returns and makes the training too slow and impractical, nevertheless based on the model configuration it seems that the authors of the paper used an equivalent 16x multiplier.

    + +

    Weight Initialization & Input Scaling & ReLU6

    + +

    A set of final optimizations that brought our implementation very close to the official one and helped us bridge the accuracy gap was training the backbone from scratch instead of initializing from ImageNet, adapting our weight initialization scheme, changing our Input Scaling and replacing all standard ReLUs added on the SSDlite heads with ReLU6. Note that since we trained the model from random weights, we additionally applied the speed optimization described on the paper of using a reduced tail on the backbone.

    + +

    Implementation Differences

    + +

    Comparing the above implementation with the one on the official repo, we’ve identified a few differences. Most of them are minor and they are related to how we initialize the weights (for example Normal initialization vs Truncated Normal), how we parameterize the LR Scheduling (for example smaller vs larger warmup rate, shorter vs longer training) etc. The biggest known difference lies in the way we compute the Classification loss. More specifically the implementation of SSDlite with MobileNetV3 backbone on the official repo doesn’t use the SSD’s Multibox loss but instead uses RetinaNet’s focal loss. This is a rather significant deviation from the paper and since TorchVision already offers a full implementation of RetinaNet, we decided to implement SSDlite using the normal Multi-box SSD loss.

    + +

    Break down of key accuracy improvements

    + +

    As discussed in previous articles, reproducing research papers and porting them to code is not a journey of monotonically increasing accuracies, especially in cases where the full training and implementation details are not known. Typically the process involves lots of backtracking as one needs to identify those implementation details and parameters that have significant impact on the accuracy from those that don’t. Below we try to visualize the most important iterations that improved our accuracy from the baseline:

    + +
    + +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    IterationmAP
    Baseline with “SSD-style” Hyperparams10.6
    + Tuned Hyperparams14.2
    + SSDlite Data Augmentation15.2
    + 3x LR Scheme19.4
    + 6x LR Scheme20.1
    + Weight Initialization & Input Scaling & ReLU621.3
    + +

    The order of optimizations presented above is accurate, though a bit idealized in some cases. For example, though different schedulers were tested during the Hyperparameter tuning phase, none of them provided significant improvements and thus we maintained the MultiStepLR which was used in the baseline. Nevertheless while later experimenting with different LR Schemes, we found it beneficial to switch to CosineAnnealingLR, as it required less configuration. Consequently, we believe that the main takeaway from the above summary should be that even by starting with a correct implementation and a set of optimal hyperparams from a model of the same family, there is always accuracy points to be found by optimizing the training recipe and tuning the implementation. Admittedly the above is a rather extreme case where the accuracy doubled, but still in many cases there is a large number of optimizations that can help us push the accuracy significantly.

    + +

    Benchmarks

    + +

    Here is how to initialize the two pre-trained models:

    + +
    ssdlite = torchvision.models.detection.ssdlite320_mobilenet_v3_large(pretrained=True)
    +ssd = torchvision.models.detection.ssd300_vgg16(pretrained=True)
    +
    + +

    Below are the benchmarks between the new and selected previous detection models:

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    ModelmAPInference on CPU (sec)# Params (M)
    SSDlite320 MobileNetV3-Large21.30.09113.44
    SSD300 VGG1625.10.830335.64
    SSD512 VGG16 (not released)28.82.249437.08
    SSD512 ResNet50 (not released)30.21.113742.70
    Faster R-CNN MobileNetV3-Large 320 FPN (Low-Res)22.80.167919.39
    Faster R-CNN MobileNetV3-Large FPN (High-Res)32.80.840919.39
    + +

    As we can see, the SSDlite320 MobileNetV3-Large model is by far the fastest and smallest model and thus it’s an excellent candidate for real-world mobile applications. Though its accuracy is lower than the pre-trained low-resolution Faster R-CNN equivalent, the SSDlite framework is adaptable and one can boost its accuracy by introducing heavier heads with more convolutions.

    + +

    On the other hand, the SSD300 VGG16 model is rather slow and less accurate. This is mainly because of its VGG16 backbone. Though extremely important and influential, the VGG architecture is nowadays quite outdated. Thus though the specific model has historical and research value and hence it’s included in TorchVision, we recommend to users who want high-resolution detectors for real world applications to either combine SSD with alternative backbones (see this example on how to create one) or use one of the Faster R-CNN pre-trained models.

    + +

    We hope you enjoyed the 2nd and final part of the SSD series. We are looking forward to your feedback.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/torchvision03/index.html b/blog/torchvision03/index.html new file mode 100644 index 000000000000..5b1635238562 --- /dev/null +++ b/blog/torchvision03/index.html @@ -0,0 +1,838 @@ + + + + + + + + + + + + + torchvision 0.3: segmentation, detection models, new datasets and more.. | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Francisco Massa + +

    +

    PyTorch domain libraries like torchvision provide convenient access to common datasets and models that can be used to quickly create a state-of-the-art baseline. Moreover, they also provide common abstractions to reduce boilerplate code that users might have to otherwise repeatedly write. The torchvision 0.3 release brings several new features including models for semantic segmentation, object detection, instance segmentation, and person keypoint detection, as well as custom C++ / CUDA ops specific to computer vision.

    + +
    + +
    + +

    New features include:

    + +

    Reference training / evaluation scripts: torchvision now provides, under the references/ folder, scripts for training and evaluation of the following tasks: classification, semantic segmentation, object detection, instance segmentation and person keypoint detection. These serve as a log of how to train a specific model and provide baseline training and evaluation scripts to quickly bootstrap research.

    + +

    torchvision ops: torchvision now contains custom C++ / CUDA operators. Those operators are specific to computer vision, and make it easier to build object detection models. These operators currently do not support PyTorch script mode, but support for it is planned for in the next release. Some of the ops supported include:

    + +
      +
    • roi_pool (and the module version RoIPool)
    • +
    • roi_align (and the module version RoIAlign)
    • +
    • nms, for non-maximum suppression of bounding boxes
    • +
    • box_iou, for computing the intersection over union metric between two sets of bounding boxes
    • +
    • box_area, for computing the area of a set of bounding boxes
    • +
    + +

    Here are a few examples on using torchvision ops:

    + +
    import torch
    +import torchvision
    +
    +# create 10 random boxes
    +boxes = torch.rand(10, 4) * 100
    +# they need to be in [x0, y0, x1, y1] format
    +boxes[:, 2:] += boxes[:, :2]
    +# create a random image
    +image = torch.rand(1, 3, 200, 200)
    +# extract regions in `image` defined in `boxes`, rescaling
    +# them to have a size of 3x3
    +pooled_regions = torchvision.ops.roi_align(image, [boxes], output_size=(3, 3))
    +# check the size
    +print(pooled_regions.shape)
    +# torch.Size([10, 3, 3, 3])
    +
    +# or compute the intersection over union between
    +# all pairs of boxes
    +print(torchvision.ops.box_iou(boxes, boxes).shape)
    +# torch.Size([10, 10])
    +
    + +

    New models and datasets: torchvision now adds support for object detection, instance segmentation and person keypoint detection models. In addition, several popular datasets have been added. Note: The API is currently experimental and might change in future versions of torchvision. New models include:

    + +

    Segmentation Models

    + +

    The 0.3 release also contains models for dense pixelwise prediction on images. +It adds FCN and DeepLabV3 segmentation models, using a ResNet50 and ResNet101 backbones. +Pre-trained weights for ResNet101 backbone are available, and have been trained on a subset of COCO train2017, which contains the same 20 categories as those from Pascal VOC.

    + +

    The pre-trained models give the following results on the subset of COCO val2017 which contain the same 20 categories as those present in Pascal VOC:

    + + + + + + + + + + + + + + + + + + + + + +
    Networkmean IoUglobal pixelwise acc
    FCN ResNet10163.791.9
    DeepLabV3 ResNet10167.492.4
    + +

    Detection Models

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Networkbox APmask APkeypoint AP
    Faster R-CNN ResNet-50 FPN trained on COCO37.0  
    Mask R-CNN ResNet-50 FPN trained on COCO37.934.6 
    Keypoint R-CNN ResNet-50 FPN trained on COCO54.6 65.0
    + +

    The implementations of the models for object detection, instance segmentation and keypoint detection are fast, specially during training.

    + +

    In the following table, we use 8 V100 GPUs, with CUDA 10.0 and CUDNN 7.4 to report the results. During training, we use a batch size of 2 per GPU, and during testing a batch size of 1 is used.

    + +

    For test time, we report the time for the model evaluation and post-processing (including mask pasting in image), but not the time for computing the precision-recall.

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Networktrain time (s / it)test time (s / it)memory (GB)
    Faster R-CNN ResNet-50 FPN0.22880.05905.2
    Mask R-CNN ResNet-50 FPN0.27280.09035.4
    Keypoint R-CNN ResNet-50 FPN0.37890.12426.8
    + +

    You can load and use pre-trained detection and segmentation models with a few lines of code

    + +
    import torchvision
    +
    +model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True)
    +# set it to evaluation mode, as the model behaves differently
    +# during training and during evaluation
    +model.eval()
    +
    +image = PIL.Image.open('/path/to/an/image.jpg')
    +image_tensor = torchvision.transforms.functional.to_tensor(image)
    +
    +# pass a list of (potentially different sized) tensors
    +# to the model, in 0-1 range. The model will take care of
    +# batching them together and normalizing
    +output = model([image_tensor])
    +# output is a list of dict, containing the postprocessed predictions
    +
    + +

    Classification Models

    + +

    The following classification models were added:

    + +
      +
    • GoogLeNet (Inception v1)
    • +
    • MobileNet V2
    • +
    • ShuffleNet v2
    • +
    • ResNeXt-50 32x4d and ResNeXt-101 32x8d
    • +
    + +

    Datasets

    + +

    The following datasets were added:

    + +
      +
    • Caltech101, Caltech256, and CelebA
    • +
    • ImageNet dataset (improving on ImageFolder, provides class-strings)
    • +
    • Semantic Boundaries Dataset
    • +
    • VisionDataset as a base class for all datasets
    • +
    + +

    In addition, we’ve added more image transforms, general improvements and bug fixes, as well as improved documentation.

    + +

    See the full release notes here as well as this getting started tutorial on Google Colab here, which describes how to fine tune your own instance segmentation model on a custom dataset.

    + +

    Cheers!

    + +

    Team PyTorch

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/towards-reproducible-research-with-pytorch-hub/index.html b/blog/towards-reproducible-research-with-pytorch-hub/index.html new file mode 100644 index 000000000000..e01a68050e68 --- /dev/null +++ b/blog/towards-reproducible-research-with-pytorch-hub/index.html @@ -0,0 +1,843 @@ + + + + + + + + + + + + + Towards Reproducible Research with PyTorch Hub | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    Reproducibility is an essential requirement for many fields of research including those based on machine learning techniques. However, many machine learning publications are either not reproducible or are difficult to reproduce. With the continued growth in the number of research publications, including tens of thousands of papers now hosted on arXiv and submissions to conferences at an all time high, research reproducibility is more important than ever. While many of these publications are accompanied by code as well as trained models which is helpful but still leaves a number of steps for users to figure out for themselves.

    + +

    We are excited to announce the availability of PyTorch Hub, a simple API and workflow that provides the basic building blocks for improving machine learning research reproducibility. PyTorch Hub consists of a pre-trained model repository designed specifically to facilitate research reproducibility and enable new research. It also has built-in support for Colab, integration with Papers With Code and currently contains a broad set of models that include Classification and Segmentation, Generative, Transformers, etc.

    + +
    + +
    + +

    [Owner] Publishing models

    + +

    PyTorch Hub supports the publication of pre-trained models (model definitions and pre-trained weights) to a GitHub repository by adding a simple hubconf.py file. +This provides an enumeration of which models are to be supported and a list of dependencies needed to run the models. +Examples can be found in the torchvision, huggingface-bert and gan-model-zoo repositories.

    + +

    Let us look at the simplest case: torchvision’s hubconf.py:

    + +
    # Optional list of dependencies required by the package
    +dependencies = ['torch']
    +
    +from torchvision.models.alexnet import alexnet
    +from torchvision.models.densenet import densenet121, densenet169, densenet201, densenet161
    +from torchvision.models.inception import inception_v3
    +from torchvision.models.resnet import resnet18, resnet34, resnet50, resnet101, resnet152,\
    +resnext50_32x4d, resnext101_32x8d
    +from torchvision.models.squeezenet import squeezenet1_0, squeezenet1_1
    +from torchvision.models.vgg import vgg11, vgg13, vgg16, vgg19, vgg11_bn, vgg13_bn, vgg16_bn, vgg19_bn
    +from torchvision.models.segmentation import fcn_resnet101, deeplabv3_resnet101
    +from torchvision.models.googlenet import googlenet
    +from torchvision.models.shufflenetv2 import shufflenet_v2_x0_5, shufflenet_v2_x1_0
    +from torchvision.models.mobilenet import mobilenet_v2
    +
    + +

    In torchvision, the models have the following properties:

    +
      +
    • Each model file can function and be executed independently
    • +
    • They dont require any package other than PyTorch (encoded in hubconf.py as dependencies['torch'])
    • +
    • They dont need separate entry-points, because the models when created, work seamlessly out of the box
    • +
    + +

    Minimizing package dependencies reduces the friction for users to load your model for immediate experimentation.

    + +

    A more involved example is HuggingFace’s BERT models. Here is their hubconf.py

    + +
    dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex']
    +
    +from hubconfs.bert_hubconf import (
    +    bertTokenizer,
    +    bertModel,
    +    bertForNextSentencePrediction,
    +    bertForPreTraining,
    +    bertForMaskedLM,
    +    bertForSequenceClassification,
    +    bertForMultipleChoice,
    +    bertForQuestionAnswering,
    +    bertForTokenClassification
    +)
    +
    + +

    Each model then requires an entrypoint to be created. Here is a code snippet to specify an entrypoint of the bertForMaskedLM model, which returns the pre-trained model weights.

    + +
    def bertForMaskedLM(*args, **kwargs):
    +    """
    +    BertForMaskedLM includes the BertModel Transformer followed by the
    +    pre-trained masked language modeling head.
    +    Example:
    +      ...
    +    """
    +    model = BertForMaskedLM.from_pretrained(*args, **kwargs)
    +    return model
    +
    + +

    These entry-points can serve as wrappers around complex model factories. They can give a clean and consistent help docstring, have logic to support downloading of pretrained weights (for example via pretrained=True) or have additional hub-specific functionality such as visualization.

    + +

    With a hubconf.py in place, you can send a pull request based on the template here. +Our goal is to curate high-quality, easily-reproducible, maximally-beneficial models for research reproducibility. +Hence, we may work with you to refine your pull request and in some cases reject some low-quality models to be published. +Once we accept your pull request, your model will soon appear on Pytorch hub webpage for all users to explore.

    + +

    [User] Workflow

    + +

    As a user, PyTorch Hub allows you to follow a few simple steps and do things like: 1) explore available models; 2) load a model; and 3) understand what methods are available for any given model. Let’s walk through some examples of each.

    + +

    Explore available entrypoints.

    + +

    Users can list all available entrypoints in a repo using the torch.hub.list() API.

    + +
    >>> torch.hub.list('pytorch/vision')
    +>>>
    +['alexnet',
    +'deeplabv3_resnet101',
    +'densenet121',
    +...
    +'vgg16',
    +'vgg16_bn',
    +'vgg19',
    + 'vgg19_bn']
    +
    + +

    Note that PyTorch Hub also allows auxillary entrypoints (other than pretrained models), e.g. bertTokenizer for preprocessing in the BERT models, to make the user workflow smoother.

    + +

    Load a model

    + +

    Now that we know which models are available in the Hub, users can load a model entrypoint using the torch.hub.load() API. This only requires a single command without the need to install a wheel. In addition the torch.hub.help() API can provide useful information about how to instantiate the model.

    + +
    print(torch.hub.help('pytorch/vision', 'deeplabv3_resnet101'))
    +model = torch.hub.load('pytorch/vision', 'deeplabv3_resnet101', pretrained=True)
    +
    + +

    It is also common that repo owners will want to continually add bug fixes or performance improvements. PyTorch Hub makes it super simple for users to get the latest update by calling:

    + +
    model = torch.hub.load(..., force_reload=True)
    +
    + +

    We believe this will help to alleviate the burden of repetitive package releases by repo owners and instead allow them to focus more on their research. +It also ensures that, as a user, you are getting the freshest available models.

    + +

    On the contrary, stability is important for users. Hence, some model owners serve them from a specificed branch or tag, rather than the master branch, to ensure stability of the code. +For example, pytorch_GAN_zoo serves them from the hub branch:

    + +
    model = torch.hub.load('facebookresearch/pytorch_GAN_zoo:hub', 'DCGAN', pretrained=True, useGPU=False)
    +
    + +

    Note that the *args, **kwargs passed to hub.load() are used to instantiate a model. In the above example, pretrained=True and useGPU=False are given to the model’s entrypoint.

    + +

    Explore a loaded model

    + +

    Once you have a model from PyTorch Hub loaded, you can use the following workflow to find out the available methods that are supported as well as understand better what arguments are requires to run it.

    + +

    dir(model) to see all available methods of the model. Let’s take a look at bertForMaskedLM’s available methods.

    + +
    >>> dir(model)
    +>>>
    +['forward'
    +...
    +'to'
    +'state_dict',
    +]
    +
    + +

    help(model.forward) provides a view into what arguments are required to make your loaded model run

    + +
    >>> help(model.forward)
    +>>>
    +Help on method forward in module pytorch_pretrained_bert.modeling:
    +forward(input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None)
    +...
    +
    + +

    Have a closer look at the BERT and DeepLabV3 pages, where you can see how these models can be used once loaded.

    + +

    Other ways to explore

    + +

    Models available in PyTorch Hub also support both Colab and are directly linked on Papers With Code and you can get started with a single click. Here is a good example to get started with (shown below).

    + +
    + +
    + +

    Additional resources:

    + + + +

    A BIG thanks to the folks at HuggingFace, the PapersWithCode team, fast.ai and Nvidia as well as Morgane Riviere (FAIR Paris) and lots of others for helping bootstrap this effort!!

    + +

    Cheers!

    + +

    Team PyTorch

    + +

    FAQ:

    + +

    Q: If we would like to contribute a model that is already in the Hub but perhaps mine has better accuracy, should I still contribute?

    + +

    A: Yes!! A next step for Hub is to implement an upvote/downvote system to surface the best models.

    + +

    Q: Who hosts the model weights for PyTorch Hub?

    + +

    A: You, as the contributor, are responsible to host the model weights. You can host your model in your favorite cloud storage or, if it fits within the limits, on GitHub. If it is not within your means to host the weights, check with us via opening an issue on the hub repository.

    + +

    Q: What if my model is trained on private data? Should I still contribute this model?

    + +

    A: No! PyTorch Hub is centered around open research and that extends to the usage of open datasets to train these models on. If a pull request for a proprietary model is submitted, we will kindly ask that you resubmit a model trained on something open and available.

    + +

    Q: Where are my downloaded models saved?

    + +

    A: We follow the XDG Base Directory Specification and adhere to common standards around cached files and directories.

    + +

    The locations are used in the order of:

    + +
      +
    • Calling hub.set_dir(<PATH_TO_HUB_DIR>)
    • +
    • $TORCH_HOME/hub, if environment variable TORCH_HOME is set.
    • +
    • $XDG_CACHE_HOME/torch/hub, if environment variable XDG_CACHE_HOME is set.
    • +
    • ~/.cache/torch/hub
    • +
    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/trace-analysis-for-masses/index.html b/blog/trace-analysis-for-masses/index.html new file mode 100644 index 000000000000..21b1ce88de59 --- /dev/null +++ b/blog/trace-analysis-for-masses/index.html @@ -0,0 +1,789 @@ + + + + + + + + + + + + + PyTorch Trace Analysis for the Masses | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    January 09, 2023

    +

    + PyTorch Trace Analysis for the Masses +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Anupam Bhatnagar, Xizhou Feng, Brian Coutinho, Yifan Liu, Sung-Han Lin, Louis Feng, and Yuzhen Huang + +

    +

    We are excited to announce the public release of Holistic Trace Analysis (HTA), an open source performance analysis and visualization Python library for PyTorch users. HTA takes as input Kineto traces collected by the PyTorch profiler, which are complex and challenging to interpret, and up-levels the performance information contained in these traces. It was initially developed internally at Meta to understand and debug performance problems for large-scale distributed training jobs on GPUs. The multidisciplinary team has made a number of enhancements to HTA’s features and scaled them to support state-of-the-art ML workloads.

    + +

    ML researchers and systems engineers often struggle to computationally scale up their models because they are not aware of the performance bottlenecks in their workloads. The resources requested for a job (e.g. GPUs, memory) are often misaligned with the resources actually required due to lack of visibility “under the hood”. To achieve the best performance from the hardware stack, it is imperative to understand the resource utilization and bottlenecks for distributed training workloads.

    + +

    The initial HTA implementation was specifically targeted at Deep Learning Based Recommendation Models (DLRM). To make the features in HTA generic and applicable to use cases such as analyzing Vision and NLP models, we decided to refactor the HTA codebase and make the library available to the larger community. This new codebase has implemented several important ideas which lead to significant efficiency and performance improvements.

    + +

    In this blog, we present several features implemented in the open source version of HTA, which can be used as a Python script as well as interactively in a Jupyter notebook. HTA provides the following features:

    + +
      +
    1. Breakdown by Dimensions +
        +
      1. Temporal: Breakdown of GPU time in terms of time spent in computation, communication, memory events, and idle time on a single node and across all ranks.
      2. +
      3. Idle Time: Breakdown of GPU idle time into waiting for the host, waiting for another kernel or attributed to an unknown cause.
      4. +
      5. Kernel: Find kernels with the longest duration on each rank.
      6. +
      7. Communication Computation Overlap: Calculate the percentage of time when communication overlaps computation.
      8. +
      +
    2. +
    3. Statistical Analysis +
        +
      1. Kernel Duration Distribution: Distribution of average time taken by longest kernels across different ranks.
      2. +
      3. CUDA Kernel Launch: Distributions of GPU kernels with very small duration, large duration, and excessive launch time.
      4. +
      5. Augmented Counters (Memory bandwidth, Queue length): Augmented trace files which provide insights into memory copy bandwidth and number of outstanding operations on each CUDA stream.
      6. +
      +
    4. +
    5. Patterns +
        +
      1. Frequent CUDA Kernels: Find the CUDA kernels most frequently launched by any given PyTorch or user defined operator.
      2. +
      +
    6. +
    7. Trace Comparison +
        +
      1. Trace Diff: A trace comparison tool to identify and visualize the differences between traces.
      2. +
      +
    8. +
    + +

    HTA source code is available to users via Github. Users can request new features or build their own analysis using the core libraries and data structures provided in the codebase in addition to the features mentioned above.

    + +

    GPU Training Performance Debugging 101

    + +

    To understand the GPU performance in distributed training jobs, we consider how the model operators interact with the GPU devices and how such interactions are reflected in certain measurable metrics.

    + +

    At a high level, we can break down the GPU operations in a model execution into three broad categories, henceforth referred to as kernel types:

    +
      +
    1. Computation (COMP) - Compute kernels execute compiled routines for matrix multiplication and similar numeric calculations. They are responsible for all of the number-crunching necessary for model execution.
    2. +
    3. Communication (COMM) - Communication kernels are routines which are responsible for exchanging and synchronizing data between different GPU devices in a distributed training job. The NVIDIA Collective Communication Library (NCCL) is a widely used communication library and all its kernels have the prefix “nccl”. Example NCCL kernels include NCCL_AllGather, NCCL_ReduceScatter, NCCL_AllReduce, etc.
    4. +
    5. Memory (MEM) - Memory kernels manage the memory allocations/deallocations on the GPU devices and data movement between the memory space on the host and the GPUs. The memory kernels include Memcpy_H2D, Memcpy_D2H, Memcpy_D2D, Memset, etc. Here, H represents the Host and D represents the GPU Device. Thus, H2D, D2H, D2D stands for Host to Device, Device to Host and Device to Device respectively.
    6. +
    + +

    Because a modern GPU device like the NVIDIA A100 GPU is a massively parallel device which is capable of running multiple kernels simultaneously, it is possible to overlap the computation, communication, and memory kernels to reduce the model execution time. One common technique to achieve the overlap is to utilize multiple CUDA streams. A CUDA stream is a sequence of operations that execute on a GPU device in the order in which they are issued by the host code. Different CUDA streams can be interleaved and even run concurrently, thus achieving the effect of kernel overlap.

    + +

    To help understand the above concepts, Figure 1 provides a timeline of the GPU kernels in a sample distributed training job on 8 GPUs for one iteration. In the figure below, each rank represents one GPU and the kernels on each GPU run on 6 CUDA streams. In the right column of the figure, you can see names of the GPU kernels used. In the middle of the figure, you see the overlap between compute and communicate kernels. This figure is created using the plot_timeline example notebook available in HTA.

    + +

    Figure 1. An example of the execution timeline of GPU Kernels across multiple ranks

    + +

    Figure 1. An example of the execution timeline of GPU Kernels across multiple ranks

    + +

    The performance of multiple GPU training jobs is affected by multiple factors. Among these factors, how does a model execution create and orchestrate the GPU kernels plays a critical role. HTA provides insights on how the model execution interacts with the GPU devices and highlights the opportunities for performance improvement.

    + +

    With the features we built in HTA, we aim to provide users insights into “what is happening under the hood in a distributed GPU training?” We briefly describe these features in the next few paragraphs.

    + +

    Features in Holistic Trace Analysis

    + +

    For most users, understanding the performance of GPU training jobs is nontrivial. Thus, we built this library to simplify the task of trace analysis and provide the user useful insights by examining the model execution traces. As the first step, we developed features which are important and generic enough so that most users can benefit from this library.

    + +

    Temporal Breakdown: We begin by asking whether the GPU is spending time on computation, communication, memory events, or is it idle? To answer this question, the temporal breakdown feature presents a breakdown in terms of these categories. To achieve high training efficiency the code should maximize time used by computation kernels and minimize idle time and non-compute time (time used by communication or memory kernels). This is accomplished by implementing concurrent execution of computation kernels with communication or memory kernels. Note that, during concurrent execution of computation kernels with communication/memory kernels the time spent by communication/memory kernels is accounted for under compute time.

    + +

    Figure 2: Temporal Breakdown across 8 GPUs

    + +

    Figure 2: Temporal Breakdown across 8 GPUs

    + +

    Kernel Breakdown: It is natural to ask which kernels are taking the most amount of time. The next feature breaks down the time spent within each kernel type (COMM, COMP, MEM) and sorts them by duration. We present this information for each kernel type and for each rank as a pie chart. See figure 3 below.

    + +

    Figure 3: Pie chart of top computation and communication kernels

    + +

    Figure 3: Pie chart of top computation and communication kernels

    + +

    Kernel Duration Distribution: Subsequently, one can also ask - for any given kernel, what is the distribution of the time spent across the ranks? To answer this, HTA generates bar graphs for the average duration of a given kernel across all ranks. Additionally, the error bars in the bar graphs show the minimum and maximum amount of time taken by a given kernel on a given rank. Figure 4 below shows a discrepancy between average duration on rank 0 as compared to other ranks. This anomalous behavior on rank 0 guides the user on where to look for possible bugs.

    + +

    Figure 4: Average duration of NCCL AllReduce Kernel across 8 ranks

    + +

    Figure 4: Average duration of NCCL AllReduce Kernel across 8 ranks

    + +

    Communication Computation Overlap: In distributed training, a significant amount of time is spent in communication and synchronization events among multiple GPU devices. To achieve high GPU efficiency (i.e. TFLOPS/GPU) it is vital to keep the GPU doing actual computation work. In other words, a GPU should not be blocked because of waiting for data from other GPUs. One way to measure the extent to which computation is blocked by data dependencies is to calculate the computation-communication overlap. Higher GPU efficiency is observed if communication events overlap computation events. Lack of communication and computation overlap will lead to the GPU being idle, thus the efficiency would be low. Thus, the communication computation overlap feature calculates the percentage of time communication and computation overlap in a job for each rank and generates a bar graph representation. See figure below. More precisely, we measure the following ratio

    + +

    (time spent in computation while communicating) / (time spent in communication)

    + +

    Figure 5: Communication computation overlap

    + +

    Figure 5: Communication computation overlap

    + +

    Augmented Counters (Queue length, Memory bandwidth): To aid in debugging, HTA calculates the memory bandwidth statistics for D2H, H2D and D2D memory copy (memcpy) and memory set (memset) events. Additionally, HTA also computes the number of outstanding CUDA operations on each CUDA stream. We refer to this as queue length. When the queue length on a stream is 1024 or larger new events cannot be scheduled on that stream and the CPU will stall until the GPU events have processed. Additionally, HTA generates a new trace file containing tracks with the memory bandwidth and queue length time series. See Figure 6 below.

    + +

    Figure 6: Memory Bandwidth and Queue Length

    + +

    Figure 6: Memory Bandwidth and Queue Length

    + +

    These primary features give us a peek into the system performance and help answer “what is happening in the system?”. As HTA evolves, we hope to address “why is X happening?” and also suggest possible solutions to overcome the bottlenecks.

    + +

    Installation and Usage

    + +

    Installation

    + +

    For installing the HTA please refer to the README. In brief, the user is required to clone the repo and install the necessary Python packages via pip.

    + +

    Usage

    + +

    This version of Holistic Trace Analysis is currently in beta and we recommend using HTA in a Jupyter notebook. A demo notebook is provided for your convenience. To get started, import the hta package in a Jupyter notebook, create a TraceAnalysis object and off we go in exactly two lines of code.

    + +
    from hta.trace_analysis import TraceAnalysis
    +analyzer = TraceAnalysis(trace_dir = /trace/folder/path)
    +
    + +

    Requirements

    + +
      +
    • All trace files for a training or inference job must be stored in a unique folder.
    • +
    • Trace files are in json or gzipped json format.
    • +
    + +

    FAQ

    + +

    Q. How can I install HTA?

    + +

    Please see the README in the root directory of the repository.

    + +

    Q. Is there any documentation on the features and API in HTA?

    + +

    The documentation and detailed API is available here.

    + +

    Q. Can you implement feature X?

    + +

    Depending on how widely the feature is needed and the level of effort required to implement it we would consider developing the feature. Please open a Github Issue and tag it with the feature-request label.

    + +

    Q. Can I modify the code?

    + +

    Please do and send a PR along the way, if you think it would be useful for others.

    + +

    Q. How can I collect traces in PyTorch?

    + +

    Please refer to this tutorial here.

    + +

    Q. Can HTA be used at production scale?

    + +

    Yes, please see a use case study here.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/training-moes/index.html b/blog/training-moes/index.html new file mode 100644 index 000000000000..83a26c6624dd --- /dev/null +++ b/blog/training-moes/index.html @@ -0,0 +1,713 @@ + + + + + + + + + + + + + Training MoEs at Scale with PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    June 23, 2024

    +

    + Training MoEs at Scale with PyTorch +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Brian Chu, Mihir Patel, Less Wright, Vitaliy Chiley, Evan Racah, Wanchao Liang, Iris Zhang, Andrew Gu + +

    +

    Over the past year, Mixture of Experts (MoE) models have surged in popularity, fueled by powerful open-source models like DBRX, Mixtral, DeepSeek, and many more. At Databricks, we’ve worked closely with the PyTorch team to scale training of MoE models. In this blog post, we’ll talk about how we scale to over three thousand GPUs using PyTorch Distributed and MegaBlocks, an efficient open-source MoE implementation in PyTorch.

    + +

    What is a MoE?

    + +

    A MoE model is a model architecture that uses multiple expert networks to make predictions. A gating network is used to route and combine the outputs of experts, ensuring each expert is trained on a different, specialized distribution of tokens. The architecture of a transformer-based large language model typically consists of an embedding layer that leads into multiple transformer blocks (Figure 1, Subfigure A). Each transformer block contains an attention block and a dense feed forward network (Figure 1, Subfigure B). These transformer blocks are stacked such that the output of one transformer block leads to the input of the next block. The final output goes through a fully connected layer and softmax to obtain probabilities for the next token to output.

    + +

    When using a MoE in LLMs, the dense feed forward layer is replaced by a MoE layer which consists of a gating network and a number of experts (Figure 1, Subfigure D). The gating network, typically a linear feed forward network, takes in each token and produces a set of weights that determine which tokens are routed to which experts. The experts themselves are typically implemented as a feed forward network as well. During training, the gating network adapts to assign inputs to the experts, enabling the model to specialize and improve its performance. The router outputs are then used to weigh expert outputs to give the final output of the MoE layer.

    + +

    Figure 1: Using Mixture of Experts in a transformer block

    + +

    Figure 1: Using Mixture of Experts in a transformer block

    + +

    Compared to dense models, MoEs provide more efficient training for a given compute budget. This is because the gating network only sends tokens to a subset of experts, reducing the computational load. As a result, the capacity of a model (its total number of parameters) can be increased without proportionally increasing the computational requirements. During inference, only some of the experts are used, so a MoE is able to perform faster inference than a dense model. However, the entire model needs to be loaded in memory, not just the experts being used.

    + +

    The sparsity in MoEs that allows for greater computational efficiency comes from the fact that a particular token will only be routed to a subset of experts. The number of experts and how experts are chosen depends on the implementation of the gating network, but a common method is top k. The gating network first predicts a probability value for each expert, then routes the token to the top k experts to obtain the output. However, if all tokens always go to the same subset of experts, training becomes inefficient and the other experts end up undertrained. To alleviate this problem, a load balancing loss is introduced that encourages even routing to all experts.

    + +

    The number of experts and choosing the top k experts is an important factor in designing MoEs. A higher number of experts allows scaling up to larger models without increasing computational cost. This means that the model has a higher capacity for learning, however, past a certain point the performance gains tend to diminish. The number of experts chosen needs to be balanced with the inference costs of serving the model since the entire model needs to be loaded in memory. Similarly, when choosing top k, a lower top k during training results in smaller matrix multiplications, leaving free computation on the table if communication costs are large enough. During inference, however, a higher top k generally leads to slower inference speed.

    + +

    MegaBlocks

    + +

    MegaBlocks is an efficient MoE implementation that uses sparse matrix multiplication to compute expert outputs in parallel despite uneven token assignment. MegaBlocks implements a dropless MoE that avoids dropping tokens while using GPU kernels that maintain efficient training. Prior to MegaBlocks, dynamic routing formulations forced a tradeoff between model quality and hardware efficiency. Previously, users had to either drop tokens from computation or waste computation and memory on padding. Experts can receive a variable number of tokens and the expert computation can be performed efficiently using block sparse matrix multiplication. We’ve integrated MegaBlocks into LLM Foundry to enable scaling MoE training to thousands of GPUs.

    + +

    Figure 2: Matrix multiplication for expert computations

    + +

    Figure 2: Matrix multiplication for expert computations

    + +

    Expert Parallelism

    + +

    As models scale to larger sizes and fail to fit on a single GPU, we require more advanced forms of parallelism. Expert parallelism is a form of model parallelism where we place different experts on different GPUs for better performance. Instead of expert weights being communicated across all GPUs, tokens are sent to the device that contains the expert. By moving data instead of weights, we can aggregate data across multiple machines for a single expert. The router determines which tokens from the input sequence should be sent to which experts. This is typically done by computing a gating score for each token-expert pair, and then routing each token to the top-scoring experts. Once the token-to-expert assignments are determined, an all-to-all communication step is performed to dispatch the tokens to the devices hosting the relevant experts. This involves each device sending the tokens assigned to experts on other devices, while receiving tokens assigned to its local experts.

    + +

    The key advantage of expert parallelism is processing a few, larger matrix multiplications instead of several small matrix multiplications. As each GPU only has a subset of experts, it only has to do computation for those experts. Correspondly, as we aggregate tokens across multiple GPUs, the size of each matrix is proportionally larger. As GPUs are optimized for large-scale parallel computations, larger operations can better exploit their capabilities, leading to higher utilization and efficiency. A more in depth explanation of the benefits of larger matrix multiplications can be found here. Once the computation is complete, another all-to-all communication step is performed to send the expert outputs back to their original devices.

    + +

    Figure 3: Token routing in expert parallelism

    + +

    Figure 3: Token routing in expert parallelism

    + +

    We leverage PyTorch’s DTensor, a low-level abstraction for describing how tensors are sharded and replicated, to effectively implement expert parallelism. We first manually place experts on different GPUs, typically sharding across a node to ensure we can leverage NVLink for fast GPU communication when we route tokens. We can then build a device mesh on top of this layout, which lets us succinctly describe the parallelism across the entire cluster. We can use this device mesh to easily checkpoint or rearrange experts when we need alternate forms of parallelism.

    + +

    Scaling ZeRO-3 with PyTorch FSDP

    + +

    In conjunction with expert parallelism, we use data parallelism for all other layers, where each GPU stores a copy of the model and optimizer and processes a different chunk of data. After each GPU has completed a forward and backward pass, gradients are accumulated across GPUs for a global model update.

    + +

    ZeRO-3 is a form of data parallelism where weights and optimizers are sharded across each GPU instead of being replicated. Each GPU now only stores a subset of the full model, dramatically reducing memory pressure. When a part of the model is needed for computation, it is gathered across all the GPUs, and after the computation is complete, the gathered weights are discarded. We use PyTorch’s implementation of ZeRO-3, called Fully Sharded Data Parallel (FSDP).

    + +

    As we scale to thousands of GPUs, the cost of communication across devices increases, slowing down training. Communication increases due to the need to synchronize and share model parameters, gradients, and optimizer states across all GPUs which involves all-gather and reduce-scatter operations. To mitigate this issue while keeping the benefits of FSDP, we utilize Hybrid Sharded Data Parallel (HSDP) to shard the model and optimizer across a set number of GPUs and replicate this multiple times to fully utilize the cluster. With HSDP, an additional all reduce operation is needed in the backward pass to sync gradients across replicas. This approach allows us to balance memory efficiency and communication cost during large scale distributed training. To use HSDP we can extend our previous device mesh from expert parallelism and let PyTorch do the heavy lifting of actually sharding and gathering when needed.

    + +

    Figure 4: FSDP and HSDP

    + +

    Figure 4: FSDP and HSDP

    + +

    With PyTorch, we can effectively combine these two types of parallelism, leveraging FSDP’s higher level API while using the lower-level DTensor abstraction when we want to implement something custom like expert parallelism. We now have a 3D device mesh with expert parallel shard dimension, ZeRO-3 shard dimension, and a replicate dimension for pure data parallelism. Together, these techniques deliver near linear scaling across very large clusters, allowing us to achieve MFU numbers over 40%.

    + +

    Elastic Checkpointing with Torch Distributed

    + +

    Fault tolerance is crucial for ensuring that LLMs can be trained reliably over extended periods, especially in distributed environments where node failures are common. To avoid losing progress when jobs inevitably encounter failures, we checkpoint the state of the model, which includes parameters, optimizer states, and other necessary metadata. When a failure occurs, the system can resume from the last saved state rather than starting over. To ensure robustness to failures, we need to checkpoint often and save and load checkpoints in the most performant way possible to minimize downtime. Additionally, if too many GPUs fail, our cluster size may change. Accordingly, we need the ability to elastically resume on a different number of GPUs.

    + +

    PyTorch supports elastic checkpointing through its distributed training framework, which includes utilities for both saving and loading checkpoints across different cluster configurations. PyTorch Distributed Checkpoint ensures the model’s state can be saved and restored accurately across all nodes in the training cluster in parallel, regardless of any changes in the cluster’s composition due to node failures or additions.

    + +

    Additionally, when training very large models, the size of checkpoints may be very large, leading to very slow checkpoint upload and download times. PyTorch Distributed Checkpoint supports sharded checkpoints, which enables each GPU to save and load only its portion of the model. When combining sharded checkpointing with elastic training, each GPU reads the metadata file to determine which shards to download on resumption. The metadata file contains information on what parts of each tensor are stored in each shard. The GPU can then download the shards for its part of the model and load that part of the checkpoint.

    + +

    Figure 5: Checkpointing saving and resumption resharded on additional GPUs

    + +

    Figure 5: Checkpointing saving and resumption resharded on additional GPUs

    + +

    By parallelizing checkpointing across GPUs, we can spread out network load, improving robustness and speed. When training a model with 3000+ GPUs, network bandwidth quickly becomes a bottleneck. We take advantage of the replication in HSDP to first download checkpoints on one replica and then send the necessary shards to other replicas. With our integration in Composer, we can reliably upload checkpoints to cloud storage as frequently as every 30 minutes and automatically resume from the latest checkpoint in the event of a node failure in less than 5 minutes.

    + +

    Conclusion

    + +

    We’re very excited to see how PyTorch is enabling training state-of-the-art LLMs with great performance. In our post, we’ve shown how we implemented efficient MoE training through Pytorch Distributed and MegaBlocks on Foundry. Furthermore, Pytorch elastic checkpointing allowed us to quickly resume training on a different number of GPUs when node failures occurred. Using Pytorch HSDP has allowed us to scale training efficiently as well as improve checkpointing resumption times. We look forward to continuing building on a strong and vibrant open-source community to help bring great AI models to everyone. Come join us in building great models at LLM Foundry and PyTorch.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/training-production-ai-models/index.html b/blog/training-production-ai-models/index.html new file mode 100644 index 000000000000..5d8adc5ef676 --- /dev/null +++ b/blog/training-production-ai-models/index.html @@ -0,0 +1,773 @@ + + + + + + + + + + + + + Training Production AI Models with PyTorch 2.0 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    + +
    + +
    +
    +
    + +
    +

    + by + + CK Luk, Daohang Shi, Yuzhen Huang, Jackie (Jiaqi) Xu, Jade Nie, Zhou Wang, Lu Fang, Flavio Sales Truzzi, Devashish Shankar, Dima Ivashchenko, Chunzhi Yang, Nicolas Macchioni, David Berard, Yu Guo, Xiaodong Wang, Bert Maher, Yanbo Liang, Edward Yang, Brian Hirsh, Michael Voznesensky, Animesh Jain, Michael Anderson + +

    +

    1. Introduction

    + +

    PyTorch 2.0 (abbreviated as PT2) can significantly improve the training and inference performance of an AI model using a compiler called torch.compile while being 100% backward compatible with PyTorch 1.x. There have been reports on how PT2 improves the performance of common benchmarks (e.g., huggingface’s diffusers). In this blog, we discuss our experiences in applying PT2 to production AI models at Meta.

    + +

    2. Background

    + +

    2.1 Why is automatic performance optimization important for production?

    + +

    Performance is particularly important for production—e.g, even a 5% reduction in the training time of a heavily used model can translate to substantial savings in GPU cost and data-center power. Another important metric is development efficiency, which measures how many engineer-months are required to bring a model to production. Typically, a significant part of this bring-up effort is spent on manual performance tuning such as rewriting GPU kernels to improve the training speed. By providing automatic performance optimization, PT2 can improve both cost and development efficiency.

    + +

    2.2 How PT2 improves performance

    + +

    As a compiler, PT2 can view multiple operations in the training graph captured from a model (unlike in PT1.x, where only one operation is executed at a time). Consequently, PT2 can exploit a number of performance optimization opportunities, including:

    + +
      +
    • Fusing multiple operations into a single GPU kernel: +
        +
      • A typical type of performance overhead in running a GPU program is the CPU overhead of launching small GPU kernels. By fusing multiple operations into a single GPU kernel, PT2 can significantly reduce the kernel-launching overhead on the CPU. For instance, consider the PyTorch program in Figure 1(a). When it is executed on GPU with PT1, it has three GPU kernels (two for the two sin() ops and one for the addition op). With PT2, there is only one kernel generated, which fuses all three ops.
      • +
      • After fusing some operations, certain operations in the graph may become dead and hence can be optimized away. This can save both compute and memory bandwidth on the GPU. For instance, in Figure 1(b), one of the duplicated sin() ops can be optimized away.
      • +
      • In addition, fusion can also reduce GPU device memory reads/writes (by composing pointwise kernels) and help improve hardware utilization.
      • +
      +
    • +
    + +

    Fig.1  How PT2 improves performance with fusion and dead-code elimination.

    + +

    Fig. 1: How PT2 improves performance with fusion and dead-code elimination.

    + +
      +
    • Reducing the type conversion overhead for using lower-precision data types: +
        +
      • PyTorch 1.x supports Automatic Mixed Precision (AMP). While AMP can reduce the compute time of an op, it introduces type conversion overhead before and after the op. PT2 can increase AMP performance by optimizing away unnecessary type conversion code, significantly reducing its overhead. As an example, Figure 2(a) converts three 32-bit input tensors (a32, b32, c32) to bf16 before doing the matrix multiplications. Nevertheless, in this example, a32 and c32 are actually the same tensor (a_float32). So, there is no need to convert a_float32 twice, as shown in the code generated by torch.compile in Figure 2(b). Note that while both this example and the previous one optimize away redundant computations, they are different in the sense that the type conversion code in this example is implicit via torch.autocast, unlike in the previous example where the torch.sin(x).cuda() is explicit in user code.
      • +
      +
    • +
    + +

    Fig.2  How PT2 reduces type conversion overhead when using AMP.

    + +

    Fig. 2: How PT2 reduces type conversion overhead when using AMP.

    + +
      +
    • Reusing buffers on the GPU: +
        +
      • With a global view, the scheduler in torch.compile can reuse buffers on the GPU, thereby reducing both memory allocation time and memory consumption. Figure 3 shows the driver program that calls the Triton kernels generated for the program in Figure 2(a). We can see that buf1 is reused as buf4.
      • +
      +
    • +
    + +

    Fig.3  Reuse of buffers.

    + +

    Fig. 3: Reuse of buffers.

    + +
      +
    • Autotuning: +
        +
      • PT2 has options to enable autotuning (via Triton) on matrix-multiply ops, pointwise ops, and reduction ops. Tunable parameters include block size, number of stages, and number of warps. With autotuning, the most performant implementation of an op can be found empirically.
      • +
      +
    • +
    + +

    3. Production environment considerations

    + +

    In this section, we describe a number of important considerations in applying PT2 to production.

    + +

    3.1 Ensuring no model quality degradation with torch.compile

    + +

    Applying torch.compile to a model will cause numerical changes because of (1) reordering of floating-point ops during various optimizations such as fusion and (2) use of lower precision data types like bf16 if AMP is enabled. Therefore 100% bitwise compatibility with PT 1.x is not expected. Nevertheless, we still need to make sure that the model quality (measured in some form of numeric scores) is preserved after applying torch.compile. Typically, each production model will have its own range of acceptable scores (e.g., percentage change must be within 0.01%).

    + +

    In case of a model-quality drop caused by torch.compile, we need to do a deep-dive debug.

    + +

    One useful technique for debugging a torch.compile-related numeric issue is to apply torch.compile with different backends, in particular “eager” and “aot_eager”, in addition to “inductor”:

    + +
      +
    • If the numeric issue happens with the “eager” backend, then the forward graph constructed by torch.compile is likely incorrect;
    • +
    • If the numeric issue doesn’t happen with “eager” but happens with “aot_eager”, then the backward graph constructed by torch.compile is likely incorrect;
    • +
    • If the numeric issue doesn’t happen with either “eager” or “aot_eager” but happens with “inductor”, then the code generation inside the inductor is likely incorrect.
    • +
    + +

    3.2 Autotuning in production

    + +

    By default, the autotuning in torch.inductor is done online while the model is executed. For some production models, we find that the autotuning time can take several hours, which is not acceptable for production. Therefore, we add offline autotuning which works as depicted in Figure 4. The very first time that a model is run, the details (e.g., input tensor shape, data type etc) on all ops that require tuning will be logged to a database. Then, a tuning process for these ops is run overnight to search for the most performant implementation of each op; the search result is updated to a persistent cache (implemented as a source file of torch.inductor). Next time when the model is run again, the tuned implementation of each op will be found in the cache and chosen for execution.

    + +

    Fig.4  The offline autotuning used in production.

    + +

    Fig. 4: The offline autotuning used in production.

    + +

    3.3 Profiling support for torch.compile

    + +

    As we previously discussed in this blog, a profiler is essential for debugging the performance of production models. We have enhanced the profiler to display torch.compile related events on the timeline. The most useful ones are marking which parts of the model are running compiled code so that we can quickly validate if the parts of the model that are supposed to be compiled are actually compiled by torch.compile. For example, the trace in Figure 5 has two compiled regions (with the label “CompiledFunction”). Other useful events are time spent on the compilation and that spent on accessing the compiler’s code-cache.

    + +

    Fig.5  A trace with two compiled regions.

    + +

    Fig. 5: A trace with two compiled regions.

    + +

    3.4 Controlling just-in-time compilation time

    + +

    torch.compile uses just-in-time compilation. The compilation happens when the first batch of data is trained. In our production setting, there is an upper limit on how much time is allowed for a training job to reach its first batch, aka Time-To-First-Batch (TTFB). We need to make sure that enabling torch.compile will not increase TTFB to over the limit. This could be challenging because production models are large and~~ ~~torch.compile can take substantial compilation time. We enable parallel compilation to keep the compile time under control (this is controlled by the global variable compile_threads inside torch/_inductor/config.py, which is already set to the CPU count on OSS Linux). A model is decomposed into one or more computational graphs; each graph is decomposed into multiple Triton kernels. If parallel compilation is enabled, all the Triton kernels in the same graph can be compiled simultaneously (nevertheless, kernels from different graphs are still compiled in serial). Figure 6 illustrates how parallel compilation helps.

    + +

    Fig.6  Using parallel compilation in production.

    + +

    Fig. 6: Using parallel compilation in production.

    + +

    4. Results

    + +

    In this section, we use three production models to evaluate PT2. First we show the training time speedups with PT2, using different optimization configs. Second, we show the importance of parallel compilation on the compilation time.

    + +

    4.1 Training-time speedup with torch.compile

    + +

    Figure 7 reports the training-time speedup with PT2. For each model, we show four cases: (i) no-compile with bf16, (ii) compile with fp32, (iii) compile with bf16, (iv) compile with bf16 and autotuning. The y-axis is the speedup over the baseline, which is no-compile with fp32. Note that no-compile with bf16 is actually slower than no-compile with fp32, due to the type conversion overhead. In contrast, compiling with bf16 achieves much larger speedups by reducing much of this overhead. Overall, given that these models are already heavily optimized by hand, we are excited to see that torch.compile can still provide 1.14-1.24x speedup.

    + +

    Fig.7 Training-time speedup with torch.compile (note: the baseline, no-compile/fp32, is  omitted in this figure).

    + +

    Fig. 7: Training-time speedup with torch.compile (note: the baseline, no-compile/fp32, is omitted in this figure).

    + +

    4.2 Compilation-time reduction with parallel compilation

    + +

    Figure 8 shows the compilation time with and without parallel compilation. While there is still room for improvement on the serial compilation time, parallel compilation has reduced the compilation overhead on TTFB to an acceptable level. Models B and C benefit more from parallel compilation than Model A does because they have more distinct Triton kernels per graph.

    + +

    Fig.8 PT2 compilation time.

    + +

    Fig. 8: PT2 compilation time.

    + +

    5. Concluding Remarks

    + +

    In this blog, we demonstrate that PT2 can significantly accelerate the training of large and complex production AI models with reasonable compilation time. In our next blog, we will discuss how PT2 can do general graph transformations.

    + +

    6. Acknowledgements

    + +

    Many thanks to Mark Saroufim, Adnan Aziz, and Gregory Chanan for their detailed and insightful reviews.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/training-using-float8-fsdp2/index.html b/blog/training-using-float8-fsdp2/index.html new file mode 100644 index 000000000000..a22b48d16dc2 --- /dev/null +++ b/blog/training-using-float8-fsdp2/index.html @@ -0,0 +1,859 @@ + + + + + + + + + + + + + Supercharging Training using float8 and FSDP2 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    November 25, 2024

    +

    + Supercharging Training using float8 and FSDP2 +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + IBM and Meta + +

    +

    IBM: Tuan Hoang Trong, Alexei Karve, Yan Koyfman, Linsong Chu, Divya Kumari, Shweta Salaria, Robert Walkup, Praneet Adusumilli, Nirmit Desai, Raghu Ganti, Seetharami Seelam
    +Meta: Less Wright, Wei Feng, Vasiliy Kuznetsov, Driss Guesseous

    + +

    In this blog, we will demonstrate how we achieve up to 50% throughput speedup while achieving loss and evaluation benchmark parity in training over FSDP1 bf16 training. We achieve this speedup by leveraging FSDP2, DTensor, and torch.compile with torchao’s float8 via linear layer updates (compute), and float8 all_gathers for weight communication. We showcase these improvements across a spectrum of Meta LLaMa model architecture sizes, ranging from small 1.8B model size all the way to 405B model size, making training faster than ever.

    + +

    We demonstrate these improvements using the Meta Llama3 architecture, and then perform model quality studies at two scales: 100B tokens at 8B model size, and 50B tokens at 70B model size, which provide an exact comparison of float8 and bf16 training loss curves. We demonstrate that the loss curves result in identical loss convergence across these model training runs compared to the bf16 counterpart. Further, we train a 3B model to 1T tokens using the FineWeb-edu dataset and run standard evaluation benchmarks to ensure that the model quality is intact and comparable to a bf16 run.

    + +

    At IBM Research, we plan to adopt these capabilities for our data ablations to improve the number of experiments we can perform in a given GPU budget. Longer term, we will follow up with a larger scale model run to demonstrate the end-to-end feasibility of float8 training.

    + +

    What is Float8?

    + +

    The float8 format for training models was introduced by NVIDIA, ARM, and Intel in a 2022 paper which demonstrated the feasibility of training using lower precision float8, without sacrificing model quality. With the introduction of newer GPUs like the NVIDIA Hopper series, FP8 training became feasible with the potential of more than 2x improvement in training throughput due to native float8 tensor core support. There are a few challenges to realize this promise:
    +(i) Enable the core model operations like matmul and attention in float8,
    +(ii) Enable float8 training in a distributed framework, and
    +(iii) Enable weight communication between GPUs in float8.
    +While the float8 matmul was enabled by NVIDIA libraries, the latter two were provided in recent updates to FSDP2 and torchao.

    + +

    In this blog, we are using torchtitan as the entry point for training, IBM’s deterministic data loader, the float8 linear layer implementation from torchao, and the float8 all gather from the latest PyTorch nightlies in conjunction with FSDP2. For this training, we are using the float8 per tensor (tensorwise) scaling granularity rather than rowwise. We leverage torch.compile to ensure that we get maximum performance gains. We are computing attention in bf16 using SDPA and are currently working on moving this to float8 as well.

    + +

    Experiments

    + +

    We perform various experiments to demonstrate the benefits of float8 training. The first is to ensure that model quality is not sacrificed. To verify this, we train an 8B model and 70B model for a few thousand steps and compare the loss curves between both the float8 and bf16 training run. Our experiments are performed on three different H100 clusters with 128, 256, and 512 H100 GPU configurations in very different environments to demonstrate reproducibility. The first cluster is customized on Grand Teton in Meta with 400Gbps custom interconnect, the second is an IBM research cluster with 3.2Tbps Infiniband interconnect, and the third is an IBM Cloud cluster with 3.2Tbps RoCE interconnect for GPU-to-GPU communication.

    + +

    First, we plot the loss curve comparisons for both these models in the below figures to demonstrate loss parity for a few thousand steps.

    + +

    Figure 1: (a) 8B model loss parity for 2k steps, (b) 70B loss parity for 1k steps

    + +

    Figure 1: (a) 8B model loss parity for 2k steps, (b) 70B loss parity for 1k steps

    + +

    Figure 1: (a) 8B model loss parity for 2k steps, (b) 70B loss parity for 1k steps

    + +

    We observe that across these different models and in different environments, we obtain loss parity for the small scale of tokens. Next, we characterize the throughput gains for four different model sizes ranging from 1.8B to 405B. We explored the best batch size and activation checkpointing schemes for both the float8 and bf16 training runs to determine the tokens/sec/GPU (wps) metric and report the performance gain. For the 405B model, we leveraged DTensor for tensor parallel training with FSDP2. We use a sequence length of 8K for all our measurements.

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Model size + wps (bf16) + wps (float8) + Percent gain +
    1.8B + 29K + 35K + 18% +
    8B + 8K + 10K + 28% +
    70B + 956 + 1430 + 50% +
    405B (TP4) + 149 + 227 + 52% +
    + +

    Table 1: Performance gains over bf16 (both bf16 and float8 use torch.compile)

    + +

    We observe from Table 1 that the gains for larger models (70B and 405B) reach up to 50%, the smaller models see gains between roughly 20 and 30%. In further experiments, we observed that the addition of float8 all_gather enables a boost of ~5% beyond the compute itself in float8, which is inline with the observations in this blog.

    + +

    Second, to demonstrate the effectiveness of an FP8 model, we trained a 3B model following the Llama3 architecture for 1T tokens using the FineWeb-edu dataset from Hugging Face. We performed evaluations using the lm-eval-harness framework and present a small portion of these results in the below table. We observe that the bf16 performance is marginally better than the float8 scores (about one percent). While some scores are significantly better with bf16 (e.g., MMLU is 3 pts higher), we expect these gaps to vanish when the right hyper parameters are chosen and across larger scale training runs (e.g., the bf16 run had half the batch size and it is well known that smaller batch size runs can improve evaluation scores).

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Benchmark + Score (float8) + Score (bf16) +
    MMLU (5-shot) + 0.26 + 0.29 +
    ARC-e + 0.73 + 0.73 +
    ARC-c + 0.43 + 0.46 +
    Hellaswag + 0.65 + 0.67 +
    sciq + 0.89 + 0.88 +
    OpenBook QA + 0.43 + 0.43 +
    PIQA + 0.76 + 0.76 +
    Winogrande + 0.60 + 0.65 +
    Average + 0.59 + 0.60 +
    + +

    Table 2: Benchmark scores for float8 trained model running in FP16 for eval (at 1T tokens of FineWeb pre-training).

    + +

    Finally, we scale our experiments to 512 H100 GPUs on the IBM Cloud cluster. We were able to recreate the results and speedups that we observed even at 512 GPU scale. We summarize these results only for the large models in the below table (70B and 405B).

    + + + + + + + + + + + + + + + + + + + + +
    Model size + wps (bf16) + wps (float8) + Percent gain +
    70B + 960 + 1448 + 51% +
    405B (TP4) + 152 + 217 + 43% +
    + +

    Table 3: Performance gains over bf16 (both bf16 and float8 use torch.compile) for 512 GPU scale

    + +

    Future work

    + +

    We are also working on evaluating other forms of parallelism such as Context Parallelism. We plan to evaluate all of these features to demonstrate the composability and ability to make choices for training large scale models.

    + +

    Acknowledgements

    + +

    We thank Davis Wertheimer from IBM Research for enabling the data loader for torchtitan runs enabling us to replay data in the same order across multiple runs. We also thank IBM Cloud for enabling us with early test access to the H100 cluster.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/triton-kernel-compilation-stages/index.html b/blog/triton-kernel-compilation-stages/index.html new file mode 100644 index 000000000000..32f7cbc39b0d --- /dev/null +++ b/blog/triton-kernel-compilation-stages/index.html @@ -0,0 +1,820 @@ + + + + + + + + + + + + + Triton Kernel Compilation Stages | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    October 30, 2024

    +

    + Triton Kernel Compilation Stages +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Sara Kokkila-Schumacher*, Brian Vaughan*, Raghu Ganti*, and Less Wright+ (*IBM Research, +Meta) + +

    +

    The Triton open-source programming language and compiler offers a high-level, python-based approach to create efficient GPU code. In this blog, we highlight the underlying details of how a triton program is compiled and the intermediate representations. For an introduction to Triton, we refer readers to this blog.

    + +

    Triton Language and Compilation

    + +

    The Triton programming language supports different types of modern GPUs and follows a blocked programming approach. As an example, we will follow the Triton vector add tutorial with minor modifications. The vector addition kernel and helper function is defined as:

    + +
    import torch
    +import triton
    +import triton.language as tl
    +
    +@triton.jit
    +def add_kernel(x_ptr,  # *Pointer* to first input vector.
    +               y_ptr,  # *Pointer* to second input vector.
    +               output_ptr,  # *Pointer* to output vector.
    +               n_elements, 
    +               BLOCK_SIZE: tl.constexpr, 
    +               ):
    +  
    +    pid = tl.program_id(axis=0) 
    +    block_start = pid * BLOCK_SIZE
    +    offsets = block_start + tl.arange(0, BLOCK_SIZE)
    + 
    +    mask = offsets < n_elements
    +
    +    x = tl.load(x_ptr + offsets, mask=mask)
    +    y = tl.load(y_ptr + offsets, mask=mask)
    +    output = x + y
    +    tl.store(output_ptr + offsets, output, mask=mask)
    + 
    +def add(x: torch.Tensor, y: torch.Tensor):
    +    output = torch.empty_like(x)
    +    assert x.is_cuda and y.is_cuda and output.is_cuda
    +    n_elements = output.numel()
    +
    +    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']), )
    +    triton_kernel=add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)
    +    torch.cuda.synchronize()
    +
    +    # Save compilation stages - some of the stages identified here are specific to NVIDIA devices:
    +    with open('triton_IR.txt', 'w') as f:
    +        print(triton_kernel.asm['ttir'], file=f)
    +    with open('triton_TTGIR.txt', 'w') as f:
    +        print(triton_kernel.asm['ttgir'], file=f)
    +    with open('triton_LLVMIR.txt', 'w') as f:
    +        print(triton_kernel.asm['llir'], file=f)
    +    with open('triton_PTX.ptx', 'w') as f:
    +        print(triton_kernel.asm['ptx'], file=f)
    +    with open('triton_cubin.txt', 'w') as f:
    +        print(triton_kernel.asm['cubin'], file=f)
    +
    +    return output
    +
    +torch.manual_seed(0)
    +size = 98432
    +x = torch.rand(size, device='cuda')
    +y = torch.rand(size, device='cuda')
    +output_torch = x + y
    +output_triton = add(x, y)
    +print(output_torch)
    +print(output_triton)
    +print(f'The maximum difference between torch and triton is '
    +      f'{torch.max(torch.abs(output_torch - output_triton))}')    
    +
    + +

    The Triton vector add kernel includes the @triton.jit decorator. The Triton compiler will compile functions marked by @triton.jit, which lowers the function through multiple compilation stages. The helper function add allocates the output tensor, computes the appropriate GPU grid size, and additionally saves the intermediate compilation stages.

    + +

    Focusing on the compilation process, the Triton kernel is lowered to device specific assembly through a series of stages outlined in the following figure.

    + +

    compilation process

    + +

    The kernel is compiled by first walking the abstract syntax tree (AST) of the decorated python function to create the Triton Intermediate Representation (Triton-IR). The Triton-IR is an unoptimized, machine independent intermediate representation. It introduces tile-level programming requirements and is based on the open-source LLVM compiler project. Next the Triton compiler optimizes and converts the Triton-IR into the stages Triton-GPU IR (Triton-TTGIR) and then LLVM-IR. Both the Triton-IR and Triton-GPUIR representations are written as MLIR dialects, where MLIR is a subproject of LLVM that aims to improve compilation for heterogeneous hardware.

    + +

    For the Triton vector add tutorial kernel, the example Triton IR snippet is:

    + +
    module {
    +  tt.func public @add_kernel(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("/u/saraks/triton_blog/01-vector-add.py":28:0), %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("/u/saraks/triton_blog/01-vector-add.py":28:0), %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("/u/saraks/triton_blog/01-vector-add.py":28:0), %arg3: i32 {tt.divisibility = 16 : i32} loc("/u/saraks/triton_blog/01-vector-add.py":28:0)) attributes {noinline = false} {
    +    %c1024_i32 = arith.constant 1024 : i32 loc(#loc1)
    +    %0 = tt.get_program_id x : i32 loc(#loc2)
    +    %1 = arith.muli %0, %c1024_i32 : i32 loc(#loc3)
    +    %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc4)
    +    %3 = tt.splat %1 : i32 -> tensor<1024xi32> loc(#loc5)
    +    %4 = arith.addi %3, %2 : tensor<1024xi32> loc(#loc5)
    +    %5 = tt.splat %arg3 : i32 -> tensor<1024xi32> loc(#loc6)
    +    %6 = arith.cmpi slt, %4, %5 : tensor<1024xi32> loc(#loc6)
    +    %7 = tt.splat %arg0 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>> loc(#loc7)
    +    %8 = tt.addptr %7, %4 : tensor<1024x!tt.ptr<f32>>, tensor<1024xi32> loc(#loc7)
    +    %9 = tt.load %8, %6 : tensor<1024x!tt.ptr<f32>> loc(#loc8)
    +    %10 = tt.splat %arg1 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>> loc(#loc9)
    +    %11 = tt.addptr %10, %4 : tensor<1024x!tt.ptr<f32>>, tensor<1024xi32> loc(#loc9)
    +    %12 = tt.load %11, %6 : tensor<1024x!tt.ptr<f32>> loc(#loc10)
    +    %13 = arith.addf %9, %12 : tensor<1024xf32> loc(#loc11)
    +    %14 = tt.splat %arg2 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>> loc(#loc12)
    +    %15 = tt.addptr %14, %4 : tensor<1024x!tt.ptr<f32>>, tensor<1024xi32> loc(#loc12)
    +    tt.store %15, %13, %6 : tensor<1024x!tt.ptr<f32>> loc(#loc13)
    +    tt.return loc(#loc14)
    +  } loc(#loc)
    +} loc(#loc)
    +
    + +

    Notice that the main functions in the Triton kernel are now represented as:

    + + + + + + + + + + + + + + + + + + + + + + +
    Triton kernel + Triton IR +
    x = tl.load(x_ptr + offsets, mask=mask) + %9 = tt.load %8, %6 : tensor<1024x!tt.ptr<f32>> loc(#loc8) +
    y = tl.load(y_ptr + offsets, mask=mask) + %12 = tt.load %11, %6 : tensor<1024x!tt.ptr<f32>> loc(#loc10) +
    output = x + y + %13 = arith.addf %9, %12 : tensor<1024xf32> loc(#loc11) +
    tl.store(output_ptr + offsets, output, mask=mask) + tt.store %15, %13, %6 : tensor<1024x!tt.ptr<f32>> loc(#loc13) +
    + +

    At the Triton IR stage, the %arg0: !tt.ptr&lt;f32> and the following tensor references show that the intermediate representation is already specialized by the data type.

    + +

    We ran this example on a Tesla V100-SXM2-32GB GPU with CUDA Version 12.2, Python version 3.11.9, and PyTorch 2.4.1 with the default version of Triton that is installed with PyTorch. On this device, the simple vector addition has the following Triton GPU IR snippet with lines omitted for clarity:

    + +
    #blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
    +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:70", "triton_gpu.threads-per-warp" = 32 : i32} {
    +  tt.func public @add_kernel(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}
    +    ⋮
    +    %9 = tt.load %8, %6 : tensor<1024x!tt.ptr<f32>, #blocked> loc(#loc8)
    +    ⋮
    +    %12 = tt.load %11, %6 : tensor<1024x!tt.ptr<f32>, #blocked> loc(#loc10)
    +    %13 = arith.addf %9, %12 : tensor<1024xf32, #blocked> loc(#loc11)
    +    ⋮
    +    tt.store %15, %13, %6 : tensor<1024x!tt.ptr<f32>, #blocked> loc(#loc13)
    +    ⋮
    +  } loc(#loc)
    +} loc(#loc)
    +
    + +

    At this stage, some of the hardware specific information is included. For example, the compute capability is included along with details on how the tensors are distributed to cores and warps or for AMD GPUs on wavefronts. In this example, the tensors are represented as a #blocked layout. In this encoding, each warp owns a contiguous portion of the tensor. Currently, other possible memory optimizations include layouts such as slice (restructures and distributes a tensor along a dimension), dot_op(optimized layout for block matrix product), shared(indicates GPU shared memory), nvidia_mma (produced by NVIDIA tensor cores), amd_mfma (produced by AMD MFMA matrix core), and amd_wmma (produced by AMD WMMA matrix core). As announced at the recent Triton conference, this layout representation will transition to a new linear layout to unify layouts within and across backends. The stage from Triton-GPUIR to LLVM-IR converts the Triton-GPUIR to LLVM’s representation. At this time, Triton has third-party backend support for NVIDIA and AMD devices, but other device support is under active development by the open-source community.

    + +

    A small subset of the LLVM-IR vector add arguments shown below for illustration:

    + +
      %19 = extractvalue { i32, i32, i32, i32 } %18, 0, !dbg !16
    +  %39 = extractvalue { i32, i32, i32, i32 } %38, 0, !dbg !18
    +  %23 = bitcast i32 %19 to float, !dbg !16
    +  %43 = bitcast i32 %39 to float, !dbg !18
    +  %56 = fadd float %23, %43, !dbg !19
    +
    + +

    After some pointer arithmetic and an inline assembly call to retrieve the data from global memory, the vector elements are extracted and cast to the correct type. Finally they are added together and later written to global memory through an inline assembly expression.

    + +

    The final stages of the Triton compilation process lower the LLVM-IR to a device specific binary. For the example vector add, on an NVIDIA GPU, the next intermediate is PTX (Parallel Thread Execution). The low-level PTX syntax specifies the execution at the thread level of NVIDIA devices, starting with the CUDA 1.0 release. For an in-depth guide on PTX, see NVIDIA’s documentation. In the vector add, the kernel parameters are passed from the host to the kernel, addresses are assigned and mov instructions facilitate the thread-level data access, ultimately representing the element addition calls with add.f32 such as the example below:

    + +
    	add.f32 	%f17, %f1, %f9// add type float32, output register, input register for x, input register for y 
    +
    + +

    The Triton compiler orchestrates the final stage with different hardware backends managing how the assembly code is compiled into binary. The Triton kernel is now ready for use.

    + +

    Summary

    + +

    Triton provides a high-level abstraction to program and compile kernels for different types of hardware. In this post, we highlight the different stages of the Triton code representations and Triton compiler. For details on including custom Triton kernels or accelerating different workloads with Triton kernels, check out the PyTorch Triton tutorial, the blog posts on Triton GPTQ kernels, Llama3 FP8 Inference with Triton, and CUDA-Free Inference for LLMs, or the PyTorch 2.2 Section on Triton code generation.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/understanding-gpu-memory-1/index.html b/blog/understanding-gpu-memory-1/index.html new file mode 100644 index 000000000000..4d341f15c3fc --- /dev/null +++ b/blog/understanding-gpu-memory-1/index.html @@ -0,0 +1,989 @@ + + + + + + + + + + + + + Understanding GPU Memory 1: Visualizing All Allocations over Time | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Aaron Shi, Zachary DeVito + +

    +

    During your time with PyTorch on GPUs, you may be familiar with this common error message:

    + +
    torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 512.00 MiB. GPU 0 has a total capacity of 79.32 GiB of which 401.56 MiB is free.
    +
    + +

    In this series, we show how to use memory tooling, including the Memory Snapshot, the Memory Profiler, and the Reference Cycle Detector to debug out of memory errors and improve memory usage.

    + +

    Memory Timeline

    + +

    The Memory Snapshot tool provides a fine-grained GPU memory visualization for debugging GPU OOMs. Captured memory snapshots will show memory events including allocations, frees and OOMs, along with their stack traces.

    + +

    In a snapshot, each tensor’s memory allocation is color coded separately. The x axis is over time, and the y axis is the amount of GPU memory in MB. The snapshot is interactive, so we can observe the stack trace for any allocation by mousing over. Try it yourself at https://github.com/pytorch/pytorch.github.io/blob/site/assets/images/understanding-gpu-memory-1/snapshot.html.

    + +

    In this snapshot, there are 3 peaks showing the memory allocations over 3 training iterations (this is configerable). When looking at the peaks, it is easy to see the rise of memory in the forward pass and the fall during the backward pass as the gradients are computed. It is also possible to see that the program has the same pattern of memory use iteration to iteration. One thing that stands out is the many tiny spikes in memory, by mousing over them, we see that they are buffers used temporarily by convolution operators.

    + +

    Capturing Memory Snapshots

    + +

    The API to capture memory snapshots is fairly simple and available in torch.cuda.memory:

    + +
      +
    • Start: torch.cuda.memory._record_memory_history(max_entries=100000)
    • +
    • Save: torch.cuda.memory._dump_snapshot(file_name)
    • +
    • Stop: torch.cuda.memory._record_memory_history(enabled=None)
    • +
    + +

    Code Snippet (for full code sample, see Appendix A):

    + +
       # Start recording memory snapshot history, initialized with a buffer
    +   # capacity of 100,000 memory events, via the `max_entries` field.
    +   torch.cuda.memory._record_memory_history(
    +       max_entries=MAX_NUM_OF_MEM_EVENTS_PER_SNAPSHOT
    +   )
    +
    +   # Run your PyTorch Model.
    +   # At any point in time, save a snapshot to file for later.
    +   for _ in range(5):
    +       pred = model(inputs)
    +       loss_fn(pred, labels).backward()
    +       optimizer.step()
    +       optimizer.zero_grad(set_to_none=True)
    +
    +   # In this sample, we save the snapshot after running 5 iterations.
    +   #   - Save as many snapshots as you'd like.
    +   #   - Snapshots will save last `max_entries` number of memory events
    +   #     (100,000 in this example).
    +   try:
    +       torch.cuda.memory._dump_snapshot(f"{file_prefix}.pickle")
    +   except Exception as e:
    +       logger.error(f"Failed to capture memory snapshot {e}")
    +
    +   # Stop recording memory snapshot history.
    +   torch.cuda.memory._record_memory_history(enabled=None)
    +
    + +

    To visualize the snapshot file, we have a tool hosted at https://pytorch.org/memory_viz. There, you can drag and drop your saved snapshot file and it will plot each allocation over time. Privacy Note: The tool will not save your snapshot.

    + +

    Memory Timeline

    + +

    Alternatively, you can generate an HTML from a .pickle by using the script at pytorch/torch/cuda/_memory_viz.py, here is an example:

    + +
    python torch/cuda/_memory_viz.py trace_plot snapshot.pickle -o snapshot.html
    +
    + +

    Debugging CUDA OOMs

    + +

    Let’s look at how we can use the memory snapshot tool to answer:

    + +
      +
    1. Why did a CUDA OOM happen?
    2. +
    3. Where is the GPU Memory being used?
    4. +
    + +

    ResNet50 with a bug

    + +

    We’ve taken a look at a properly working model in the first snapshot. Now, let’s take a look at a training example with a bug, see snapshot:

    + +

    Memory Timeline

    + +

    Notice how the second iteration uses far more memory than the first iteration. If this model were much larger, it could have CUDA OOM’d in the second iteration without much more insight into why.

    + +

    Memory Timeline

    + +

    When examining this snapshot further, we can clearly see that several tensors are staying alive from the first iteration to the second and later iterations. If we mouse over one of these tensors, it would show a stack trace suggesting that these were gradient tensors.

    + +

    And indeed if we go to the code, we can see that it doesn’t clear the gradient tensors, when it could have cleared them before the forward.

    + +

    Before:

    +
            for _ in range(num_iters):
    +          pred = model(inputs)
    +          loss_fn(pred, labels).backward()
    +          optimizer.step()
    +
    + +

    After:

    +
            for _ in range(num_iters):
    +          pred = model(inputs)
    +          loss_fn(pred, labels).backward()
    +          optimizer.step()
    +          # Add this line to clear grad tensors
    +          optimizer.zero_grad(set_to_none=True)
    +
    + +

    We can simply add an optimizer.zero_grad(set_to_none=True) instruction to clear the gradient tensors from iteration to iteration (more details about why we need to zero the gradients here: https://pytorch.org/tutorials/recipes/recipes/zeroing_out_gradients.html).

    + +

    This is a simplification of a bug we’ve found in more complicated programs using this tool. We encourage you to try out the Memory Snapshot on your GPU memory problems and let us know how it goes.

    + +

    ResNet50 after bug fix

    + +

    After applying the fix, the snapshot seems to be clearing the gradients now.

    + +

    Memory Timeline

    + +

    We now have the snapshot of a properly working ResNet50 model. Try out the code yourself (see code sample in Appendix A).

    + +

    But you may be wondering, why is there still an increase in memory after the first iteration? To answer this, let’s visit the Memory Profiler in the next section.

    + +

    Categorized Memory Usage

    + +

    The Memory Profiler is an added feature of the PyTorch Profiler that categorizes memory usage over time. We still rely on the Memory Snapshot for stack traces for deep dives into memory allocations.

    + +

    To generate a memory timeline, here is a code snippet (full code sample in Appendix B):

    + +
       # Initialize the profiler context with record_shapes, profile_memory,
    +   # and with_stack set to True.
    +   with torch.profiler.profile(
    +       activities=[
    +           torch.profiler.ProfilerActivity.CPU,
    +           torch.profiler.ProfilerActivity.CUDA,
    +       ],
    +       schedule=torch.profiler.schedule(wait=0, warmup=0, active=6, repeat=1),
    +       record_shapes=True,
    +       profile_memory=True,
    +       with_stack=True,
    +       on_trace_ready=trace_handler,
    +   ) as prof:
    +       # Run the PyTorch Model inside the profile context.
    +       for _ in range(5):
    +           prof.step()
    +           with record_function("## forward ##"):
    +               pred = model(inputs)
    +
    +           with record_function("## backward ##"):
    +               loss_fn(pred, labels).backward()
    +
    +           with record_function("## optimizer ##"):
    +               optimizer.step()
    +               optimizer.zero_grad(set_to_none=True)
    +
    +   # Construct the memory timeline HTML plot.
    +   prof.export_memory_timeline(f"{file_prefix}.html", device="cuda:0")
    +
    + +

    For further reference, see https://pytorch.org/docs/main/profiler.html.

    + +

    The Memory Profiler automatically generates categories based on the graph of tensor operations recorded during profiling.

    + +

    Memory Timeline

    + +

    In this Memory Timeline collected using the Memory Profiler, we have the same training example as before. We can observe the gradients in blue are now being cleared from iteration to iteration. We can also notice that the optimizer state in yellow is allocated after the first iteration, and is kept constant for the rest of the job.

    + +

    This optimizer state is the reason behind the increase of GPU memory from the first iteration to the second. Try out the code yourself (see code sample in Appendix B). The Memory Profiler helps to improve training memory understanding so that model authors can figure out which categories are using the most GPU memory.

    + +

    Where can I find these tools?

    + +

    We hope that these tools will greatly improve your ability to debug CUDA OOMs and to understand your memory usage by category.

    + +

    The Memory Snapshot and the Memory Profiler are available in the v2.1 release of PyTorch as experimental features.

    + + + +

    Feedback

    + +

    We look forward to hearing from you about any enhancements, bugs or memory stories that our tools helped to solve! As always, please feel free to open new issues on PyTorch’s Github page.

    + +

    We are also open to contributions from the OSS community, feel free to tag Aaron Shi and Zachary DeVito in any Github PRs for reviews.

    + +

    Acknowledgements

    + +

    Really appreciate the content reviewers, Mark Saroufim and Gregory Chanan, for reviewing this post and improving its readability.

    + +

    Really appreciate the code reviews and feedback from Adnan Aziz and Lei Tian.

    + +

    Appendix

    + +

    Appendix A - ResNet50 Memory Snapshot Code Example

    + +
    # (c) Meta Platforms, Inc. and affiliates. 
    +import logging
    +import socket
    +from datetime import datetime, timedelta
    +
    +import torch
    +
    +from torchvision import models
    +
    +logging.basicConfig(
    +   format="%(levelname)s:%(asctime)s %(message)s",
    +   level=logging.INFO,
    +   datefmt="%Y-%m-%d %H:%M:%S",
    +)
    +logger: logging.Logger = logging.getLogger(__name__)
    +logger.setLevel(level=logging.INFO)
    +
    +TIME_FORMAT_STR: str = "%b_%d_%H_%M_%S"
    +
    +# Keep a max of 100,000 alloc/free events in the recorded history
    +# leading up to the snapshot.
    +MAX_NUM_OF_MEM_EVENTS_PER_SNAPSHOT: int = 100000
    +
    +def start_record_memory_history() -> None:
    +   if not torch.cuda.is_available():
    +       logger.info("CUDA unavailable. Not recording memory history")
    +       return
    +
    +   logger.info("Starting snapshot record_memory_history")
    +   torch.cuda.memory._record_memory_history(
    +       max_entries=MAX_NUM_OF_MEM_EVENTS_PER_SNAPSHOT
    +   )
    +
    +def stop_record_memory_history() -> None:
    +   if not torch.cuda.is_available():
    +       logger.info("CUDA unavailable. Not recording memory history")
    +       return
    +
    +   logger.info("Stopping snapshot record_memory_history")
    +   torch.cuda.memory._record_memory_history(enabled=None)
    +
    +def export_memory_snapshot() -> None:
    +   if not torch.cuda.is_available():
    +       logger.info("CUDA unavailable. Not exporting memory snapshot")
    +       return
    +
    +   # Prefix for file names.
    +   host_name = socket.gethostname()
    +   timestamp = datetime.now().strftime(TIME_FORMAT_STR)
    +   file_prefix = f"{host_name}_{timestamp}"
    +
    +   try:
    +       logger.info(f"Saving snapshot to local file: {file_prefix}.pickle")
    +       torch.cuda.memory._dump_snapshot(f"{file_prefix}.pickle")
    +   except Exception as e:
    +       logger.error(f"Failed to capture memory snapshot {e}")
    +       return
    +
    +# Simple Resnet50 example to demonstrate how to capture memory visuals.
    +def run_resnet50(num_iters=5, device="cuda:0"):
    +   model = models.resnet50().to(device=device)
    +   inputs = torch.randn(1, 3, 224, 224, device=device)
    +   labels = torch.rand_like(model(inputs))
    +   optimizer = torch.optim.SGD(model.parameters(), lr=1e-2, momentum=0.9)
    +   loss_fn = torch.nn.CrossEntropyLoss()
    +
    +   # Start recording memory snapshot history
    +   start_record_memory_history()
    +
    +   for _ in range(num_iters):
    +       pred = model(inputs)
    +       loss_fn(pred, labels).backward()
    +       optimizer.step()
    +       optimizer.zero_grad(set_to_none=True)
    +
    +   # Create the memory snapshot file
    +   export_memory_snapshot()
    +
    +   # Stop recording memory snapshot history
    +   stop_record_memory_history()
    +
    +if __name__ == "__main__":
    +    # Run the resnet50 model
    +    run_resnet50()
    +
    + +

    Appendix B - ResNet50 Memory Profiler Code Example

    + +
    # (c) Meta Platforms, Inc. and affiliates. 
    +import logging
    +import socket
    +from datetime import datetime, timedelta
    +
    +import torch
    +
    +from torch.autograd.profiler import record_function
    +from torchvision import models
    +
    +logging.basicConfig(
    +   format="%(levelname)s:%(asctime)s %(message)s",
    +   level=logging.INFO,
    +   datefmt="%Y-%m-%d %H:%M:%S",
    +)
    +logger: logging.Logger = logging.getLogger(__name__)
    +logger.setLevel(level=logging.INFO)
    +
    +TIME_FORMAT_STR: str = "%b_%d_%H_%M_%S"
    +
    +def trace_handler(prof: torch.profiler.profile):
    +   # Prefix for file names.
    +   host_name = socket.gethostname()
    +   timestamp = datetime.now().strftime(TIME_FORMAT_STR)
    +   file_prefix = f"{host_name}_{timestamp}"
    +
    +   # Construct the trace file.
    +   prof.export_chrome_trace(f"{file_prefix}.json.gz")
    +
    +   # Construct the memory timeline file.
    +   prof.export_memory_timeline(f"{file_prefix}.html", device="cuda:0")
    +
    +def run_resnet50(num_iters=5, device="cuda:0"):
    +   model = models.resnet50().to(device=device)
    +   inputs = torch.randn(1, 3, 224, 224, device=device)
    +   labels = torch.rand_like(model(inputs))
    +   optimizer = torch.optim.SGD(model.parameters(), lr=1e-2, momentum=0.9)
    +   loss_fn = torch.nn.CrossEntropyLoss()
    +
    +   with torch.profiler.profile(
    +       activities=[
    +           torch.profiler.ProfilerActivity.CPU,
    +           torch.profiler.ProfilerActivity.CUDA,
    +       ],
    +       schedule=torch.profiler.schedule(wait=0, warmup=0, active=6, repeat=1),
    +       record_shapes=True,
    +       profile_memory=True,
    +       with_stack=True,
    +       on_trace_ready=trace_handler,
    +   ) as prof:
    +       for _ in range(num_iters):
    +           prof.step()
    +           with record_function("## forward ##"):
    +               pred = model(inputs)
    +
    +           with record_function("## backward ##"):
    +               loss_fn(pred, labels).backward()
    +
    +           with record_function("## optimizer ##"):
    +               optimizer.step()
    +               optimizer.zero_grad(set_to_none=True)
    +
    +if __name__ == "__main__":
    +    # Warm up
    +    run_resnet50()
    +    # Run the resnet50 model
    +    run_resnet50()
    +
    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/understanding-gpu-memory-2/index.html b/blog/understanding-gpu-memory-2/index.html new file mode 100644 index 000000000000..7b303408c05b --- /dev/null +++ b/blog/understanding-gpu-memory-2/index.html @@ -0,0 +1,985 @@ + + + + + + + + + + + + + Understanding GPU Memory 2: Finding and Removing Reference Cycles | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Aaron Shi, Zachary DeVito + +

    +

    This is part 2 of the Understanding GPU Memory blog series. Our first post Understanding GPU Memory 1: Visualizing All Allocations over Time shows how to use the memory snapshot tool. In this part, we will use the Memory Snapshot to visualize a GPU memory leak caused by reference cycles, and then locate and remove them in our code using the Reference Cycle Detector.

    + +

    Sometimes when we were using the Memory Snapshot, we saw plots of GPU memory that looked similar to this.

    + +

    GPU memory

    + +

    In this snapshot, each peak shows GPU tensors building up over time and then several tensors getting released at once. In addition, a CUDA OOM happens on the right side causing all the tensors to be released. Seeing the tensors accumulate like this is a clear indication of a problem, but it doesn’t immediately suggest why.

    + +

    Tensors in Reference Cycles

    + +

    During early debugging, we dug in further to find that this **pattern happens a lot when your Python code has objects with reference cycles. ** Python will clean up non-cyclic objects immediately using reference counting. However objects in reference cycles are only cleaned up later by a cycle collector. If these cycles refer to a GPU tensor, the GPU tensor will stay alive until that cycle collector runs and removes the reference cycle. Let’s take a look at a simplified example.

    + +

    Simple reference cycle

    + +

    Code Snippet behind the snapshot (full code in Appendix A):

    + +
        def leak(tensor_size, num_iter=100000, device="cuda:0"):
    +      class Node:
    +        def __init__(self, T):
    +          self.tensor = T
    +          self.link = None
    +
    +      for _ in range(num_iter):
    +        A = torch.zeros(tensor_size, device=device)
    +        B = torch.zeros(tensor_size, device=device)
    +        a, b = Node(A), Node(B)
    +
    +        # A reference cycle will force refcounts to be non-zero.
    +        a.link, b.link = b, a
    +        # Python will eventually garbage collect a & b, but will
    +        # OOM on the GPU before that happens (since python
    +        # runtime doesn't know about CUDA memory usage).
    +
    + +

    In this code example, the tensors A and B are created, where A has a link to B and vice versa. This forces a non-zero reference count when A and B go out of scope. When we run this for 100,000 iterations, we expect the automatic garbage collection to free the reference cycles when going out of scope. However, this will actually CUDA OOM.

    + +

    Why doesn’t automatic garbage collection work?

    + +

    The automatic garbage collection works well when there is a lot of extra memory as is common on CPUs because it amortizes the expensive garbage collection by using Generational Garbage Collection. But to amortize the collection work, it defers some memory cleanup making the maximum memory usage higher, which is less suited to memory constrained environments. The Python runtime also has no insights into CUDA memory usage, so it cannot be triggered on high memory pressure either. It’s even more challenging as GPU training is almost always memory constrained because we will often raise the batch size to use any additional free memory.

    + +

    The CPython’s garbage collection frees unreachable objects held in reference cycles via the mark-and-sweep. The garbage collection is automatically run when the number of objects exceeds certain thresholds. There are 3 generations of thresholds to help amortize the expensive costs of running garbage collection on every object. The later generations are less frequently run. This would explain why automatic collections will only clear several tensors on each peak, however there are still tensors that leak resulting in the CUDA OOM. Those tensors were held by reference cycles in later generations.

    + +

    Explicitly calling gc.collect()

    + +

    One way to fix this is by explicitly calling the garbage collector frequently. Here we can see that the GPU memory for tensors out of scope gets cleaned up when we explicitly call the garbage collector every 100 iterations. This also controls the maximum GPU peak memory held by leaking tensors.

    + +

    memory leak

    + +

    Although this works and fixes the CUDA OOM issue, calling gc.collect() too frequently can cause other issues including QPS regressions. Therefore we cannot simply increase the frequency of garbage collection on every training job. It’s best to just avoid creating reference cycles in the first place. More on this in section, Reference Cycle Detector.

    + +

    Sneaky Memory Leak in Callback

    + +

    Real examples are more complicated, so let’s look at a more realistic example that has a similar behavior. In this snapshot, we can observe the same behavior of tensors being accumulated and freed during automatic garbage collection, until we hit a CUDA OOM.

    + +

    memory leak

    + +

    Code Snippet behind this snapshot (full code sample in Appendix A):

    + +
        class AwaitableTensor:
    +      def __init__(self, tensor_size):
    +        self._tensor_size = tensor_size
    +        self._tensor = None
    +
    +      def wait(self):
    +        self._tensor = torch.zeros(self._tensor_size, device="cuda:0")
    +        return self._tensor
    +
    +    class AwaitableTensorWithViewCallback:
    +      def __init__(self, tensor_awaitable, view_dim):
    +        self._tensor_awaitable = tensor_awaitable
    +        self._view_dim = view_dim
    +        # Add a view filter callback to the tensor.
    +        self._callback = lambda ret: ret.view(-1, self._view_dim)
    +
    +      def wait(self):
    +        return self._callback(self._tensor_awaitable.wait())
    +
    +    async def awaitable_leak(
    +      tensor_size=2**27, num_iter=100000,
    +    ):
    +      for _ in range(num_iter):
    +        A = AwaitableTensor(tensor_size)
    +        AwaitableTensorWithViewCallBack(A, 4).wait()
    +
    + +

    In this code, we define two classes. The class AwaitableTensor will create a tensor when waited upon. Another class AwaitableTensorWithViewCallback will apply a view filter on the AwaitableTensor via callback lambda.

    + +

    When running awaitable_leak, which creates tensor A (512 MB) and applies a view filter for 100,000 iterations, we expect that A should be reclaimed each time it goes out of scope because the reference count should reach 0. However, this will actually OOM!

    + +

    While we know there is a reference cycle here, it isn’t clear from the code where the cycle is created. To help with these situations, we have created a tool to locate and report these cycles.

    + +

    Reference Cycle Detector

    + +

    Introducing the Reference Cycle Detector, which helps us find reference cycles keeping GPU tensors alive. The API is fairly simple:

    + +
      +
    • During model initialization: +
        +
      • Import: from torch.utils.viz._cycles import warn_tensor_cycles
      • +
      • Start: warn_tensor_cycles()
      • +
      +
    • +
    + +

    The Reference Cycle Detector will issue warnings every time that the cycle collector runs and finds a CUDA tensor that gets freed. The warning provides an object graph showing how the reference cycle refers to the GPU tensor.

    + +

    object graph

    + +

    For instance in this object graph, we can easily observe that there is a circular dependency on the outer circle of the graph, and highlighted in red is the GPU tensor kept alive.

    + +

    Most cycles are pretty easy to fix once they are discovered. For instance here we can remove the reference to self created by self._view_dim in the callback.

    + +

    code snippet

    + +

    We’ve spent some time fixing cycles in existing models using these tools. For example in TorchRec, we’ve found and removed a reference cycle in PR#1226.

    + +

    code snippet

    + +

    Once we’ve removed the reference cycles, the code will no longer issue a CUDA OOM nor show any memory leaks in their snapshots.

    + +

    What are the other benefits of using the Reference Cycle Detector?

    + +

    Removing these cycles will also directly lower the maximum GPU memory usage as well as make it less likely for memory to fragment because the allocator returns to the same state after each iteration.

    + +

    Where can I find these tools?

    + +

    We hope that the Reference Cycle Detector will greatly improve your ability to find and remove memory leaks caused by reference cycles. The Reference Cycle Detector is available in the v2.1 release of PyTorch as experimental features and More information about the Reference Cycle Detector can be found in the PyTorch Memory docs here.

    + +

    Feedback

    + +

    We look forward to hearing from you about any enhancements, bugs or memory stories that our tools helped to solve! As always, please feel free to open new issues on PyTorch’s Github page.

    + +

    We are also open to contributions from the OSS community, feel free to tag Aaron Shi and Zachary DeVito in any Github PRs for reviews.

    + +

    Acknowledgements

    + +

    Really appreciate the content reviewers, Mark Saroufim, Gregory Chanan, and Adnan Aziz for reviewing this post and improving its readability.

    + +

    Appendix

    + +

    Appendix A - Code Sample

    + +

    This code snippet was used to generate the plots and examples shown. Here are the arguments to reproduce the sections:

    + +
      +
    • Introduction: python sample.py
    • +
    • Explicitly calling gc.collect(): python sample.py --gc_collect_interval=100
    • +
    • Sneaky Memory Leak in Callback: python sample.py --workload=awaitable
    • +
    • Ref Cycle Detector: python sample.py --workload=awaitable --warn_tensor_cycles
    • +
    + +

    sample.py:

    + +
    # (c) Meta Platforms, Inc. and affiliates. 
    +import argparse
    +import asyncio
    +import gc
    +import logging
    +import socket
    +from datetime import datetime, timedelta
    +
    +import torch
    +
    +logging.basicConfig(
    +   format="%(levelname)s:%(asctime)s %(message)s",
    +   level=logging.INFO,
    +   datefmt="%Y-%m-%d %H:%M:%S",
    +)
    +logger: logging.Logger = logging.getLogger(__name__)
    +logger.setLevel(level=logging.INFO)
    +
    +TIME_FORMAT_STR: str = "%b_%d_%H_%M_%S"
    +
    +# Keep a max of 100,000 alloc/free events in the recorded history
    +# leading up to the snapshot.
    +MAX_NUM_OF_MEM_EVENTS_PER_SNAPSHOT: int = 100000
    +
    +def start_record_memory_history() -> None:
    +   if not torch.cuda.is_available():
    +       logger.info("CUDA unavailable. Not recording memory history")
    +       return
    +
    +   logger.info("Starting snapshot record_memory_history")
    +   torch.cuda.memory._record_memory_history(
    +       max_entries=MAX_NUM_OF_MEM_EVENTS_PER_SNAPSHOT
    +   )
    +
    +def stop_record_memory_history() -> None:
    +   if not torch.cuda.is_available():
    +       logger.info("CUDA unavailable. Not recording memory history")
    +       return
    +
    +   logger.info("Stopping snapshot record_memory_history")
    +   torch.cuda.memory._record_memory_history(enabled=None)
    +
    +def export_memory_snapshot() -> None:
    +   if not torch.cuda.is_available():
    +       logger.info("CUDA unavailable. Not exporting memory snapshot")
    +       return
    +
    +   # Prefix for file names.
    +   host_name = socket.gethostname()
    +   timestamp = datetime.now().strftime(TIME_FORMAT_STR)
    +   file_prefix = f"{host_name}_{timestamp}"
    +
    +   try:
    +       logger.info(f"Saving snapshot to local file: {file_prefix}.pickle")
    +       torch.cuda.memory._dump_snapshot(f"{file_prefix}.pickle")
    +   except Exception as e:
    +       logger.error(f"Failed to capture memory snapshot {e}")
    +       return
    +
    +# This function will leak tensors due to the reference cycles.
    +def simple_leak(tensor_size, gc_interval=None, num_iter=30000, device="cuda:0"):
    +    class Node:
    +        def __init__(self, T):
    +            self.tensor = T
    +            self.link = None
    +
    +    for i in range(num_iter):
    +        A = torch.zeros(tensor_size, device=device)
    +        B = torch.zeros(tensor_size, device=device)
    +        a, b = Node(A), Node(B)
    +        # A reference cycle will force refcounts to be non-zero, when
    +        # a and b go out of scope.
    +        a.link, b.link = b, a
    +        # Python will eventually gc a and b, but may OOM on the CUDA
    +        # device before that happens (since python runtime doesn't
    +        # know about CUDA memory usage).
    +
    +        # Since implicit gc is not called frequently enough due to
    +        # generational gc, adding an explicit gc is necessary as Python
    +        # runtime does not know about CUDA memory pressure.
    +        # https://en.wikipedia.org/wiki/Tracing_garbage_collection#Generational_GC_(ephemeral_GC)
    +        if gc_interval and i % int(gc_interval) == 0:
    +            gc.collect()
    +
    +async def awaitable_leak(
    +    tensor_size, gc_interval=None, num_iter=100000, device="cuda:0"
    +):
    +    class AwaitableTensor:
    +        def __init__(self, tensor_size, device) -> None:
    +            self._tensor_size = tensor_size
    +            self._device = device
    +            self._tensor = None
    +
    +        def wait(self) -> torch.Tensor:
    +            self._tensor = torch.zeros(self._tensor_size, device=self._device)
    +            return self._tensor
    +
    +    class AwaitableTensorWithViewCallBack:
    +        def __init__(
    +            self,
    +            tensor_awaitable: AwaitableTensor,
    +            view_dim: int,
    +        ) -> None:
    +            self._tensor_awaitable = tensor_awaitable
    +            self._view_dim = view_dim
    +            # Add a view filter callback to the tensor.
    +            self._callback = lambda ret: ret.view(-1, self._view_dim)
    +
    +        def wait(self) -> torch.Tensor:
    +            return self._callback(self._tensor_awaitable.wait())
    +
    +    for i in range(num_iter):
    +        # Create an awaitable tensor
    +        a_tensor = AwaitableTensor(tensor_size, device)
    +
    +        # Apply a view filter callback on the awaitable tensor.
    +        AwaitableTensorWithViewCallBack(a_tensor, 4).wait()
    +
    +        # a_tensor will go out of scope.
    +
    +        if gc_interval and i % int(gc_interval) == 0:
    +            gc.collect()
    +
    +if __name__ == "__main__":
    +    parser = argparse.ArgumentParser(description="A memory_leak binary instance")
    +    parser.add_argument(
    +        "--gc_collect_interval",
    +        default=None,
    +        help="Explicitly call GC every given interval. Default is off.",
    +    )
    +    parser.add_argument(
    +        "--workload",
    +        default="simple",
    +        help="Toggle which memory leak workload to run. Options are simple, awaitable.",
    +    )
    +    parser.add_argument(
    +        "--warn_tensor_cycles",
    +        action="store_true",
    +        default=False,
    +        help="Toggle whether to enable reference cycle detector.",
    +    )
    +    args = parser.parse_args()
    +
    +    if args.warn_tensor_cycles:
    +        from tempfile import NamedTemporaryFile
    +
    +        from torch.utils.viz._cycles import observe_tensor_cycles
    +
    +        logger.info("Enabling warning for Python reference cycles for CUDA Tensors.")
    +
    +        def write_and_log(html):
    +            with NamedTemporaryFile("w", suffix=".html", delete=False) as f:
    +                f.write(html)
    +                logger.warning(
    +                    "Reference cycle includes a CUDA Tensor see visualization of cycle %s",
    +                    f.name,
    +                )
    +
    +        observe_tensor_cycles(write_and_log)
    +    else:
    +        # Start recording memory snapshot history
    +        start_record_memory_history()
    +
    +    # Run the workload with a larger tensor size.
    +    # For smaller sizes, we will not CUDA OOM as gc will kick in often enough
    +    # to reclaim reference cycles before an OOM occurs.
    +    size = 2**26  # 256 MB
    +    try:
    +        if args.workload == "awaitable":
    +            size *= 2
    +            logger.info(f"Running tensor_size: {size*4/1024/1024} MB")
    +            asyncio.run(
    +                awaitable_leak(tensor_size=size, gc_interval=args.gc_collect_interval)
    +            )
    +        elif args.workload == "simple":
    +            logger.info(f"Running tensor_size: {size*4/1024/1024} MB")
    +            simple_leak(tensor_size=size, gc_interval=args.gc_collect_interval)
    +        else:
    +            raise Exception("Unknown workload.")
    +    except Exception:
    +        logger.exception(f"Failed to allocate {size*4/1024/1024} MB")
    +
    +    # Create the memory snapshot file
    +    export_memory_snapshot()
    +
    +    # Stop recording memory snapshot history
    +    stop_record_memory_history()
    +
    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/understanding-lazytensor-system-performance-with-pytorch-xla-on-cloud-tpu/index.html b/blog/understanding-lazytensor-system-performance-with-pytorch-xla-on-cloud-tpu/index.html new file mode 100644 index 000000000000..5fcb321944ba --- /dev/null +++ b/blog/understanding-lazytensor-system-performance-with-pytorch-xla-on-cloud-tpu/index.html @@ -0,0 +1,823 @@ + + + + + + + + + + + + + Understanding LazyTensor System Performance with PyTorch/XLA on Cloud TPU | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Vaibhav Singh + +

    +

    Introduction

    + +

    Ease of use, expressivity, and debuggability are among the core principles of PyTorch. One of the key drivers for the ease of use is that PyTorch execution is by default “eager, i.e. op by op execution preserves the imperative nature of the program. However, eager execution does not offer the compiler based optimization, for example, the optimizations when the computation can be expressed as a graph.

    + +

    LazyTensor [1], first introduced with PyTorch/XLA, helps combine these seemingly disparate approaches. While PyTorch eager execution is widely used, intuitive, and well understood, lazy execution is not as prevalent yet.

    + +

    In this post we will explore some of the basic concepts of the LazyTensor System with the goal of applying these concepts to understand and debug performance of LazyTensor based implementations in PyTorch. Although we will use PyTorch/XLA on Cloud TPU as the vehicle for exploring these concepts, we hope that these ideas will be useful to understand other system(s) built on LazyTensors.

    + +

    LazyTensor

    + +

    Any operation performed on a PyTorch tensor is by default dispatched as a kernel or a composition of kernels to the underlying hardware. These kernels are executed asynchronously on the underlying hardware. The program execution is not blocked until the value of a tensor is fetched. This approach scales extremely well with massively parallel programmed hardware such as GPUs.

    + +

    The starting point of a LazyTensor system is a custom tensor type. In PyTorch/XLA, this type is called XLA tensor. In contrast to PyTorch’s native tensor type, operations performed on XLA tensors are recorded into an IR graph. Let’s examine an example that sums the product of two tensors:

    + +
    import torch
    +import torch_xla
    +import torch_xla.core.xla_model as xm
    +
    +dev = xm.xla_device()
    +
    +x1 = torch.rand((3, 3)).to(dev)
    +x2 = torch.rand((3, 8)).to(dev)
    +
    +y1 = torch.einsum('bs,st->bt', x1, x2)
    +print(torch_xla._XLAC._get_xla_tensors_text([y1]))
    +
    + +

    You can execute this colab notebook to examine the resulting graph for y1. Notice that no computation has been performed yet.

    + +
    y1 = y1 + x2
    +print(torch_xla._XLAC._get_xla_tensors_text([y1]))
    +
    + +

    The operations will continue until PyTorch/XLA encounters a barrier. This barrier can either be a mark step() api call or any other event which forces the execution of the graph recorded so far.

    + +
    xm.mark_step()
    +print(torch_xla._XLAC._get_xla_tensors_text([y1]))
    +
    + +

    Once the mark_step() is called, the graph is compiled and then executed on TPU, i.e. the tensors have been materialized. Therefore, the graph is now reduced to a single line y1 tensor which holds the result of the computation.

    + +

    Compile Once, Execute Often

    + +

    XLA compilation passes offer optimizations (e.g. op-fusion, which reduces HBM pressure by using scratch-pad memory for multiple ops, ref ) and leverages lower level XLA infrastructure to optimally use the underlying hardware. However, there is one caveat, compilation passes are expensive, i.e. can add to the training step time. Therefore, this approach scales well if and only if we can compile once and execute often (compilation cache helps, such that the same graph is not compiled more than once).

    + +

    In the following example, we create a small computation graph and time the execution:

    + +
    y1 = torch.rand((3, 8)).to(dev)
    +def dummy_step() :
    +  y1 = torch.einsum('bs,st->bt', y1, x)
    +  xm.mark_step()
    +  return y1
    +
    + +
    %timeit dummy_step
    +
    + +
    The slowest run took 29.74 times longer than the fastest. This could mean that an intermediate result is being cached.
    +10000000 loops, best of 5: 34.2 ns per loop
    +
    + +

    You notice that the slowest step is quite longer than the fastest. This is because of the graph compilation overhead which is incurred only once for a given shape of graph, input shape, and output shape. Subsequent steps are faster because no graph compilation is necessary.

    + +

    This also implies that we expect to see performance cliffs when the “compile once and execute often” assumption breaks. Understanding when this assumption breaks is the key to understanding and optimizing the performance of a LazyTensor system. Let’s examine what triggers the compilation.

    + +

    Graph Compilation and Execution and LazyTensor Barrier

    + +

    We saw that the computation graph is compiled and executed when a LazyTensor barrier is encountered. There are three scenarios when the LazyTensor barrier is automatically or manually introduced. The first is the explicit call of mark_step() api as shown in the preceding example. mark_step() is also called implicitly at every step when you wrap your dataloader with MpDeviceLoader (highly recommended to overlap compute and data upload to TPU device). The Optimizer step method of xla_model also allows to implicitly call mark_step (when you set barrier=True).

    + +

    The second scenario where a barrier is introduced is when PyTorch/XLA finds an op with no mapping (lowering) to equivalent XLA HLO ops. PyTorch has 2000+ operations. Although most of these operations are composite (i.e. can be expressed in terms of other fundamental operations), some of these operations do not have corresponding lowering in XLA.

    + +

    + +

    + +

    What happens when an op with no XLA lowering is used? PyTorch XLA stops the operation recording and cuts the graph(s) leading to the input(s) of the unlowered op. This cut graph is then compiled and dispatched for execution. The results (materialized tensor) of execution are sent back from device to host, the unlowered op is then executed on the host (cpu), and then downstream LazyTensor operations creating a new graph(s) until a barrier is encountered again.

    + +

    The third and final scenario which results in a LazyTensor barrier is when there is a control structure/statement or another method which requires the value of a tensor. This statement would at the minimum cause the execution of the computation graph leading to the tensor (if the graph has already been seen) or cause compilation and execution of both.

    + +

    Other examples of such methods include .item(), isEqual(). In general, any operation that maps Tensor -> Scalar will cause this behavior.

    + +

    Dynamic Graph

    + +

    As illustrated in the preceding section, graph compilation cost is amortized if the same shape of the graph is executed many times. It’s because the compiled graph is cached with a hash derived from the graph shape, input shape, and the output shape. If these shapes change it will trigger compilation, and too frequent compilation will result in training time degradation.

    + +

    Let’s consider the following example:

    + +
    def dummy_step(x, y, loss, acc=False):
    +  z = torch.einsum('bs,st->bt', y, x)
    +  step_loss = z.sum().view(1,)
    +  if acc:
    +    loss = torch.cat((loss, step_loss))
    +  else:
    +    loss = step_loss
    +  xm.mark_step()
    +  return loss
    +
    +
    +import time
    +def measure_time(acc=False):
    +  exec_times = []
    +  iter_count = 100
    +  x = torch.rand((512, 8)).to(dev)
    +  y = torch.rand((512, 512)).to(dev)
    +  loss = torch.zeros(1).to(dev)
    +  for i in range(iter_count):
    +    tic = time.time()
    +    loss = dummy_step(x, y, loss, acc=acc)
    +    toc = time.time()
    +    exec_times.append(toc - tic)
    +  return exec_times
    +
    +dyn = measure_time(acc=True) # acc= True Results in dynamic graph
    +st = measure_time(acc=False) # Static graph, computation shape, inputs and output shapes don't change
    +
    +import matplotlib.pyplot as plt
    +plt.plot(st, label = 'static graph')
    +plt.plot(dyn, label = 'dynamic graph')
    +plt.legend()
    +plt.title('Execution time in seconds')
    +
    + +

    + +

    + +

    Note that static and dynamic cases have the same computation but dynamic graph compiles every time, leading to the higher overall run-time. In practice, the training step with recompilation can sometimes be an order of magnitude or slower. In the next section we discuss some of the PyTorch/XLA tools to debug training degradation.

    + +

    Profiling Training Performance with PyTorch/XLA

    + +

    PyTorch/XLA profiling consists of two major components. First is the client side profiling. This feature is turned on by simply setting the environment variable PT_XLA_DEBUG to 1. Client side profiling points to unlowered ops or device-to-host transfer in your source code. Client side profiling also reports if there are too frequent compilations happening during the training. You can explore some metrics and counters provided by PyTorch/XLA in conjunction with the profiler in this notebook.

    + +

    The second component offered by PyTorch/XLA profiler is the inline trace annotation. For example:

    + +
    import torch_xla.debug.profiler as xp
    +
    +def train_imagenet():
    +  print('==> Preparing data..')
    +  img_dim = get_model_property('img_dim')
    +  ....
    +  server = xp.start_server(3294)
    +  def train_loop_fn(loader, epoch):
    +    ....
    +    model.train()
    +    for step, (data, target) in enumerate(loader):
    +      with xp.StepTrace('Train_Step', step_num=step):
    +        ....
    +        if FLAGS.amp:
    +        ....
    +        else:
    +          with xp.Trace('build_graph'):
    +            output = model(data)
    +            loss = loss_fn(output, target)
    +            loss.backward()
    +          xm.optimizer_step(optimizer)
    +
    + +

    Notice the start_server api call. The port number that you have used here is the same port number you will use with the tensorboard profiler in order to view the op trace similar to:

    + +

    + +

    + +

    Op trace along with the client-side debugging function is a powerful set of tools to debug and optimize your training performance with PyTorch/XLA. For more detailed instructions on the profiler usage, the reader is encouraged to explore blogs part-1, part-2, and part-3 of the blog series on PyTorch/XLA performance debugging.

    + +

    Summary

    + +

    In this article we have reviewed the fundamentals of the LazyTensor system. We built on those fundamentals with PyTorch/XLA to understand the potential causes of training performance degradation. We discussed why “compile once and execute often” helps to get the best performance on LazyTensor systems, and why training slows down when this assumption breaks.

    + +

    We hope that PyTorch users will find these insights helpful for their novel works with LazyTensor systems.

    + +

    Acknowledgements

    + +

    A big thank you to my outstanding colleagues Jack Cao, Milad Mohammedi, Karl Weinmeister, Rajesh Thallam, Jordan Tottan (Google) and Geeta Chauhan (Meta) for their meticulous reviews and feedback. And thanks to the extended PyTorch/XLA development team from Google, Meta, and the open source community to make PyTorch possible on TPUs. And finally, thanks to the authors of the LazyTensor paper not only for developing LazyTensor but also for writing such an accessible paper.

    + +

    Refrences

    + +

    [1] LazyTensor: combining eager execution with domain-specific compilers

    + + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/unleashing-ai-mobile/index.html b/blog/unleashing-ai-mobile/index.html new file mode 100644 index 000000000000..f868f4a19cbd --- /dev/null +++ b/blog/unleashing-ai-mobile/index.html @@ -0,0 +1,754 @@ + + + + + + + + + + + + + Unleashing the Power of AI on Mobile: LLM Inference for Llama 3.2 Quantized Models with ExecuTorch and KleidiAI | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Gian Marco Iodice, Arm and Digant Desai, Meta + +

    +

    Introduction

    + +

    At the recent PyTorch Conference, Arm highlighted the widespread impact of its technology, spanning from cloud to edge, emphasizing its commitment to delivering its advanced AI computing capabilities seamlessly to millions of developers worldwide.

    + +

    key stats

    + +

    During the presentation, it was emphasized that Arm bears the immense responsibility of equipping 20+ million developers and billions of users with advanced AI computing features without friction. Achieving this requires crucial software collaborations across a vast ecosystem of software and hardware partners.

    + +

    Just a few months ago, Arm launched Arm Kleidi, developer enablement technologies and resources to drive technical collaboration and innovation across the ML stack. This includes the KleidiAI software library providing optimized software routines, which when integrated into key frameworks such as XNNPACK enable automatic AI acceleration for developers on Arm Cortex-A CPUs.

    + +

    Today, we’re excited to announce a new milestone for the AI open-source community that brings Arm even closer to realizing this vision: the integration of KleidiAI into ExecuTorch via XNNPACK, boosting AI workload performance on Arm mobile CPUs!

    + +

    Thanks to the collaborative efforts of the engineering teams at Arm and Meta, AI developers can now deploy quantized Llama models which run up to 20% faster on Arm Cortex-A v9 CPUs with the i8mm ISA extension.

    + +

    And there’s more exciting news - the ExecuTorch team has officially launched the Beta release!

    + +

    This marks an important milestone in our partnership. In this blog, we are eager to share more details about ExecuTorch capabilities, the new Meta Llama 3.2 models, the integer 4-bit with per-block quantization, and the impressive performance recorded on certain Arm CPUs. Notably, we have achieved speeds of over 350 tokens per second on the prefill stage with the quantized Llama 3.2 1B model on Samsung S24+ device, as shown in the following screenshots.

    + +

    mobile app screenshots

    + +

    Now, let’s dive into the key components that enabled the demo creation presented in the preceding images. First up: new Llama 3.2 models!

    + +

    Meta Llama 3.2

    + +

    Meta recently announced the first lightweight quantized Llama models, which are designed to run on popular mobile devices. Meta used two techniques for quantizing Llama 3.2 1B and 3B models: Quantization-Aware Training (QAT) with LoRA adaptors (QLoRA), and SpinQuant, a state-of-the-art post-training quantization method. The quantized models were evaluated using PyTorch’s ExecuTorch framework as the inference engine, with the Arm CPU as a backend.

    + +

    These instruction-tuned models retain the quality and safety of the original 1B and 3B models while achieving a 2-4x speedup and reducing model size by 56% on average and memory footprint by 41% on average compared to the original BF16 format.

    + +

    In this blog post, we will demonstrate the performance improvements we observed in our experiments.

    + +

    ExecuTorch

    + +

    ExecuTorch is a PyTorch-native framework specifically designed for deploying AI models on-device, enhancing privacy and reducing latency. It supports the deployment of cutting-edge open-source AI models, including the Llama family of models and vision and speech models like Segment Anything and Seamless.

    + +

    This unlocks new possibilities for edge devices such as mobile phones, smart glasses, VR headsets, and smart home cameras. Traditionally, deploying PyTorch-trained AI models to resource-limited edge devices has been challenging and time-consuming, often requiring conversion to other formats which could lead to errors and suboptimal performance. The varied toolchains across the hardware and edge ecosystem have also degraded the developer experience, making a universal solution impractical.

    + +

    ExecuTorch addresses these issues by providing composable components that include core runtime, operator library, and delegation interface that allows for portability as well extensibility. Models can be exported using torch.export(), producing a graph that is natively compatible with the ExecuTorch runtime, capable of running on most edge devices with CPUs, and extendable to specialized hardware like GPUs and NPUs for enhanced performance.

    + +

    Working with Arm, ExecuTorch now leverages the optimized low-bit matrix multiplication kernels from the Arm KleidiAI library to improve on-device Large Language Model (LLM) inference performance via XNNPACK. We also thank the XNNPACK team at Google for supporting this effort.

    + +

    In this post, we will focus on this integration available in ExecuTorch

    + +

    Evolving the architecture for AI workloads

    + +

    At Arm, we have been deeply committed to investing in open-source projects and advancing new technologies in our processors since the early days of the deep learning wave, focusing on making AI workloads high-performing and more power-efficient.

    + +

    For instance, Arm introduced the SDOT instruction, starting with the Armv8.2-A architecture, to accelerate dot product arithmetic between 8-bit integer vectors. This feature, now widely available in mobile devices, significantly speeds up the computation of quantized 8-bit models. After the SDOT instruction, Arm introduced the BF16 data type and the MMLA instruction to further enhance the floating-point and integer matrix multiplication performance on CPUs and, most recently, announced the Scalable Matrix Extension (SME), marking a significant leap forward in machine learning capabilities.

    + +

    The following image shows a few examples of Arm CPU’s continuous innovations in the AI space over the last decade:

    + +

    line chart

    + +

    Given the widespread use of Arm CPUs, AI frameworks need to take full advantage of these technologies in key operators to maximize performance. Recognizing this, we saw the need for an open-source library to share these optimized software routines. However, we were mindful of the challenges in integrating a new library into AI frameworks, such as concerns about library size, dependencies, and documentation and the need to avoid adding extra burdens for developers. So, we took extra steps to gather feedback from our partners and ensure a smooth integration process that does not require additional dependencies for AI developers. This effort led to KleidiAI, an open-source library that provides optimized performance-critical routines for artificial intelligence (AI) workloads tailored for Arm CPUs. You can learn more about KleidiAI here.

    + +

    Working with the ExecuTorch team at Meta, Arm provided the software optimizations for their novel 4-bit with per-block quantization schema, which is used to accelerate the matrix multiplication kernel in the Transformer layer’s torch.nn.linear operator for Llama 3.2 quantized models. This flexible 4-bit quantization schema from ExecuTorch strikes a balance between model accuracy and low-bit matrix multiplication performance targeting on-device LLMs.

    + +

    The integer 4-bit with per-block quantization

    + +

    In KleidiAI, we introduced micro-kernels optimized for this new 4-bit integer quantization scheme (matmul_clamp_f32_qai8dxp_qsi4c32p)

    + +

    As shown in the following image, this 4-bit quantization uses a per-block strategy for weight (RHS matrix) quantization and an 8-bit per-row quantization for activations (LHS matrix):

    + +

    arch diagram

    + +

    As you can see in the preceding image, each output feature map (OFM) in the weight matrix is divided into equally sized blocks (group size), with each block having a scale factor stored in BF16 format. BF16 is advantageous because it maintains the dynamic range of 32-bit floating-point (FP32) format with half the bit size, and it’s easy to convert to and from FP32 using a simple shift operation. This makes BF16 ideal for saving model space, preserving accuracy, and ensuring backward compatibility with devices that lack BF16 hardware acceleration. You can learn more about the BF16 format in this Arm Community blog post.

    + +

    For completeness, this 4-bit quantization scheme and our implementation in KleidiAI allow users to configure group size for the linear weights (RHS), allowing them to trade-off between model size, model accuracy, and model performance if the model is quantized by the user.

    + +

    At this point, we are ready to unveil the incredible performance recorded on Arm CPUs with ExecuTorch when running Llama 3.2 1B and Llama 3.2 3B. Let’s first go over metrics we will use to evaluate the performance of LLM inference.

    + +

    Metrics for LLM Inference

    + +

    Typically, performance metrics used to evaluate LLM performance during inference include:

    + +
      +
    • Time To First Token (TTFT): This measures the time it takes to produce the first output token after a prompt is provided by the user. This latency or response time is important for a good user experience, especially on a phone. TTFT is also a function of the length of the prompt or prompt tokens. To make this metric independent of the prompt length, we use Prefill tokens/second as a proxy here. The relationship between these is inverse: lower TTFT corresponds to higher Prefill tokens/second.
    • +
    • Decode Performance: This is the average number of output tokens generated per second, thus reported in Tokens/Second. It is independent of the total number of tokens generated. For on-device inference, it is important to keep this higher than a user’s average reading speed.
    • +
    • Peak Runtime Memory: This metric reflects the amount of RAM, typically reported in MegaBytes (MiB), needed to run the model with expected performance measured using the metrics above. Given the limited amount of RAM available on Android and iOS devices, this is one of the key metrics for on-device LLM deployment. It dictates the type of models that can be deployed on a device.
    • +
    + +

    Results

    + +

    The quantized Llama 3.2 1B models, both SpinQuant and QLoRA, are designed to run efficiently on a wide range of phones with limited RAM. In this section, we demonstrate that the quantized Llama 3.2 1B models can achieve over 350 tokens per second in the prefill phase and over 40 tokens per second in the decode stage. This level of performance is sufficient to enable on-device text summarization with a reasonable user experience using only Arm CPUs. To put this into perspective, on average, 50 unread messages contain about 600 tokens. With this performance, the response time (the time it takes for the first generated word to appear on the screen) is approximately two seconds.

    + +

    We present measurements from a Samsung S24+ running vanilla Android. We used Llama 3.2 1B parameter models for these experiments. Although we only demonstrate using 1B models, similar performance gains can be expected for the 3B parameter models. The experiment setup involves doing a single warmup run, sequence length of 128, prompt length of 64, and using 6 out of 8 available CPUs, and measuring results over adb.

    + +

    Using the ExecuTorch main branch from GitHub, we first generated the ExecuTorch PTE binary files for each model using the published checkpoints. Then, using the same repository, we generated the ExecuTorch runtime binary for Armv8. In the rest of the section, we will compare the performance of different quantized 1B models against the BF16 model using the binary built with KleidiAI. We will also compare the performance gains for quantized models between the binary with KleidiAI and the one without KleidiAI to distill the impact from KleidiAI.

    + +

    Quantized Model Performance

    + +

    Llama 3.2 quantized models both SpinQuant and QLoRA perform significantly better on prompt prefill and text generation (decode) compared to the baseline BF16. We observed a >2x improvement in decode and a >5x improvement in prefill performance.

    + +

    Furthermore, the quantized model size, PTE file size in bytes, is less than half that of the BF16 model, 2.3 GiB vs. 1.1 GiB. Although the size of int4 is a quarter of BF16, some layers in the model are quantized with int8, making the PTE file size ratio larger. We observed runtime peak memory footprint reduction of almost 40% from 3.1 GiB for the BF16 model to 1.9 GiB for the SpinQuant model, measured in Resident Set Size (RSS) for a maximum sequence length of 2048.

    + +

    With all-around improvements, the new quantized Llama 3.2 models are ideal for on-device deployment targeting Arm CPUs. For more information on accuracy, check out the Meta Llama 3.2 blog.

    + +

    bar graph

    + +

    KleidiAI Impact

    + +

    ExecuTorch relies on the Arm KleidiAI library to provide low-bit performant matrix multiplication kernels for the latest Arm CPUs with advanced Armv8/9 ISA features. These kernels are utilized for on-device quantized Llama 3.2 model inference in ExecuTorch. As depicted in the graph below, ExecuTorch achieves an average of >20% better prefill performance on S24+ with KleidiAI compared to non-KleidiAI kernels, while maintaining the same accuracy. This performance advantage is not limited to specific models or devices, and is expected to benefit all ExecuTorch models using low-bit quantized matrix multiplication on Arm CPUs.

    + +

    To assess the impact of Kleidi, we generated two ExecuTorch runtime binaries targeting Arm Cortex-A CPUs and compared their performance.

    + +
      +
    1. The first ExecuTorch runtime binary built with the Arm KleidiAI library through the XNNPACK library.
    2. +
    3. The second binary was built without the Arm KleidiAI repository, using native kernels from the XNNPACK library.
    4. +
    + +

    bar chart

    + +

    Try it yourself!

    + +

    Ready to experience the performance improvements firsthand? Here’s how you can try out ExecuTorch with the optimizations provided by KleidiAI on your projects: Here is a link to the learning path from Arm to start developing your own application using LLMs using ExecuTorch and KleidiAI.

    + +

    We look forward to hearing your feedback!

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/unlocking-pt-2-6-intel/index.html b/blog/unlocking-pt-2-6-intel/index.html new file mode 100644 index 000000000000..b727a7fbb6fa --- /dev/null +++ b/blog/unlocking-pt-2-6-intel/index.html @@ -0,0 +1,717 @@ + + + + + + + + + + + + + Unlocking the Latest Features in PyTorch 2.6 for Intel Platforms | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + the Intel PyTorch Team + +

    +

    PyTorch* 2.6 has just been released with a set of exciting new features including torch.compile compatibility with Python 3.13, new security and performance enhancements, and a change in the default parameter for torch.load. PyTorch also announced the deprecation of its official Anaconda channel.

    + +

    Among the performance features are three that enhance developer productivity on Intel platforms:

    + +
      +
    1. Improved Intel GPU availability
    2. +
    3. FlexAttention optimization on x86 CPU for LLM
    4. +
    5. FP16 on x86 CPU support for eager and Inductor modes
    6. +
    + +

    Improved Intel GPU Availability

    + +

    To provide developers working in artificial intelligence (AI) with better support for Intel GPUs, the PyTorch user experience on these GPUs has been enhanced. This improvement includes simplified installation steps, a Windows* release binary distribution, and expanded coverage of supported GPU models, including the latest Intel® Arc™ B-Series discrete graphics.

    + +

    These new features help promote accelerated machine learning workflows within the PyTorch ecosystem, providing a consistent developer experience and support. Application developers and researchers seeking to fine-tune, perform inference, and develop with PyTorch models on Intel® Core™ Ultra AI PCs  and Intel® Arc™ discrete graphics will now be able to install PyTorch directly with binary releases for Windows, Linux*, and Windows Subsystem for Linux 2.

    + +

    The new features include:

    + +
      +
    • Simplified Intel GPU software stack setup to enable one-click installation of the torch-xpu PIP wheels to run deep learning workloads in a ready-to-use fashion, thus eliminating the complexity of installing and activating Intel GPU development software bundles. 
    • +
    • Windows binary releases for torch core, torchvision and torchaudio have been made available for Intel GPUs, expanding from Intel® Core™ Ultra Series 2 with Intel® Arc™ Graphics and Intel® Arc™ A-Series graphics to the latest GPU hardware Intel® Arc™ B-Series graphics support. 
    • +
    • Further enhanced coverage of Aten operators on Intel GPUs with SYCL* kernels for smooth eager mode execution, as well as bug fixes and performance optimizations for torch.compile on Intel GPUs. 
    • +
    + +

    Get a tour of new environment setup, PIP wheels installation, and examples on Intel® Client GPUs and Intel® Data Center GPU Max Series in the Getting Started Guide.

    + +

    FlexAttention Optimization on X86 CPU for LLM

    + +

    FlexAttention was first introduced in PyTorch 2.5, to address the need to support various Attentions or even combinations of them. This PyTorch API leverages torch.compile to generate a fused FlashAttention kernel, which eliminates extra memory allocation and achieves performance comparable to handwritten implementations.

    + +

    Previously, FlexAttention was implemented for CUDA* devices based on the Triton backend. Since PyTorch 2.6, X86 CPU support of FlexAttention was added through TorchInductor CPP backend. This new feature leverages and extends current CPP template abilities to support broad attention variants (e.g., PageAttention, which is critical for LLMs inference) based on the existing FlexAttention API, and brings optimized performance on x86 CPUs. With this feature, user can easily use FlexAttention API to compose their Attention solutions on CPU platforms and achieve good performance.

    + +

    Typically, FlexAttention is utilized by popular LLM ecosystem projects, such as Hugging Face transformers and vLLM in their LLM related modeling (e.g., PagedAttention) to achieve better out-of-the-box performance. Before the official adoption happens, this enabling PR in Hugging Face can help us the performance benefits that FlexAttention can bring on x86 CPU platforms.

    + +

    The graph below shows the performance comparison of PyTorch 2.6 (with this feature) and PyTorch 2.5 (without this feature) on typical Llama models. For real-time mode (Batch Size = 1), there is about 1.13x-1.42x performance improvement for next token across different input token lengths. As for best throughput under a typical SLA (P99 token latency <=50ms), PyTorch 2.6 achieves more than 7.83x performance than PyTorch 2.5 as PyTorch 2.6 can run at 8 inputs (Batch Size = 8) together and still keep SLA while PyTorch 2.5 can only run 1 input, because FlexAttention based PagedAttention in PyTorch 2.6 provides more efficiency during multiple batch size scenarios.

    + +

    Figure 1. Performance comparison of PyTorch 2.6 and PyTorch 2.5 on Typical Llama Models

    + +

    Figure 1. Performance comparison of PyTorch 2.6 and PyTorch 2.5 on Typical Llama Models

    + +

    FP16 on X86 CPU Support for Eager and Inductor Modes

    + +

    Float16 is a commonly used reduced floating-point type that improves performance in neural network inference and training. CPUs like recently launched Intel® Xeon® 6 with P-Cores support Float16 datatype with native accelerator AMX, which highly improves the Float16 performance. Float16 support on x86 CPU was first introduced in PyTorch 2.5 as a prototype feature. Now it has been further improved for both eager mode and Torch.compile + Inductor mode, which is pushed to Beta level for broader adoption. This helps the deployment on the CPU side without the need to modify the model weights when the model is pre-trained with mixed precision of Float16/Float32. On platforms that support AMX Float16 (i.e., the Intel® Xeon® 6 processors with P-cores), Float16 has the same pass rate as Bfloat16 across the typical PyTorch benchmark suites: TorchBench, Hugging Face, and Timms. It also shows good performance comparable to 16 bit datatype Bfloat16.

    + +

    Summary

    + +

    In this blog, we discussed three features to enhance developer productivity on Intel platforms in PyTorch 2.6. These three features are designed to improve Intel GPU availability, optimize FlexAttention for x86 CPUs tailored for large language models (LLMs), and support FP16 on x86 CPUs in both eager and Inductor modes. Get PyTorch 2.6 and try them for yourself or you can access PyTorch 2.6 on the Intel® Tiber™ AI Cloud to take advantage of hosted notebooks that are optimized for Intel hardware and software.

    + +

    Acknowledgements

    + +

    The release of PyTorch 2.6 is an exciting milestone for Intel platforms, and it would not have been possible without the deep collaboration and contributions from the community. We extend our heartfelt thanks to Alban, Andrey, Bin, Jason, Jerry and Nikita for sharing their invaluable ideas, meticulously reviewing PRs, and providing insightful feedback on RFCs. Their dedication has driven continuous improvements and pushed the ecosystem forward for Intel platforms.

    + +

    References

    + + + +

    Product and Performance Information

    + +

    Measurement on AWS EC2 m7i.metal-48xl using: 2x Intel® Xeon® Platinum 8488C, HT On, Turbo On, NUMA 2, Integrated Accelerators Available [used]: DLB [8], DSA [8], IAA[8], QAT[on CPU, 8], Total Memory 512GB (16x32GB DDR5 4800 MT/s [4400 MT/s]), BIOS Amazon EC2 1.0, microcode 0x2b000603, 1x Elastic Network Adapter (ENA) 1x Amazon Elastic Block Store 800G, Ubuntu 24.04.1 LTS 6.8.0-1018-aws Test by Intel on Jan 15th 2025.

    + +

    Notices and Disclaimers

    + +

    Performance varies by use, configuration and other factors. Learn more on the Performance Index site. Performance results are based on testing as of dates shown in configurations and may not reflect all publicly available updates.  See backup for configuration details.  No product or component can be absolutely secure. Your costs and results may vary. Intel technologies may require enabled hardware, software or service activation.

    + +

    Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others.

    + +

    AI disclaimer:

    + +

    AI features may require software purchase, subscription or enablement by a software or platform provider, or may have specific configuration or compatibility requirements. Details at www.intel.com/AIPC. Results may vary.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/updates-improvements-to-pytorch-tutorials/index.html b/blog/updates-improvements-to-pytorch-tutorials/index.html new file mode 100644 index 000000000000..785d2c22f44c --- /dev/null +++ b/blog/updates-improvements-to-pytorch-tutorials/index.html @@ -0,0 +1,722 @@ + + + + + + + + + + + + + Updates & Improvements to PyTorch Tutorials | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    + +
    + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    PyTorch.org provides researchers and developers with documentation, installation instructions, latest news, community projects, tutorials, and more. Today, we are introducing usability and content improvements including tutorials in additional categories, a new recipe format for quickly referencing common topics, sorting using tags, and an updated homepage.

    + +

    Let’s take a look at them in detail.

    + +

    TUTORIALS HOME PAGE UPDATE

    +

    The tutorials home page now provides clear actions that developers can take. For new PyTorch users, there is an easy-to-discover button to take them directly to “A 60 Minute Blitz”. Right next to it, there is a button to view all recipes which are designed to teach specific features quickly with examples.

    + +
    + +
    + +

    In addition to the existing left navigation bar, tutorials can now be quickly filtered by multi-select tags. Let’s say you want to view all tutorials related to “Production” and “Quantization”. You can select the “Production” and “Quantization” filters as shown in the image shown below:

    + +
    + +
    + +

    The following additional resources can also be found at the bottom of the Tutorials homepage:

    + + +

    PYTORCH RECIPES

    +

    Recipes are new bite-sized, actionable examples designed to teach researchers and developers how to use specific PyTorch features. Some notable new recipes include:

    + + +

    View the full recipes here.

    + +

    LEARNING PYTORCH

    +

    This section includes tutorials designed for users new to PyTorch. Based on community feedback, we have made updates to the current Deep Learning with PyTorch: A 60 Minute Blitz tutorial, one of our most popular tutorials for beginners. Upon completion, one can understand what PyTorch and neural networks are, and be able to build and train a simple image classification network. Updates include adding explanations to clarify output meanings and linking back to where users can read more in the docs, cleaning up confusing syntax errors, and reconstructing and explaining new concepts for easier readability.

    + +

    DEPLOYING MODELS IN PRODUCTION

    +

    This section includes tutorials for developers looking to take their PyTorch models to production. The tutorials include:

    + + +

    FRONTEND APIS

    +

    PyTorch provides a number of frontend API features that can help developers to code, debug, and validate their models more efficiently. This section includes tutorials that teach what these features are and how to use them. Some tutorials to highlight:

    + + +

    MODEL OPTIMIZATION

    +

    Deep learning models often consume large amounts of memory, power, and compute due to their complexity. This section provides tutorials for model optimization:

    + + +

    PARALLEL AND DISTRIBUTED TRAINING

    +

    PyTorch provides features that can accelerate performance in research and production such as native support for asynchronous execution of collective operations and peer-to-peer communication that is accessible from Python and C++. This section includes tutorials on parallel and distributed training:

    + + +

    Making these improvements are just the first step of improving PyTorch.org for the community. Please submit your suggestions here.

    + +

    Cheers,

    + +

    Team PyTorch

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/vllm-joins-pytorch/index.html b/blog/vllm-joins-pytorch/index.html new file mode 100644 index 000000000000..7c1324ff4398 --- /dev/null +++ b/blog/vllm-joins-pytorch/index.html @@ -0,0 +1,705 @@ + + + + + + + + + + + + + vLLM Joins PyTorch Ecosystem: Easy, Fast, and Cheap LLM Serving for Everyone | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + vLLM Team + +

    +

    vllm logo

    + +

    We’re thrilled to announce that the vLLM project has become a PyTorch ecosystem project, and joined the PyTorch ecosystem family!

    + +

    For more information on what it means to be a PyTorch ecosystem project, see the PyTorch Ecosystem Tools page.

    + +

    Running large language models (LLMs) is both resource-intensive and complex, especially as these models scale to hundreds of billions of parameters. That’s where vLLM comes in — a high-throughput, memory-efficient inference and serving engine designed for LLMs.

    + +

    Originally built around the innovative PagedAttention algorithm, vLLM has grown into a comprehensive, state-of-the-art inference engine. A thriving community is also continuously adding new features and optimizations to vLLM, including pipeline parallelism, chunked prefill, speculative decoding, and disaggregated serving.

    + +

    Since its release, vLLM has garnered significant attention, achieving over 31,000 GitHub stars—a testament to its popularity and thriving community. This milestone marks an exciting chapter for vLLM as we continue to empower developers and researchers with cutting-edge tools for efficient and scalable AI deployment. Welcome to the next era of LLM inference!

    + +

    vLLM has always had a strong connection with the PyTorch project. It is deeply integrated into PyTorch, leveraging it as a unified interface to support a wide array of hardware backends. These include NVIDIA GPUs, AMD GPUs, Google Cloud TPUs, Intel GPUs, Intel CPUs, Intel Gaudi HPUs, and AWS Neuron, among others. This tight coupling with PyTorch ensures seamless compatibility and performance optimization across diverse hardware platforms.

    + +

    Do you know you can experience the power of vLLM right from your phone? During this year’s Amazon Prime Day, vLLM played a crucial role in delivering lightning-fast responses to millions of users. Across three regions, over 80,000 Trainium and Inferentia chips powered an average of 3 million tokens per minute, all while maintaining a P99 latency of less than 1 second for the first response. That means when customers opened the Amazon app and chatted with Rufus, they were seamlessly interacting with vLLM in action!

    + +

    vLLM also collaborates tightly with leading model vendors to ensure support for popular models. This includes tight integration with Meta LLAMA, Mistral, QWen, and DeepSeek models, plus many others. One particularly memorable milestone was the release of LLAMA 3.1 (405B). As the launching partner, vLLM was the first to enable running this very large model, showcasing vLLM’s capability to handle the most complex and resource-intensive language models.

    + +

    To install vLLM, simply run:

    + +
    pip install vllm
    +
    + +

    vLLM is designed for both researchers and production-grade serving.

    + +

    To run vLLM as an OpenAI API compatible server, just use the Huggingface model ID:

    + +
    vllm serve meta-llama/Llama-3.1-8B
    +
    + +

    To run vLLM as a simple function:

    + +
    from vllm import LLM, SamplingParams
    +
    +# Sample prompts.
    +prompts = [
    +   "Hello, my name is",
    +   "The president of the United States is",
    +   "The capital of France is",
    +   "The future of AI is",
    +]
    +# Create a sampling params object.
    +sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
    +
    +# Create an LLM.
    +llm = LLM(model="meta-llama/Llama-3.1-8B")
    +# Generate texts from the prompts. The output is a list of RequestOutput objects
    +# that contain the prompt, generated text, and other information.
    +outputs = llm.generate(prompts, sampling_params)
    +# Print the outputs.
    +for output in outputs:
    +   prompt = output.prompt
    +   generated_text = output.outputs[0].text
    +   print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
    +
    + +

    Open-source innovation is part of the vLLM’s DNA. Born out of a Berkeley academic project, it follows the legacy of other pioneering open-source initiatives such as BSD, which revolutionized operating systems in the 1980s. Other innovations from the same organization include Apache Spark and Ray, now the standard for big data and AI systems. In the Gen AI era, vLLM serves as a platform dedicated to democratizing AI inference.

    + +

    The vLLM team remains steadfast in its mission to keep the project “of the community, by the community, and for the community.” Collaboration and inclusivity lie at the heart of everything we do.

    + +

    If you have collaboration requests or inquiries, feel free to reach out at vllm-questions@lists.berkeley.edu. To join the active and growing vLLM community, explore our GitHub repository or connect with us on the vLLM Slack. Together, we can push the boundaries of AI innovation and make it accessible to all.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/warp-specialization/index.html b/blog/warp-specialization/index.html new file mode 100644 index 000000000000..412bb7e7a3c8 --- /dev/null +++ b/blog/warp-specialization/index.html @@ -0,0 +1,741 @@ + + + + + + + + + + + + + Enabling advanced GPU features in PyTorch - Warp Specialization | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Meta and NVIDIA + +

    +

    Meta: Hongtao Yu, Manman Ren, Bert Maher, Shane Nay
    +NVIDIA: Gustav Zhu, Shuhao Jiang

    + +

    Over the past few months, we have been working on enabling advanced GPU features for PyTorch and Triton users through the Triton compiler. One of our key goals has been to introduce warp specialization support on NVIDIA Hopper GPUs. Today, we are thrilled to announce that our efforts have resulted in the rollout of fully automated Triton warp specialization, now available to users in the upcoming release of Triton 3.2, which will ship with PyTorch 2.6. PyTorch users can leverage this feature by implementing user-defined Triton kernels. This work leveraged an initial implementation of warp specialization in Triton by NVIDIA and we look forward to further development with the community in the future.

    + +

    Warp specialization (WS) is a GPU programming technique where warps (a group of 32 threads on NVIDIA GPUs) within a threadblock are assigned distinct roles or tasks. This approach optimizes performance by enabling efficient execution of workloads that require task differentiation or cooperative processing. It enhances kernel performance by leveraging an asynchronous execution model, where different parts of the kernel are managed by separate hardware units. Data communication between these units, facilitated via shared memory on the NVIDIA H100, is highly efficient. Compared to a uniform warp approach, warp specialization allows the hardware multitasking warp scheduler to operate more effectively, maximizing resource utilization and overall performance.

    + +

    Using GEMM as an example, a typical uniform warp approach on the H100 GPU involves 8 warps per thread block collectively computing a tile of the output tensor. These 8 warps are divided into two warp groups (WG), with each group cooperatively computing half of the tile using efficient warp-group-level MMA (WGMMA) instructions, as illustrated in Figure 1.

    + +

    Figure 1. GEMM K-loop Body with Uniform Warps

    + +

    Figure 1. GEMM K-loop Body with Uniform Warps

    + +

    The implementation is clean, easy to understand, and generally performs well, thanks to an elegant software pipeliner. The pipeliner’s purpose is to enhance instruction-level parallelism by executing non-dependent operations on different hardware units. For instance, load operations from the next loop iteration can be executed simultaneously with WGMMA operations in the current iteration. However, this approach relies heavily on the compiler to craft an instruction sequence that ensures load and WGMMA instructions are issued at precisely the right time. While this is relatively straightforward for GEMM, which involves a limited number of operations, it becomes significantly more challenging for more complex kernels, such as flash attention.

    + +

    On the other hand, warp specialization addresses programming challenges by separating operations intended to run simultaneously on different hardware units into distinct warps, synchronizing them efficiently using low-cost barriers in shared memory. This allows each warp to have its own instruction sequence, enabling instructions to be issued and executed continuously without being interrupted by other operations, thanks to the multi-way warp scheduler. An illustration of a warp-specialized GEMM can be seen in Figure 2.

    + +

    Figure 2. GEMM K-loop Body with Specialized Warps

    + +

    Figure 2. GEMM K-loop Body with Specialized Warps

    + +

    How to enable WS

    + +

    To enable warp specialization, users simply need to specify two autotune flags: num_consumer_groups and num_buffers_warp_spec. For example, a warp-specialized GEMM implementation might look as shown below. Users can enable warp specialization by setting a non-zero value for num_consumer_groups, which defines the number of consumer warp groups. There is no corresponding flag to set the number of producer warp groups, as currently only one producer is supported. The num_buffers_warp_spec flag specifies the number of buffers the producer warp group will use to communicate with the consumer warp groups. A working example of a warp-specialized kernel is provided in the persistent GEMM tutorial.

    + +
    @triton.autotune(
    +    configs=[
    +        triton.Config(
    +            {
    +                "BLOCK_SIZE_M": 128,
    +                "BLOCK_SIZE_N": 256,
    +                "BLOCK_SIZE_K": 64,
    +                "GROUP_SIZE_M": 8,
    +            },
    +            num_stages=2,
    +            num_warps=4,
    +            num_consumer_groups=2,
    +            num_buffers_warp_spec=3,
    +        ),
    +    ],
    +    key=["M", "N", "K"],
    +)
    +@triton.jit
    +def matmul_persistent_ws_kernel(
    +   a_ptr, b_ptr, c_ptr, M, N, K,
    +   stride_am, stride_ak, stride_bk, stride_bn, stride_cm, stride_cn,
    +   BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
    +):
    +   pid = tl.program_id(axis=0)
    +   num_pid_m = tl.cdiv(M, BLOCK_M)
    +   num_pid_n = tl.cdiv(N, BLOCK_N)
    +   pid_m = pid // num_pid_m
    +   pid_n = pid % num_pid_n
    +   offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
    +   offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
    +   offs_k = tl.arange(0, BLOCK_K)
    +   a_ptrs = a_ptr + (offs_m[:, None] * stride_am + offs_k[None, :] * stride_ak)
    +   b_ptrs = b_ptr + (offs_k[:, None] * stride_bk + offs_n[None, :] * stride_bn)
    +   acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
    +   for k in range(0, tl.cdiv(K, BLOCK_K)):
    +       a = tl.load(a_ptrs)
    +       b = tl.load(b_ptrs)
    +       acc += tl.dot(a, b)
    +       a_ptrs += BLOCK_K * stride_ak
    +       b_ptrs += BLOCK_K * stride_bk
    +   c = acc.to(tl.float16)
    +   c_ptrs = c_ptr + stride_cm * offs_m[:, None] + stride_cn * offs_n[None, :]
    +   tl.store(c_ptrs, c)
    +
    + +

    Under the Hood

    + +

    Warp specialization uses a set of hierarchical compiler transformations and IR changes to translate a user’s non-warp-specialized kernel into warp-specialized machine code. These include:

    + +
      +
    • Task Partitioning: The entire kernel is automatically divided into asynchronous tasks based on predefined heuristics. The compiler determines how to utilize one producer warp group and a user-specified number of consumer warp groups to execute the kernel. It assigns task IDs to specific anchor operations, which then influence the task assignments for remaining operations through asynchronous task ID propagation and dependency analysis. Since shared memory is the most efficient method for data transfer between warp groups across all supported platforms, the compiler optimizes task partitions to minimize register spills to shared memory, ensuring efficient execution.
    • +
    • Data Partitioning for Multiple Consumer Groups: Efficiently partitioning data among multiple consumer groups is key to optimizing workload distribution. On the H100 GPU, the compiler, by default, attempts to partition the input tensor A along the M dimension, allowing each consumer group to compute half of the output tensor independently. This strategy, known as cooperative partitioning, maximizes efficiency under most conditions. However, if this split leads to inefficiencies—such as producing a workload smaller than the native WGMMA instruction size—the compiler dynamically adjusts and partitions along the N dimension instead.
    • +
    • Dataflow Pipelining: The compiler creates cyclic shared memory buffers to pipeline dataflows across multiple-dimensional loops. Warp-specialized pipelining supports complex control flow. For example, our warp-specialized persistent GEMM kernel uses a doubly-nested loop, allowing the producer to begin fetching data for the next output tile while the consumer is finishing the compute for the prior tile.
    • +
    • Communication Operations: We introduced four high-level Triton GPU IR (TTGIR) communication operations—ProducerAcquireOp, ProducerCommitOp, ConsumerWaitOp, and ConsumerReleaseOp—to manage pipelined dataflows. These support both TMA and non-TMA memory operations.
    • +
    • Code Partitioning: Each async task is outlined into its own standalone code region, guarded by warp group ID checks. Control dependencies are duplicated as needed.
    • +
    • TTGIR to LLVM/PTX Materialization: TTGIR communication operations are materialized into corresponding LLVM/PTX barrier operations.
    • +
    + +

    Performance

    + +

    The warp specialization release introduces a range of Triton compiler transformations that collectively convert user code into warp-specialized kernels. This feature has been applied to several key kernels, including Flash Attention and FP8 row-wise GEMM, resulting in significant performance gains of 10% to 15%. Below, we highlight the latest performance metrics for these high-impact kernels.

    + +

    bar chart

    + +

    bar chart

    + +

    Future Work

    + +

    Looking ahead, we plan to further enhance Triton’s warp specialization support by introducing new features such as Ping-Pong scheduling, expanded buffer sharing support, improved transparent handling for TMA, refined partitioning heuristics for upcoming NVIDIA hardware.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/what-every-user-should-know-about-mixed-precision-training-in-pytorch/index.html b/blog/what-every-user-should-know-about-mixed-precision-training-in-pytorch/index.html new file mode 100644 index 000000000000..ff39048f2482 --- /dev/null +++ b/blog/what-every-user-should-know-about-mixed-precision-training-in-pytorch/index.html @@ -0,0 +1,757 @@ + + + + + + + + + + + + + What Every User Should Know About Mixed Precision Training in PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Syed Ahmed, Christian Sarofeen, Mike Ruberry, Eddie Yan, Natalia Gimelshein, Michael Carilli, Szymon Migacz, Piotr Bialecki, Paulius Micikevicius, Dusan Stosic, Dong Yang, and Naoya Maruyama + +

    +

    Efficient training of modern neural networks often relies on using lower precision data types. Peak float16 matrix multiplication and convolution performance is 16x faster than peak float32 performance on A100 GPUs. And since the float16 and bfloat16 data types are only half the size of float32 they can double the performance of bandwidth-bound kernels and reduce the memory required to train a network, allowing for larger models, larger batches, or larger inputs. Using a module like torch.amp (short for “Automated Mixed Precision”) makes it easy to get the speed and memory usage benefits of lower precision data types while preserving convergence behavior.

    + +

    Going faster and using less memory is always advantageous – deep learning practitioners can test more model architectures and hyperparameters, and larger, more powerful models can be trained. Training very large models like those described in Narayanan et al. and Brown et al. (which take thousands of GPUs months to train even with expert handwritten optimizations) is infeasible without using mixed precision.

    + +

    We’ve talked about mixed precision techniques before (here, here, and here), and this blog post is a summary of those techniques and an introduction if you’re new to mixed precision.

    + +

    Mixed Precision Training in Practice

    + +

    Mixed precision training techniques – the use of the lower precision float16 or bfloat16 data types alongside the float32 data type – are broadly applicable and effective. See Figure 1 for a sampling of models successfully trained with mixed precision, and Figures 2 and 3 for example speedups using torch.amp.

    + +

    + +

    + +

    + Figure 1: Sampling of DL Workloads Successfully Trained with float16 (Source). +

    + +

    + +

    + +

    + Figure 2: Performance of mixed precision training using torch.amp on NVIDIA 8xV100 vs. float32 training on 8xV100 GPU. Bars represent the speedup factor of torch.amp over float32. +(Higher is better.) (Source). +

    + +

    + +

    + +

    + Figure 3. Performance of mixed precision training using torch.amp on NVIDIA 8xA100 vs. 8xV100 GPU. Bars represent the speedup factor of A100 over V100. +(Higher is Better.) (Source). +

    + +

    See the NVIDIA Deep Learning Examples repository for more sample mixed precision workloads.

    + +

    Similar performance charts can be seen in 3D medical image analysis, gaze estimation, video synthesis, conditional GANs, and convolutional LSTMs. Huang et al. showed that mixed precision training is 1.5x to 5.5x faster over float32 on V100 GPUs, and an additional 1.3x to 2.5x faster on A100 GPUs on a variety of networks. On very large networks the need for mixed precision is even more evident. Narayanan et al. reports that it would take 34 days to train GPT-3 175B on 1024 A100 GPUs (with a batch size of 1536), but it’s estimated it would take over a year using float32!

    + +

    Getting Started With Mixed Precision Using torch.amp

    + +

    torch.amp, introduced in PyTorch 1.6, makes it easy to leverage mixed precision training using the float16 or bfloat16 dtypes. See this blog post, tutorial, and documentation for more details. Figure 4 shows an example of applying AMP with grad scaling to a network.

    + +
    import torch
    +# Creates once at the beginning of training
    +scaler = torch.cuda.amp.GradScaler()
    +
    +for data, label in data_iter:
    +   optimizer.zero_grad()
    +   # Casts operations to mixed precision
    +   with torch.amp.autocast(device_type=“cuda”, dtype=torch.float16):
    +      loss = model(data)
    +
    +   # Scales the loss, and calls backward()
    +   # to create scaled gradients
    +   scaler.scale(loss).backward()
    +
    +   # Unscales gradients and calls
    +   # or skips optimizer.step()
    +   scaler.step(optimizer)
    +
    +   # Updates the scale for next iteration
    +   scaler.update()
    +
    + +

    + Figure 4: AMP recipe +

    + +

    Picking The Right Approach

    + +

    Out-of-the-box mixed precision training with either float16 or bfloat16 is effective at speeding up the convergence of many deep learning models, but some models may require more careful numerical accuracy management. Here are some options:

    + +
      +
    • Full float32 precision. Floating point tensors and modules are created in float32 precision by default in PyTorch, but this is a historic artifact not representative of training most modern deep learning networks. It’s rare that networks need this much numerical accuracy.
    • +
    • Enabling TensorFloat32 (TF32) mode. On Ampere and later CUDA devices matrix multiplications and convolutions can use the TensorFloat32 (TF32) mode for faster but slightly less accurate computations. See the Accelerating AI Training with NVIDIA TF32 Tensor Cores blog post for more details. By default PyTorch enables TF32 mode for convolutions but not matrix multiplications, and unless a network requires full float32 precision we recommend enabling this setting for matrix multiplications, too (see the documentation here for how to do so). It can significantly speed up computations with typically negligible loss of numerical accuracy.
    • +
    • Using torch.amp with bfloat16 or float16. Both these low precision floating point data types are usually comparably fast, but some networks may only converge with one vs the other. If a network requires more precision it may need to use float16, and if a network requires more dynamic range it may need to use bfloat16, whose dynamic range is equal to that of float32. If overflows are observed, for example, then we suggest trying bfloat16.
    • +
    + +

    There are even more advanced options than those presented here, like using torch.amp’s autocasting for only parts of a model, or managing mixed precision directly. These topics are largely beyond the scope of this blog post, but see the “Best Practices” section below.

    + +

    Best Practices

    + +

    We strongly recommend using mixed precision with torch.amp or the TF32 mode (on Ampere and later CUDA devices) whenever possible when training a network. If one of those approaches doesn’t work, however, we recommend the following:

    + +
      +
    • High Performance Computing (HPC) applications, regression tasks, and generative networks may simply require full float32 IEEE precision to converge as expected.
    • +
    • Try selectively applying torch.amp. In particular we recommend first disabling it on regions performing operations from the torch.linalg module or when doing pre- or post-processing. These operations are often especially sensitive. Note that TF32 mode is a global switch and can’t be used selectively on regions of a network. Enable TF32 first to check if a network’s operators are sensitive to the mode, otherwise disable it.
    • +
    • If you encounter type mismatches while using torch.amp we don’t suggest inserting manual casts to start. This error is indicative of something being off with the network, and it’s usually worth investigating first.
    • +
    • Figure out by experimentation if your network is sensitive to range and/or precision of a format. For example fine-tuning bfloat16-pretrained models in float16 can easily run into range issues in float16 because of the potentially large range from training in bfloat16, so users should stick with bfloat16 fine-tuning if the model was trained in bfloat16.
    • +
    • The performance gain of mixed precision training can depend on multiple factors (e.g. compute-bound vs memory-bound problems) and users should use the tuning guide to remove other bottlenecks in their training scripts. Although having similar theoretical performance benefits, BF16 and FP16 can have different speeds in practice. It’s recommended to try the mentioned formats and use the one with best speed while maintaining the desired numeric behavior.
    • +
    + +

    For more details, refer to the AMP Tutorial, Training Neural Networks with Tensor Cores, and see the post “More In-Depth Details of Floating Point Precision” on PyTorch Dev Discussion.

    + +

    Conclusion

    + +

    Mixed precision training is an essential tool for training deep learning models on modern hardware, and it will become even more important in the future as the performance gap between lower precision operations and float32 continues to grow on newer hardware, as reflected in Figure 5.

    + +

    + +

    + +

    +Figure 5: Relative peak throughput of float16 (FP16) vs float32 matrix multiplications on Volta and Ampere GPUs. On Ampere relative peak throughput for the TensorFloat32 (TF32) mode and bfloat16 matrix multiplications are shown, too. The relative peak throughput of low precision data types like float16 and bfloat16 vs. float32 matrix multiplications is expected to grow as new hardware is released. +

    + +

    PyTorch’s torch.amp module makes it easy to get started with mixed precision, and we highly recommend using it to train faster and reduce memory usage. torch.amp supports both float16 and bfloat16 mixed precision.

    + +

    There are still some networks that are tricky to train with mixed precision, and for these networks we recommend trying TF32 accelerated matrix multiplications on Ampere and later CUDA hardware. Networks are rarely so precision sensitive that they require full float32 precision for every operation.

    + +

    If you have questions or suggestions for torch.amp or mixed precision support in PyTorch then let us know by posting to the mixed precision category on the PyTorch Forums or filing an issue on the PyTorch GitHub page.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/zeus/index.html b/blog/zeus/index.html new file mode 100644 index 000000000000..98ea60e63b08 --- /dev/null +++ b/blog/zeus/index.html @@ -0,0 +1,809 @@ + + + + + + + + + + + + + Deep Learning Energy Measurement and Optimization | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + + +
    +
    +
    + +
    +

    + by + + Jae-Won Chung + +

    +

    Zeus logo

    + +

    This post is authored by Jae-Won Chung, a PhD student at the University of Michigan and the lead of the ML.ENERGY Initiative.

    + +

    Deep learning consumes quite a bit of energy. For instance, training a single 200B LLM on AWS p4d instances consumed around 11.9 GWh (source: CIDR 2024 keynote), which is an amount that can single-handedly power more than a thousand average US households for a year.

    + +

    Zeus is an open-source toolbox for measuring and optimizing the energy consumption of deep learning workloads. Our goal is to make energy optimization based on accurate measurements as easy as possible for diverse deep learning workloads and setups by offering composable tools with minimal assumptions.

    + +

    Zeus largely provides two types of tools:

    + +
      +
    1. Programmatic and command line GPU energy measurement tools
    2. +
    3. Several energy optimization tools that find the best ML and/or GPU configurations
    4. +
    + +

    Zeus can benefit those who would like to

    + +
      +
    • measure and optimize their electricity cost
    • +
    • reduce heat dissipation from their GPUs (by lowering power draw)
    • +
    • report energy usage from research and development
    • +
    • reduce carbon footprint from electricity usage
    • +
    + +

    Part 1: Measuring Energy

    + +

    Just like performance optimization, accurate measurement is the basis of effective energy optimization. Popular proxies for estimating power consumption like the maximum power draw of the hardware can sometimes be vastly off compared to actual measurement.

    + +

    To make energy measurement as easy and transparent as possible, the core utility Zeus offers is the ZeusMonitor class. Let’s take a look at the actual snippet:

    + +
    from zeus.monitor import ZeusMonitor
    +
    +# All four GPUs are measured simultaneously.
    +monitor = ZeusMonitor(gpu_indices=[0,1,2,3])
    +
    +# Measure total time and energy within the window.
    +monitor.begin_window("training")
    +for e in range(100):
    +
    +    # Measurement windows can arbitrarily be overlapped.
    +    monitor.begin_window("epoch")
    +    for x, y in train_dataloader:
    +        y_hat = model(x)
    +        loss = criterion(y, y_hat)
    +        loss.backward()
    +        optim.step()
    +    measurement = monitor.end_window("epoch")
    +    print(f"Epoch {e}: {measurement.time} s, {measurement.total_energy} J")
    +
    +measurement = monitor.end_window("training")
    +print(f"Entire training: {measurement.time} s, {measurement.total_energy} J")
    +
    + +

    What you see above is a typical PyTorch training loop which uses four GPUs for data parallel training. Inside, we created an instance of ZeusMonitor and passed in a list of GPU indices to monitor. Then, using the monitor, we can measure the time and energy consumption of arbitrary execution windows within the training script by pairing calls to begin_window and end_window. Multiple windows can overlap and nest in arbitrary ways without affecting the measurement of each, as long as their names are different.

    + +

    ZeusMonitor adds very little overhead – typically single digit milliseconds – around the window. This allows ZeusMonitor to be used in various applications. For instance:

    + +
      +
    • The ML.ENERGY Leaderboard: The first open-source benchmark on how much energy LLM text generation consumes.
    • +
    • The ML.ENERGY Colosseum: An online service that lets users compare LLM responses side-by-side based on response quality and energy consumption.
    • +
    + +

    See our blog post for a deeper technical dive into accurate GPU energy measurement.

    + +

    Part 2: Optimizing Energy

    + +

    Let me introduce you to two of the energy optimizers provided by Zeus.

    + +

    GlobalPowerLimitOptimizer

    + +

    GPUs allow users to configure its maximum power draw, called power limit. Typically, as you lower the GPU’s power limit from the default maximum, computation may get slightly slower, but you’ll save disproportionately more energy. The GlobalPowerLimitOptimizer in Zeus automatically finds the optimal GPU power limit globally across all GPUs.

    + +
    from zeus.monitor import ZeusMonitor
    +from zeus.optimizer.power_limit import GlobalPowerLimitOptimizer
    +
    +# The optimizer measures time and energy through the ZeusMonitor.
    +monitor = ZeusMonitor(gpu_indices=[0,1,2,3])
    +plo = GlobalPowerLimitOptimizer(monitor)
    +
    +for e in range(100):
    +    plo.on_epoch_begin()
    +    for x, y in train_dataloader:
    +        plo.on_step_begin()
    +
    +        y_hat = model(x)
    +        loss = criterion(y, y_hat)
    +        loss.backward()
    +        optim.step()
    +
    +        plo.on_step_end()
    +    plo.on_epoch_end()
    +
    + +

    In our familiar PyTorch training loop, we have instantiated GlobalPowerLimitOptimizer and passed it an instance of the ZeusMonitor, through which the optimizer sees the GPUs. Then, we just need to let the optimizer know about training progress (step and epoch boundaries), and the optimizer will transparently do all the necessary profiling and converge to the optimal power limit.

    + +

    If you’re using the HuggingFace Trainer or SFTTrainer, integration is even easier:

    + +
    from zeus.monitor import ZeusMonitor
    +from zeus.optimizer.power_limit import HFGlobalPowerLimitOptimizer
    +
    +# ZeusMonitor actually auto-detects CUDA_VISIBLE_DEVICES.
    +monitor = ZeusMonitor()
    +pl_optimizer = HFGlobalPowerLimitOptimizer(monitor)
    +
    +# Pass in the optimizer as a Trainer callback. Also works for SFTTrainer.
    +trainer = Trainer(
    +    model=model,
    +    train_dataset=train_dataset,
    +    ...,
    +    callbacks=[pl_optimizer],
    +)
    +
    + +

    The HFGlobalPowerLimitOptimizer wraps GlobalPowerLimitOptimizer so that it automatically detects step and epoch boundaries. We have example integrations here, including running Gemma 7B supervised fine-tuning with QLoRA.

    + +

    Now, we know how to integrate the optimizer, but what is the optimal power limit? We know different users can have different preferences regarding trading off time and energy, so we allow users to specify an OptimumSelector (basically the Strategy Pattern) to express their needs.

    + +
    # Built-in strategies for selecting the optimal power limit.
    +from zeus.optimizer.power_limit import (
    +    GlobalPowerLimitOptimizer,
    +    Time,
    +    Energy,
    +    MaxSlowdownConstraint,
    +)
    +
    +# Minimize energy while tolerating at most 10% slowdown.
    +plo = GlobalPowerLimitOptimizer(
    +    monitor,
    +    MaxSlowdownConstraint(factor=1.1),
    +)
    +
    +
    + +

    Some of the built-in strategies include “Minimize time” (Time, this might still reduce the power limit from the default since some workloads exhibit almost no slowdown even on lower power limits), “Minimize energy” (Energy), “Somewhere in between” (ZeusCost), and “Minimize energy given maximum slowdown” (MaxSlowdownConstraint). Users can also create their own optimum selectors as needed.

    + +

    PipelineFrequencyOptimizer

    + +

    The pipeline frequency optimizer, based on our research paper Perseus, is our latest work on energy optimization for large model training, like GPT-3. Perseus can reduce the energy consumption of large model training with no or negligible training throughput degradation. We’ll briefly talk about how.

    + +

    one iteration of training with four stage pipeline parallelism

    + +

    The above is a visualization of one iteration of training with four stage pipeline parallelism running with the 1F1B schedule. Each box is either a forward or a backward computation, and is colored with its power consumption.

    + +

    The key observation here is that when models are partitioned into pipeline stages, it’s very difficult to slice them in perfectly equal sizes. This leads to forward/backward boxes of varying widths and therefore computation idle time between boxes. You would notice that those smaller boxes can run slightly slower than wider boxes and the overall critical path (blue line) will not change at all.

    + +

    one iteration of training with four stage pipeline parallelism

    + +

    That’s what Perseus automatically does. Based on profiling, it identifies computation boxes that are not on the critical path and figures out the precise amount of slowdown for each box that minimizes energy consumption. When done correctly, computations we slowed down will consume less power & energy, but the overall iteration time of the pipeline does not change.

    + +

    See our guide to get started with Perseus!

    + +

    Final Words

    + +

    For users who run their own on-premise compute, energy consumption and the resulting electricity bill is not something that can be easily overlooked. On a larger scale, energy consumption is not just about electricity bills, but also about data center power delivery. With thousands of GPUs running in clusters, finding stable, affordable, and sustainable electricity sources to power data centers is becoming increasingly challenging. Finding ways to reduce energy disproportionately more than slowdown leads to lower average power consumption, which can help with the power delivery challenge.

    + +

    With Zeus, we hope to take the first step towards deep learning energy measurement and optimization.

    + +

    Wondering where to go from here? Here are a couple helpful links:

    + + + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/board_info/advanced-micro-devices.html b/board_info/advanced-micro-devices.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/board_info/advanced-micro-devices.html @@ -0,0 +1 @@ + diff --git a/board_info/arm.html b/board_info/arm.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/board_info/arm.html @@ -0,0 +1 @@ + diff --git a/board_info/aws.html b/board_info/aws.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/board_info/aws.html @@ -0,0 +1 @@ + diff --git a/board_info/google-cloud.html b/board_info/google-cloud.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/board_info/google-cloud.html @@ -0,0 +1 @@ + diff --git a/board_info/huawei.html b/board_info/huawei.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/board_info/huawei.html @@ -0,0 +1 @@ + diff --git a/board_info/hugging-face.html b/board_info/hugging-face.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/board_info/hugging-face.html @@ -0,0 +1 @@ + diff --git a/board_info/ibm.html b/board_info/ibm.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/board_info/ibm.html @@ -0,0 +1 @@ + diff --git a/board_info/intel.html b/board_info/intel.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/board_info/intel.html @@ -0,0 +1 @@ + diff --git a/board_info/lightning.html b/board_info/lightning.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/board_info/lightning.html @@ -0,0 +1 @@ + diff --git a/board_info/meta.html b/board_info/meta.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/board_info/meta.html @@ -0,0 +1 @@ + diff --git a/board_info/microsoft-corporation.html b/board_info/microsoft-corporation.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/board_info/microsoft-corporation.html @@ -0,0 +1 @@ + diff --git a/board_info/nvidia-corporation.html b/board_info/nvidia-corporation.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/board_info/nvidia-corporation.html @@ -0,0 +1 @@ + diff --git a/case_studies/amazon-ads.html b/case_studies/amazon-ads.html new file mode 100644 index 000000000000..588ea386377c --- /dev/null +++ b/case_studies/amazon-ads.html @@ -0,0 +1,642 @@ + + + + + + + + + + + + + Amazon Ads | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    June 04, 2025

    +

    + Amazon Ads +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    Reduce inference costs by 71% and drive scale out using PyTorch, TorchServe, and AWS Inferentia.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/case_studies/salesforce.html b/case_studies/salesforce.html new file mode 100644 index 000000000000..c21adc40c3fe --- /dev/null +++ b/case_studies/salesforce.html @@ -0,0 +1,642 @@ + + + + + + + + + + + + + Salesforce | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    June 04, 2025

    +

    + Salesforce +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    Pushing the state of the art in NLP and Multi-task learning.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/case_studies/stanford-university.html b/case_studies/stanford-university.html new file mode 100644 index 000000000000..230a34ef5bc5 --- /dev/null +++ b/case_studies/stanford-university.html @@ -0,0 +1,642 @@ + + + + + + + + + + + + + Stanford University | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    June 04, 2025

    +

    + Stanford University +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    Using PyTorch’s flexibility to efficiently research new algorithmic approaches.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code-of-conduct.html b/code-of-conduct.html new file mode 100644 index 000000000000..149dd98924a3 --- /dev/null +++ b/code-of-conduct.html @@ -0,0 +1,826 @@ + + + + + + + + + + + + + PyTorch Foundation Code of Conduct | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + +
    + + + +
    +
    +
    +

    PyTorch Foundation
    Code of Conduct

    +
    +
    +
    + +
    +
    +
    +
    + + +

    Our Commitment

    + + +

    + The PyTorch Foundation is committed to fostering an inclusive, welcoming, and safe environment for everyone involved in the PyTorch Foundation community. This commitment extends across all Foundation activities, including but not limited to our technical projects, events, communication channels, and social media presence. We pledge to make participation in our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation. +

    +

    Scope

    + + +

    + This code of conduct applies to Governing Board meetings, Technical Advisory Council meetings and outreach programs (such as the Ambassador Program) of the PyTorch Foundation and any other activity of the PyTorch Foundation that is not otherwise covered by a code of conduct of either The Linux Foundation or an applicable technical project. +

    +

    PyTorch Foundation Events

    +

    + PyTorch Foundation events that are produced by the Linux Foundation with professional events staff are governed by the Linux Foundation Events Code of Conduct available on the event page, which is designed to be used in conjunction with this PyTorch Foundation Code of Conduct. +

    +

    Technical Projects in the PyTorch Foundation Umbrella

    +

    + Technical projects supported by the PyTorch Foundation are organized as separate projects and each maintains a code of conduct that applies to participants in those projects. +

    +

    Expected Behavior

    + + +

    + Community members are expected to: +

    +
      + +
    • Use welcoming and inclusive language
    • + +
    • Respect differing viewpoints and experiences
    • + +
    • Accept constructive criticism gracefully
    • + +
    • Prioritize what benefits the community as a whole
    • + +
    • Show empathy and kindness toward others
    • + +
    • Be professional and responsible in all interactions
    • + +
    • Follow health and safety requirements at in-person events
    • + +
    • Exercise consideration and respect in speech and actions
    • + +
    • Collaborate with other community members in a constructive manner
    • +
    +

    Unacceptable Behavior

    + + +

    + The following behaviors are considered unacceptable within our community: +

    +

    Harassment and Discrimination

    + + +
      + +
    • Harassment of any kind, whether verbal, physical, or visual
    • + +
    • Discrimination based on protected characteristics
    • + +
    • Sexual harassment or unwelcome sexual attention
    • + +
    • Deliberate intimidation, stalking, or following
    • + +
    • Sustained disruption of talks, events, or online discussions
    • + +
    • Inappropriate physical contact
    • +
    +

    Communication and Content

    + + +
      + +
    • Use of sexualized language or imagery
    • + +
    • Violent or threatening language or imagery
    • + +
    • Trolling, insulting/derogatory comments, or personal attacks
    • + +
    • Public or private harassment
    • + +
    • Publishing others’ private information without permission
    • + +
    • Using Foundation platforms for political campaigning or promotion of political causes that are unrelated to technology
    • + +
    • Other conduct which could reasonably be considered inappropriate in a professional setting
    • +
    +

    Online and Social Media Behavior

    + + +
      + +
    • Harassment or bullying through social media platforms
    • + +
    • Spreading misinformation about the Foundation or its members
    • + +
    • Using Foundation channels for commercial promotion without permission
    • + +
    • Creating multiple accounts to evade moderation
    • + +
    • Impersonating Foundation members or officials
    • +
    +

    Behavior During Investigations

    + + +
      + +
    • Providing knowingly false or misleading information in connection with a Code of Conduct investigation or otherwise intentionally tampering with an investigation.
    • + +
    • Retaliating against a person because they reported an incident or provided information about an incident as a witness.
    • +
    +

    Enforcement

    + + +

    Reporting Violations

    + + +

    + Violations can be reported to conduct@pytorch.org. All reports will be: +

    +
      + +
    • Reviewed promptly and thoroughly
    • + +
    • Treated with strict confidentiality
    • + +
    • Investigated and addressed appropriately
    • + +
    • Documented for future reference
    • +
    +

    Consequences

    + + +

    + Violations may result in: +

    +
      + +
    • Warning to the offending individual
    • + +
    • Temporary or permanent ban from Foundation spaces
    • + +
    • Removal from leadership or contributory roles
    • + +
    • Expulsion from events without refund
    • + +
    • Reporting to appropriate authorities if necessary
    • + +
    • Other consequences
    • +
    +

    Appeals Process

    + + +
      + +
    • Individuals may appeal enforcement decisions
    • + +
    • Appeals must be submitted in writing within 30 days to the PyTorch Foundation via email to conduct@pytorch.org
    • + +
    • Decisions on appeals are final
    • +
    +

    Pre-Event Concerns

    + + +

    + If you have concerns about attending an upcoming event where specific individuals may be present: +

    +
      + +
    • Contact conduct@pytorch.org in advance
    • + +
    • Arrangements can be made for your safety and comfort
    • + +
    • Precautions may include providing security escorts and notifying staff
    • +
    +

    Amendments

    + + +

    + This Code of Conduct may be amended by the PyTorch Foundation as needed. Changes will be communicated to the community, and continued participation in the community indicates agreement to the current version. +

    +

    Questions and Reporting - Contact

    + + +

    + For questions, concerns, or reports: +
    + Email: conduct@pytorch.org +

    +

    ​​Acknowledgements

    +

    + This Code of Conduct is adapted from the Contributor Covenant, version 2.0 available here. +

    + + + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + diff --git a/community-blog.html b/community-blog.html new file mode 100644 index 000000000000..2f8fccf5b337 --- /dev/null +++ b/community-blog.html @@ -0,0 +1,988 @@ + + + + + + + + + + + + + Community Blog | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + +
    + +
    +
    +

    Community Blog

    +

    Stories from the PyTorch Ecosystem

    + +

    Also see the PyTorch Blog

    + +
    +
    + +
    +
    +
    +
    + + + +
    +
    +

    March 19, 2025

    +

    + SGLang Joins PyTorch Ecosystem: Efficient LLM Serving Engine +

    +

    We’re thrilled to announce that the SGLang project has been integrated into the PyTorch ecosystem! This integration ensures that SGLang aligns with PyTorch’s standards and practices, providing developers with a reliable and community-supported framework for fast and flexible serving of LLMs. +

    +
    + + Read More + +
    + +
    +
    +

    March 16, 2025

    +

    + PyTorch at GTC 2025 +

    +

    GTC is coming back to San Jose on March 17–21, 2025. Join PyTorch Foundation members Arm, AWS, Google Cloud, IBM, Lightning AI, Meta, Microsoft Azure, Snowflake, and thousands of developers as we celebrate PyTorch. Together learn how AI & accelerated computing are helping humanity solve our most complex challenges. +

    +
    + + Read More + +
    + +
    +
    +

    March 07, 2025

    +

    + Powering AI with PyTorch, Fedora, and Open Source Communities +

    +

    At DevConf.IN 2025 in Pune, I had the opportunity to host a PyTorch Meetup on February 28th. The session, titled “Powering AI with PyTorch, Fedora, and Open Source Communities” was aimed at introducing PyTorch to students and professionals, explaining why PyTorch+Fedora form an ideal AI development platform. The other key aspect I covered was collaboration between open source communities. +

    +
    + + Read More + +
    + +
    +
    +

    February 19, 2025

    +

    + Optimize LLMs for Efficiency & Sustainability +

    +

    The rapid growth of large language model (LLM) applications is linked to rapid growth in energy demand. According to the International Energy Agency (IEA), data center electricity consumption is projected to roughly double by 2026 primarily driven by AI. This is due to the energy-intensive training requirements for massive LLMs – however, the increase in AI Inferencing workloads also plays a role. For example, compared with traditional search queries, a single AI inference can consume about 1...

    +
    + + Read More + +
    + +
    +
    +

    February 12, 2025

    +

    + Solve Real-Word AI Challenges with PyTorch at Datathon 2025: DataOrbit +

    +

    We’re excited to have PyTorch sponsor Datathon 2025: DataOrbit, a place where students can collaborate with a team to solve problems using real-world datasets! This event, hosted by Data Science UCSB in collaboration with Gaucho Sports Analytics and ACM@UCSB, will take place on February 22–23rd, 2025 at UC Santa Barbara, with the incredible opportunity to present your project to a panel of corporate and faculty judges – including the executive director of Pytorch! – for a chance to win prizes...

    +
    + + Read More + +
    + +
    +
    +

    January 22, 2025

    +

    + Bringing the PyTorch Community Together +

    +

    As we step into a new year, it’s a great moment to reflect on the incredible community events that made 2024 a memorable year for the PyTorch Foundation. Global meetups, events, and conferences brought the community together to learn, connect, and grow. Here’s a quick recap of the year’s highlights and what to expect in 2025. +

    +
    + + Read More + +
    + +
    +
    +

    January 15, 2025

    +

    + MLOps Workflow Simplified for PyTorch with Arm and GitHub Collaboration +

    +

    PyTorch is one of the most widely used and most powerful deep learning frameworks for training and deploying complex neural networks. It has never been easier to train and deploy AI applications, and low-cost, high-performance, energy-efficient hardware, tools, and technology for creating optimized workflows are more accessible than ever. But data science, machine learning, and devops can be deep topics unto themselves, and it can be overwhelming for developers with one specialty to see how ...

    +
    + + Read More + +
    + +
    +
    +

    December 18, 2024

    +

    + docTR joins PyTorch Ecosystem: From Pixels to Data, Building a Recognition Pipeline with PyTorch and docTR +

    +

    We’re thrilled to announce that the docTR project has been integrated into the PyTorch ecosystem! This integration ensures that docTR aligns with PyTorch’s standards and practices, giving developers a reliable, community-backed solution for powerful OCR workflows. +

    +
    + + Read More + +
    + +
    +
    +

    December 09, 2024

    +

    + vLLM Joins PyTorch Ecosystem: Easy, Fast, and Cheap LLM Serving for Everyone +

    +

    We’re thrilled to announce that the vLLM project has become a PyTorch ecosystem project, and joined the PyTorch ecosystem family! + +

    +
    + + Read More + +
    + +
    +
    +

    September 08, 2024

    +

    + PyTorch Shanghai Meetup Notes +

    +

    We are honored to successfully host the PyTorch Shanghai Meetup on August 15, 2024. This Meetup has received great attention from the industry. We invited senior PyTorch developers from Intel and Huawei as guest speakers, who shared their valuable experience and the latest technical trends. In addition, this event also attracted PyTorch enthusiasts from many technology companies and well-known universities. A total of more than 40 participants gathered together to discuss and exchange the lat...

    +
    + + Read More + +
    + +
    +
    +

    May 12, 2024

    +

    + Enhancing Deep Learning Workflows: PyTorch Ecosystem Tools +

    +

    Welcome to the thriving PyTorch ecosystem, where a wealth of tools and libraries await, purpose-built to elevate your experience in deep learning as a developer or researcher. The Ecosystem Tools pages host many projects from experts spanning academia, industry, application development, and machine learning. +

    +
    + + Read More + +
    + +
    +
    +

    May 11, 2024

    +

    + Deep Learning Energy Measurement and Optimization +

    +

    Zeus is an open-source toolbox for measuring and optimizing the energy consumption of deep learning workloads. Our goal is to make energy optimization based on accurate measurements as easy as possible for diverse deep learning workloads and setups by offering composable tools with minimal assumptions. +

    +
    + + Read More + +
    + +
    +
    +

    May 11, 2024

    +

    + Introducing depyf: mastering torch.compile with ease +

    +

    We are thrilled to introduce depyf, a new project to the PyTorch ecosystem designed to help users understand, learn, and adapt to torch.compile! +

    +
    + + Read More + +
    + +
    +
    +

    February 15, 2024

    +

    + Exploring scientific machine learning pipelines through the SimulAI toolkit +

    +

    SciML, short for Scientific Machine Learning, encompasses work that merges quantitative sciences with machine learning. It has gained significant traction over the past decade, driven by the widespread availability of specialized hardware (such as GPUs and TPUs) and datasets. Additionally, it has been propelled by the overarching influence of the machine learning wave, now ingrained in the zeitgeist of our times. In this context, we’d like to introduce SimulAI, an open-source toolkit under th...

    +
    + + Read More + +
    + +
    +
    +

    January 29, 2024

    +

    + Colossal-LLaMA-2: Low Cost and High-quality Domain-specific LLM Solution Using LLaMA and Colossal-AI +

    +

    The most prominent distinction between LLaMA-1 and LLaMA-2 lies in the incorporation of higher-quality corpora, a pivotal factor contributing to significant performance enhancements in LLaMA-2. This, coupled with its commercial availability, extends the potential for creative applications of large models within the open-source community. +

    +
    + + Read More + +
    + +
    +
    +

    January 25, 2024

    +

    + 3D rotations and spatial transformations made easy with RoMa +

    +

    Struggling with quaternions, rotation vectors, right-hand rules and all these stuffs? Try RoMa: an easy-to-to-use, stable and efficient library to deal with rotations and spatial transformations in PyTorch. +

    +
    + + Read More + +
    + +
    +
    +

    January 04, 2024

    +

    + torchdistill — a modular, configuration-driven framework for reproducible deep learning and knowledge distillation experiments +

    +

    This article summarizes key features and concepts of torchdistill (v1.0.0). Refer to the official documentation for its APIs and research projects. +

    +
    + + Read More + +
    + +
    +
    +

    December 06, 2023

    +

    + PyPose: A Library for Robot Learning with Physics-based Optimization +

    +

    We are excited to share our new open-source library PyPose. It is a PyTorch-based robotics-oriented library that provides a set of tools and algorithms for connecting deep learning with physics-based optimization. +

    +
    + + Read More + +
    + +
    +
    +

    November 09, 2023

    +

    + How Activation Checkpointing enables scaling up training deep learning models +

    +

    Activation checkpointing is a technique used for reducing the memory footprint at the cost of more compute. It utilizes the simple observation that we can avoid saving intermediate tensors necessary for backward computation if we just recompute them on demand instead. +

    +
    + + Read More + +
    + +
    +
    +

    October 26, 2023

    +

    + torch.compile, explained +

    +

    Have you ever felt overwhelmed by the complexities of torch.compile? Diving into its workings can feel like black magic, with bytecode and Python internal details that many users fail to understand, hindering them from understanding and debugging torch.compile. +

    +
    + + Read More + +
    + +
    +
    +

    July 06, 2023

    +

    + Unveiling the Power of Semi-Supervised Learning: The Unified Semi-Supervised Learning Benchmark +

    +

    Machine Learning models thrive on high-quality, fully-annotated data. The traditional supervised learning approach typically requires data on the scale of millions, or even billions, to train large foundational models. However, obtaining such a vast amount of labeled data is often tedious and labor-intensive. As an alternative, semi-supervised learning (SSL) aims to enhance model generalization with only a fraction of labeled data, complemented by a considerable amount of unlabeled data. This...

    +
    + + Read More + +
    + +
    +
    +

    June 29, 2023

    +

    + Introducing TorchOpt: A High-Performance Differentiable Optimization Library for PyTorch +

    +

    Explore TorchOpt, a PyTorch-based library that revolutionizes differentiable optimization with its unified programming abstraction, high-performance distributed execution runtime, and support for various differentiation modes.” +

    +
    + + Read More + +
    + +
    +
    +

    April 04, 2023

    +

    + Profiling PyTorch language models with octoml-profile +

    +

    The recent launch of PyTorch 2.0 makes it clear that the community is heavily investing in a compiler-powered future for machine learning. The new OctoML Profiler can help any user realize the full potential of these shifts in the ML landscape. +

    +
    + + Read More + +
    + +
    +
    +

    February 10, 2023

    +

    + How FASHABLE achieves SoA realistic AI generated images using PyTorch and Azure Machine Learning +

    +

    Fashable is a company born at XNFY Lab (a joint initiative with Microsoft). The company’s main goal is to revolutionize the world of fashion with ethical Artificial Intelligence (AI) technologies built on PyTorch framework. Fashable is focused on developing AI models that generates synthetic contents for the global fashion industry. The Fashion industry has been criticized in recent years because it generates a lot of waste and is responsible for up to 10% of global carbon dioxide output. Fas...

    +
    + + Read More + +
    + +
    +
    +

    January 31, 2023

    +

    + Latest Colossal-AI boasts novel automatic parallelism and offers savings up to 46x for Stable Diffusion 2 +

    +

    As a new PyTorch Ecosystem Partner, we at HPC-AI Tech look forward to working with the PyTorch community to advance AI technologies through our open source project, Colossal-AI. We are excited to join forces with the PyTorch community in this effort. +

    +
    + + Read More + +
    + +
    +
    +

    January 06, 2023

    +

    + Distributed training with PyTorch and Azure ML +

    +

    Suppose you have a very large PyTorch model, and you’ve already tried many common tricks to speed up training: you optimized your code, you moved training to the cloud and selected a fast GPU VM, you installed software packages that improve training performance (for example, by using the ACPT curated environment on Azure ML). And yet, you still wish your model could train faster. Maybe it’s time to give distributed training a try! Continue reading to learn the simplest way to do distributed t...

    +
    + + Read More + +
    + + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + diff --git a/community-stories.html b/community-stories.html new file mode 100644 index 000000000000..c156e8bc15c7 --- /dev/null +++ b/community-stories.html @@ -0,0 +1,1839 @@ + + + + + + + + + + + + + Community Stories | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + +
    + +
    +
    +

    Community Stories

    +

    Read case studies on how our community solves real, everyday machine learning problems with PyTorch

    +
    +
    + +
    +
    +
    +
    + + + +
    +
    +

    May 01, 2025

    +

    + How IBM Research Uses PyTorch and TerraTorch to Make Geospatial Computer Vision Accessible for Everyone +

    + +

    Geospatial computer vision is essential for understanding our planet — from monitoring deforestation to tracking urban development and analyzing the impacts of climate change. However, the coding and deep learning skills for applying AI models to satellite imagery and earth observation data has traditionally been a major barrier for many practitioners. +

    +
    + + Read More + +
    + +
    +
    +

    January 24, 2025

    +

    + How Intel Uses PyTorch to Empower Generative AI through Intel Arc GPUs +

    + +

    Intel has long been at the forefront of technological innovation, and its recent venture into Generative AI (GenAI) solutions is no exception. With the rise of AI-powered gaming experiences, Intel sought to deliver an accessible and intuitive GenAI inferencing solution tailored for AI PCs powered by Intel’s latest GPUs. By leveraging PyTorch as the backbone for development efforts, Intel successfully launched AI Playground, an open source application that showcases advanced GenAI workloads. +

    +
    + + Read More + +
    + +
    +
    +

    September 27, 2024

    +

    + Using PyTorch for Monocular Depth Estimation Webinar +

    + +

    In this webinar, Bob Chesebrough of Intel guides you through the steps he took to create a clipped image with background clutter removed from the image. He accomplished this using monocular depth estimation with PyTorch. This could potentially be used to automate structure from motion and other image-related tasks where you want to highlight or focus on a single portion of an image, particularly for identifying parts of the image that were closest to the camera. Specifically, he used depth es...

    +
    + + Read More + +
    + +
    +
    +

    May 25, 2024

    +

    + AI Helps Duolingo Personalize Language Learning +

    + +

    Learning a foreign language was probably one of your goals last year. And the year before, and the year before that. Like gym memberships, our best intentions often don’t survive very long. Aside from the time required to achieve proficiency with a new language, most people struggle with traditional approaches to learning. Even many web-based language tools can be monotonous and cumbersome. +

    +
    + + Read More + +
    + +
    +
    +

    October 11, 2023

    +

    + ML Model Server Resource Saving - Transition From High-Cost GPUs to Intel CPUs and oneAPI powered Software with performance +

    + +

    Here, We will be sharing our experience in moving AI workloads from our GPU servers to our Intel CPU servers without any performance or quality degradation, and saving annual costs of approximately 340 thousand U.S. Dollar (refer to the Conclusion) in the process. +

    +
    + + Read More + +
    + +
    +
    +

    March 09, 2023

    +

    + Axon offers technology boost for public safety with in-car Automated License Plate Recognition on Azure +

    + +

    Axon, a technology leader in public safety, developed AI technology to add cutting-edge license plate recognition capabilities to its in-car camera products, which now can identify plates for vehicles of interest and provide law enforcement with proactive notifications and alerts. Axon AI scientists and engineers chose Microsoft Azure infrastructure as a scalable, cost-efficient, and feature-rich environment where they can develop and test AI models. With Azure compute, storage, and PyTorch a...

    +
    + + Read More + +
    + +
    +
    +

    February 21, 2023

    +

    + HippoScreen Improves AI Performance by 2.4x with oneAPI Tools +

    + +

    The Taiwan-based neurotechnology startup used tools and frameworks in the Intel® oneAPI Base and AI Analytics Toolkits to the improve efficiency and build times of deep-learning models used in its Brain Waves AI system. As a result, HippoScreen is able to broaden the system’s applications to a wider range of psychiatric conditions and diseases. +

    +
    + + Read More + +
    + +
    +
    +

    February 02, 2023

    +

    + NASA and IBM to Speed AI Creation with New Foundation Models +

    + +

    NASA and IBM are working together to create foundation models based on NASA’s data sets — including geospatial data — with the goal of accelerating the creation of AI models. + +

    +
    + + Read More + +
    + +
    +
    +

    January 23, 2023

    +

    + Search Model Serving Using PyTorch and TorchServe +

    + +

    Walmart Search has embarked on the journey of adopting Deep Learning in the search ecosystem to improve search relevance. For our pilot use case, we served the computationally intensive Bert Base model at runtime with an objective to achieve low latency and high throughput. +

    +
    + + Read More + +
    + +
    +
    +

    December 30, 2022

    +

    + Extracting value from siloed healthcare data using federated learning with Azure Machine Learning +

    + +

    Sensitive information such as healthcare data is often siloed within health organization boundaries. This has posed a challenge to machine learning models used by the health and life sciences industry that require data for training purposes. To improve patient care and accelerate health industry progression, the Microsoft Health & Life Sciences AI group used a federated learning setup to train their biomedical natural language processing service, Text Analytics for Health, while preservin...

    +
    + + Read More + +
    + +
    +
    +

    December 02, 2022

    +

    + How PyTorch is bringing the power of AI to computers and smartphones +

    + +

    Many of the experiences people enjoy on Facebook and Instagram are powered by artificial intelligence (AI). A number of them, like Assistant, Avatars, and AR effects, cannot be powered by server-side AI due to latency, network bandwidth, and other constraints. Running AI on-device —that is, directly on a phone, tablet, or even a pair of smart glasses — offers huge advantages over constantly sending data back to a server. It’s faster, and it creates a privacy-enhancing experience for people wh...

    +
    + + Read More + +
    + +
    +
    +

    November 17, 2022

    +

    + IBM Research: Bringing massive AI models to any cloud +

    + +

    The field of AI is in the middle of a revolution. In recent years, AI models have made images, songs, or even websites out of simple text prompts. These types of models with billions of parameters, called foundation models, can with little fine-tuning be repurposed from one task to another, removing countless hours of training and labelling, and refitting a model to take on a new task. +

    +
    + + Read More + +
    + +
    +
    +

    October 25, 2022

    +

    + Run inference at scale for OpenFold, a PyTorch-based protein folding ML model, using Amazon EKS +

    + +

    In drug discovery, understanding the 3D structure of proteins is key to assessing the ability of a drug to bind to it, directly impacting its efficacy. Predicting the 3D protein form, however, is very complex, challenging, expensive, and time consuming, and can take years when using traditional methods such as X-ray diffraction. Applying machine learning (ML) to predict these structures can significantly accelerate the time to predict protein structures—from years to hours. Several high-profi...

    +
    + + Read More + +
    + +
    +
    +

    October 04, 2022

    +

    + Optimize Protein Folding Costs with OpenFold on AWS Batch +

    + +

    Knowing the physical structure of proteins is an important part of the drug discovery process. Machine learning (ML) algorithms like AlphaFold v2.0 significantly reduce the cost and time needed to generate usable protein structures. These projects have also inspired development of AI-driven workflows for de novo protein design and protein-ligand interaction analysis. +

    +
    + + Read More + +
    + +
    +
    +

    June 28, 2022

    +

    + Crayon boosts speed, accuracy of healthcare auditing process using Azure Machine Learning and PyTorch +

    + +

    Healthcare providers need to be able to verify that they’re maintaining the highest operating safety and efficacy standards. Those standards are set by a national accreditation organization whose surveyors, often healthcare professionals themselves, regularly visit facilities and document situations that might need to be corrected or brought back in line with the latest rules and policies. That assessment and accreditation process generates a huge amount of data, and even the most experienced...

    +
    + + Read More + +
    + +
    +
    +

    May 25, 2022

    +

    + Wayve’s AV2.0 builds a brighter future with Azure Machine Learning and PyTorch +

    + +

    Wayve wants to accelerate and scale autonomous vehicle (AV) development by using vision-based machine learning for rapid prototyping and quick iteration. So, it developed a platform that uses the open-source machine learning framework PyTorch with Microsoft Azure Machine Learning to gather, manage, and process millions of hours of driving data per year—petabytes of data—consisting of images, GPS data, and data from other sensors. Wayve now has the scalable capacity to build and iterate drivin...

    +
    + + Read More + +
    + +
    +
    +

    May 12, 2022

    +

    + Ambient Clinical Intelligence: Generating Medical Reports with PyTorch +

    + +

    Complete and accurate clinical documentation is an essential tool for tracking patient care. It allows for treatment plans to be shared among care teams to aid in continuity of care and ensures a transparent and effective process for reimbursement. +

    +
    + + Read More + +
    + +
    +
    +

    March 16, 2022

    +

    + Bentley Systems creates breakthrough framework, drastically speeds up AI development with Azure Machine Learning +

    + +

    Software innovator Bentley Systems offers a broad portfolio of solutions to help the organizations that design, build, and operate the world’s infrastructure assets. The company uses machine learning in its flagship product to read disparate paper-based asset data and transform it into consolidated digital data. To speed up and formalize this process, Bentley created a machine learning operations framework using Microsoft Azure Machine Learning and PyTorch. Developers’ speed and job satisfact...

    +
    + + Read More + +
    + +
    +
    +

    March 14, 2022

    +

    + Solliance makes headlines with cryptocurrency news analysis platform powered by Azure Machine Learning, PyTorch +

    + +

    Solliance delivers cutting-edge solutions that fill gaps across a wide variety of industries. Through its recent collaboration with Baseline, Solliance revolutionizes the cryptocurrency trading experience, extracting news insights from more than 150,000 global sources in near real time. To manage Baseline workloads, Solliance brought Microsoft Azure Machine Learning and PyTorch together for maximum processing power and deep learning capabilities. The result: investors can get under the headli...

    +
    + + Read More + +
    + +
    +
    +

    March 02, 2022

    +

    + Create a Wine Recommender Using NLP on AWS +

    + +

    In this tutorial, we’ll build a simple machine learning pipeline using a BERT word embedding model and the Nearest Neighbor algorithm to recommend wines based on user inputted preferences. To create and power this recommendation engine, we’ll leverage AWS’s SageMaker platform, which provides a fully managed way for us to train and deploy our service. +

    +
    + + Read More + +
    + +
    +
    +

    February 24, 2022

    +

    + Amazon Ads Uses PyTorch and AWS Inferentia to Scale Models for Ads Processing +

    + +

    Amazon Ads uses PyTorch, TorchServe, and AWS Inferentia to reduce inference costs by 71% and drive scale out. Amazon Ads helps companies build their brand and connect with shoppers through ads shown both within and beyond Amazon’s store, including websites, apps, and streaming TV content in more than 15 countries. Businesses and brands of all sizes, including registered sellers, vendors, book vendors, Kindle Direct Publishing (KDP) authors, app developers, and agencies can upload their own ad...

    +
    + + Read More + +
    + +
    +
    +

    February 10, 2022

    +

    + ChemicalX: A Deep Learning Library for Drug Pair Scoring +

    + +

    In this paper, we introduce ChemicalX, a PyTorch-based deep learning library designed for providing a range of state of the art models to solve the drug pair scoring task. The primary objective of the library is to make deep drug pair scoring models accessible to machine learning researchers and practitioners in a streamlined this http URL design of ChemicalX reuses existing high level model training utilities, geometric deep learning, and deep chemistry layers from the PyTorch ecosystem. Our...

    +
    + + Read More + +
    + +
    +
    +

    January 04, 2022

    +

    + The Why and How of Scaling Large Language Models +

    + +

    Anthropic is an AI safety and research company that’s working to build reliable, interpretable, and steerable AI systems. Over the past decade, the amount of compute used for the largest training runs has increased at an exponential pace. We’ve also seen in many domains that larger models are able to attain better performance following precise scaling laws. The compute needed to train these models can only be attained using many coordinated machines that are communicating data between them. I...

    +
    + + Read More + +
    + +
    +
    +

    November 21, 2021

    +

    + Running BERT model inference on AWS Inf1: From model compilation to speed comparison +

    + +

    In this tech blog, we will compare the speed and cost of Inferentia, GPU, and CPU for a BERT sequence labeling example. We also provide a helpful tutorial on the steps for model compilation and inference on Inf1 instances. +

    +
    + + Read More + +
    + +
    +
    +

    November 09, 2021

    +

    + SearchSage: Learning Search Query Representations at Pinterest +

    + +

    Pinterest surfaces billions of ideas to people every day, and the neural modeling of embeddings for content, users, and search queries are key in the constant improvement of these machine learning-powered recommendations. Good embeddings — representations of discrete entities as vectors of numbers — enable fast candidate generation and are strong signals to models that classify, retrieve and rank relevant content. +

    +
    + + Read More + +
    + +
    +
    +

    October 18, 2021

    +

    + How We Built: An Early-Stage Recommender System +

    + +

    Personalization is ubiquitous on most platforms today. Supercharged by connectivity, and scaled by machine learning, most experiences on the internet are tailored to our personal tastes. Peloton classes offer a diversity of instructors, languages, fitness disciplines, durations and intensity. Each Member has specific fitness goals, schedule, fitness equipment, and level of skill or strength. This diversity of content and individuality of Member needs at massive scale creates the opportunity f...

    +
    + + Read More + +
    + +
    +
    +

    September 07, 2021

    +

    + Using a Grapheme to Phoneme Model in Cisco’s Webex Assistant +

    + +

    Grapheme to Phoneme (G2P) is a function that generates pronunciations (phonemes) for words based on their written form (graphemes). It has an important role in automatic speech recognition systems, natural language processing, and text-to-speech engines. In Cisco’s Webex Assistant, we use G2P modelling to assist in resolving person names from voice. See here for further details of various techniques we use to build robust voice assistants. +

    +
    + + Read More + +
    + +
    +
    +

    September 07, 2021

    +

    + How AI is Helping Vets to Help our Pets +

    + +

    1 in 4 dogs, and 1 in 5 cats, will develop cancer at some point in their lives. Pets today have a better chance of being successfully treated than ever, thanks to advances in early recognition, diagnosis and treatment. +

    +
    + + Read More + +
    + +
    +
    +

    August 10, 2021

    +

    + University of Pécs enables text and speech processing in Hungarian, builds the BERT-large model with just 1,000 euro with Azure +

    + +

    Everyone prefers to use their mother tongue when communicating with chat agents and other automated services. However, for languages like Hungarian—spoken by only 15 million people—the market size will often be viewed as too small for large companies to create software, tools or applications that can process Hungarian text as input. Recognizing this need, the Applied Data Science and Artificial Intelligence team from University of Pécs decided to step up. Using Microsoft AI Solutions and ONNX...

    +
    + + Read More + +
    + +
    +
    +

    June 17, 2021

    +

    + How 3DFY.ai Built a Multi-Cloud, Distributed Training Platform Over Spot Instances with TorchElastic and Kubernetes +

    + +

    Deep Learning development is becoming more and more about minimizing the time from idea to trained model. To shorten this lead time, researchers need access to a training environment that supports running multiple experiments concurrently, each utilizing several GPUs. +

    +
    + + Read More + +
    + +
    +
    +

    June 07, 2021

    +

    + AI21 Labs Trains 178-Billion-Parameter Language Model Using Amazon EC2 P4d Instances, PyTorch +

    + +

    AI21 Labs uses machine learning to develop language models focused on understanding meaning, and in 2021 it set a goal to train the recently released Jurassic-1 Jumbo, an autoregressive language model with 178 billion parameters. Developers who register for beta testing will get access to Jurassic-1 Jumbo and can immediately start to customize the model for their use case. The software startup wanted to train the model efficiently, so it looked to Amazon Web Services (AWS) and built a solutio...

    +
    + + Read More + +
    + +
    +
    +

    June 02, 2021

    +

    + PyTorch Community Voices +

    + +

    Join us for an interview with star PyTorch community members Alexander O’Connor and Binghui Ouyang from AutoDesk as we learn how they used PyTorch and AWS Inferentia to deploy production-scale models in chatbot intent classification. +

    +
    + + Read More + +
    + +
    +
    +

    May 14, 2021

    +

    + How Outreach Productionizes PyTorch-based Hugging Face Transformers for NLP +

    + +

    At Outreach, a leading sales engagement platform, our data science team is a driving force behind our innovative product portfolio largely driven by deep learning and AI. We recently announced enhancements to the Outreach Insights feature, which is powered by the proprietary Buyer Sentiment deep learning model developed by the Outreach Data Science team. This model allows sales teams to deepen their understanding of customer sentiment through the analysis of email reply content, moving from j...

    +
    + + Read More + +
    + +
    +
    +

    April 29, 2021

    +

    + Automated Background Removal in E-commerce Fashion Image Processing Using PyTorch on Databricks +

    + +

    Wehkamp is one of the biggest e-commerce companies in the Netherlands, with more than 500,000 daily visitors on their website. A wide variety of products offered on the Wehkamp site aims to meet its customers’ many needs. An important aspect of any customer visit on an e-commerce website is a qualitative and accurate visual experience of the products. At a large scale, this is no easy task, with thousands of product photos processed in a local photo studio. +

    +
    + + Read More + +
    + +
    +
    +

    April 27, 2021

    +

    + Disney's Creative Genome by Miquel Farré +

    + +

    Miquel Farré is a senior technology manager at Disney, taking the lead on projects at the intersection of video technology, machine learning and web applications. Metadata that drives content searchability is most often indexed at the title level, with limited governance and high ambiguity; at best, keyword metadata has been added to a title as a layer of enrichment. +

    +
    + + Read More + +
    + +
    +
    +

    April 07, 2021

    +

    + How We Used AWS Inferentia to Boost PyTorch NLP Model Performance by 4.9x for the Autodesk Ava Chatbot +

    + +

    Autodesk is a multinational software company with world-renowned products in areas such as Architecture, Engineering, & Construction, Manufacturing, and Media & Entertainment. Amongst Autodesk’s best-known products are AutoCAD, Revit, Maya, and Fusion 360. The company has millions of customers around the world, and many of them have need for support to make best use of their products. +

    +
    + + Read More + +
    + +
    +
    +

    February 25, 2021

    +

    + Machine Learning at Tubi: Powering Free Movies, TV and News for All +

    + +

    In this blog series, our aim is to highlight the nuances of Machine Learning in Tubi’s Ad-based Video on Demand (AVOD) space as practiced at Tubi. Machine Learning helps solve myriad problems involving recommendations, content understanding and ads. We extensively use PyTorch for several of these use cases as it provides us the flexibility, computational speed and ease of implementation to train large scale deep neural networks using GPUs. +

    +
    + + Read More + +
    + +
    +
    +

    January 27, 2021

    +

    + Deepset achieves a 3.9x speedup and 12.8x cost reduction for training NLP models by working with AWS and NVIDIA +

    + +

    At deepset, we’re building the next-level search engine for business documents. Our core product, Haystack, is an open-source framework that enables developers to utilize the latest NLP models for semantic search and question answering at scale. Our software as a service (SaaS) platform, Haystack Hub, is used by developers from various industries, including finance, legal, and automotive, to find answers in all kinds of text documents. You can use these answers to improve the search experienc...

    +
    + + Read More + +
    + +
    +
    +

    December 17, 2020

    +

    + Using PyTorch to streamline machine-learning projects +

    + +

    For many surgeons, the possibility of going back into the operating room to review the actions they carried out on a patient could provide invaluable medical insights. +

    +
    + + Read More + +
    + +
    +
    +

    December 17, 2020

    +

    + How theator Built a Continuous Training Framework To Scale up Its Surgical Intelligence Platform +

    + +

    Performing surgery is largely about decision making. As Dr. Frank Spencer put it in 1978, “A skillfully performed operation is about 75% decision making and 25% dexterity”. Five decades later, and the surgical field is finally — albeit gradually — implementing advances in data science and AI to enhance surgeons’ ability to make the best decisions in the operating room. That’s where theator comes in: the company is re-imagining surgery with a Surgical Intelligence platform that leverages highl...

    +
    + + Read More + +
    + +
    +
    +

    December 02, 2020

    +

    + Graph Convolutional Operators in the PyTorch JIT +

    + +

    In this talk, scientist Lindsey Gray and Ph.D. student Matthias Fey co-examine how the challenges of High Energy Particle Physics are driving the need for more efficient research and development pipelines in neural network development. In particular, they look at the additions made to PyTorch Geometric, which allow Graph Neural Network models to be compiled by the PyTorch JIT, significantly easing the process of deploying such networks at scale. +

    +
    + + Read More + +
    + +
    +
    +

    October 22, 2020

    +

    + How Wadhwani AI Uses PyTorch To Empower Cotton Farmers +

    + +

    Cotton is a major fibre crop across the world, cultivated in over 80 countries with nearly 100 million families across the world rely on cotton farming for their livelihood. With such importance placed on many farmers’ crops, cotton’s particular vulnerability to pest infestations has been troubling to many. However, pest infestation is also simultaneously one of the most significant and preventable problems that farmers face with 55% of all pesticide usage in India being devoted to cotton far...

    +
    + + Read More + +
    + +
    +
    +

    October 07, 2020

    +

    + How Lyft Uses PyTorch to Power Machine Learning for Their Self-Driving Cars +

    + +

    Lyft’s mission is to improve people’s lives with the world’s best transportation. We believe in a future where self-driving cars make transportation safer and more accessible for everyone. That’s why Level 5, Lyft’s self-driving division, is developing a complete autonomous system for the Lyft network to provide riders’ access to the benefits of this technology. However, this is an incredibly complex task. +

    +
    + + Read More + +
    + +
    +
    +

    September 30, 2020

    +

    + Speeding up drug discovery with advanced machine learning +

    + +

    Whatever our job title happens to be at AstraZeneca, we’re seekers. I’m part of the Biological Insights Knowledge Graph (BIKG) team. We help scientists comb through massive amounts of data in our quest to find the information we need to help us deliver life-changing medicines. +

    +
    + + Read More + +
    + +
    +
    +

    September 30, 2020

    +

    + AstraZeneca is using PyTorch-powered algorithms to discover new drugs +

    + +

    Since it launched in 2017, Facebook’s machine-learning framework PyTorch has been put to good use, with applications ranging from powering Elon Musk’s autonomous cars to driving robot-farming projects. Now pharmaceutical firm AstraZeneca has revealed how its in-house team of engineers are tapping PyTorch too, and for equally as important endeavors: to simplify and speed up drug discovery. +

    +
    + + Read More + +
    + +
    +
    +

    August 06, 2020

    +

    + AI for AG: Production machine learning for agriculture +

    + +

    How did farming affect your day today? If you live in a city, you might feel disconnected from the farms and fields that produce your food. Agriculture is a core piece of our lives, but we often take it for granted. +

    +
    + + Read More + +
    + +
    +
    +

    July 17, 2020

    +

    + How Pixar uses AI and GANs to create high-resolution content +

    + +

    As digital animators continue to push the boundaries of technology and creativity, the technical teams that support them are turning to artificial intelligence and machine learning to deliver the tools they need. That’s the case at Pixar, where the company has made new machine learning breakthroughs it hopes will both improve quality and reduce costs. +

    +
    + + Read More + +
    + +
    +
    +

    July 16, 2020

    +

    + How Disney uses PyTorch for animated character recognition +

    + +

    The long and incremental evolution of the media industry, from a traditional broadcast and home video model, to a more mixed model with increasingly digitally-accessible content, has accelerated the use of machine learning and artificial intelligence (AI). Advancing the implementation of these technologies is critical for a company like Disney that has produced nearly a century of content, as it allows for new consumer experiences and enables new applications for illustrators and writers to c...

    +
    + + Read More + +
    + +
    +
    +

    June 16, 2020

    +

    + How Trigo built a scalable AI development & deployment pipeline for Frictionless Retail +

    + +

    Trigo is a provider of AI & computer vision based checkout-free systems for the retail market, enabling frictionless checkout and a range of other in-store operational and marketing solutions such as predictive inventory management, security and fraud prevention, pricing optimization and event-driven marketing. +

    +
    + + Read More + +
    + +
    +
    +

    June 09, 2020

    +

    + How Datarock is using PyTorch for more intelligent mining decision making +

    + +

    The mining industry is currently going through a digital revolution as it looks for new and innovative ways to explore and extract mineral resources. This has largely been driven by a need to reduce costs in a competitive global industry that’s experiencing declining ore grades and fewer new discoveries. +

    +
    + + Read More + +
    + +
    +
    +

    April 25, 2020

    +

    + Deploying huggingface‘s BERT to production with pytorch/serve +

    + +

    TL;DR: pytorch/serve is a new awesome framework to serve torch models in production. This story teaches you how to use it for huggingface/transformers models like BERT. +

    +
    + + Read More + +
    + +
    +
    +

    November 14, 2019

    +

    + Using deep learning and PyTorch to power next gen aircraft at Caltech +

    + +

    Learn how Caltech’s Center for Autonomous Systems and Technologies (CAST) uses PyTorch to build deep learning systems that can understand the aerodynamics of how aircrafts interact with the ground to enable much smoother and safer landings. +

    +
    + + Read More + +
    + +
    +
    +

    November 06, 2019

    +

    + PyTorch at Dolby Labs +

    + +

    Hear how Dolby Labs is using PyTorch to develop deep learning for audio, and learn about the challenges that audio AI presents and the breakthroughs and applications they’ve built at Dolby to push the field forward. +

    +
    + + Read More + +
    + +
    +
    +

    August 20, 2019

    +

    + Dialogue Assistance for Customer Service at Airbnb +

    + +

    Businesses are using PyTorch, an open source machine learning framework, to seamlessly build, train, and deploy AI models in production across their products and services. Hear how industry leaders leverage PyTorch to help power everything from ubiquitous productivity software used across the world to enabling advances in medicine for fighting cancer. +

    +
    + + Read More + +
    + +
    +
    +

    July 23, 2019

    +

    + Mapillary Research: Seamless Scene Segmentation and In-Place Activated BatchNorm +

    + +

    With roads in developed countries like the US changing up to 15% annually, Mapillary addresses a growing demand for keeping maps updated by combining images from any camera into a 3D visualization of the world. Mapillary’s independent and collaborative approach enables anyone to collect, share, and use street-level images for improving maps, developing cities, and advancing the automotive industry. +

    +
    + + Read More + +
    + + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + diff --git a/community-stories/1/index.html b/community-stories/1/index.html new file mode 100644 index 000000000000..cfe97e93ab4c --- /dev/null +++ b/community-stories/1/index.html @@ -0,0 +1 @@ +

    At Outreach, a leading sales engagement platform, our data science team is a driving force behind our innovative product portfolio largely driven by deep learning and AI. We recently announced enhancements to the Outreach Insights feature, which is powered by the proprietary Buyer Sentiment deep learning model developed by the Outreach Data Science team. This model allows sales teams to deepen their understanding of customer sentiment through the analysis of email reply content, moving from just counting the reply rate to classification of the replier’s intent.

    diff --git a/community-stories/10/index.html b/community-stories/10/index.html new file mode 100644 index 000000000000..a9c0fc3c1270 --- /dev/null +++ b/community-stories/10/index.html @@ -0,0 +1 @@ +

    Solliance delivers cutting-edge solutions that fill gaps across a wide variety of industries. Through its recent collaboration with Baseline, Solliance revolutionizes the cryptocurrency trading experience, extracting news insights from more than 150,000 global sources in near real time. To manage Baseline workloads, Solliance brought Microsoft Azure Machine Learning and PyTorch together for maximum processing power and deep learning capabilities. The result: investors can get under the headlines and see which specific news metrics are moving the volatile crypto market to make more informed trading decisions, while Baseline can release new features in weeks instead of months.

    diff --git a/community-stories/11/index.html b/community-stories/11/index.html new file mode 100644 index 000000000000..728a64d17b29 --- /dev/null +++ b/community-stories/11/index.html @@ -0,0 +1 @@ +

    In this tutorial, we’ll build a simple machine learning pipeline using a BERT word embedding model and the Nearest Neighbor algorithm to recommend wines based on user inputted preferences. To create and power this recommendation engine, we’ll leverage AWS’s SageMaker platform, which provides a fully managed way for us to train and deploy our service.

    diff --git a/community-stories/12/index.html b/community-stories/12/index.html new file mode 100644 index 000000000000..20e17bac23cf --- /dev/null +++ b/community-stories/12/index.html @@ -0,0 +1 @@ +

    Healthcare providers need to be able to verify that they’re maintaining the highest operating safety and efficacy standards. Those standards are set by a national accreditation organization whose surveyors, often healthcare professionals themselves, regularly visit facilities and document situations that might need to be corrected or brought back in line with the latest rules and policies. That assessment and accreditation process generates a huge amount of data, and even the most experienced surveyors struggle to keep ahead of the ongoing development of thousands of policy rules that might be relevant in any particular scenario. Vaagan and his team took on the task of fixing the issue by building a machine learning solution that could ingest text from those reports and return a top ten list of the latest associated rules with unprecedented accuracy. They used Azure technology, development tools, and services to bring that solution to fruition. Crayon customers report clear time savings with the new healthcare solution. Just as important, the solution provides consistent responses that aren’t subject to the vagaries of individual interpretation or potentially out-of-date data.

    diff --git a/community-stories/13/index.html b/community-stories/13/index.html new file mode 100644 index 000000000000..19261c8f2a75 --- /dev/null +++ b/community-stories/13/index.html @@ -0,0 +1 @@ +

    Sensitive information such as healthcare data is often siloed within health organization boundaries. This has posed a challenge to machine learning models used by the health and life sciences industry that require data for training purposes. To improve patient care and accelerate health industry progression, the Microsoft Health & Life Sciences AI group used a federated learning setup to train their biomedical natural language processing service, Text Analytics for Health, while preserving the trust boundaries of siloed data. The federated learning framework was built using Microsoft Azure Machine Learning and open-source technologies to help organizations analyze siloed data and build new applications without compromising data privacy.

    diff --git a/community-stories/14/index.html b/community-stories/14/index.html new file mode 100644 index 000000000000..88bca8c17c55 --- /dev/null +++ b/community-stories/14/index.html @@ -0,0 +1 @@ +

    The Taiwan-based neurotechnology startup used tools and frameworks in the Intel® oneAPI Base and AI Analytics Toolkits to the improve efficiency and build times of deep-learning models used in its Brain Waves AI system. As a result, HippoScreen is able to broaden the system’s applications to a wider range of psychiatric conditions and diseases.

    diff --git a/community-stories/16/index.html b/community-stories/16/index.html new file mode 100644 index 000000000000..1430e12feb24 --- /dev/null +++ b/community-stories/16/index.html @@ -0,0 +1 @@ +

    Miquel Farré is a senior technology manager at Disney, taking the lead on projects at the intersection of video technology, machine learning and web applications. Metadata that drives content searchability is most often indexed at the title level, with limited governance and high ambiguity; at best, keyword metadata has been added to a title as a layer of enrichment.

    diff --git a/community-stories/17/index.html b/community-stories/17/index.html new file mode 100644 index 000000000000..57e31ab4d9f3 --- /dev/null +++ b/community-stories/17/index.html @@ -0,0 +1 @@ +

    The long and incremental evolution of the media industry, from a traditional broadcast and home video model, to a more mixed model with increasingly digitally-accessible content, has accelerated the use of machine learning and artificial intelligence (AI). Advancing the implementation of these technologies is critical for a company like Disney that has produced nearly a century of content, as it allows for new consumer experiences and enables new applications for illustrators and writers to create the highest-quality content.

    diff --git a/community-stories/18/index.html b/community-stories/18/index.html new file mode 100644 index 000000000000..892a30dd97e3 --- /dev/null +++ b/community-stories/18/index.html @@ -0,0 +1 @@ +

    In this blog series, our aim is to highlight the nuances of Machine Learning in Tubi’s Ad-based Video on Demand (AVOD) space as practiced at Tubi. Machine Learning helps solve myriad problems involving recommendations, content understanding and ads. We extensively use PyTorch for several of these use cases as it provides us the flexibility, computational speed and ease of implementation to train large scale deep neural networks using GPUs.

    diff --git a/community-stories/19/index.html b/community-stories/19/index.html new file mode 100644 index 000000000000..fee7d95008c0 --- /dev/null +++ b/community-stories/19/index.html @@ -0,0 +1 @@ +

    As digital animators continue to push the boundaries of technology and creativity, the technical teams that support them are turning to artificial intelligence and machine learning to deliver the tools they need. That’s the case at Pixar, where the company has made new machine learning breakthroughs it hopes will both improve quality and reduce costs.

    diff --git a/community-stories/2/index.html b/community-stories/2/index.html new file mode 100644 index 000000000000..27b9b1224b6f --- /dev/null +++ b/community-stories/2/index.html @@ -0,0 +1 @@ +

    Amazon Ads uses PyTorch, TorchServe, and AWS Inferentia to reduce inference costs by 71% and drive scale out. Amazon Ads helps companies build their brand and connect with shoppers through ads shown both within and beyond Amazon’s store, including websites, apps, and streaming TV content in more than 15 countries. Businesses and brands of all sizes, including registered sellers, vendors, book vendors, Kindle Direct Publishing (KDP) authors, app developers, and agencies can upload their own ad creatives, which can include images, video, audio, and, of course, products sold on Amazon.

    diff --git a/community-stories/20/index.html b/community-stories/20/index.html new file mode 100644 index 000000000000..88ad90797b78 --- /dev/null +++ b/community-stories/20/index.html @@ -0,0 +1 @@ +

    In this tech blog, we will compare the speed and cost of Inferentia, GPU, and CPU for a BERT sequence labeling example. We also provide a helpful tutorial on the steps for model compilation and inference on Inf1 instances.

    diff --git a/community-stories/21/index.html b/community-stories/21/index.html new file mode 100644 index 000000000000..7d51b399621a --- /dev/null +++ b/community-stories/21/index.html @@ -0,0 +1 @@ +

    Complete and accurate clinical documentation is an essential tool for tracking patient care. It allows for treatment plans to be shared among care teams to aid in continuity of care and ensures a transparent and effective process for reimbursement.

    diff --git a/community-stories/22/index.html b/community-stories/22/index.html new file mode 100644 index 000000000000..28e02017dd25 --- /dev/null +++ b/community-stories/22/index.html @@ -0,0 +1 @@ +

    Since it launched in 2017, Facebook’s machine-learning framework PyTorch has been put to good use, with applications ranging from powering Elon Musk’s autonomous cars to driving robot-farming projects. Now pharmaceutical firm AstraZeneca has revealed how its in-house team of engineers are tapping PyTorch too, and for equally as important endeavors: to simplify and speed up drug discovery.

    diff --git a/community-stories/23/index.html b/community-stories/23/index.html new file mode 100644 index 000000000000..c5a733be0486 --- /dev/null +++ b/community-stories/23/index.html @@ -0,0 +1 @@ +

    TL;DR: pytorch/serve is a new awesome framework to serve torch models in production. This story teaches you how to use it for huggingface/transformers models like BERT.

    diff --git a/community-stories/24/index.html b/community-stories/24/index.html new file mode 100644 index 000000000000..93c138da5885 --- /dev/null +++ b/community-stories/24/index.html @@ -0,0 +1 @@ +

    1 in 4 dogs, and 1 in 5 cats, will develop cancer at some point in their lives. Pets today have a better chance of being successfully treated than ever, thanks to advances in early recognition, diagnosis and treatment.

    diff --git a/community-stories/25/index.html b/community-stories/25/index.html new file mode 100644 index 000000000000..0e6e8f14e830 --- /dev/null +++ b/community-stories/25/index.html @@ -0,0 +1 @@ +

    Performing surgery is largely about decision making. As Dr. Frank Spencer put it in 1978, “A skillfully performed operation is about 75% decision making and 25% dexterity”. Five decades later, and the surgical field is finally — albeit gradually — implementing advances in data science and AI to enhance surgeons’ ability to make the best decisions in the operating room. That’s where theator comes in: the company is re-imagining surgery with a Surgical Intelligence platform that leverages highly advanced AI, specifically machine learning and computer vision technology, to analyze every step, event, milestone, and critical junction of surgical procedures — significantly boosting surgeons’ overall performance.

    diff --git a/community-stories/26/index.html b/community-stories/26/index.html new file mode 100644 index 000000000000..db073271a4d6 --- /dev/null +++ b/community-stories/26/index.html @@ -0,0 +1 @@ +

    Whatever our job title happens to be at AstraZeneca, we’re seekers. I’m part of the Biological Insights Knowledge Graph (BIKG) team. We help scientists comb through massive amounts of data in our quest to find the information we need to help us deliver life-changing medicines.

    diff --git a/community-stories/27/index.html b/community-stories/27/index.html new file mode 100644 index 000000000000..1a498999036d --- /dev/null +++ b/community-stories/27/index.html @@ -0,0 +1 @@ +

    For many surgeons, the possibility of going back into the operating room to review the actions they carried out on a patient could provide invaluable medical insights.

    diff --git a/community-stories/28/index.html b/community-stories/28/index.html new file mode 100644 index 000000000000..0a0738494ff2 --- /dev/null +++ b/community-stories/28/index.html @@ -0,0 +1 @@ +

    In drug discovery, understanding the 3D structure of proteins is key to assessing the ability of a drug to bind to it, directly impacting its efficacy. Predicting the 3D protein form, however, is very complex, challenging, expensive, and time consuming, and can take years when using traditional methods such as X-ray diffraction. Applying machine learning (ML) to predict these structures can significantly accelerate the time to predict protein structures—from years to hours. Several high-profile research teams have released algorithms such as AlphaFold2 (AF2), RoseTTAFold, and others. These algorithms were recognized by Science magazine as the 2021 Breakthrough of the Year.

    diff --git a/community-stories/29/index.html b/community-stories/29/index.html new file mode 100644 index 000000000000..d6abcbf4e966 --- /dev/null +++ b/community-stories/29/index.html @@ -0,0 +1 @@ +

    Knowing the physical structure of proteins is an important part of the drug discovery process. Machine learning (ML) algorithms like AlphaFold v2.0 significantly reduce the cost and time needed to generate usable protein structures. These projects have also inspired development of AI-driven workflows for de novo protein design and protein-ligand interaction analysis.

    diff --git a/community-stories/3/index.html b/community-stories/3/index.html new file mode 100644 index 000000000000..8b03718d850f --- /dev/null +++ b/community-stories/3/index.html @@ -0,0 +1,3 @@ +

    NASA and IBM are working together to create foundation models based on NASA’s data sets — including geospatial data — with the goal of accelerating the creation of AI models.

    + +

    Foundation models are trained on large, broad data sets, then used to train other AI models by using targeted and smaller datasets. Foundation models can be used for different tasks and can apply information about one situation to another. One real-world example of a foundation model at work is ChatGPT3, which was built with the foundation model, GPT3.

    diff --git a/community-stories/30/index.html b/community-stories/30/index.html new file mode 100644 index 000000000000..54bc97d7e118 --- /dev/null +++ b/community-stories/30/index.html @@ -0,0 +1 @@ +

    The mining industry is currently going through a digital revolution as it looks for new and innovative ways to explore and extract mineral resources. This has largely been driven by a need to reduce costs in a competitive global industry that’s experiencing declining ore grades and fewer new discoveries.

    diff --git a/community-stories/32/index.html b/community-stories/32/index.html new file mode 100644 index 000000000000..8f2902f2a44a --- /dev/null +++ b/community-stories/32/index.html @@ -0,0 +1 @@ +

    Trigo is a provider of AI & computer vision based checkout-free systems for the retail market, enabling frictionless checkout and a range of other in-store operational and marketing solutions such as predictive inventory management, security and fraud prevention, pricing optimization and event-driven marketing.

    diff --git a/community-stories/33/index.html b/community-stories/33/index.html new file mode 100644 index 000000000000..189445f7c4b1 --- /dev/null +++ b/community-stories/33/index.html @@ -0,0 +1 @@ +

    Personalization is ubiquitous on most platforms today. Supercharged by connectivity, and scaled by machine learning, most experiences on the internet are tailored to our personal tastes. Peloton classes offer a diversity of instructors, languages, fitness disciplines, durations and intensity. Each Member has specific fitness goals, schedule, fitness equipment, and level of skill or strength. This diversity of content and individuality of Member needs at massive scale creates the opportunity for a recommender system to create a personalized experience on the Peloton platform.

    diff --git a/community-stories/34/index.html b/community-stories/34/index.html new file mode 100644 index 000000000000..8b4c0b6ee44b --- /dev/null +++ b/community-stories/34/index.html @@ -0,0 +1 @@ +

    Wehkamp is one of the biggest e-commerce companies in the Netherlands, with more than 500,000 daily visitors on their website. A wide variety of products offered on the Wehkamp site aims to meet its customers’ many needs. An important aspect of any customer visit on an e-commerce website is a qualitative and accurate visual experience of the products. At a large scale, this is no easy task, with thousands of product photos processed in a local photo studio.

    diff --git a/community-stories/35/index.html b/community-stories/35/index.html new file mode 100644 index 000000000000..c9c9dc75f45e --- /dev/null +++ b/community-stories/35/index.html @@ -0,0 +1,2 @@ +

    Walmart Search has embarked on the journey of adopting Deep Learning in the search ecosystem to improve search relevance. For our pilot use case, we served the computationally intensive Bert Base model at runtime with an objective to achieve low latency and high throughput.

    + diff --git a/community-stories/36/index.html b/community-stories/36/index.html new file mode 100644 index 000000000000..e00333fdcdb5 --- /dev/null +++ b/community-stories/36/index.html @@ -0,0 +1,2 @@ +

    Autodesk is a multinational software company with world-renowned products in areas such as Architecture, Engineering, & Construction, Manufacturing, and Media & Entertainment. Amongst Autodesk’s best-known products are AutoCAD, Revit, Maya, and Fusion 360. The company has millions of customers around the world, and many of them have need for support to make best use of their products.

    + diff --git a/community-stories/37/index.html b/community-stories/37/index.html new file mode 100644 index 000000000000..dd9bfb7823e9 --- /dev/null +++ b/community-stories/37/index.html @@ -0,0 +1 @@ +

    Software innovator Bentley Systems offers a broad portfolio of solutions to help the organizations that design, build, and operate the world’s infrastructure assets. The company uses machine learning in its flagship product to read disparate paper-based asset data and transform it into consolidated digital data. To speed up and formalize this process, Bentley created a machine learning operations framework using Microsoft Azure Machine Learning and PyTorch. Developers’ speed and job satisfaction have shot up since they began using this stable, reproducible framework, which easily gets their code into the cloud, accelerating delivery by three to five times and significantly increasing efficiency.

    diff --git a/community-stories/38/index.html b/community-stories/38/index.html new file mode 100644 index 000000000000..3ff272c45bf4 --- /dev/null +++ b/community-stories/38/index.html @@ -0,0 +1 @@ +

    Join us for an interview with star PyTorch community members Alexander O’Connor and Binghui Ouyang from AutoDesk as we learn how they used PyTorch and AWS Inferentia to deploy production-scale models in chatbot intent classification.

    diff --git a/community-stories/39/index.html b/community-stories/39/index.html new file mode 100644 index 000000000000..7422af52d2c6 --- /dev/null +++ b/community-stories/39/index.html @@ -0,0 +1 @@ +

    Many of the experiences people enjoy on Facebook and Instagram are powered by artificial intelligence (AI). A number of them, like Assistant, Avatars, and AR effects, cannot be powered by server-side AI due to latency, network bandwidth, and other constraints. Running AI on-device —that is, directly on a phone, tablet, or even a pair of smart glasses — offers huge advantages over constantly sending data back to a server. It’s faster, and it creates a privacy-enhancing experience for people who use our platforms. However, on-device AI presents new challenges, since it requires coping with devices that have a small battery, far less powerful processors, and less memory than a server in a data center.

    diff --git a/community-stories/4/index.html b/community-stories/4/index.html new file mode 100644 index 000000000000..c3601e6d5dec --- /dev/null +++ b/community-stories/4/index.html @@ -0,0 +1,2 @@ +

    How did farming affect your day today? If you live in a city, you might feel disconnected from the farms and fields that produce your food. Agriculture is a core piece of our lives, but we often take it for granted.

    + diff --git a/community-stories/40/index.html b/community-stories/40/index.html new file mode 100644 index 000000000000..509a7cad16f2 --- /dev/null +++ b/community-stories/40/index.html @@ -0,0 +1 @@ +

    Axon, a technology leader in public safety, developed AI technology to add cutting-edge license plate recognition capabilities to its in-car camera products, which now can identify plates for vehicles of interest and provide law enforcement with proactive notifications and alerts. Axon AI scientists and engineers chose Microsoft Azure infrastructure as a scalable, cost-efficient, and feature-rich environment where they can develop and test AI models. With Azure compute, storage, and PyTorch and machine learning resources, Axon can easily take advantage of the latest software and hardware technology to develop best-in-class AI solutions for its customers.

    diff --git a/community-stories/41/index.html b/community-stories/41/index.html new file mode 100644 index 000000000000..f886e1e59c98 --- /dev/null +++ b/community-stories/41/index.html @@ -0,0 +1 @@ +

    Here, We will be sharing our experience in moving AI workloads from our GPU servers to our Intel CPU servers without any performance or quality degradation, and saving annual costs of approximately 340 thousand U.S. Dollar (refer to the Conclusion) in the process.

    diff --git a/community-stories/42/index.html b/community-stories/42/index.html new file mode 100644 index 000000000000..f8b8c8804732 --- /dev/null +++ b/community-stories/42/index.html @@ -0,0 +1 @@ +

    Businesses are using PyTorch, an open source machine learning framework, to seamlessly build, train, and deploy AI models in production across their products and services. Hear how industry leaders leverage PyTorch to help power everything from ubiquitous productivity software used across the world to enabling advances in medicine for fighting cancer.

    diff --git a/community-stories/43/index.html b/community-stories/43/index.html new file mode 100644 index 000000000000..8b1873748546 --- /dev/null +++ b/community-stories/43/index.html @@ -0,0 +1 @@ +

    Learn how Caltech’s Center for Autonomous Systems and Technologies (CAST) uses PyTorch to build deep learning systems that can understand the aerodynamics of how aircrafts interact with the ground to enable much smoother and safer landings.

    diff --git a/community-stories/44/index.html b/community-stories/44/index.html new file mode 100644 index 000000000000..2f5c7fedcdb4 --- /dev/null +++ b/community-stories/44/index.html @@ -0,0 +1 @@ +

    At deepset, we’re building the next-level search engine for business documents. Our core product, Haystack, is an open-source framework that enables developers to utilize the latest NLP models for semantic search and question answering at scale. Our software as a service (SaaS) platform, Haystack Hub, is used by developers from various industries, including finance, legal, and automotive, to find answers in all kinds of text documents. You can use these answers to improve the search experience, cover the long-tail of chat bot queries, extract structured data from documents, or automate invoicing processes.

    diff --git a/community-stories/45/index.html b/community-stories/45/index.html new file mode 100644 index 000000000000..c6edca44f515 --- /dev/null +++ b/community-stories/45/index.html @@ -0,0 +1 @@ +

    Hear how Dolby Labs is using PyTorch to develop deep learning for audio, and learn about the challenges that audio AI presents and the breakthroughs and applications they’ve built at Dolby to push the field forward.

    diff --git a/community-stories/46/index.html b/community-stories/46/index.html new file mode 100644 index 000000000000..9047ef314c73 --- /dev/null +++ b/community-stories/46/index.html @@ -0,0 +1 @@ +

    Grapheme to Phoneme (G2P) is a function that generates pronunciations (phonemes) for words based on their written form (graphemes). It has an important role in automatic speech recognition systems, natural language processing, and text-to-speech engines. In Cisco’s Webex Assistant, we use G2P modelling to assist in resolving person names from voice. See here for further details of various techniques we use to build robust voice assistants.

    diff --git a/community-stories/47/index.html b/community-stories/47/index.html new file mode 100644 index 000000000000..e86fbad663b2 --- /dev/null +++ b/community-stories/47/index.html @@ -0,0 +1 @@ +

    AI21 Labs uses machine learning to develop language models focused on understanding meaning, and in 2021 it set a goal to train the recently released Jurassic-1 Jumbo, an autoregressive language model with 178 billion parameters. Developers who register for beta testing will get access to Jurassic-1 Jumbo and can immediately start to customize the model for their use case. The software startup wanted to train the model efficiently, so it looked to Amazon Web Services (AWS) and built a solution using Amazon Elastic Compute Cloud (Amazon EC2), a web service that provides secure, resizable compute capacity in the cloud. Choosing Amazon EC2 gave the company control over the training process, including node allocation.

    diff --git a/community-stories/48/index.html b/community-stories/48/index.html new file mode 100644 index 000000000000..352973490e86 --- /dev/null +++ b/community-stories/48/index.html @@ -0,0 +1 @@ +

    Anthropic is an AI safety and research company that’s working to build reliable, interpretable, and steerable AI systems. Over the past decade, the amount of compute used for the largest training runs has increased at an exponential pace. We’ve also seen in many domains that larger models are able to attain better performance following precise scaling laws. The compute needed to train these models can only be attained using many coordinated machines that are communicating data between them. In this talk, Nicholas Joseph (Technical Staff, Anthropic) goes through why and how they can scale up training runs to use these machines efficiently.

    diff --git a/community-stories/49/index.html b/community-stories/49/index.html new file mode 100644 index 000000000000..fea2e24a5a05 --- /dev/null +++ b/community-stories/49/index.html @@ -0,0 +1 @@ +

    Everyone prefers to use their mother tongue when communicating with chat agents and other automated services. However, for languages like Hungarian—spoken by only 15 million people—the market size will often be viewed as too small for large companies to create software, tools or applications that can process Hungarian text as input. Recognizing this need, the Applied Data Science and Artificial Intelligence team from University of Pécs decided to step up. Using Microsoft AI Solutions and ONNX Runtime solutions, it built and trained its own BERT-large model in native Hungarian in under 200 hours and total build cost of 1,000 euro.

    diff --git a/community-stories/5/index.html b/community-stories/5/index.html new file mode 100644 index 000000000000..0d0f5135f201 --- /dev/null +++ b/community-stories/5/index.html @@ -0,0 +1 @@ +

    In this webinar, Bob Chesebrough of Intel guides you through the steps he took to create a clipped image with background clutter removed from the image. He accomplished this using monocular depth estimation with PyTorch. This could potentially be used to automate structure from motion and other image-related tasks where you want to highlight or focus on a single portion of an image, particularly for identifying parts of the image that were closest to the camera. Specifically, he used depth estimation on a couple of images that he took at a natural history museum to capture just the dinosaur in the foreground, eliminating the background murals, lights, and building structure. The cool thing about this algorithm is that it creates a depth estimate from a single image!

    diff --git a/community-stories/50/index.html b/community-stories/50/index.html new file mode 100644 index 000000000000..57fd76786bb4 --- /dev/null +++ b/community-stories/50/index.html @@ -0,0 +1 @@ +

    With roads in developed countries like the US changing up to 15% annually, Mapillary addresses a growing demand for keeping maps updated by combining images from any camera into a 3D visualization of the world. Mapillary’s independent and collaborative approach enables anyone to collect, share, and use street-level images for improving maps, developing cities, and advancing the automotive industry.

    diff --git a/community-stories/51/index.html b/community-stories/51/index.html new file mode 100644 index 000000000000..dfd47f994995 --- /dev/null +++ b/community-stories/51/index.html @@ -0,0 +1 @@ +

    Deep Learning development is becoming more and more about minimizing the time from idea to trained model. To shorten this lead time, researchers need access to a training environment that supports running multiple experiments concurrently, each utilizing several GPUs.

    diff --git a/community-stories/52/index.html b/community-stories/52/index.html new file mode 100644 index 000000000000..25a9447da531 --- /dev/null +++ b/community-stories/52/index.html @@ -0,0 +1 @@ +

    Pinterest surfaces billions of ideas to people every day, and the neural modeling of embeddings for content, users, and search queries are key in the constant improvement of these machine learning-powered recommendations. Good embeddings — representations of discrete entities as vectors of numbers — enable fast candidate generation and are strong signals to models that classify, retrieve and rank relevant content.

    diff --git a/community-stories/53/index.html b/community-stories/53/index.html new file mode 100644 index 000000000000..243c15d06967 --- /dev/null +++ b/community-stories/53/index.html @@ -0,0 +1 @@ +

    The field of AI is in the middle of a revolution. In recent years, AI models have made images, songs, or even websites out of simple text prompts. These types of models with billions of parameters, called foundation models, can with little fine-tuning be repurposed from one task to another, removing countless hours of training and labelling, and refitting a model to take on a new task.

    diff --git a/community-stories/54/index.html b/community-stories/54/index.html new file mode 100644 index 000000000000..632fee728703 --- /dev/null +++ b/community-stories/54/index.html @@ -0,0 +1 @@ +

    In this paper, we introduce ChemicalX, a PyTorch-based deep learning library designed for providing a range of state of the art models to solve the drug pair scoring task. The primary objective of the library is to make deep drug pair scoring models accessible to machine learning researchers and practitioners in a streamlined this http URL design of ChemicalX reuses existing high level model training utilities, geometric deep learning, and deep chemistry layers from the PyTorch ecosystem. Our system provides neural network layers, custom pair scoring architectures, data loaders, and batch iterators for end users. We showcase these features with example code snippets and case studies to highlight the characteristics of ChemicalX. A range of experiments on real world drug-drug interaction, polypharmacy side effect, and combination synergy prediction tasks demonstrate that the models available in ChemicalX are effective at solving the pair scoring task. Finally, we show that ChemicalX could be used to train and score machine learning models on large drug pair datasets with hundreds of thousands of compounds on commodity hardware.

    diff --git a/community-stories/55/index.html b/community-stories/55/index.html new file mode 100644 index 000000000000..caa6849eb657 --- /dev/null +++ b/community-stories/55/index.html @@ -0,0 +1 @@ +

    In this talk, scientist Lindsey Gray and Ph.D. student Matthias Fey co-examine how the challenges of High Energy Particle Physics are driving the need for more efficient research and development pipelines in neural network development. In particular, they look at the additions made to PyTorch Geometric, which allow Graph Neural Network models to be compiled by the PyTorch JIT, significantly easing the process of deploying such networks at scale.

    diff --git a/community-stories/56/index.html b/community-stories/56/index.html new file mode 100644 index 000000000000..26bae87a28eb --- /dev/null +++ b/community-stories/56/index.html @@ -0,0 +1,2 @@ +

    Intel has long been at the forefront of technological innovation, and its recent venture into Generative AI (GenAI) solutions is no exception. With the rise of AI-powered gaming experiences, Intel sought to deliver an accessible and intuitive GenAI inferencing solution tailored for AI PCs powered by Intel’s latest GPUs. By leveraging PyTorch as the backbone for development efforts, Intel successfully launched AI Playground, an open source application that showcases advanced GenAI workloads.

    + diff --git a/community-stories/57/index.html b/community-stories/57/index.html new file mode 100644 index 000000000000..4940f1df3660 --- /dev/null +++ b/community-stories/57/index.html @@ -0,0 +1 @@ +

    Geospatial computer vision is essential for understanding our planet — from monitoring deforestation to tracking urban development and analyzing the impacts of climate change. However, the coding and deep learning skills for applying AI models to satellite imagery and earth observation data has traditionally been a major barrier for many practitioners.

    diff --git a/community-stories/6/index.html b/community-stories/6/index.html new file mode 100644 index 000000000000..b0658a98dffb --- /dev/null +++ b/community-stories/6/index.html @@ -0,0 +1 @@ +

    Cotton is a major fibre crop across the world, cultivated in over 80 countries with nearly 100 million families across the world rely on cotton farming for their livelihood. With such importance placed on many farmers’ crops, cotton’s particular vulnerability to pest infestations has been troubling to many. However, pest infestation is also simultaneously one of the most significant and preventable problems that farmers face with 55% of all pesticide usage in India being devoted to cotton farming.

    diff --git a/community-stories/7/index.html b/community-stories/7/index.html new file mode 100644 index 000000000000..7c23debd8c62 --- /dev/null +++ b/community-stories/7/index.html @@ -0,0 +1 @@ +

    Lyft’s mission is to improve people’s lives with the world’s best transportation. We believe in a future where self-driving cars make transportation safer and more accessible for everyone. That’s why Level 5, Lyft’s self-driving division, is developing a complete autonomous system for the Lyft network to provide riders’ access to the benefits of this technology. However, this is an incredibly complex task.

    diff --git a/community-stories/8/index.html b/community-stories/8/index.html new file mode 100644 index 000000000000..425e24461aa0 --- /dev/null +++ b/community-stories/8/index.html @@ -0,0 +1 @@ +

    Wayve wants to accelerate and scale autonomous vehicle (AV) development by using vision-based machine learning for rapid prototyping and quick iteration. So, it developed a platform that uses the open-source machine learning framework PyTorch with Microsoft Azure Machine Learning to gather, manage, and process millions of hours of driving data per year—petabytes of data—consisting of images, GPS data, and data from other sensors. Wayve now has the scalable capacity to build and iterate driving models for complex urban environments, adjust models more nimbly, and adapt to new environments more readily.

    diff --git a/community-stories/9/index.html b/community-stories/9/index.html new file mode 100644 index 000000000000..0bed358bf0f5 --- /dev/null +++ b/community-stories/9/index.html @@ -0,0 +1,2 @@ +

    Learning a foreign language was probably one of your goals last year. And the year before, and the year before that. Like gym memberships, our best intentions often don’t survive very long. Aside from the time required to achieve proficiency with a new language, most people struggle with traditional approaches to learning. Even many web-based language tools can be monotonous and cumbersome.

    + diff --git a/community_blog/3d-rotations-and-spatial-transformations-made-easy-with-roma-356a495a20c4.html b/community_blog/3d-rotations-and-spatial-transformations-made-easy-with-roma-356a495a20c4.html new file mode 100644 index 000000000000..86fb96e996e9 --- /dev/null +++ b/community_blog/3d-rotations-and-spatial-transformations-made-easy-with-roma-356a495a20c4.html @@ -0,0 +1 @@ +

    Struggling with quaternions, rotation vectors, right-hand rules and all these stuffs? Try RoMa: an easy-to-to-use, stable and efficient library to deal with rotations and spatial transformations in PyTorch.

    diff --git a/community_blog/bringing-the-pytorch-community-together.html b/community_blog/bringing-the-pytorch-community-together.html new file mode 100644 index 000000000000..3d8829afc1b3 --- /dev/null +++ b/community_blog/bringing-the-pytorch-community-together.html @@ -0,0 +1 @@ +

    As we step into a new year, it’s a great moment to reflect on the incredible community events that made 2024 a memorable year for the PyTorch Foundation. Global meetups, events, and conferences brought the community together to learn, connect, and grow. Here’s a quick recap of the year’s highlights and what to expect in 2025.

    diff --git a/community_blog/colossal-llama-2-low-cost-and-high-quality-domain-specific-llm-solution-using-llama-and-26d2e4b9fd92.html b/community_blog/colossal-llama-2-low-cost-and-high-quality-domain-specific-llm-solution-using-llama-and-26d2e4b9fd92.html new file mode 100644 index 000000000000..9f7e844ec19f --- /dev/null +++ b/community_blog/colossal-llama-2-low-cost-and-high-quality-domain-specific-llm-solution-using-llama-and-26d2e4b9fd92.html @@ -0,0 +1 @@ +

    The most prominent distinction between LLaMA-1 and LLaMA-2 lies in the incorporation of higher-quality corpora, a pivotal factor contributing to significant performance enhancements in LLaMA-2. This, coupled with its commercial availability, extends the potential for creative applications of large models within the open-source community.

    diff --git a/community_blog/datathon-2025.html b/community_blog/datathon-2025.html new file mode 100644 index 000000000000..582d28a09270 --- /dev/null +++ b/community_blog/datathon-2025.html @@ -0,0 +1 @@ +

    We’re excited to have PyTorch sponsor Datathon 2025: DataOrbit, a place where students can collaborate with a team to solve problems using real-world datasets! This event, hosted by Data Science UCSB in collaboration with Gaucho Sports Analytics and ACM@UCSB, will take place on February 22–23rd, 2025 at UC Santa Barbara, with the incredible opportunity to present your project to a panel of corporate and faculty judges – including the executive director of Pytorch! – for a chance to win prizes up to $3000.

    diff --git a/community_blog/distributed-training-with-pytorch-and-azure-ml-898429139098.html b/community_blog/distributed-training-with-pytorch-and-azure-ml-898429139098.html new file mode 100644 index 000000000000..e42bac8a407d --- /dev/null +++ b/community_blog/distributed-training-with-pytorch-and-azure-ml-898429139098.html @@ -0,0 +1 @@ +

    Suppose you have a very large PyTorch model, and you’ve already tried many common tricks to speed up training: you optimized your code, you moved training to the cloud and selected a fast GPU VM, you installed software packages that improve training performance (for example, by using the ACPT curated environment on Azure ML). And yet, you still wish your model could train faster. Maybe it’s time to give distributed training a try! Continue reading to learn the simplest way to do distributed training with PyTorch and Azure ML.

    diff --git a/community_blog/doctr-joins-pytorch-ecosystem.html b/community_blog/doctr-joins-pytorch-ecosystem.html new file mode 100644 index 000000000000..25b5830a11a4 --- /dev/null +++ b/community_blog/doctr-joins-pytorch-ecosystem.html @@ -0,0 +1 @@ +

    We’re thrilled to announce that the docTR project has been integrated into the PyTorch ecosystem! This integration ensures that docTR aligns with PyTorch’s standards and practices, giving developers a reliable, community-backed solution for powerful OCR workflows.

    diff --git a/community_blog/enhancing-deep-learning.html b/community_blog/enhancing-deep-learning.html new file mode 100644 index 000000000000..5d90ad45ab17 --- /dev/null +++ b/community_blog/enhancing-deep-learning.html @@ -0,0 +1 @@ +

    Welcome to the thriving PyTorch ecosystem, where a wealth of tools and libraries await, purpose-built to elevate your experience in deep learning as a developer or researcher. The Ecosystem Tools pages host many projects from experts spanning academia, industry, application development, and machine learning.

    diff --git a/community_blog/exploring-scientific-machine-learning-pipelines-through-the-simulai-toolkit-9fda42d6c6a0.html b/community_blog/exploring-scientific-machine-learning-pipelines-through-the-simulai-toolkit-9fda42d6c6a0.html new file mode 100644 index 000000000000..0afbab8d074e --- /dev/null +++ b/community_blog/exploring-scientific-machine-learning-pipelines-through-the-simulai-toolkit-9fda42d6c6a0.html @@ -0,0 +1 @@ +

    SciML, short for Scientific Machine Learning, encompasses work that merges quantitative sciences with machine learning. It has gained significant traction over the past decade, driven by the widespread availability of specialized hardware (such as GPUs and TPUs) and datasets. Additionally, it has been propelled by the overarching influence of the machine learning wave, now ingrained in the zeitgeist of our times. In this context, we’d like to introduce SimulAI, an open-source toolkit under the Apache 2.0 license. SimulAI is designed to be user-friendly, providing a high-level Python interface for managing scientific machine learning pipelines. This article aims to showcase its current workflow and utility in constructing scientific experiments. We encourage feedback and potential contributions from the interested community, with plans to delve into more advanced topics in future articles.

    diff --git a/community_blog/how-activation-checkpointing-enables-scaling-up-training-deep-learning-models-7a93ae01ff2d.html b/community_blog/how-activation-checkpointing-enables-scaling-up-training-deep-learning-models-7a93ae01ff2d.html new file mode 100644 index 000000000000..d06738e1ba72 --- /dev/null +++ b/community_blog/how-activation-checkpointing-enables-scaling-up-training-deep-learning-models-7a93ae01ff2d.html @@ -0,0 +1 @@ +

    Activation checkpointing is a technique used for reducing the memory footprint at the cost of more compute. It utilizes the simple observation that we can avoid saving intermediate tensors necessary for backward computation if we just recompute them on demand instead.

    diff --git a/community_blog/how-fashable-achieves-soa-realistic-ai-generated-images-using-pytorch-and-azure-machine-learning-2313c4cf5f44.html b/community_blog/how-fashable-achieves-soa-realistic-ai-generated-images-using-pytorch-and-azure-machine-learning-2313c4cf5f44.html new file mode 100644 index 000000000000..635264cf35fb --- /dev/null +++ b/community_blog/how-fashable-achieves-soa-realistic-ai-generated-images-using-pytorch-and-azure-machine-learning-2313c4cf5f44.html @@ -0,0 +1,2 @@ +

    Fashable is a company born at XNFY Lab (a joint initiative with Microsoft). The company’s main goal is to revolutionize the world of fashion with ethical Artificial Intelligence (AI) technologies built on PyTorch framework. Fashable is focused on developing AI models that generates synthetic contents for the global fashion industry. The Fashion industry has been criticized in recent years because it generates a lot of waste and is responsible for up to 10% of global carbon dioxide output. Fashable has stepped up to address this issue by introducing multiple AI solutions that generates realistic personalized consumer garments without actually producing them to help in reducing carbon footprint. This will help the fashion brands make informed decisions without investing in experimental products and also reducing the industry’s carbon footprint globally. Hence, in Fashable, our IP models utilize modern approaches, such as Generative Adversarial Networks (GANs), best seller analysis, custom dataset creation, and so on to resolve such problems.

    + diff --git a/community_blog/introducing-depyf.html b/community_blog/introducing-depyf.html new file mode 100644 index 000000000000..8d56edceed89 --- /dev/null +++ b/community_blog/introducing-depyf.html @@ -0,0 +1 @@ +

    We are thrilled to introduce depyf, a new project to the PyTorch ecosystem designed to help users understand, learn, and adapt to torch.compile!

    diff --git a/community_blog/introducing-torchopt-a-high-performance-differentiable-optimization-library-for-pytorch-37c4c0ef6ae1.html b/community_blog/introducing-torchopt-a-high-performance-differentiable-optimization-library-for-pytorch-37c4c0ef6ae1.html new file mode 100644 index 000000000000..e239d2b0c617 --- /dev/null +++ b/community_blog/introducing-torchopt-a-high-performance-differentiable-optimization-library-for-pytorch-37c4c0ef6ae1.html @@ -0,0 +1 @@ +

    Explore TorchOpt, a PyTorch-based library that revolutionizes differentiable optimization with its unified programming abstraction, high-performance distributed execution runtime, and support for various differentiation modes.”

    diff --git a/community_blog/latest-colossal-ai-boasts-novel-automatic-parallelism-and-offers-savings-up-to-46x-for-stable-1453b48f3f02.html b/community_blog/latest-colossal-ai-boasts-novel-automatic-parallelism-and-offers-savings-up-to-46x-for-stable-1453b48f3f02.html new file mode 100644 index 000000000000..1d4f1a9b5175 --- /dev/null +++ b/community_blog/latest-colossal-ai-boasts-novel-automatic-parallelism-and-offers-savings-up-to-46x-for-stable-1453b48f3f02.html @@ -0,0 +1 @@ +

    As a new PyTorch Ecosystem Partner, we at HPC-AI Tech look forward to working with the PyTorch community to advance AI technologies through our open source project, Colossal-AI. We are excited to join forces with the PyTorch community in this effort.

    diff --git a/community_blog/mlops-workflow.html b/community_blog/mlops-workflow.html new file mode 100644 index 000000000000..1e9e964ef6a1 --- /dev/null +++ b/community_blog/mlops-workflow.html @@ -0,0 +1 @@ +

    PyTorch is one of the most widely used and most powerful deep learning frameworks for training and deploying complex neural networks. It has never been easier to train and deploy AI applications, and low-cost, high-performance, energy-efficient hardware, tools, and technology for creating optimized workflows are more accessible than ever. But data science, machine learning, and devops can be deep topics unto themselves, and it can be overwhelming for developers with one specialty to see how they all come together in the real world, or even to know where to get started.

    diff --git a/community_blog/optimize-llms.html b/community_blog/optimize-llms.html new file mode 100644 index 000000000000..b7eb36968f4d --- /dev/null +++ b/community_blog/optimize-llms.html @@ -0,0 +1 @@ +

    The rapid growth of large language model (LLM) applications is linked to rapid growth in energy demand. According to the International Energy Agency (IEA), data center electricity consumption is projected to roughly double by 2026 primarily driven by AI. This is due to the energy-intensive training requirements for massive LLMs – however, the increase in AI Inferencing workloads also plays a role. For example, compared with traditional search queries, a single AI inference can consume about 10x more energy.

    diff --git a/community_blog/profiling-pytorch-language-models-with-octoml-profile-eda7ece6b7bd.html b/community_blog/profiling-pytorch-language-models-with-octoml-profile-eda7ece6b7bd.html new file mode 100644 index 000000000000..d5a6630672d2 --- /dev/null +++ b/community_blog/profiling-pytorch-language-models-with-octoml-profile-eda7ece6b7bd.html @@ -0,0 +1 @@ +

    The recent launch of PyTorch 2.0 makes it clear that the community is heavily investing in a compiler-powered future for machine learning. The new OctoML Profiler can help any user realize the full potential of these shifts in the ML landscape.

    diff --git a/community_blog/pt-fedora-os-communities.html b/community_blog/pt-fedora-os-communities.html new file mode 100644 index 000000000000..de170ab8cc3b --- /dev/null +++ b/community_blog/pt-fedora-os-communities.html @@ -0,0 +1,2 @@ +

    At DevConf.IN 2025 in Pune, I had the opportunity to host a PyTorch Meetup on February 28th. The session, titled “Powering AI with PyTorch, Fedora, and Open Source Communities” was aimed at introducing PyTorch to students and professionals, explaining why PyTorch+Fedora form an ideal AI development platform. The other key aspect I covered was collaboration between open source communities.

    + diff --git a/community_blog/pypose-a-library-for-robot-learning-with-physics-based-optimization-861bc0bb92f1.html b/community_blog/pypose-a-library-for-robot-learning-with-physics-based-optimization-861bc0bb92f1.html new file mode 100644 index 000000000000..7c05468c0c4a --- /dev/null +++ b/community_blog/pypose-a-library-for-robot-learning-with-physics-based-optimization-861bc0bb92f1.html @@ -0,0 +1 @@ +

    We are excited to share our new open-source library PyPose. It is a PyTorch-based robotics-oriented library that provides a set of tools and algorithms for connecting deep learning with physics-based optimization.

    diff --git a/community_blog/pytorch-at-gtc.html b/community_blog/pytorch-at-gtc.html new file mode 100644 index 000000000000..3d1786d203a1 --- /dev/null +++ b/community_blog/pytorch-at-gtc.html @@ -0,0 +1 @@ +

    GTC is coming back to San Jose on March 17–21, 2025. Join PyTorch Foundation members Arm, AWS, Google Cloud, IBM, Lightning AI, Meta, Microsoft Azure, Snowflake, and thousands of developers as we celebrate PyTorch. Together learn how AI & accelerated computing are helping humanity solve our most complex challenges.

    diff --git a/community_blog/pytorch-shanghai-notes.html b/community_blog/pytorch-shanghai-notes.html new file mode 100644 index 000000000000..a8e0b3ecef39 --- /dev/null +++ b/community_blog/pytorch-shanghai-notes.html @@ -0,0 +1,679 @@ + + + + + + + + + + + + + PyTorch Shanghai Meetup Notes | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    + +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + +
    +
    +

    September 08, 2024

    +

    + PyTorch Shanghai Meetup Notes +

    +
    +
    + +
    +
    +
    + +
    +

    + by + + Team PyTorch + +

    +

    We are honored to successfully host the PyTorch Shanghai Meetup on August 15, 2024. This Meetup has received great attention from the industry. We invited senior PyTorch developers from Intel and Huawei as guest speakers, who shared their valuable experience and the latest technical trends. In addition, this event also attracted PyTorch enthusiasts from many technology companies and well-known universities. A total of more than 40 participants gathered together to discuss and exchange the latest applications and technological advances of PyTorch.

    + +

    This Meetup not only strengthened the connection between PyTorch community members, but also provided a platform for local AI technology enthusiasts to learn, communicate and grow. We look forward to the next gathering to continue to promote the development of PyTorch technology in the local area.

    + +

    1. PyTorch Foundation Updates

    + +

    man instructing students

    + +

    PyTorch Board member Fred Li shared the latest updates in the PyTorch community, He reviewed the development history of the PyTorch community, explained in detail the growth path of community developers, encouraged everyone to delve deeper into technology, and introduced the upcoming PyTorch Conference 2024 related matters.

    + +

    2. Intel’s Journey with PyTorch Democratizing AI with ubiquitous hardware and open software

    + +

    PyTorch CPU module maintainer Jiong Gong shared 6-year technical contributions from Intel to PyTorch and its ecosystem, explored the remarkable advancements that Intel has made in both software and hardware democratizing AI, ensuring accessibility, and optimizing performance across a diverse range of Intel hardware platforms.

    + +

    man instructing students

    + +

    3. Exploring Multi-Backend Support in PyTorch Ecosystem: A Case Study of Ascend

    + +

    man instructing students

    + +

    Fengchun Hua, a PyTorch contributor from Huawei, took Huawei Ascend NPU as an example to demonstrate the latest achievements in multi-backend support for PyTorch applications. He introduced the hardware features of Huawei Ascend NPU and the infrastructure of CANN (Compute Architecture for Neural Networks), and explained the key achievements and innovations in native support work. He also shared the current challenges and the next work plan.

    + +

    Yuanhao Ji, another PyTorch contributor from Huawei, then introduced the Autoload Device Extension proposal, explained its implementation details and value in improving the scalability of PyTorch, and introduced the latest work progress of the PyTorch Chinese community.

    + +

    4. Intel XPU Backend for Inductor

    + +

    man instructing students

    + +

    Eikan is a PyTorch contributor from Intel. He focuses on torch.compile stack for both Intel CPU and GPU. In this session, Eikan presented Intel’s efforts on torch.compile for Intel GPUs. He provided updates on the current status of Intel GPUs within PyTorch, covering both functionality and performance aspects. Additionally, Eikan used Intel GPU as a case study to demonstrate how to integrate a new backend into the Inductor using Triton.

    + +

    5. PyTorch PrivateUse1 Evolution Approaches and Insights

    + +

    man instructing students

    + +

    Jiawei Li, a PyTorch collaborator from Huawei, introduced PyTorch’s Dispatch mechanism and emphasized the limitations of DIspatchKey. He took Huawei Ascend NPU as an example to share the best practices of the PyTorch PrivateUse1 mechanism. He mentioned that while using the PrivateUse1 mechanism, Huawei also submitted many improvements and bug fixes for the mechanism to the PyTorch community. He also mentioned that due to the lack of upstream CI support for out-of-tree devices, changes in upstream code may affect their stability and quality, and this insight was recognized by everyone.

    + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/community_blog/sglang-joins-pytorch.html b/community_blog/sglang-joins-pytorch.html new file mode 100644 index 000000000000..f12a8fe637e4 --- /dev/null +++ b/community_blog/sglang-joins-pytorch.html @@ -0,0 +1 @@ +

    We’re thrilled to announce that the SGLang project has been integrated into the PyTorch ecosystem! This integration ensures that SGLang aligns with PyTorch’s standards and practices, providing developers with a reliable and community-supported framework for fast and flexible serving of LLMs.

    diff --git a/community_blog/torch-compile-explained-ae0def293084.html b/community_blog/torch-compile-explained-ae0def293084.html new file mode 100644 index 000000000000..b3268ed0be8d --- /dev/null +++ b/community_blog/torch-compile-explained-ae0def293084.html @@ -0,0 +1 @@ +

    Have you ever felt overwhelmed by the complexities of torch.compile? Diving into its workings can feel like black magic, with bytecode and Python internal details that many users fail to understand, hindering them from understanding and debugging torch.compile.

    diff --git a/community_blog/torchdistill-a-modular-configuration-driven-framework-for-reproducible-deep-learning-and-9e0ecabf2815.html b/community_blog/torchdistill-a-modular-configuration-driven-framework-for-reproducible-deep-learning-and-9e0ecabf2815.html new file mode 100644 index 000000000000..85cdacb8e4cf --- /dev/null +++ b/community_blog/torchdistill-a-modular-configuration-driven-framework-for-reproducible-deep-learning-and-9e0ecabf2815.html @@ -0,0 +1 @@ +

    This article summarizes key features and concepts of torchdistill (v1.0.0). Refer to the official documentation for its APIs and research projects.

    diff --git a/community_blog/unveiling-the-power-of-semi-supervised-learning-the-unified-semi-supervised-learning-benchmark-849f42bbc32a.html b/community_blog/unveiling-the-power-of-semi-supervised-learning-the-unified-semi-supervised-learning-benchmark-849f42bbc32a.html new file mode 100644 index 000000000000..0b061be998ea --- /dev/null +++ b/community_blog/unveiling-the-power-of-semi-supervised-learning-the-unified-semi-supervised-learning-benchmark-849f42bbc32a.html @@ -0,0 +1 @@ +

    Machine Learning models thrive on high-quality, fully-annotated data. The traditional supervised learning approach typically requires data on the scale of millions, or even billions, to train large foundational models. However, obtaining such a vast amount of labeled data is often tedious and labor-intensive. As an alternative, semi-supervised learning (SSL) aims to enhance model generalization with only a fraction of labeled data, complemented by a considerable amount of unlabeled data. This blog introduces USB — the Unified Semi-Supervised Learning Framework and Benchmark, covering multi-modalities and various SSL scenarios.

    diff --git a/community_blog/vllm-joins-pytorch.html b/community_blog/vllm-joins-pytorch.html new file mode 100644 index 000000000000..92b038ca93e1 --- /dev/null +++ b/community_blog/vllm-joins-pytorch.html @@ -0,0 +1,3 @@ +

    We’re thrilled to announce that the vLLM project has become a PyTorch ecosystem project, and joined the PyTorch ecosystem family!

    + +

    Running large language models (LLMs) is both resource-intensive and complex, especially as these models scale to hundreds of billions of parameters. That’s where vLLM comes in — a high-throughput, memory-efficient inference and serving engine designed for LLMs.

    diff --git a/community_blog/zeus.html b/community_blog/zeus.html new file mode 100644 index 000000000000..48ceabdbdea7 --- /dev/null +++ b/community_blog/zeus.html @@ -0,0 +1 @@ +

    Zeus is an open-source toolbox for measuring and optimizing the energy consumption of deep learning workloads. Our goal is to make energy optimization based on accurate measurements as easy as possible for diverse deep learning workloads and setups by offering composable tools with minimal assumptions.

    diff --git a/contact-us.html b/contact-us.html new file mode 100644 index 000000000000..06925803eac4 --- /dev/null +++ b/contact-us.html @@ -0,0 +1,639 @@ + + + + + + + + + + + + + Contact Us | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + +
    + +
    +
    +
    +

    Contact Us

    +
    +
    +
    + +
    +
    +
    +
    +

    Get in Touch

    + +

    The success of PyTorch is only possible with the contributions and support of our developer community and member companies. If you would like to learn how you can collaborate with your peers in the PyTorch Foundation, and would like to have a conversation with a PyTorch Foundation representative, please fill out this form.

    + +

    Note: for all PyTorch technical questions please go to discuss.pytorch.org

    + + + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + diff --git a/credits.html b/credits.html new file mode 100644 index 000000000000..a0fa4bf72d4b --- /dev/null +++ b/credits.html @@ -0,0 +1,807 @@ + + + + + + + + + + + + + PyTorch Cloud Credit Program | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + +
    + + + +
    +
    +
    +

    PyTorch Cloud
           Credit Program

    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +

    We believe providing public, self-service, and automated access to cloud infrastructure is essential for every project's incubation, growth, and success. +

    + To support this, PyTorch has established a program that enables organizations to contribute either cloud credits or financial donations directly towards maintaining and expanding our Continuous Integration (CI) infrastructure and other foundation-hosted project infrastructure. Contributions from organizations like AWS have already provided cloud credits, demonstrating a clear commitment to the success and sustainability of the PyTorch' Foundation's hosted projects. Many organizations continue to sponsor PyTorch projects, recognizing that supporting foundational infrastructure contributes directly to their own business growth and success. +

    +
    +
    + Men and woman at a conference +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +

    Sponsors

    +

    Organizations can get started sponsoring cloud credits today.

    + + Apply + +
    +
    +
    +
    + +
    +
    +
    +
    +

    Advance the ecosystem

    +

    Join the PyTorch Credits program and benefit in the following ways: +

    +
      +
    • Highlight your company participation in the ecosystem
    • +
    • Help shape the future of cloud native
    • +
    • Establish your company as a thought leader in the space
    • +
    • Collaborate with various companies and organizations — improving open source technologies
    • +
    + +

    Cloud credit levels + benefits

    + +
    + + +
    +
    +
    +

    Supporter

    +
    + +
    +

    Starting benefits

    +
      +
    • Coordinated PyTorch blog +
    • +
    • Appropriate placement on + the PyTorch website
    • +
    +
    + +
    +

    $250k+

    +
    +
    +
    + + +
    +
    + +
    +

    Advocate

    +
    + +
    +

    Everything included + in Supporter benefits, plus:

    +
      +
    • Bonus online program slot + (live webinar, live stream, or on-demand webinar - your + choice) to highlight your participation in the program
    • + +
    +
    + +
    +

    $500k+

    +
    +
    +
    + + +
    +
    + +
    +

    Champion

    +
    + +
    +

    Everything included + in Advocate benefits, plus:

    +
      +
    • Top placement on the PyTorch + website with an explanation of the offering
    • +
    • PyTorch marketing campaign to + drive awareness towards the donation
      + - Coordinated blog post and media outreach
      + - Social media shout out to thank your company for their + contribution
      + - Mention in the PyTorch keynote at the next PyTorch Conference
      +
    • +
    • PyTorch will set up resource + pooling and CI capacity for self-service of your offering +
    • +
    +
    + +
    +

    $1m+

    +
    +
    +
    + +
    + +
    +
    +
    +
    + +
    +
    +
    +
    +

    How it works

    +
    +
    +
    + Cloud ecosystem + +

    Step 1

    +

    Decide how to donate

    +

    Cloud credits can be donated via cash or credit:

    +
    • Credit, for products a sponsor usually sells as a service to the public
    • +
    • Cash, to be earmarked for paying for services when we run out of donated credits
    +
    +
    + +
    +
    + Person in a magnifying glass + +

    Step 2

    +

    Determine the technical points of contact

    +

    Projects often need help utilizing credits. We ask that you provide a technical support resource with a defined SLA.

    +
    +
    + +
    +
    + cogs +

    Step 3

    +

    Projects benefit from your donation

    +

    After getting set up with PyTorch, projects can access these credits via a curated self-service portal, managed by PyTorch.

    +
    +
    +
    + + + Join the PyTorch Cloud Credits Program + + + +
    +
    +
    +
    + + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + diff --git a/credits/sponsor.html b/credits/sponsor.html new file mode 100644 index 000000000000..516c548caced --- /dev/null +++ b/credits/sponsor.html @@ -0,0 +1,638 @@ + + + + + + + + + + + + + Sponsor Cloud Credits | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
    +
    +
    +
    + + +
    + + + + + + + + +
    + +
    +
    + + +
    + + + +
    +
    +
    +

    PyTorch Cloud
           Credit Program

    +
    +
    +
    + +
    +
    +
    +
    +

    Sponsor cloud credits and support PyTorch. Please fill in the form and we will be in touch. +

    + + +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Docs

    +

    Access comprehensive developer documentation for PyTorch

    + View Docs +
    + +
    +

    Tutorials

    +

    Get in-depth tutorials for beginners and advanced developers

    + View Tutorials +
    + +
    +

    Resources

    +

    Find development resources and get your questions answered

    + View Resources +
    +
    +
    +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs.html b/docs.html deleted file mode 100644 index 58fe2e65f749..000000000000 --- a/docs.html +++ /dev/null @@ -1,9 +0,0 @@ ---- -title: PyTorch | Documentation -id: docs -permalink: /docs/ -layout: default ---- - - - \ No newline at end of file diff --git a/docs/.buildinfo b/docs/.buildinfo deleted file mode 100644 index c2b7f345f3ae..000000000000 --- a/docs/.buildinfo +++ /dev/null @@ -1,4 +0,0 @@ -# Sphinx build info version 1 -# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. -config: 378a0880d9ea0ae502c65857a1c2e50a -tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/docs/_modules/index.html b/docs/_modules/index.html deleted file mode 100644 index 8dedb2cab28c..000000000000 --- a/docs/_modules/index.html +++ /dev/null @@ -1,623 +0,0 @@ - - - - - - - - - - - Overview: module code — PyTorch 0.1.11 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - -
    - - - - - - - - -
    - -
    - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/_modules/torch.html b/docs/_modules/torch.html deleted file mode 100644 index aef9fc4134a5..000000000000 --- a/docs/_modules/torch.html +++ /dev/null @@ -1,912 +0,0 @@ - - - - - - - - - - - torch — PyTorch 0.1.11 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - -
    - - - - - - -
    -
    - - - - - - - - - - - - - - - - -
    - - - - -
    -
    -
    -
    - -

    Source code for torch

    -"""
    -The torch package contains data structures for multi-dimensional
    -tensors and mathematical operations over these are defined.
    -Additionally, it provides many utilities for efficient serializing of
    -Tensors and arbitrary types, and other useful utilities.
    -
    -It has a CUDA counterpart, that enables you to run your tensor computations
    -on an NVIDIA GPU with compute capability >= 2.0.
    -"""
    -
    -import sys
    -from ._utils import _import_dotted_name
    -from .version import __version__
    -
    -__all__ = [
    -    'typename', 'is_tensor', 'is_storage', 'set_default_tensor_type',
    -    'set_rng_state', 'get_rng_state', 'manual_seed', 'initial_seed',
    -    'save', 'load', 'set_printoptions', 'chunk', 'split', 'stack',
    -    'DoubleStorage', 'FloatStorage', 'LongStorage', 'IntStorage',
    -    'ShortStorage', 'CharStorage', 'ByteStorage',
    -    'DoubleTensor', 'FloatTensor', 'LongTensor', 'IntTensor',
    -    'ShortTensor', 'CharTensor', 'ByteTensor',
    -]
    -
    -################################################################################
    -# Load the extension module
    -################################################################################
    -
    -# Loading the extension with RTLD_GLOBAL option allows to not link extension
    -# modules against the _C shared object. Their missing THP symbols will be
    -# automatically filled by the dynamic loader.
    -import os as _dl_flags
    -
    -# if we have numpy, it *must* be imported before the call to setdlopenflags()
    -# or there is risk that later c modules will segfault when importing numpy
    -try:
    -    import numpy as np
    -except:
    -    pass
    -
    -# first check if the os package has the required flags
    -if not hasattr(_dl_flags, 'RTLD_GLOBAL') or not hasattr(_dl_flags, 'RTLD_NOW'):
    -    try:
    -        # next try if DLFCN exists
    -        import DLFCN as _dl_flags
    -    except ImportError:
    -        # as a last attempt, use compile-time constants
    -        import torch._dl as _dl_flags
    -
    -old_flags = sys.getdlopenflags()
    -sys.setdlopenflags(_dl_flags.RTLD_GLOBAL | _dl_flags.RTLD_NOW)
    -
    -from torch._C import *
    -
    -__all__ += [name for name in dir(_C)
    -            if name[0] != '_' and
    -            not name.endswith('Base')]
    -
    -sys.setdlopenflags(old_flags)
    -del _dl_flags
    -del old_flags
    -
    -################################################################################
    -# Define basic utilities
    -################################################################################
    -
    -
    -def typename(o):
    -    module = ''
    -    class_name = ''
    -    if hasattr(o, '__module__') and o.__module__ != 'builtins' \
    -            and o.__module__ != '__builtin__' and o.__module__ is not None:
    -        module = o.__module__ + '.'
    -
    -    if hasattr(o, '__qualname__'):
    -        class_name = o.__qualname__
    -    elif hasattr(o, '__name__'):
    -        class_name = o.__name__
    -    else:
    -        class_name = o.__class__.__name__
    -
    -    return module + class_name
    -
    -
    -
    [docs]def is_tensor(obj): - r"""Returns True if `obj` is a pytorch tensor. - - Args: - obj (Object): Object to test - """ - return obj.__class__ in _tensor_classes
    - - -
    [docs]def is_storage(obj): - r"""Returns True if `obj` is a pytorch storage object. - - Args: - obj (Object): Object to test - """ - return obj.__class__ in _storage_classes
    - - -
    [docs]def set_default_tensor_type(t): - global Tensor - global Storage - Tensor = _import_dotted_name(t) - Storage = _import_dotted_name(t.replace('Tensor', 'Storage')) - _C._set_default_tensor_type(Tensor)
    - - -
    [docs]def set_rng_state(new_state): - r"""Sets the random number generator state. - - Args: - new_state (torch.ByteTensor): The desired state - """ - default_generator.set_state(new_state)
    - - -
    [docs]def get_rng_state(): - r"""Returns the random number generator state as a ByteTensor.""" - return default_generator.get_state()
    - - -
    [docs]def manual_seed(seed): - r"""Sets the seed for generating random numbers. And returns a - `torch._C.Generator` object. - - Args: - seed (int or long): The desired seed. - """ - return default_generator.manual_seed(seed)
    - - -
    [docs]def initial_seed(): - r"""Returns the initial seed for generating random numbers as a - python `long`. - """ - return default_generator.initial_seed()
    - - -from .serialization import save, load -from ._tensor_str import set_printoptions - -################################################################################ -# Define Storage and Tensor classes -################################################################################ - -from .storage import _StorageBase -from .tensor import _TensorBase - - -class DoubleStorage(_C.DoubleStorageBase, _StorageBase): - pass - - -
    [docs]class FloatStorage(_C.FloatStorageBase, _StorageBase): - pass
    - - -class HalfStorage(_C.HalfStorageBase, _StorageBase): - pass - - -class LongStorage(_C.LongStorageBase, _StorageBase): - pass - - -class IntStorage(_C.IntStorageBase, _StorageBase): - pass - - -class ShortStorage(_C.ShortStorageBase, _StorageBase): - pass - - -class CharStorage(_C.CharStorageBase, _StorageBase): - pass - - -class ByteStorage(_C.ByteStorageBase, _StorageBase): - pass - - -class DoubleTensor(_C.DoubleTensorBase, _TensorBase): - - def is_signed(self): - return True - - @classmethod - def storage_type(cls): - return DoubleStorage - - -class FloatTensor(_C.FloatTensorBase, _TensorBase): - - def is_signed(self): - return True - - @classmethod - def storage_type(cls): - return FloatStorage - - -class HalfTensor(_C.HalfTensorBase, _TensorBase): - - def is_signed(self): - return True - - @classmethod - def storage_type(cls): - return HalfStorage - - -class LongTensor(_C.LongTensorBase, _TensorBase): - - def is_signed(self): - return True - - @classmethod - def storage_type(cls): - return LongStorage - - -class IntTensor(_C.IntTensorBase, _TensorBase): - - def is_signed(self): - return True - - @classmethod - def storage_type(cls): - return IntStorage - - -class ShortTensor(_C.ShortTensorBase, _TensorBase): - - def is_signed(self): - return True - - @classmethod - def storage_type(cls): - return ShortStorage - - -class CharTensor(_C.CharTensorBase, _TensorBase): - - def is_signed(self): - # TODO - return False - - @classmethod - def storage_type(cls): - return CharStorage - - -class ByteTensor(_C.ByteTensorBase, _TensorBase): - - def is_signed(self): - return False - - @classmethod - def storage_type(cls): - return ByteStorage - - -_storage_classes = { - DoubleStorage, FloatStorage, LongStorage, IntStorage, ShortStorage, - CharStorage, ByteStorage, -} - -_tensor_classes = { - DoubleTensor, FloatTensor, LongTensor, IntTensor, ShortTensor, - CharTensor, ByteTensor, -} - - -set_default_tensor_type('torch.FloatTensor') - -################################################################################ -# Import interface functions defined in Python -################################################################################ - -from .functional import * - - -################################################################################ -# Initialize extension -################################################################################ - -def manager_path(): - import os - path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'lib', 'torch_shm_manager') - if not os.path.exists(path): - raise RuntimeError("Unable to find torch_shm_manager at " + path) - return path.encode('utf-8') - - -# Shared memory manager needs to know the exact location of manager executable -_C._initExtension(manager_path()) -del manager_path - -################################################################################ -# Remove unnecessary members -################################################################################ - -del DoubleStorageBase -del FloatStorageBase -del LongStorageBase -del IntStorageBase -del ShortStorageBase -del CharStorageBase -del ByteStorageBase -del DoubleTensorBase -del FloatTensorBase -del LongTensorBase -del IntTensorBase -del ShortTensorBase -del CharTensorBase -del ByteTensorBase - -del SparseDoubleTensorBase -del SparseFloatTensorBase -del SparseLongTensorBase -del SparseIntTensorBase -del SparseShortTensorBase -del SparseCharTensorBase -del SparseByteTensorBase - -################################################################################ -# Import most common subpackages -################################################################################ - -import torch.cuda -import torch.autograd -import torch.nn -import torch.optim -import torch.multiprocessing - -# attach docstrings to torch and tensor functions -from . import _torch_docs, _tensor_docs -del _torch_docs, _tensor_docs -
    - -
    -
    - -
    -
    -
    - - -
    - -
    -

    - © Copyright 2017, Torch Contributors. - -

    -
    - Built with Sphinx using a theme provided by Read the Docs. - -
    - -
    -
    - -
    - -
    - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/_modules/torch/_tensor_str.html b/docs/_modules/torch/_tensor_str.html deleted file mode 100644 index 4db73203c377..000000000000 --- a/docs/_modules/torch/_tensor_str.html +++ /dev/null @@ -1,870 +0,0 @@ - - - - - - - - - - - torch._tensor_str — PyTorch 0.1.11 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - -
    - - - - - - -
    -
    - - - - - - - - - - - - - - - - -
    - - - - -
    -
    -
    -
    - -

    Source code for torch._tensor_str

    -import math
    -import torch
    -from functools import reduce
    -from ._utils import _range
    -
    -
    -class __PrinterOptions(object):
    -    precision = 4
    -    threshold = 1000
    -    edgeitems = 3
    -    linewidth = 80
    -
    -
    -PRINT_OPTS = __PrinterOptions()
    -SCALE_FORMAT = '{:.5e} *\n'
    -
    -
    -# We could use **kwargs, but this will give better docs
    -
    [docs]def set_printoptions( - precision=None, - threshold=None, - edgeitems=None, - linewidth=None, - profile=None, -): - """Set options for printing. Items shamelessly taken from Numpy - - Args: - precision: Number of digits of precision for floating point output - (default 8). - threshold: Total number of array elements which trigger summarization - rather than full repr (default 1000). - edgeitems: Number of array items in summary at beginning and end of - each dimension (default 3). - linewidth: The number of characters per line for the purpose of - inserting line breaks (default 80). Thresholded matricies will - ignore this parameter. - profile: Sane defaults for pretty printing. Can override with any of - the above options. (default, short, full) - """ - if profile is not None: - if profile == "default": - PRINT_OPTS.precision = 4 - PRINT_OPTS.threshold = 1000 - PRINT_OPTS.edgeitems = 3 - PRINT_OPTS.linewidth = 80 - elif profile == "short": - PRINT_OPTS.precision = 2 - PRINT_OPTS.threshold = 1000 - PRINT_OPTS.edgeitems = 2 - PRINT_OPTS.linewidth = 80 - elif profile == "full": - PRINT_OPTS.precision = 4 - PRINT_OPTS.threshold = float('inf') - PRINT_OPTS.edgeitems = 3 - PRINT_OPTS.linewidth = 80 - - if precision is not None: - PRINT_OPTS.precision = precision - if threshold is not None: - PRINT_OPTS.threshold = threshold - if edgeitems is not None: - PRINT_OPTS.edgeitems = edgeitems - if linewidth is not None: - PRINT_OPTS.linewidth = linewidth
    - - -def _number_format(tensor, min_sz=-1): - min_sz = max(min_sz, 2) - tensor = torch.DoubleTensor(tensor.nelement()).copy_(tensor).abs_() - - pos_inf_mask = tensor.eq(float('inf')) - neg_inf_mask = tensor.eq(float('-inf')) - nan_mask = tensor.ne(tensor) - invalid_value_mask = pos_inf_mask + neg_inf_mask + nan_mask - if invalid_value_mask.all(): - example_value = 0 - else: - example_value = tensor[invalid_value_mask.eq(0)][0] - tensor[invalid_value_mask] = example_value - if invalid_value_mask.any(): - min_sz = max(min_sz, 3) - - int_mode = True - # TODO: use fmod? - for value in tensor: - if value != math.ceil(value): - int_mode = False - break - - exp_min = tensor.min() - if exp_min != 0: - exp_min = math.floor(math.log10(exp_min)) + 1 - else: - exp_min = 1 - exp_max = tensor.max() - if exp_max != 0: - exp_max = math.floor(math.log10(exp_max)) + 1 - else: - exp_max = 1 - - scale = 1 - exp_max = int(exp_max) - prec = PRINT_OPTS.precision - if int_mode: - if exp_max > prec + 1: - format = '{{:11.{}e}}'.format(prec) - sz = max(min_sz, 7 + prec) - else: - sz = max(min_sz, exp_max + 1) - format = '{:' + str(sz) + '.0f}' - else: - if exp_max - exp_min > prec: - sz = 7 + prec - if abs(exp_max) > 99 or abs(exp_min) > 99: - sz = sz + 1 - sz = max(min_sz, sz) - format = '{{:{}.{}e}}'.format(sz, prec) - else: - if exp_max > prec + 1 or exp_max < 0: - sz = max(min_sz, 7) - scale = math.pow(10, exp_max - 1) - else: - if exp_max == 0: - sz = 7 - else: - sz = exp_max + 6 - sz = max(min_sz, sz) - format = '{{:{}.{}f}}'.format(sz, prec) - return format, scale, sz - - -def _tensor_str(self): - n = PRINT_OPTS.edgeitems - has_hdots = self.size()[-1] > 2 * n - has_vdots = self.size()[-2] > 2 * n - print_full_mat = not has_hdots and not has_vdots - formatter = _number_format(self, min_sz=3 if not print_full_mat else 0) - print_dots = self.numel() >= PRINT_OPTS.threshold - - dim_sz = max(2, max(len(str(x)) for x in self.size())) - dim_fmt = "{:^" + str(dim_sz) + "}" - dot_fmt = u"{:^" + str(dim_sz + 1) + "}" - - counter_dim = self.ndimension() - 2 - counter = torch.LongStorage(counter_dim).fill_(0) - counter[counter.size() - 1] = -1 - finished = False - strt = '' - while True: - nrestarted = [False for i in counter] - nskipped = [False for i in counter] - for i in _range(counter_dim - 1, -1, -1): - counter[i] += 1 - if print_dots and counter[i] == n and self.size(i) > 2 * n: - counter[i] = self.size(i) - n - nskipped[i] = True - if counter[i] == self.size(i): - if i == 0: - finished = True - counter[i] = 0 - nrestarted[i] = True - else: - break - if finished: - break - elif print_dots: - if any(nskipped): - for hdot in nskipped: - strt += dot_fmt.format('...') if hdot \ - else dot_fmt.format('') - strt += '\n' - if any(nrestarted): - strt += ' ' - for vdot in nrestarted: - strt += dot_fmt.format(u'\u22EE' if vdot else '') - strt += '\n' - if strt != '': - strt += '\n' - strt += '({},.,.) = \n'.format( - ','.join(dim_fmt.format(i) for i in counter)) - submatrix = reduce(lambda t, i: t.select(0, i), counter, self) - strt += _matrix_str(submatrix, ' ', formatter, print_dots) - return strt - - -def __repr_row(row, indent, fmt, scale, sz, truncate=None): - if truncate is not None: - dotfmt = " {:^5} " - return (indent + - ' '.join(fmt.format(val / scale) for val in row[:truncate]) + - dotfmt.format('...') + - ' '.join(fmt.format(val / scale) for val in row[-truncate:]) + - '\n') - else: - return indent + ' '.join(fmt.format(val / scale) for val in row) + '\n' - - -def _matrix_str(self, indent='', formatter=None, force_truncate=False): - n = PRINT_OPTS.edgeitems - has_hdots = self.size(1) > 2 * n - has_vdots = self.size(0) > 2 * n - print_full_mat = not has_hdots and not has_vdots - - if formatter is None: - fmt, scale, sz = _number_format(self, - min_sz=5 if not print_full_mat else 0) - else: - fmt, scale, sz = formatter - nColumnPerLine = int(math.floor((PRINT_OPTS.linewidth - len(indent)) / (sz + 1))) - strt = '' - firstColumn = 0 - - if not force_truncate and \ - (self.numel() < PRINT_OPTS.threshold or print_full_mat): - while firstColumn < self.size(1): - lastColumn = min(firstColumn + nColumnPerLine - 1, self.size(1) - 1) - if nColumnPerLine < self.size(1): - strt += '\n' if firstColumn != 1 else '' - strt += 'Columns {} to {} \n{}'.format( - firstColumn, lastColumn, indent) - if scale != 1: - strt += SCALE_FORMAT.format(scale) - for l in _range(self.size(0)): - strt += indent + (' ' if scale != 1 else '') - row_slice = self[l, firstColumn:lastColumn + 1] - strt += ' '.join(fmt.format(val / scale) for val in row_slice) - strt += '\n' - firstColumn = lastColumn + 1 - else: - if scale != 1: - strt += SCALE_FORMAT.format(scale) - if has_vdots and has_hdots: - vdotfmt = "{:^" + str((sz + 1) * n - 1) + "}" - ddotfmt = u"{:^5}" - for row in self[:n]: - strt += __repr_row(row, indent, fmt, scale, sz, n) - strt += indent + ' '.join([vdotfmt.format('...'), - ddotfmt.format(u'\u22F1'), - vdotfmt.format('...')]) + "\n" - for row in self[-n:]: - strt += __repr_row(row, indent, fmt, scale, sz, n) - elif not has_vdots and has_hdots: - for row in self: - strt += __repr_row(row, indent, fmt, scale, sz, n) - elif has_vdots and not has_hdots: - vdotfmt = u"{:^" + \ - str(len(__repr_row(self[0], '', fmt, scale, sz))) + \ - "}\n" - for row in self[:n]: - strt += __repr_row(row, indent, fmt, scale, sz) - strt += vdotfmt.format(u'\u22EE') - for row in self[-n:]: - strt += __repr_row(row, indent, fmt, scale, sz) - else: - for row in self: - strt += __repr_row(row, indent, fmt, scale, sz) - return strt - - -def _vector_str(self): - fmt, scale, sz = _number_format(self) - strt = '' - ident = '' - n = PRINT_OPTS.edgeitems - dotfmt = u"{:^" + str(sz) + "}\n" - if scale != 1: - strt += SCALE_FORMAT.format(scale) - ident = ' ' - if self.numel() < PRINT_OPTS.threshold: - return (strt + - '\n'.join(ident + fmt.format(val / scale) for val in self) + - '\n') - else: - return (strt + - '\n'.join(ident + fmt.format(val / scale) for val in self[:n]) + - '\n' + (ident + dotfmt.format(u"\u22EE")) + - '\n'.join(ident + fmt.format(val / scale) for val in self[-n:]) + - '\n') - - -def _str(self): - if self.ndimension() == 0: - return '[{} with no dimension]\n'.format(torch.typename(self)) - elif self.ndimension() == 1: - strt = _vector_str(self) - elif self.ndimension() == 2: - strt = _matrix_str(self) - else: - strt = _tensor_str(self) - - size_str = 'x'.join(str(size) for size in self.size()) - device_str = '' if not self.is_cuda else \ - ' (GPU {})'.format(self.get_device()) - strt += '[{} of size {}{}]\n'.format(torch.typename(self), - size_str, device_str) - return '\n' + strt -
    - -
    -
    - -
    -
    -
    - - -
    - -
    -

    - © Copyright 2017, Torch Contributors. - -

    -
    - Built with Sphinx using a theme provided by Read the Docs. - -
    - -
    -
    - -
    - -
    - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/_modules/torch/_utils.html b/docs/_modules/torch/_utils.html deleted file mode 100644 index 03694cf41744..000000000000 --- a/docs/_modules/torch/_utils.html +++ /dev/null @@ -1,673 +0,0 @@ - - - - - - - - - - - torch._utils — PyTorch 0.1.11 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - -
    - - - - - - -
    -
    - - - - - - - - - - - - - - - - -
    - - - - -
    -
    -
    -
    - -

    Source code for torch._utils

    -import torch
    -import importlib
    -
    -
    -def _type(self, new_type=None, async=False):
    -    """Casts this object to the specified type.
    -
    -    If this is already of the correct type, no copy is performed and the
    -    original object is returned.
    -
    -    Args:
    -        new_type (type or string): The desired type
    -        async (bool): If True, and the source is in pinned memory and
    -                      destination is on the GPU or vice versa, the copy is
    -                      performed asynchronously with respect to the host.
    -                      Otherwise, the argument has no effect.
    -    """
    -    if new_type is None:
    -        return self.__module__ + '.' + self.__class__.__name__
    -
    -    if isinstance(new_type, str):
    -        new_type = _import_dotted_name(new_type)
    -    if new_type == type(self):
    -        return self
    -    if self.is_sparse:
    -        if not new_type.is_sparse:
    -            raise RuntimeError("Cannot cast sparse tensor to dense tensor")
    -        new_type_name = new_type.__module__ + '.' + new_type.__name__
    -        new_values_type_name = new_type_name.replace('.sparse', '')
    -        new_values = self.values().type(new_values_type_name, async)
    -        return new_type(self.indices(), new_values, self.size())
    -    if new_type.is_sparse:
    -        raise RuntimeError("Cannot cast dense tensor to sparse tensor")
    -    return new_type(self.size()).copy_(self, async)
    -
    -
    -def _cuda(self, device=None, async=False):
    -    """Returns a copy of this object in CUDA memory.
    -
    -    If this object is already in CUDA memory and on the correct device, then
    -    no copy is performed and the original object is returned.
    -
    -    Args:
    -        device (int): The destination GPU id. Defaults to the current device.
    -        async (bool): If True and the source is in pinned memory, the copy will
    -                      be asynchronous with respect to the host. Otherwise, the
    -                      argument has no effect.
    -    """
    -    if self.is_cuda:
    -        if device is None:
    -            device = torch.cuda.current_device()
    -        if self.get_device() == device:
    -            return self
    -    else:
    -        if device is None:
    -            device = -1
    -    with torch.cuda.device(device):
    -        if self.is_sparse:
    -            new_type = getattr(torch.cuda.sparse, self.__class__.__name__)
    -            indices = self.indices().cuda(device, async)
    -            values = self.values().cuda(device, async)
    -            return new_type(indices, values, self.size())
    -        else:
    -            new_type = getattr(torch.cuda, self.__class__.__name__)
    -            return new_type(self.size()).copy_(self, async)
    -
    -
    -def _rebuild_tensor(storage, storage_offset, size, stride):
    -    class_name = storage.__class__.__name__.replace('Storage', 'Tensor')
    -    module = importlib.import_module(storage.__module__)
    -    tensor_class = getattr(module, class_name)
    -    return tensor_class().set_(storage, storage_offset, size, stride)
    -
    -
    -def _range(*args, **kwargs):
    -    return __builtins__['range'](*args, **kwargs)
    -
    -
    -def _import_dotted_name(name):
    -    components = name.split('.')
    -    obj = __import__(components[0])
    -    for component in components[1:]:
    -        obj = getattr(obj, component)
    -    return obj
    -
    -
    -# Taken from python 3.5 docs
    -def _accumulate(iterable, fn=lambda x, y: x + y):
    -    'Return running totals'
    -    # _accumulate([1,2,3,4,5]) --> 1 3 6 10 15
    -    # _accumulate([1,2,3,4,5], operator.mul) --> 1 2 6 24 120
    -    it = iter(iterable)
    -    try:
    -        total = next(it)
    -    except StopIteration:
    -        return
    -    yield total
    -    for element in it:
    -        total = fn(total, element)
    -        yield total
    -
    - -
    -
    - -
    -
    -
    - - -
    - -
    -

    - © Copyright 2017, Torch Contributors. - -

    -
    - Built with Sphinx using a theme provided by Read the Docs. - -
    - -
    -
    - -
    - -
    - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/_modules/torch/autograd.html b/docs/_modules/torch/autograd.html deleted file mode 100644 index eefe240660fa..000000000000 --- a/docs/_modules/torch/autograd.html +++ /dev/null @@ -1,617 +0,0 @@ - - - - - - - - - - - torch.autograd — PyTorch 0.1.11 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - -
    - - - - - - -
    -
    - - - - - - - - - - - - - - - - -
    - - - - -
    -
    -
    -
    - -

    Source code for torch.autograd

    -"""
    -torch.autograd provides classes and functions implementing automatic
    -differentiation of arbitrary scalar valued functions. It requires minimal
    -changes to the existing code - you only need to wrap all tensors in
    -:class:`.Variable` objects.
    -"""
    -import torch
    -
    -from .variable import Variable
    -from .function import Function, NestedIOFunction
    -from .stochastic_function import StochasticFunction
    -from .gradcheck import gradcheck
    -
    -__all__ = ['Variable', 'Function', 'StochasticFunction', 'backward']
    -
    -
    -
    [docs]def backward(variables, grad_variables, retain_variables=False): - """Computes the sum of gradients of given variables w.r.t. graph leaves. - - The graph is differentiated using the chain rule. If any of ``variables`` - are non-scalar (i.e. their data has more than one element) and require - gradient, the function additionaly requires specifying ``grad_variables``. - It should be a sequence of matching length, that containins gradient of - the differentiated function w.r.t. corresponding variables (``None`` is an - acceptable value for all variables that don't need gradient tensors). - - This function accumulates gradients in the leaves - you might need to zero - them before calling it. - - Arguments: - variables (sequence of Variable): Variables of which the derivative will be - computed. - grad_variables (sequence of Tensor): Gradients w.r.t. each element of - corresponding variables. Required only for non-scalar variables that - require gradient. - retain_variables (bool): If ``True``, buffers necessary for computing - gradients won't be freed after use. It is only necessary to - specify ``True`` if you want to differentiate some subgraph multiple - times. - """ - Variable._execution_engine.run_backward( - tuple(variables), tuple(grad_variables), retain_variables)
    - -assert torch._C._autograd_init() -
    - -
    -
    - -
    -
    -
    - - -
    - -
    -

    - © Copyright 2017, Torch Contributors. - -

    -
    - Built with Sphinx using a theme provided by Read the Docs. - -
    - -
    -
    - -
    - -
    - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/_modules/torch/autograd/function.html b/docs/_modules/torch/autograd/function.html deleted file mode 100644 index ecf32ad4e98e..000000000000 --- a/docs/_modules/torch/autograd/function.html +++ /dev/null @@ -1,823 +0,0 @@ - - - - - - - - - - - torch.autograd.function — PyTorch 0.1.11 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - -
    - - - - - - -
    -
    - - - - - - - - - - - - - - - - -
    - - - - -
    -
    -
    -
    - -

    Source code for torch.autograd.function

    -import torch
    -import torch._C as _C
    -import torch.utils.hooks as hooks
    -from collections import OrderedDict
    -
    -
    -
    [docs]class Function(_C._FunctionBase): - """Records operation history and defines formulas for differentiating ops. - - Every operation performed on :class:`Variable` s creates a new function - object, that performs the computation, and records that it happened. - The history is retained in the form of a DAG of functions, with edges - denoting data dependencies (``input <- output``). Then, when backward is - called, the graph is processed in the topological ordering, by calling - :func:`backward` methods of each :class:`Function` object, and passing - returned gradients on to next :class:`Function` s. - - Normally, the only way users interact with functions is by creating - subclasses and defining new operations. This is a recommended way of - extending torch.autograd. - - Since Function logic is a hotspot in most scripts, almost all of it - was moved to our C backend, to ensure that the framework overhead is - minimal. - - Each function is meant to be used only once (in the forward pass). - - Attributes: - saved_tensors: Tuple of Tensors that were saved in the call to - :func:`forward`. - needs_input_grad: Tuple of booleans of length :attr:`num_inputs`, - indicating whether a given input requires gradient. This can be - used to optimize buffers saved for backward, and ignoring gradient - computation in :func:`~Function.backward`. - num_inputs: Number of inputs given to :func:`forward`. - num_outputs: Number of tensors returned by :func:`forward`. - requires_grad: Boolean indicating whether the :func:`backward` will - ever need to be called. - previous_functions: Tuple of (int, Function) pairs of length - :attr:`num_inputs`. Each entry contains a reference to a - :class:`Function` that created corresponding input, and an index - of the previous function output that's been used. - """ - __call__ = _C._FunctionBase._do_forward - -
    [docs] def save_for_backward(self, *tensors): - """Saves given tensors for a future call to :func:`~Function.backward`. - - **This should be called at most once, and only from inside the** - :func:`forward` **method.** - - Later, saved tensors can be accessed through the :attr:`saved_tensors` - attribute. Before returning them to the user, a check is made, to - ensure they weren't used in any in-place operation that modified - their content. - - Arguments can also be ``None``. - """ - self.to_save = tensors
    - -
    [docs] def mark_dirty(self, *args): - """Marks given tensors as modified in an in-place operation. - - **This should be called at most once, only from inside the** - :func:`forward` **method, and all arguments should be inputs.** - - Every tensor that's been modified in-place in a call to :func:`forward` - should be given to this function, to ensure correcness of our checks. - It doesn't matter wheter the function is called before or after - modification. - """ - self.dirty_tensors = args
    - -
    [docs] def mark_shared_storage(self, *pairs): - """Marks that given pairs of distinct tensors are sharing storage. - - **This should be called at most once, only from inside the** - :func:`forward` **method, and all arguments should be pairs of - (input, output).** - - If some of the outputs are going to be tensors sharing storage with - some of the inputs, all pairs of (input_arg, output_arg) should be - given to this function, to ensure correctness checking of in-place - modification. The only exception is when an output is exactly the same - tensor as input (e.g. in-place ops). In such case it's easy to conclude - that they're sharing data, so we don't require specifying such - dependencies. - - This function is not needed in most functions. It's primarily used in - indexing and transpose ops. - """ - self.shared_pairs = pairs
    - -
    [docs] def mark_non_differentiable(self, *args): - """Marks outputs as non-differentiable. - - **This should be called at most once, only from inside the** - :func:`forward` **method, and all arguments should be outputs.** - - This will mark outputs as not requiring gradients, increasing the - efficiency of backward computation. You still need to accept a gradient - for each output in :meth:`~Function.backward`, but it's always going to - be ``None``. - - This is used e.g. for indices returned from a max :class:`Function`. - """ - self.non_differentiable = args
    - - @staticmethod - def _register_hook(backward_hooks, hook): - if backward_hooks is None: - backward_hooks = OrderedDict() - handle = hooks.RemovableHandle(backward_hooks) - backward_hooks[handle.id] = hook - return backward_hooks, handle - -
    [docs] def forward(self, *input): - """Performs the operation. - - This function is to be overriden by all subclasses. - - It can take and return an arbitrary number of tensors. - """ - raise NotImplementedError
    - -
    [docs] def backward(self, *grad_output): - """Defines a formula for differentiating the operation. - - This function is to be overriden by all subclasses. - - All arguments are tensors. It has to accept exactly as many arguments, - as many outputs did :func:`forward` return, and it should return as - many tensors, as there were inputs to :func:`forward`. Each argument - is the gradient w.r.t the given output, and each returned value should - be the gradient w.r.t. the corresponding input. - """ - raise NotImplementedError
    - - -class InplaceFunction(Function): - - def __init__(self, inplace=False): - super(InplaceFunction, self).__init__() - self.inplace = inplace - - -def _nested_map(condition, fn): - def _map(obj): - if condition(obj): - return fn(obj) - elif obj is None: - return None - elif isinstance(obj, (list, tuple)): - return type(obj)(_map(x) for x in obj) - else: - raise ValueError("NestedIOFunction doesn't know how to process " - "an input object of type " + torch.typename(obj)) - return _map - - -def _iter_filter(condition): - def _iter(obj): - if condition(obj): - yield obj - elif obj is None: - return - elif isinstance(obj, (list, tuple)): - for o in obj: - for var in _iter(o): - yield var - else: - raise ValueError("NestedIOFunction doesn't know how to process " - "an input object of type " + torch.typename(obj)) - return _iter - - -def _unflatten(input, proto): - # unflatten a list or tuple input into a nested list/tuple structure - # specified by proto - def unflatten_helper(input, proto): - res = [] - if not isinstance(proto, (list, tuple)): - return input[0], input[1:] - for e in proto: - res_e, input = unflatten_helper(input, e) - res.append(res_e) - return type(proto)(res), input - - return unflatten_helper(input, proto)[0] - -_iter_variables = _iter_filter(lambda o: isinstance(o, torch.autograd.Variable)) -_iter_tensors = _iter_filter(torch.is_tensor) -_iter_None_tensors = _iter_filter(lambda o: o is None or torch.is_tensor(o)) -_map_variable_tensor = _nested_map(lambda o: isinstance(o, torch.autograd.Variable), lambda o: o.data) - - -class NestedIOFunction(Function): - - def _do_forward(self, *input): - self._nested_input = input - flat_input = tuple(_iter_variables(input)) - flat_output = super(NestedIOFunction, self)._do_forward(*flat_input) - nested_output = self._nested_output - nested_variables = _unflatten(flat_output, self._nested_output) - return nested_variables - - def _do_backward(self, gradients, retain_variables): - self.retain_variables = retain_variables - result = super(NestedIOFunction, self)._do_backward(gradients, retain_variables) - if not retain_variables: - del self._nested_output - del self._to_save_nested - return result - - def backward(self, *gradients): - nested_gradients = _unflatten(gradients, self._nested_output) - result = self.backward_extended(*nested_gradients) - return tuple(_iter_None_tensors(result)) - - __call__ = _do_forward - - def forward(self, *args): - nested_tensors = _map_variable_tensor(self._nested_input) - result = self.forward_extended(*nested_tensors) - del self._nested_input - self._nested_output = result - return tuple(_iter_tensors(result)) - - def save_for_backward(self, *args): - self.to_save = tuple(_iter_tensors(args)) - self._to_save_nested = args - - @property - def saved_tensors(self): - flat_tensors = super(NestedIOFunction, self).saved_tensors - return _unflatten(flat_tensors, self._to_save_nested) - - def mark_dirty(self, *args, **kwargs): - self.dirty_tensors = tuple(_iter_tensors((args, kwargs))) - - def mark_non_differentiable(self, *args, **kwargs): - self.non_differentiable = tuple(_iter_tensors((args, kwargs))) - - def forward_extended(self, *input): - raise NotImplementedError - - def backward_extended(self, *grad_output): - raise NotImplementedError -
    - -
    -
    - -
    -
    -
    - - -
    - -
    -

    - © Copyright 2017, Torch Contributors. - -

    -
    - Built with Sphinx using a theme provided by Read the Docs. - -
    - -
    -
    - -
    - -
    - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/_modules/torch/autograd/variable.html b/docs/_modules/torch/autograd/variable.html deleted file mode 100644 index c903ec74e760..000000000000 --- a/docs/_modules/torch/autograd/variable.html +++ /dev/null @@ -1,1474 +0,0 @@ - - - - - - - - - - - torch.autograd.variable — PyTorch 0.1.11 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - -
    - - - - - - -
    -
    - - - - - - - - - - - - - - - - -
    - - - - -
    -
    -
    -
    - -

    Source code for torch.autograd.variable

    -import sys
    -import torch._C as _C
    -from collections import OrderedDict
    -import torch.sparse as sparse
    -import torch.utils.hooks as hooks
    -
    -from ._functions import *
    -
    -
    -
    [docs]class Variable(_C._VariableBase): - """Wraps a tensor and records the operations applied to it. - - Variable is a thin wrapper around a Tensor object, that also holds - the gradient w.r.t. to it, and a reference to a function that created it. - This reference allows retracing the whole chain of operations that - created the data. If the Variable has been created by the user, its creator - will be ``None`` and we call such objects *leaf* Variables. - - Since autograd only supports scalar valued function differentiation, grad - size always matches the data size. Also, grad is normally only allocated - for leaf variables, and will be always zero otherwise. - - Attributes: - data: Wrapped tensor of any type. - grad: Variable holding the gradient of type and location matching - the ``.data``. This attribute is lazily allocated and can't - be reassigned. - requires_grad: Boolean indicating whether the Variable has been - created by a subgraph containing any Variable, that requires it. - See :ref:`excluding-subgraphs` for more details. - Can be changed only on leaf Variables. - volatile: Boolean indicating that the Variable should be used in - inference mode, i.e. don't save the history. See - :ref:`excluding-subgraphs` for more details. - Can be changed only on leaf Variables. - creator: Function of which the variable was an output. For leaf - (user created) variables it's ``None``. Read-only attribute. - - Parameters: - data (any tensor class): Tensor to wrap. - requires_grad (bool): Value of the requires_grad flag. **Keyword only.** - volatile (bool): Value of the volatile flag. **Keyword only.** - """ - - _fallthrough_methods = { - 'size', - 'stride', - 'nelement', - 'ndimension', - 'element_size', - 'is_contiguous', - 'is_set_to', - 'is_signed', - 'numel', - 'dim', - 'get_device', - 'is_cuda', - } - - def __getattr__(self, name): - if name in self._fallthrough_methods: - return getattr(self.data, name) - raise AttributeError(name) - - def __getitem__(self, key): - if (isinstance(key, Variable) and - type(key.data).__name__ == 'ByteTensor'): - return MaskedSelect()(self, key) - return Index(key)(self) - - def __setitem__(self, key, value): - if (isinstance(key, Variable) and - type(key.data).__name__ == 'ByteTensor'): - if isinstance(value, Variable): - return MaskedCopy(inplace=True)(self, key, value) - else: - return MaskedFill(value, inplace=True)(self, key) - else: - if isinstance(value, Variable): - return SetItem(key)(self, value) - else: - return SetItem(key, value)(self) - - def __deepcopy__(self, memo): - if self.creator is not None: - raise RuntimeError("Only Variables created explicitly by the user " - "(graph leaves) support the deepcopy protocol at the moment") - result = type(self)(self.data.clone()) - result.requires_grad = self.requires_grad - result.volatile = self.volatile - memo[id(self)] = result - return result - - def __reduce_ex__(self, proto): - state = (self.requires_grad, self.volatile, self._backward_hooks) - if proto > 1: - return type(self), (self.data,), state - if sys.version_info[0] == 2: - from copy_reg import __newobj__ - else: - from copyreg import __newobj__ - return __newobj__, (type(self), self.data), state - - def __setstate__(self, state): - if len(state) == 5: - # legacy serialization of Variable - self.data = state[0] - state = (state[3], state[4], state[2]) - if self.creator is not None: - raise RuntimeError('__setstate__ can be only called on leaf variables') - self.requires_grad, self.volatile, self._backward_hooks = state - - def __repr__(self): - return 'Variable containing:' + self.data.__repr__() - -
    [docs] def backward(self, gradient=None, retain_variables=False): - """Computes the gradient of current variable w.r.t. graph leaves. - - The graph is differentiated using the chain rule. If the variable is - non-scalar (i.e. its data has more than one element) and requires - gradient, the function additionaly requires specifying ``gradient``. - It should be a tensor of matching type and location, that contains - the gradient of the differentiated function w.r.t. ``self``. - - This function accumulates gradients in the leaves - you might need to zero - them before calling it. - - Arguments: - gradient (Tensor): Gradient of the differentiated function - w.r.t. the data. Required only if the data has more than one - element. Type and location should match these of ``self.data``. - retain_variables (bool): If ``True``, buffers necessary for computing - gradients won't be freed after use. It is only necessary to - specify ``True`` if you want to differentiate some subgraph multiple - times (in some cases it will be much more efficient to use - `autograd.backward`). - """ - if self.volatile: - raise RuntimeError('calling backward on a volatile variable') - if gradient is None and self.requires_grad: - if self.data.numel() != 1: - raise RuntimeError( - 'backward should be called only on a scalar (i.e. 1-element tensor) ' - 'or with gradient w.r.t. the variable') - gradient = self.data.new().resize_as_(self.data).fill_(1) - self._execution_engine.run_backward((self,), (gradient,), retain_variables)
    - -
    [docs] def register_hook(self, hook): - """Registers a backward hook. - - The hook will be called every time a gradient with respect to the - variable is computed. The hook should have the following signature:: - - hook(grad) -> Variable or None - - The hook should not modify its argument, but it can optionally return - a new gradient which will be used in place of :attr:`grad`. - - This function returns a handle with a method ``handle.remove()`` - that removes the hook from the module. - - Example: - >>> v = Variable(torch.Tensor([0, 0, 0]), requires_grad=True) - >>> h = v.register_hook(lambda grad: grad * 2) # double the gradient - >>> v.backward(torch.Tensor([1, 1, 1])) - >>> v.grad.data - 2 - 2 - 2 - [torch.FloatTensor of size 3] - >>> h.remove() # removes the hook - """ - if self.volatile: - raise RuntimeError("cannot register a hook on a volatile variable") - if not self.requires_grad: - raise RuntimeError("cannot register a hook on a variable that " - "doesn't require gradient") - if self._backward_hooks is None: - self._backward_hooks = OrderedDict() - if self.creator is not None: - self.creator._register_hook_dict(self) - handle = hooks.RemovableHandle(self._backward_hooks) - self._backward_hooks[handle.id] = hook - return handle
    - -
    [docs] def reinforce(self, reward): - """Registers a reward obtained as a result of a stochastic process. - - Differentiating stochastic nodes requires providing them with reward - value. If your graph contains any stochastic operations, you should - call this function on their outputs. Otherwise an error will be raised. - - Parameters: - reward(Tensor): Tensor with per-element rewards. It has to match - the device location and shape of Variable's data. - """ - if not isinstance(self.creator, StochasticFunction): - raise RuntimeError("reinforce() can be only called on outputs " - "of stochastic functions") - self.creator._reinforce(reward)
    - -
    [docs] def detach(self): - """Returns a new Variable, detached from the current graph. - - Result will never require gradient. If the input is volatile, the output - will be volatile too. - - .. note:: - - Returned Variable uses the same data tensor, as the original one, and - in-place modifications on either of them will be seen, and may trigger - errors in correctness checks. - """ - result = NoGrad()(self) # this is needed, because it merges version counters - result._creator = None - return result
    - -
    [docs] def detach_(self): - """Detaches the Variable from the graph that created it, making it a leaf.""" - self._creator = None - self.requires_grad = False
    - - def contiguous(self): - self.data = self.data.contiguous() - return self - - def clone(self): - return Clone()(self) - - def type(self, t): - if t != type(self.data): - return Type(t)(self) - return self - - def type_as(self, t): - return self.type(type(t.data)) - - def _get_type(self, name): - module = torch._import_dotted_name(self.data.__module__) - return getattr(module, name) - - def cuda(self, device_id=None, async=False): - return CudaTransfer(device_id, async)(self) - - def cpu(self): - return self.type(getattr(torch, type(self.data).__name__)) - - def double(self): - return self.type(self._get_type('DoubleTensor')) - - def float(self): - return self.type(self._get_type('FloatTensor')) - - def half(self): - return self.type(self._get_type('HalfTensor')) - - def long(self): - return self.type(self._get_type('LongTensor')) - - def int(self): - return self.type(self._get_type('IntTensor')) - - def short(self): - return self.type(self._get_type('ShortTensor')) - - def char(self): - return self.type(self._get_type('CharTensor')) - - def byte(self): - return self.type(self._get_type('ByteTensor')) - - def is_same_size(self, other_var): - return self.data.is_same_size(other_var.data) - - def _add(self, other, inplace): - if isinstance(other, Variable): - return Add(inplace)(self, other) - else: - assert not torch.is_tensor(other) - return AddConstant(other, inplace)(self) - - def add(self, other): - return self._add(other, False) - - def add_(self, other): - return self._add(other, True) - - def _sub(self, other, inplace): - if isinstance(other, Variable): - return Sub(inplace=inplace)(self, other) - else: - assert not torch.is_tensor(other) - return SubConstant(other, inplace=inplace)(self) - - def sub(self, other): - return self._sub(other, False) - - def sub_(self, other): - return self._sub(other, True) - - def mul(self, other): - if isinstance(other, Variable): - return Mul()(self, other) - else: - assert not torch.is_tensor(other) - return MulConstant(other)(self) - - def mul_(self, other): - if not isinstance(other, Variable) and not torch.is_tensor(other): - return MulConstant(other, inplace=True)(self) - raise RuntimeError("mul_ only supports scalar multiplication") - - def div(self, other): - if isinstance(other, Variable): - return Div()(self, other) - else: - assert not torch.is_tensor(other) - return DivConstant(other)(self) - - def div_(self, other): - if not isinstance(other, Variable) and not torch.is_tensor(other): - return DivConstant(other, inplace=True)(self) - raise RuntimeError("div_ only supports scalar multiplication") - - def pow(self, other): - if isinstance(other, Variable): - return Pow()(self, other) - else: - assert not torch.is_tensor(other) - return PowConstant(other)(self) - - def exp(self): - return Exp()(self) - - def exp_(self): - return Exp(inplace=True)(self) - - def log(self): - return Log()(self) - - def log1p(self): - return Log1p()(self) - - def neg(self): - return Negate()(self) - - def neg_(self): - return Negate(inplace=True)(self) - - def tanh(self): - return Tanh()(self) - - def tanh_(self): - return Tanh(True)(self) - - def sigmoid(self): - return Sigmoid()(self) - - def sigmoid_(self): - return Sigmoid(True)(self) - - def sin(self): - return Sin()(self) - - def cos(self): - return Cos()(self) - - def tan(self): - return Tan()(self) - - def asin(self): - return Asin()(self) - - def acos(self): - return Acos()(self) - - def atan(self): - return Atan()(self) - - def sinh(self): - return Sinh()(self) - - def cosh(self): - return Cosh()(self) - - def abs(self): - return Abs()(self) - - def clamp(self, min=None, max=None): - if min is None and max is None: - raise ValueError("clamp requires specifying at least one of " - "min and max arguments") - elif min is None and max is not None: - return CminConstant(max)(self) - elif min is not None and max is None: - return CmaxConstant(min)(self) - else: - return Clamp(min, max)(self) - - def reciprocal(self): - return Reciprocal()(self) - - def floor(self): - return Floor()(self) - - def ceil(self): - return Ceil()(self) - - def frac(self): - return Frac()(self) - - def sqrt(self): - return Sqrt()(self) - - def round(self): - return Round()(self) - - def sign(self): - return Sign()(self) - - def trunc(self): - return Trunc()(self) - - def fmod(self, value): - return Fmod(value)(self) - - def remainder(self, value): - return Remainder(value)(self) - - def lerp(self, tensor, weight): - return Lerp(weight)(self, tensor) - - def rsqrt(self): - return Rsqrt()(self) - - def sum(self, dim=None): - return Sum(dim)(self) - - def prod(self, dim=None): - return Prod(dim)(self) - - def mean(self, dim=None): - return Mean(dim)(self) - - def max(self, dim=None): - if isinstance(dim, Variable): - return Cmax()(self, dim) - return Max(dim)(self) - - def min(self, dim=None): - if isinstance(dim, Variable): - return Cmin()(self, dim) - return Min(dim)(self) - - def mode(self, dim): - return Mode(dim)(self) - - def median(self, dim): - return Median(dim)(self) - - def kthvalue(self, dim): - return Kthvalue(dim)(self) - - def sort(self, dim=None, descending=False): - return Sort(dim, descending)(self) - - def topk(self, k, dim=None, largest=True, sorted=True): - return Topk(k, dim, largest, sorted)(self) - - def view(self, *sizes): - return View(*sizes)(self) - - def view_as(self, tensor): - return View(*tensor.size())(self) - - def split(self, split_size, dim=0): - return torch.split(self, split_size, dim) - - def repeat(self, *repeats): - if len(repeats) == 1 and isinstance(repeats[0], torch.Size): - repeats = repeats[0] - else: - repeats = torch.Size(repeats) - return Repeat(repeats)(self) - - def cumsum(self, dim): - return Cumsum(dim)(self) - - def var(self, dim=None, unbiased=True): - mean = self.mean(dim) - if dim is None: - mean = mean.view(*(1 for s in self.size())) - mean_expanded = mean.expand_as(self) - zero_centered = self.sub(mean_expanded) - var = zero_centered.mul(zero_centered).sum(dim) - numel = self.numel() if dim is None else self.size(dim) - return var.div(numel - int(unbiased)) - - def std(self, dim=None, unbiased=True): - return self.var(dim, unbiased).sqrt() - - def renorm(self, p, dim, maxnorm): - t = self.transpose(dim, 0) - flat = t.contiguous().view(self.size(0), -1) - norms = flat.norm(p, 1) - norms = norms.clamp(max=maxnorm).div(norms.add(1e-7)) - flat_out = flat.mul(norms.expand_as(flat)) - return flat_out.view(t.size()).transpose(dim, 0) - - @staticmethod - def _static_blas(cls, args, inplace): - num_args = len(args) - alpha = beta = 1 - if num_args > 5: - raise RuntimeError("too many args") - if num_args == 5: - alpha, beta = args[1:3] - if num_args == 4: - alpha = args[1] - return cls(alpha, beta, inplace)(*(args[:1] + args[-2:])) - - def _blas(self, cls, args, inplace): - return self._static_blas(cls, (self,) + args, inplace) - - def mm(self, matrix): - output = Variable(self.data.new(self.data.size(0), matrix.data.size(1))) - return self._static_blas(Addmm, (output, 0, 1, self, matrix), False) - - def bmm(self, batch): - output = Variable(self.data.new(self.data.size(0), self.data.size(1), - batch.data.size(2))) - return self._static_blas(Baddbmm, (output, 0, 1, self, batch), False) - - def mv(self, vector): - output = Variable(self.data.new(self.data.size(0))) - return self._static_blas(Addmv, (output, 0, 1, self, vector), False) - - def ger(self, vector): - output = Variable(self.data.new(self.data.size(0), vector.data.size(0))) - return self._static_blas(Addr, (output, 0, 1, self, vector), False) - - def resize(self, *sizes): - return Resize(*sizes)(self) - - def resize_as(self, variable): - return Resize(*variable.size())(self) - - def addmm(self, *args): - return self._blas(Addmm, args, False) - - def addmm_(self, *args): - return self._blas(Addmm, args, True) - - def addbmm(self, *args): - return self._blas(Addbmm, args, False) - - def addbmm_(self, *args): - return self._blas(Addbmm, args, True) - - def baddbmm(self, *args): - return self._blas(Baddbmm, args, False) - - def baddbmm_(self, *args): - return self._blas(Baddbmm, args, True) - - def addmv(self, *args): - return self._blas(Addmv, args, False) - - def addmv_(self, *args): - return self._blas(Addmv, args, True) - - def addr(self, *args): - return self._blas(Addr, args, False) - - def addr_(self, *args): - return self._blas(Addr, args, True) - - def dot(self, other): - return Dot()(self, other) - - def _addcop(self, op, args): - if len(args) == 3: - # scale, tensor1, tensor2 - return op(args[0])(self, *args[1:]) - else: - # tensor1, tensor2 - return op()(self, *args) - - def addcmul(self, *args): - return self._addcop(Addcmul, args) - - def addcdiv(self, *args): - return self._addcop(Addcdiv, args) - - def norm(self, p=2, dim=None): - return Norm(p, dim)(self) - - def dist(self, tensor, p=2): - return Norm(p)(self - tensor) - - def index_add(self, dim, index, tensor): - return IndexAdd(dim)(self, index, tensor) - - def index_add_(self, dim, index, tensor): - return IndexAdd(dim, True)(self, index, tensor) - - def index_copy(self, dim, index, tensor): - return IndexCopy(dim)(self, index, tensor) - - def index_copy_(self, dim, index, tensor): - return IndexCopy(dim, True)(self, index, tensor) - - def index_fill(self, dim, index, value): - return IndexFill(dim, value)(self, index) - - def index_fill_(self, dim, index, value): - return IndexFill(dim, value, True)(self, index) - - def index_select(self, dim, index): - return IndexSelect(dim)(self, index) - - def gather(self, dim, index): - return Gather(dim)(self, index) - - def scatter(self, dim, index, source): - return Scatter(dim)(self, index, source) - - def scatter_(self, dim, index, source): - return Scatter(dim, True)(self, index, source) - - def masked_copy(self, mask, variable): - return MaskedCopy()(self, mask, variable) - - def masked_copy_(self, mask, variable): - return MaskedCopy(True)(self, mask, variable) - - def masked_fill(self, mask, value): - return MaskedFill(value)(self, mask) - - def masked_fill_(self, mask, value): - return MaskedFill(value, True)(self, mask) - - def masked_select(self, mask): - return MaskedSelect()(self, mask) - - def expand(self, *sizes): - if isinstance(sizes[0], torch.Size): - if len(sizes) > 1: - raise ValueError("expand expects a several ints or a single " - "torch.Size argument") - sizes = sizes[0] - return Expand(sizes)(self) - - def expand_as(self, tensor): - return Expand(tensor.size())(self) - - def t(self): - return Transpose(0, 1)(self) - - def transpose(self, dim1, dim2): - return Transpose(dim1, dim2)(self) - - def select(self, dim, _index): - dim = dim if dim >= 0 else dim + self.dim() - index = tuple(slice(None, None) for _ in range(dim)) + (_index,) - return Index(index)(self) - - def narrow(self, dim, start_index, length): - dim = dim if dim >= 0 else dim + self.dim() - index = tuple(slice(None, None) for _ in range(dim)) + \ - (slice(start_index, start_index + length),) - - return Index(index)(self) - - def chunk(self, num_chunks, dim=0): - return Chunk(num_chunks, dim)(self) - - def squeeze(self, dim=None): - return Squeeze(dim)(self) - - def unsqueeze(self, dim): - return Unsqueeze(dim)(self) - - def permute(self, *permutation): - return Permute(permutation)(self) - - def diag(self, diagonal_idx=0): - return Diag(diagonal_idx)(self) - - def tril(self, diagonal_idx=0): - return Tril(diagonal_idx)(self) - - def triu(self, diagonal_idx=0): - return Triu(diagonal_idx)(self) - - def trace(self): - return Trace()(self) - - def cross(self, other, dim=-1): - return Cross(dim)(self, other) - - def multinomial(self, num_samples=1, with_replacement=False): - return Multinomial(num_samples, with_replacement)(self) - - def bernoulli(self): - return Bernoulli()(self) - - def eq(self, other): - if isinstance(other, Variable): - return Eq()(self, other) - assert not torch.is_tensor(other), "can't compare Variable and tensor" - return Eq(other)(self) - - def ne(self, other): - if isinstance(other, Variable): - return Ne()(self, other) - assert not torch.is_tensor(other), "can't compare Variable and tensor" - return Ne(other)(self) - - def gt(self, other): - if isinstance(other, Variable): - return Gt()(self, other) - assert not torch.is_tensor(other), "can't compare Variable and tensor" - return Gt(other)(self) - - def ge(self, other): - if isinstance(other, Variable): - return Ge()(self, other) - assert not torch.is_tensor(other), "can't compare Variable and tensor" - return Ge(other)(self) - - def lt(self, other): - if isinstance(other, Variable): - return Lt()(self, other) - assert not torch.is_tensor(other), "can't compare Variable and tensor" - return Lt(other)(self) - - def le(self, other): - if isinstance(other, Variable): - return Le()(self, other) - assert not torch.is_tensor(other), "can't compare Variable and tensor" - return Le(other)(self) - - def __add__(self, other): - return self.add(other) - __radd__ = __add__ - - def __iadd__(self, other): - return self.add_(other) - - def __sub__(self, other): - return self.sub(other) - - def __isub__(self, other): - return self.sub_(other) - - def __rsub__(self, other): - return SubConstant(other, sub_tensor=True)(self) - - def __mul__(self, other): - return self.mul(other) - __rmul__ = __mul__ - - def __imul__(self, other): - return self.mul_(other) - - def __matmul__(self, other): - dim_self = self.dim() - try: - dim_other = other.dim() - except AttributeError: # not a Variable - return NotImplemented - if dim_self == 1 and dim_other == 1: - return self.dot(other) - if dim_self == 2 and dim_other == 1: - return self.mv(other) - if dim_self == 1 and dim_other == 2: - return self.unsqueeze(0).mm(other).squeeze(0) - elif dim_self == 2 and dim_other == 2: - return self.mm(other) - raise ValueError("both arguments to __matmul__ need to be 1D or 2D, " - "but they are {}D and {}D".format(dim_self, dim_other)) - - def __div__(self, other): - return self.div(other) - __truediv__ = __div__ - - def __rdiv__(self, other): - return DivConstant(other, div_by_tensor=True)(self) - __rtruediv__ = __rdiv__ - - def __idiv__(self, other): - return self.div_(other) - - def __pow__(self, other): - return self.pow(other) - - def __ipow__(self, other): - raise NotImplementedError("in-place pow not implemented") - - def __rpow__(self, other): - return PowConstant(other, tensor_power=True)(self) - - def __neg__(self): - return Negate()(self) - - def __len__(self): - return len(self.data) - - def __iter__(self): - return iter(map(lambda i: self[i], range(self.size(0)))) - - def __mod__(self, other): - return self.remainder(other) - - def __eq__(self, other): - return self.eq(other) - - def __ne__(self, other): - return self.ne(other) - - def __lt__(self, other): - return self.lt(other) - - def __le__(self, other): - return self.le(other) - - def __gt__(self, other): - return self.gt(other) - - def __ge__(self, other): - return self.ge(other) - - def __hash__(self): - return id(self) - - class _torch(object): - - @staticmethod - def cat(iterable, dim=0): - return Concat(dim)(*iterable) - - @staticmethod - def normal(means, std=1): - if isinstance(std, Variable): - return Normal()(means, std) - else: - return Normal(std)(means) - - @staticmethod - def _blas(cls, args, inplace): - num_args = len(args) - alpha = beta = 1 - if num_args > 5: - raise RuntimeError("too many args") - if num_args == 5: - alpha, beta = args[0], args[2] - tensors = args[1:2] + args[3:] - elif num_args == 4: - alpha = args[0] - tensors = args[1:] - else: - tensors = args - return cls(alpha, beta, inplace)(*tensors) - - @classmethod - def addmm(cls, *args): - return cls._blas(Addmm, args, False) - - @classmethod - def addbmm(cls, *args): - return cls._blas(Addbmm, args, False) - - @classmethod - def baddbmm(cls, *args): - return cls._blas(Baddbmm, args, False) - - @classmethod - def addmv(cls, *args): - return cls._blas(Addmv, args, False) - - @classmethod - def addr(cls, *args): - return cls._blas(Addr, args, False)
    - - -for method in dir(Variable): - # This will also wrap some methods that normally aren't part of the - # funcitonal interface, but we don't care, as they won't ever be used - if method.startswith('_') or method.endswith('_'): - continue - if hasattr(Variable._torch, method): - continue - as_static = staticmethod(getattr(Variable, method)) - setattr(Variable._torch, method, as_static) - - -from .engine import ImperativeEngine -Variable._execution_engine = ImperativeEngine() -
    - -
    -
    - -
    -
    -
    - - -
    - -
    -

    - © Copyright 2017, Torch Contributors. - -

    -
    - Built with Sphinx using a theme provided by Read the Docs. - -
    - -
    -
    - -
    - -
    - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/_modules/torch/cuda.html b/docs/_modules/torch/cuda.html deleted file mode 100644 index 6aca35a05572..000000000000 --- a/docs/_modules/torch/cuda.html +++ /dev/null @@ -1,988 +0,0 @@ - - - - - - - - - - - torch.cuda — PyTorch 0.1.11 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - -
    - - - - - - -
    -
    - - - - - - - - - - - - - - - - -
    - - - - -
    -
    -
    -
    - -

    Source code for torch.cuda

    -"""
    -This package adds support for CUDA tensor types, that implement the same
    -function as CPU tensors, but they utilize GPUs for computation.
    -
    -It is lazily initialized, so you can always import it, and use
    -:func:`is_available()` to determine if your system supports CUDA.
    -
    -:ref:`cuda-semantics` has more details about working with CUDA.
    -"""
    -
    -import contextlib
    -import platform
    -import ctypes
    -import os
    -import torch
    -from multiprocessing.util import register_after_fork as _register_after_fork
    -
    -_initialized = False
    -_in_bad_fork = False
    -_original_pid = False
    -_cudart = None
    -
    -
    -
    [docs]def is_available(): - """Returns a bool indicating if CUDA is currently available.""" - if (not hasattr(torch._C, '_cuda_isDriverSufficient') or - not torch._C._cuda_isDriverSufficient()): - return False - try: - return torch._C._cuda_getDeviceCount() > 0 - except RuntimeError as e: - if 'no CUDA-capable device is detected' in e.args[0]: - return False - raise
    - - -def _sleep(cycles): - torch._C._cuda_sleep(cycles) - - -def _load_cudart(): - # First check the main program for CUDA symbols - lib = ctypes.cdll.LoadLibrary(None) - if hasattr(lib, 'cudaGetErrorName'): - return lib - - raise RuntimeError( - "couldn't find libcudart. Make sure CUDA libraries are installed in a" - "default location, or that they're in {}." - .format('DYLD_LIBRARY_PATH' if platform.system() == 'Darwin' else - 'LD_LIBRARY_PATH')) - - -def _check_driver(): - if not hasattr(torch._C, '_cuda_isDriverSufficient'): - raise AssertionError("Torch not compiled with CUDA enabled") - if not torch._C._cuda_isDriverSufficient(): - if torch._C._cuda_getDriverVersion() == 0: - # found no NVIDIA driver on the system - raise AssertionError(""" -Found no NVIDIA driver on your system. Please check that you -have an NVIDIA GPU and installed a driver from -http://www.nvidia.com/Download/index.aspx""") - else: - # TODO: directly link to the alternative bin that needs install - raise AssertionError(""" -The NVIDIA driver on your system is too old (found version {}). -Please update your GPU driver by downloading and installing a new -version from the URL: http://www.nvidia.com/Download/index.aspx -Alternatively, go to: https://pytorch.org/binaries to install -a PyTorch version that has been compiled with your version -of the CUDA driver.""".format(str(torch._C._cuda_getDriverVersion()))) - - -def _lazy_init(): - global _initialized, _cudart, _original_pid - if _initialized: - return - if _in_bad_fork: - from sys import version_info - if version_info < (3, 4): - msg = ("To use CUDA with multiprocessing, you must use Python " - "3.4+ and the 'spawn' start method") - else: - msg = ("To use CUDA with multiprocessing, you must use the " - "'spawn' start method") - raise RuntimeError( - "Cannot re-initialize CUDA in forked subprocess. " + msg) - _check_driver() - assert torch._C._cuda_init() - assert torch._C._cuda_sparse_init() - _cudart = _load_cudart() - _cudart.cudaGetErrorName.restype = ctypes.c_char_p - _cudart.cudaGetErrorString.restype = ctypes.c_char_p - _original_pid = os.getpid() - _initialized = True - - -def _after_fork(arg): - global _initialized, _in_bad_fork - if _initialized and _original_pid != os.getpid(): - _initialized = False - _in_bad_fork = True - - -_register_after_fork(_after_fork, _after_fork) - - -def cudart(): - _lazy_init() - return _cudart - - -
    [docs]class device(object): - """Context-manager that changes the selected device. - - Arguments: - idx (int): device index to select. It's a no-op if this argument - is negative. - """ - - def __init__(self, idx): - self.idx = idx - self.prev_idx = -1 - - def __enter__(self): - if self.idx is -1: - return - _lazy_init() - self.prev_idx = torch._C._cuda_getDevice() - if self.prev_idx != self.idx: - torch._C._cuda_setDevice(self.idx) - - def __exit__(self, *args): - if self.prev_idx != self.idx: - torch._C._cuda_setDevice(self.prev_idx) - return False
    - - -
    [docs]class device_of(device): - """Context-manager that changes the current device to that of given object. - - You can use both tensors and storages as arguments. If a given object is - not allocated on a GPU, this is a no-op. - - Arguments: - obj (Tensor or Storage): object allocated on the selected device. - """ - - def __init__(self, obj): - idx = obj.get_device() if obj.is_cuda else -1 - super(device_of, self).__init__(idx)
    - - -
    [docs]def set_device(device): - """Sets the current device. - - Usage of this function is discouraged in favor of :any:`device`. In most - cases it's better to use ``CUDA_VISIBLE_DEVICES`` environmental variable. - - Arguments: - device (int): selected device. This function is a no-op if this - argument is negative. - """ - if device >= 0: - torch._C._cuda_setDevice(device)
    - - -@contextlib.contextmanager -
    [docs]def stream(stream): - """Context-manager that selects a given stream. - - All CUDA kernels queued within its context will be enqueued on a selected - stream. - - Arguments: - stream (Stream): selected stream. This manager is a no-op if it's - ``None``. - """ - if stream is None: - yield - return - prev_stream = current_stream() - torch._C._cuda_setStream(stream._cdata) - try: - yield - finally: - torch._C._cuda_setStream(prev_stream._cdata)
    - - -
    [docs]def device_count(): - """Returns the number of GPUs available.""" - if is_available(): - _lazy_init() - return torch._C._cuda_getDeviceCount() - else: - return 0
    - - -
    [docs]def current_device(): - """Returns the index of a currently selected device.""" - _lazy_init() - return torch._C._cuda_getDevice()
    - - -
    [docs]def synchronize(): - """Waits for all kernels in all streams on current device to complete.""" - _lazy_init() - return torch._C._cuda_synchronize()
    - - -
    [docs]def current_stream(): - """Returns a currently selected :class:`Stream`.""" - _lazy_init() - return torch.cuda.Stream(_cdata=torch._C._cuda_getCurrentStream())
    - - -
    [docs]def current_blas_handle(): - """Returns cublasHandle_t pointer to current cuBLAS handle""" - return torch._C._cuda_getCurrentBlasHandle()
    - - -def _host_allocator(): - _lazy_init() - return torch._C._cuda_cudaHostAllocator() - - -@contextlib.contextmanager -def _free_mutex(): - torch._C._cuda_lock_mutex() - try: - yield - finally: - torch._C._cuda_unlock_mutex() - - -from .random import * - -################################################################################ -# Define Storage and Tensor classes -################################################################################ - - -from ..tensor import _TensorBase -from ..storage import _StorageBase - - -def _dummy_type(name): - def init_err(self): - class_name = self.__class__.__name__ - raise RuntimeError( - "Tried to instantiate dummy base class {}".format(class_name)) - return type(storage_name, (object,), {"__init__": init_err}) - - -if not hasattr(torch._C, 'CudaDoubleStorageBase'): - # Define dummy base classes - for t in ['Double', 'Float', 'Long', 'Int', 'Short', 'Char', 'Byte', 'Half']: - storage_name = 'Cuda{0}StorageBase'.format(t) - tensor_name = 'Cuda{0}TensorBase'.format(t) - - torch._C.__dict__[storage_name] = _dummy_type(storage_name) - torch._C.__dict__[tensor_name] = _dummy_type(tensor_name) - - torch._C.__dict__['_CudaStreamBase'] = _dummy_type('CudaStreamBase') - - -class _CudaBase(object): - is_cuda = True - is_sparse = False - - def type(self, *args, **kwargs): - with device(self.get_device()): - return super(_CudaBase, self).type(*args, **kwargs) - - def __new__(cls, *args, **kwargs): - _lazy_init() - # We need this method only for lazy init, so we can remove it - del _CudaBase.__new__ - return super(_CudaBase, cls).__new__(cls, *args, **kwargs) - - -class DoubleStorage(_CudaBase, torch._C.CudaDoubleStorageBase, _StorageBase): - pass - - -class FloatStorage(_CudaBase, torch._C.CudaFloatStorageBase, _StorageBase): - pass - - -class LongStorage(_CudaBase, torch._C.CudaLongStorageBase, _StorageBase): - pass - - -class IntStorage(_CudaBase, torch._C.CudaIntStorageBase, _StorageBase): - pass - - -class ShortStorage(_CudaBase, torch._C.CudaShortStorageBase, _StorageBase): - pass - - -class CharStorage(_CudaBase, torch._C.CudaCharStorageBase, _StorageBase): - pass - - -class ByteStorage(_CudaBase, torch._C.CudaByteStorageBase, _StorageBase): - pass - - -class HalfStorage(_CudaBase, torch._C.CudaHalfStorageBase, _StorageBase): - pass - - -class DoubleTensor(_CudaBase, torch._C.CudaDoubleTensorBase, _TensorBase): - - def is_signed(self): - return True - - @classmethod - def storage_type(cls): - return DoubleStorage - - -class FloatTensor(_CudaBase, torch._C.CudaFloatTensorBase, _TensorBase): - - def is_signed(self): - return True - - @classmethod - def storage_type(cls): - return FloatStorage - - -class LongTensor(_CudaBase, torch._C.CudaLongTensorBase, _TensorBase): - - def is_signed(self): - return True - - @classmethod - def storage_type(cls): - return LongStorage - - -class IntTensor(_CudaBase, torch._C.CudaIntTensorBase, _TensorBase): - - def is_signed(self): - return True - - @classmethod - def storage_type(cls): - return IntStorage - - -class ShortTensor(_CudaBase, torch._C.CudaShortTensorBase, _TensorBase): - - def is_signed(self): - return True - - @classmethod - def storage_type(cls): - return ShortStorage - - -class CharTensor(_CudaBase, torch._C.CudaCharTensorBase, _TensorBase): - - def is_signed(self): - # TODO - return False - - @classmethod - def storage_type(cls): - return CharStorage - - -class ByteTensor(_CudaBase, torch._C.CudaByteTensorBase, _TensorBase): - - def is_signed(self): - return False - - @classmethod - def storage_type(cls): - return ByteStorage - - -class HalfTensor(_CudaBase, torch._C.CudaHalfTensorBase, _TensorBase): - - def is_signed(self): - return True - - @classmethod - def storage_type(): - return HalfStorage - - -torch._storage_classes.add(DoubleStorage) -torch._storage_classes.add(FloatStorage) -torch._storage_classes.add(LongStorage) -torch._storage_classes.add(IntStorage) -torch._storage_classes.add(ShortStorage) -torch._storage_classes.add(CharStorage) -torch._storage_classes.add(ByteStorage) -torch._storage_classes.add(HalfStorage) - -torch._tensor_classes.add(DoubleTensor) -torch._tensor_classes.add(FloatTensor) -torch._tensor_classes.add(LongTensor) -torch._tensor_classes.add(IntTensor) -torch._tensor_classes.add(ShortTensor) -torch._tensor_classes.add(CharTensor) -torch._tensor_classes.add(ByteTensor) -torch._tensor_classes.add(HalfTensor) - -from . import sparse -from .streams import Stream, Event -
    - -
    -
    - -
    -
    -
    - - -
    - -
    -

    - © Copyright 2017, Torch Contributors. - -

    -
    - Built with Sphinx using a theme provided by Read the Docs. - -
    - -
    -
    - -
    - -
    - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/_modules/torch/cuda/comm.html b/docs/_modules/torch/cuda/comm.html deleted file mode 100644 index bc484c0ca47b..000000000000 --- a/docs/_modules/torch/cuda/comm.html +++ /dev/null @@ -1,822 +0,0 @@ - - - - - - - - - - - torch.cuda.comm — PyTorch 0.1.11 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - -
    - - - - - - -
    -
    - - - - - - - - - - - - - - - - -
    - - - - -
    -
    -
    -
    - -

    Source code for torch.cuda.comm

    -import torch
    -from . import nccl
    -from torch._utils import _accumulate
    -
    -# TODO: sync streams when implemented
    -
    -
    -
    [docs]def broadcast(tensor, devices): - """Broadcasts a tensor to a number of GPUs. - - Arguments: - tensor (Tensor): tensor to broadcast. - devices (Iterable): an iterable of devices among which to broadcast. - Note that it should be like (src, dst1, dst2, ...), the first element - of which is the source device to broadcast from. - - Returns: - A tuple containing copies of the ``tensor``, placed on devices - corresponding to indices from ``devices``. - """ - if nccl.is_available([tensor]) and len(set(devices)) == len(devices): - tensors = [tensor] - for device in devices[1:]: - with torch.cuda.device(device): - tensors.append(type(tensor)(tensor.size())) - nccl.broadcast(tensors) - return tuple(tensors) - - # TODO: copy to a pinned buffer first (if copy is from CPU) - return tuple(tensor.cuda(gpu, async=True) for gpu in devices)
    - - -def broadcast_coalesced(tensors, devices, buffer_size=10485760): - """Broadcasts a sequence tensors to the specified GPUs. - - Small tensors are first coalesced into a buffer to reduce the number - of synchronizations. - - Arguments: - tensors (sequence): tensors to broadcast. - devices (Iterable): an iterable of devices among which to broadcast. - Note that it should be like (src, dst1, dst2, ...), the first element - of which is the source device to broadcast from. - buffer_size (int): maximum size of the buffer used for coalescing - - Returns: - A tuple containing copies of the ``tensor``, placed on devices - corresponding to indices from ``devices``. - """ - for tensor in tensors: - if tensor.get_device() != devices[0]: - raise RuntimeError('all tensors must be on devices[0]') - outputs = [[] for _ in devices] - # use the original tensors for the first device - outputs[0].extend(tensors) - for chunk in _take_tensors(tensors, buffer_size): - results = broadcast(_flatten_tensors(chunk), devices) - # use the broadcasted tensors for the remaining devices - for dst, res in zip(outputs[1:], results[1:]): - dst.extend(_unflatten_tensors(res, chunk)) - return tuple(outputs) - - -
    [docs]def reduce_add(inputs, destination=None): - """Sums tensors from multiple GPUs. - - All inputs should have matching shapes. - - Arguments: - inputs (Iterable[Tensor]): an iterable of tensors to add. - destination (int, optional): a device on which the output will be - placed (default: current device). - - Returns: - A tensor containing an elementwise sum of all inputs, placed on the - ``destination`` device. - """ - # TODO: try to find an input on another gpu, copy it, - # and accumulate into the copy - input_size = inputs[0].size() - for i, inp in enumerate(inputs): - assert inp.is_cuda, "reduce_add expects all inputs to be on GPUs" - if inp.size() != input_size: - got = 'x'.join(str(x) for x in inp.size()) - expected = 'x'.join(str(x) for x in input_size) - raise ValueError("input {} has invalid size: got {}, but expected " - "{}".format(i, got, expected)) - if destination is None: - destination = torch.cuda.current_device() - with torch.cuda.device(destination): - result = type(inp)(input_size).zero_() - - if nccl.is_available(inputs) and inputs[0].get_device() == destination: - outputs = [result] + [t.new(t.size()) for t in inputs[1:]] - nccl.reduce(inputs, outputs) - return result - - for inp in inputs: - input_correct_gpu = inp.cuda(result.get_device()) - result.add_(input_correct_gpu) - return result
    - - -def reduce_add_coalesced(inputs, destination=None, buffer_size=10485760): - """Sums tensors from multiple GPUs. - - Small tensors are first coalesced into a buffer to reduce the number - of synchronizations. - - Arguments: - inputs (Iterable[Tensor]): an iterable of tensors to add. - destination (int, optional): a device on which the output will be - placed (default: current device). - buffer_size (int): maximum size of the buffer used for coalescing - - Returns: - A tuple of tensors containing an elementwise sum of each group of - inputs, placed on the ``destination`` device. - """ - output = [] - itrs = [_take_tensors(tensors, buffer_size) for tensors in inputs] - for chunks in zip(*itrs): - flattened = [_flatten_tensors(chunk) for chunk in chunks] - result = reduce_add(flattened, destination) - output.extend(_unflatten_tensors(result, chunks[0])) - return tuple(output) - - -
    [docs]def scatter(tensor, devices, chunk_sizes=None, dim=0, streams=None): - """Scatters tensor across multiple GPUs. - - Arguments: - tensor (Tensor): tensor to scatter. - devices (Iterable[int]): iterable of ints, specifying among which - devices the tensor should be scattered. - chunk_sizes (Iterable[int], optional): sizes of chunks to be placed on - each device. It should match ``devices`` in length and sum to - ``tensor.size(dim)``. If not specified, the tensor will be divided - into equal chunks. - dim (int, optional): A dimension along which to chunk the tensor. - - Returns: - A tuple containing chunks of the ``tensor``, spread accross given - ``devices``. - """ - if chunk_sizes is None: - chunks = tensor.chunk(len(devices), dim) - else: - assert sum(chunk_sizes) == tensor.size(dim), "given chunk sizes " \ - "don't sum up to the tensor's size (sum(chunk_sizes) == {}, but " \ - "expected {})".format(sum(chunk_sizes), tensor.size(dim)) - assert min(chunk_sizes) > 0, "got a negative chunk_size" - chunks = [tensor.narrow(dim, start - size, size) - for start, size in zip(_accumulate(chunk_sizes), chunk_sizes)] - chunks = tuple(chunk.contiguous() for chunk in chunks) - # TODO: copy to a pinned buffer first (if copying from CPU) - if streams is None: - streams = [None] * len(devices) - outputs = [] - for device, chunk, stream in zip(devices, chunks, streams): - with torch.cuda.device(device), torch.cuda.stream(stream): - outputs.append(chunk.cuda(device, async=True)) - return tuple(outputs)
    - - -
    [docs]def gather(tensors, dim=0, destination=None): - """Gathers tensors from multiple GPUs. - - Tensor sizes in all dimension different than ``dim`` have to match. - - Arguments: - tensors (Iterable[Tensor]): iterable of tensors to gather. - dim (int): a dimension along which the tensors will be concatenated. - destination (int, optional): output device (-1 means CPU, default: - current device) - - Returns: - A tensor located on ``destination`` device, that is a result of - concatenating ``tensors`` along ``dim``. - """ - total_size = 0 - expected_size = list(tensors[0].size()) - for tensor in tensors: - assert tensor.is_cuda, "gather expects all inputs to be on GPUs" - expected_size[dim] = tensor.size(dim) - if list(tensor.size()) != expected_size: - got = 'x'.join(str(x) for x in tensor.size()) - expected = 'x'.join(str(x) for x in expected_size) - raise ValueError("gather got an input of invalid size: got {}, " - "but expected {}".format(got, expected)) - total_size += tensor.size(dim) - expected_size[dim] = total_size - expected_size = torch.Size(expected_size) - if destination is None: - destination = torch.cuda.current_device() - if destination == -1: - result = getattr(torch, type(tensors[0]).__name__)(expected_size) - else: - with torch.cuda.device(destination): - result = type(tensors[0])(expected_size) - - chunk_start = 0 - # TODO: if copying to CPU, allocate a pinned buffer, do async copies to it, - # and copy it to regular memory - for tensor in tensors: - result.narrow(dim, chunk_start, tensor.size(dim)).copy_(tensor, True) - chunk_start += tensor.size(dim) - return result
    - - -def _flatten_tensors(tensors): - """Flatten tensors into a single contiguous 1D buffer""" - if len(tensors) == 1: - return tensors[0].contiguous().view(-1) - size = sum(tensor.numel() for tensor in tensors) - offset = 0 - flat = tensors[0].new(size) - for tensor in tensors: - flat.narrow(0, offset, tensor.numel()).copy_(tensor) - offset += tensor.numel() - return flat - - -def _unflatten_tensors(flat, tensors): - """View a flat buffer using the sizes of tensors""" - outputs = [] - offset = 0 - for tensor in tensors: - outputs.append(flat.narrow(0, offset, tensor.numel()).view_as(tensor)) - offset += tensor.numel() - return tuple(outputs) - - -def _take_tensors(tensors, size_limit): - """Groups tensors into lists of up to size_limit bytes""" - buf = [] - size = 0 - for tensor in tensors: - param_size = tensor.numel() * tensor.element_size() - if size + param_size > size_limit and size > 0: - yield buf - size = 0 - buf = [] - buf.append(tensor) - size += param_size - if len(buf) > 0: - yield buf -
    - -
    -
    - -
    -
    -
    - - -
    - -
    -

    - © Copyright 2017, Torch Contributors. - -

    -
    - Built with Sphinx using a theme provided by Read the Docs. - -
    - -
    -
    - -
    - -
    - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/_modules/torch/cuda/streams.html b/docs/_modules/torch/cuda/streams.html deleted file mode 100644 index 405d65c3324a..000000000000 --- a/docs/_modules/torch/cuda/streams.html +++ /dev/null @@ -1,777 +0,0 @@ - - - - - - - - - - - torch.cuda.streams — PyTorch 0.1.11 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - -
    - - - - - - -
    -
    - - - - - - - - - - - - - - - - -
    - - - - -
    -
    -
    -
    - -

    Source code for torch.cuda.streams

    -import ctypes
    -import torch
    -from . import cudart
    -
    -
    -SUCCESS = 0
    -ERROR_NOT_READY = 34
    -
    -
    -class CudaError(RuntimeError):
    -
    -    def __init__(self, code):
    -        msg = cudart().cudaGetErrorString(code).decode('utf-8')
    -        super(CudaError, self).__init__('{0} ({1})'.format(msg, code))
    -
    -
    -def check_error(res):
    -    if res != SUCCESS:
    -        raise CudaError(res)
    -
    -
    -
    [docs]class Stream(torch._C._CudaStreamBase): - """Wrapper around a CUDA stream. - - Arguments: - device(int, optional): a device on which to allocate the Stream. - priority(int, optional): priority of the stream. Lower numbers - represent higher priorities. - """ - - def __new__(cls, device=-1, priority=0, **kwargs): - with torch.cuda.device(device): - return super(Stream, cls).__new__(cls, priority=priority, **kwargs) - -
    [docs] def wait_event(self, event): - """Makes all future work submitted to the stream wait for an event. - - Arguments: - event (Event): an event to wait for. - """ - check_error(cudart().cudaStreamWaitEvent(self, event, ctypes.c_int(0)))
    - -
    [docs] def wait_stream(self, stream): - """Synchronizes with another stream. - - All future work submitted to this stream will wait until all kernels - submitted to a given stream at the time of call complete. - - Arguments: - stream (Stream): a stream to synchronize. - """ - self.wait_event(stream.record_event())
    - -
    [docs] def record_event(self, event=None): - """Records an event. - - Arguments: - event (Event, optional): event to record. If not given, a new one - will be allocated. - - Returns: - Recorded event. - """ - if event is None: - event = Event() - check_error(cudart().cudaEventRecord(event, self)) - return event
    - -
    [docs] def query(self): - """Checks if all the work submitted has been completed. - - Returns: - A boolean indicating if all kernels in this stream are completed. - """ - res = cudart().cudaStreamQuery(self) - if res == ERROR_NOT_READY: - return False - check_error(res) - return True
    - -
    [docs] def synchronize(self): - """Wait for all the kernels in this stream to complete.""" - check_error(cudart().cudaStreamSynchronize(self))
    - - @staticmethod - def priority_range(): - least_priority = ctypes.c_int() - greatest_priority = ctypes.c_int() - check_error(cudart().cudaDeviceGetStreamPriorityRange( - ctypes.byref(least_priority), ctypes.byref(greatest_priority))) - return (least_priority.value, greatest_priority.value) - - @property - def priority(self): - priority = ctypes.c_int() - check_error(cudart().cudaStreamGetPriority(self, ctypes.byref(priority))) - return priority.value - - @property - def _as_parameter_(self): - return ctypes.c_void_p(self.cuda_stream) - - def __eq__(self, o): - if isinstance(o, Stream): - return o.device == self.device and o.cuda_stream == self.cuda_stream - return False - - def __hash__(self): - return hash((self.cuda_stream, self.device)) - - def __repr__(self): - return ('<torch.cuda.Stream device={0} cuda_stream={1:#x}>' - .format(self.device, self.cuda_stream))
    - - -class EventHandle(ctypes.Structure): - IPC_HANDLE_SIZE = 64 - _fields_ = [('reserved', ctypes.c_char * IPC_HANDLE_SIZE)] - - -
    [docs]class Event(object): - """Wrapper around CUDA event. - - Arguments: - enable_timing (bool): indicates if the event should measure time - (default: False) - blocking (bool): if true, :meth:`wait` will be blocking (default: False) - interprocess (bool): if true, the event can be shared between processes - (default: False) - """ - - DEFAULT = 0x0 - BLOCKING_SYNC = 0x1 - DISABLE_TIMING = 0x2 - INTERPROCESS = 0x4 - - def __init__(self, enable_timing=False, blocking=False, interprocess=False, - _handle=None): - flags = Event.DEFAULT - if not enable_timing: - flags |= Event.DISABLE_TIMING - if blocking: - flags |= Event.BLOCKING_SYNC - if interprocess: - flags |= Event.INTERPROCESS - - ptr = ctypes.c_void_p() - self._cudart = cudart() - if _handle: - check_error(self._cudart.cudaIpcOpenEventHandle(ctypes.byref(ptr), _handle)) - else: - check_error(self._cudart.cudaEventCreateWithFlags(ctypes.byref(ptr), ctypes.c_uint(flags))) - self._as_parameter_ = ptr - - def __del__(self): - if hasattr(self, '_as_parameter_'): - check_error(self._cudart.cudaEventDestroy(self._as_parameter_)) - del self._as_parameter_ - -
    [docs] def record(self, stream=None): - """Records the event in a given stream.""" - if stream is None: - stream = torch.cuda.current_stream() - stream.record_event(self)
    - -
    [docs] def wait(self, stream=None): - """Makes a given stream wait for the event.""" - if stream is None: - stream = torch.cuda.current_stream() - stream.wait_event(self)
    - -
    [docs] def query(self): - """Checks if the event has been recorded. - - Returns: - A boolean indicating if the event has been recorded. - """ - res = cudart().cudaEventQuery(self) - if res == ERROR_NOT_READY: - return False - check_error(res) - return True
    - -
    [docs] def elapsed_time(self, end_event): - """Returns the time elapsed before the event was recorded.""" - time_ms = ctypes.c_float() - check_error(cudart().cudaEventElapsedTime( - ctypes.byref(time_ms), self, end_event)) - return time_ms.value
    - -
    [docs] def synchronize(self): - """Synchronizes with the event.""" - check_error(cudart().cudaEventSynchronize(self))
    - -
    [docs] def ipc_handle(self): - """Returns an IPC handle of this event.""" - handle = EventHandle() - check_error(cudart().cudaIpcGetEventHandle(ctypes.byref(handle), self)) - return handle
    - - def __repr__(self): - return '<torch.cuda.Event {0:#x}>'.format(self._as_parameter_.value)
    -
    - -
    -
    - -
    -
    -
    - - -
    - -
    -

    - © Copyright 2017, Torch Contributors. - -

    -
    - Built with Sphinx using a theme provided by Read the Docs. - -
    - -
    -
    - -
    - -
    - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/_modules/torch/functional.html b/docs/_modules/torch/functional.html deleted file mode 100644 index 1a770e5d5c8c..000000000000 --- a/docs/_modules/torch/functional.html +++ /dev/null @@ -1,684 +0,0 @@ - - - - - - - - - - - torch.functional — PyTorch 0.1.11 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - -
    - - - - - - -
    -
    - - - - - - - - - - - - - - - - -
    - - - - -
    -
    -
    -
    - -

    Source code for torch.functional

    -import torch
    -from ._utils import _range
    -
    -
    -
    [docs]def split(tensor, split_size, dim=0): - """Splits the tensor into equally sized chunks (if possible). - - Last chunk will be smaller if the tensor size along a given dimension - is not divisible by ``split_size``. - - Arguments: - tensor (Tensor): tensor to split. - split_size (int): size of a single chunk. - dim (int): dimension along which to split the tensor. - """ - if dim < 0: - dim += tensor.dim() - dim_size = tensor.size(dim) - num_splits = (dim_size + split_size - 1) // split_size - last_split_size = split_size - (split_size * num_splits - dim_size) - - def get_split_size(i): - return split_size if i < num_splits - 1 else last_split_size - return tuple(tensor.narrow(int(dim), int(i * split_size), int(get_split_size(i))) for i - in _range(0, num_splits))
    - - -
    [docs]def chunk(tensor, chunks, dim=0): - """Splits a tensor into a number of chunks along a given dimension. - - Arguments: - tensor (Tensor): tensor to split. - chunks (int): number of chunks to return. - dim (int): dimension along which to split the tensor. - """ - if dim < 0: - dim += tensor.dim() - split_size = (tensor.size(dim) + chunks - 1) // chunks - return split(tensor, split_size, dim)
    - - -
    [docs]def stack(sequence, dim=0): - """Concatenates sequence of tensors along a new dimension. - - All tensors need to be of the same size. - - Arguments: - sqequence (Sequence): sequence of tensors to concatenate. - dim (int): dimension to insert. Has to be between 0 and the number - of dimensions of concatenated tensors (inclusive). - """ - if len(sequence) == 0: - raise TypeError("stack expects a non-empty sequence of tensors") - if dim < 0: - dim += sequence[0].dim() - return torch.cat(list(t.unsqueeze(dim) for t in sequence), dim)
    - - -
    [docs]def unbind(tensor, dim=0): - """Removes a tensor dimension. - - Returns a tuple of all slices along a given dimension, already without it. - - Arguments: - tensor (Tensor): tensor to unbind. - dim (int): dimension to remove. - """ - return tuple(tensor.select(dim, i) for i in _range(tensor.size(dim)))
    - - -def btriunpack(LU_data, LU_pivots, unpack_data=True, unpack_pivots=True): - """Unpacks the data and pivots from a batched LU factorization (btrifact) of a tensor. - - Returns a tuple indexed by: - 0: The pivots. - 1: The L tensor. - 2: The U tensor. - - Arguments: - LU_data (Tensor): The packed LU factorization data. - LU_pivots (Tensor): The packed LU factorization pivots. - unpack_data (bool): Flag indicating if the data should be unpacked. - unpack_pivots (bool): Flag indicating if the pivots should be unpacked. - """ - - nBatch, sz, _ = LU_data.size() - - if unpack_data: - I_U = torch.triu(torch.ones(sz, sz)).type_as(LU_data).byte().unsqueeze(0).expand(nBatch, sz, sz) - I_L = 1 - I_U - L = LU_data.new(LU_data.size()).zero_() - U = LU_data.new(LU_data.size()).zero_() - I_diag = torch.eye(sz).type_as(LU_data).byte().unsqueeze(0).expand(nBatch, sz, sz) - L[I_diag] = 1.0 - L[I_L] = LU_data[I_L] - U[I_U] = LU_data[I_U] - else: - L = U = None - - if unpack_pivots: - P = torch.eye(sz).type_as(LU_data).unsqueeze(0).repeat(nBatch, 1, 1) - for i in range(nBatch): - for j in range(sz): - k = LU_pivots[i, j] - 1 - t = P[i, :, j].clone() - P[i, :, j] = P[i, :, k] - P[i, :, k] = t - else: - P = None - - return P, L, U -
    - -
    -
    - -
    -
    -
    - - -
    - -
    -

    - © Copyright 2017, Torch Contributors. - -

    -
    - Built with Sphinx using a theme provided by Read the Docs. - -
    - -
    -
    - -
    - -
    - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/_modules/torch/multiprocessing.html b/docs/_modules/torch/multiprocessing.html deleted file mode 100644 index 369fd3e41f5f..000000000000 --- a/docs/_modules/torch/multiprocessing.html +++ /dev/null @@ -1,640 +0,0 @@ - - - - - - - - - - - torch.multiprocessing — PyTorch 0.1.11 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - -
    - - - - - - -
    -
    - - - - - - - - - - - - - - - - -
    - - - - -
    -
    -
    -
    - -

    Source code for torch.multiprocessing

    -"""
    -torch.multiprocessing is a wrapper around the native :mod:`multiprocessing`
    -module. It registers custom reducers, that use shared memory to provide shared
    -views on the same data in different processes. Once the tensor/storage is moved
    -to shared_memory (see :func:`~torch.Tensor.share_memory_`), it will be possible
    -to send it to other processes without making any copies.
    -
    -The API is 100% compatible with the original module - it's enough to change
    -``import multiprocessing`` to ``import torch.multiprocessing`` to have all the
    -tensors sent through the queues or shared via other mechanisms, moved to shared
    -memory.
    -
    -Because of the similarity of APIs we do not document most of this package
    -contents, and we recommend referring to very good docs of the original module.
    -"""
    -import sys
    -from .reductions import init_reductions
    -import multiprocessing
    -
    -__all__ = ['set_sharing_strategy', 'get_sharing_strategy',
    -           'get_all_sharing_strategies']
    -
    -
    -from multiprocessing import *
    -
    -
    -__all__ += multiprocessing.__all__
    -
    -
    -if sys.version_info < (3, 3):
    -    """Override basic classes in Python 2.7 and Python 3.3 to use ForkingPickler
    -    for serialization. Later versions of Python already use ForkingPickler."""
    -    from .queue import Queue, SimpleQueue
    -    from .pool import Pool
    -
    -
    -if sys.platform == 'darwin':
    -    _sharing_strategy = 'file_system'
    -    _all_sharing_strategies = {'file_system'}
    -else:
    -    _sharing_strategy = 'file_descriptor'
    -    _all_sharing_strategies = {'file_descriptor', 'file_system'}
    -
    -
    -
    [docs]def set_sharing_strategy(new_strategy): - """Sets the strategy for sharing CPU tensors. - - Arguments: - new_strategy (str): Name of the selected strategy. Should be one of - the values returned by :func:`get_all_sharing_strategies()`. - """ - global _sharing_strategy - assert new_strategy in _all_sharing_strategies - _sharing_strategy = new_strategy
    - - -
    [docs]def get_sharing_strategy(): - """Returns the current strategy for sharing CPU tensors.""" - return _sharing_strategy
    - - -
    [docs]def get_all_sharing_strategies(): - """Returns a set of sharing strategies supported on a current system.""" - return _all_sharing_strategies
    - - -init_reductions() -
    - -
    -
    - -
    -
    -
    - - -
    - -
    -

    - © Copyright 2017, Torch Contributors. - -

    -
    - Built with Sphinx using a theme provided by Read the Docs. - -
    - -
    -
    - -
    - -
    - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/_modules/torch/nn/functional.html b/docs/_modules/torch/nn/functional.html deleted file mode 100644 index 4365a9cd28bb..000000000000 --- a/docs/_modules/torch/nn/functional.html +++ /dev/null @@ -1,1279 +0,0 @@ - - - - - - - - - - - torch.nn.functional — PyTorch 0.1.11 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - -
    - - - - - - -
    -
    - - - - - - - - - - - - - - - - -
    - - - - -
    -
    -
    -
    - -

    Source code for torch.nn.functional

    -"""Functional interface"""
    -
    -import torch
    -from . import _functions
    -from .modules import utils
    -from ._functions.padding import ConstantPad2d
    -from .modules.utils import _single, _pair, _triple
    -
    -# Convolutions
    -ConvNd = torch._C._functions.ConvNd
    -
    -
    -
    [docs]def conv2d(input, weight, bias=None, stride=1, padding=0, dilation=1, - groups=1): - """Applies a 2D convolution over an input image composed of several input - planes. - - See :class:`~torch.nn.Conv2d` for details and output shape. - - Args: - input: input tensor (minibatch x in_channels x iH x iW) - weight: filters tensor (out_channels, in_channels/groups, kH, kW) - bias: optional bias tensor (out_channels) - stride: the stride of the convolving kernel. Can be a single number or - a tuple (sh x sw). Default: 1 - padding: implicit zero padding on the input. Can be a single number or - a tuple. Default: 0 - groups: split input into groups, in_channels should be divisible by - the number of groups - - Examples: - >>> # With square kernels and equal stride - >>> filters = autograd.Variable(torch.randn(8,4,3,3)) - >>> inputs = autograd.Variable(torch.randn(1,4,5,5)) - >>> F.conv2d(inputs, filters, padding=1) - """ - f = ConvNd(_pair(stride), _pair(padding), _pair(dilation), False, - _pair(0), groups, torch.backends.cudnn.benchmark, torch.backends.cudnn.enabled) - return f(input, weight, bias)
    - - -
    [docs]def conv1d(input, weight, bias=None, stride=1, padding=0, dilation=1, - groups=1): - """Applies a 1D convolution over an input signal composed of several input - planes. - - See :class:`~torch.nn.Conv1d` for details and output shape. - - Args: - input: input tensor of shape (minibatch x in_channels x iW) - weight: filters of shape (out_channels, in_channels, kW) - bias: optional bias of shape (out_channels) - stride: the stride of the convolving kernel, default 1 - - Examples: - >>> filters = autograd.Variable(torch.randn(33, 16, 3)) - >>> inputs = autograd.Variable(torch.randn(20, 16, 50)) - >>> F.conv1d(inputs, filters) - """ - f = ConvNd(_single(stride), _single(padding), _single(dilation), False, - _single(0), groups, torch.backends.cudnn.benchmark, torch.backends.cudnn.enabled) - return f(input, weight, bias)
    - - -
    [docs]def conv3d(input, weight, bias=None, stride=1, padding=0, dilation=1, - groups=1): - """Applies a 3D convolution over an input image composed of several input - planes. - - See :class:`~torch.nn.Conv3d` for details and output shape. - - Args: - input: input tensor of shape (minibatch x in_channels x iT x iH x iW) - weight: filters tensor of shape (out_channels, in_channels, kT, kH, kW) - bias: optional bias tensor of shape (out_channels) - stride: the stride of the convolving kernel. Can be a single number or - a tuple (st x sh x sw). Default: 1 - padding: implicit zero padding on the input. Can be a single number or - a tuple. Default: 0 - - Examples: - >>> filters = autograd.Variable(torch.randn(33, 16, 3, 3, 3)) - >>> inputs = autograd.Variable(torch.randn(20, 16, 50, 10, 20)) - >>> F.conv3d(inputs, filters) - """ - f = ConvNd(_triple(stride), _triple(padding), _triple(dilation), False, - _triple(0), groups, torch.backends.cudnn.benchmark, torch.backends.cudnn.enabled) - return f(input, weight, bias)
    - - -
    [docs]def conv_transpose1d(input, weight, bias=None, stride=1, padding=0, - output_padding=0, groups=1): - f = ConvNd(_single(stride), _single(padding), _single(1), True, - _single(output_padding), groups, torch.backends.cudnn.benchmark, torch.backends.cudnn.enabled) - return f(input, weight, bias)
    - - -
    [docs]def conv_transpose2d(input, weight, bias=None, stride=1, padding=0, - output_padding=0, groups=1): - """Applies a 2D transposed convolution operator over an input image - composed of several input planes, sometimes also called "deconvolution". - - See :class:`~torch.nn.ConvTranspose2d` for details and output shape. - - Args: - input: input tensor of shape (minibatch x in_channels x iH x iW) - weight: filters of shape (in_channels x out_channels x kH x kW) - bias: optional bias of shape (out_channels) - stride: the stride of the convolving kernel, a single number or a - tuple (sh x sw). Default: 1 - padding: implicit zero padding on the input, a single number or a - tuple (padh x padw). Default: 0 - groups: split input into groups, in_channels should be divisible by - the number of groups - output_padding: A zero-padding of 0 <= padding < stride that should be - added to the output. Can be a single number or a tuple. Default: 0 - """ - f = ConvNd(_pair(stride), _pair(padding), _pair(1), True, - _pair(output_padding), groups, torch.backends.cudnn.benchmark, torch.backends.cudnn.enabled) - return f(input, weight, bias)
    - - -
    [docs]def conv_transpose3d(input, weight, bias=None, stride=1, padding=0, - output_padding=0, groups=1): - """Applies a 3D transposed convolution operator over an input image - composed of several input planes, sometimes also called "deconvolution" - - See :class:`~torch.nn.ConvTranspose3d` for details and output shape. - - Args: - input: input tensor of shape (minibatch x in_channels x iT x iH x iW) - weight: filters of shape (in_channels x out_channels x kH x kW) - bias: optional bias of shape (out_channels) - stride: the stride of the convolving kernel, a single number or a - tuple (sh x sw). Default: 1 - padding: implicit zero padding on the input, a single number or a - tuple (padh x padw). Default: 0 - """ - f = ConvNd(_triple(stride), _triple(padding), _triple(1), True, - _triple(output_padding), groups, torch.backends.cudnn.benchmark, torch.backends.cudnn.enabled) - return f(input, weight, bias)
    - - -# Pooling -
    [docs]def avg_pool1d(input, kernel_size, stride=None, padding=0, - ceil_mode=False, count_include_pad=True): - r"""Applies a 1D average pooling over an input signal composed of several - input planes. - - See :class:`~torch.nn.AvgPool1d` for details and output shape. - - Args: - kernel_size: the size of the window - stride: the stride of the window. Default value is :attr:`kernel_size` - padding: implicit zero padding to be added on both sides - ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape - count_include_pad: when True, will include the zero-padding in the averaging calculation - - Example: - >>> # pool of square window of size=3, stride=2 - >>> input = Variable(torch.Tensor([[[1,2,3,4,5,6,7]]])) - >>> F.avg_pool1d(input, kernel_size=3, stride=2) - Variable containing: - (0 ,.,.) = - 2 4 6 - [torch.FloatTensor of size 1x1x3] - """ - if input.dim() != 3: - raise ValueError('expected 3D input (got {} dimensions)' - .format(input.dim())) - kernel_size = _single(kernel_size) + (1,) - stride = _single(stride) + (1,) if stride is not None else kernel_size - padding = _single(padding) + (0,) - f = _functions.thnn.AvgPool2d(kernel_size, stride, padding, - ceil_mode, count_include_pad) - return f(input.unsqueeze(3)).squeeze(3)
    - - -
    [docs]def avg_pool2d(input, kernel_size, stride=None, padding=0, - ceil_mode=False, count_include_pad=True): - """Applies 2D average-pooling operation in kh x kw regions by step size - dh x dw steps. The number of output features is equal to the number of - input planes. - - See :class:`~torch.nn.AvgPool2d` for details and output shape. - - Args: - input: input tensor (minibatch x in_channels x iH x iW) - kernel_size: size of the pooling region, a single number or a - tuple (kh x kw) - stride: stride of the pooling operation, a single number or a - tuple (sh x sw). Default is equal to kernel size - padding: implicit zero padding on the input, a single number or - a tuple (padh x padw), Default: 0 - ceil_mode: operation that defines spatial output shape - count_include_pad: divide by the number of elements inside the - original non-padded image or kh * kw - """ - return _functions.thnn.AvgPool2d(kernel_size, stride, padding, - ceil_mode, count_include_pad)(input)
    - - -
    [docs]def avg_pool3d(input, kernel_size, stride=None): - """Applies 3D average-pooling operation in kt x kh x kw regions by step - size kt x dh x dw steps. The number of output features is equal to the - number of input planes / dt. - """ - return _functions.thnn.AvgPool3d(kernel_size, stride)(input)
    - - -# share the same interface -
    [docs]def max_pool1d(input, kernel_size, stride=None, padding=0, dilation=1, - ceil_mode=False, return_indices=False): - return _functions.thnn.MaxPool1d(kernel_size, stride, padding, dilation, - return_indices, ceil_mode)(input)
    - - -
    [docs]def max_pool2d(input, kernel_size, stride=None, padding=0, dilation=1, - ceil_mode=False, return_indices=False): - return _functions.thnn.MaxPool2d(kernel_size, stride, padding, dilation, - return_indices, ceil_mode)(input)
    - - -
    [docs]def max_pool3d(input, kernel_size, stride=None, padding=0, dilation=1, - ceil_mode=False, return_indices=False): - return _functions.thnn.MaxPool3d(kernel_size, stride, padding, dilation, - return_indices, ceil_mode)(input)
    - - -def _unpool_output_size(input, kernel_size, stride, padding, output_size): - input_size = input.size() - default_size = [] - for d in range(len(kernel_size)): - default_size.append((input_size[d + 2] - 1) * stride[d] + - kernel_size[d] - 2 * padding[d]) - if output_size is None: - return default_size - - output_size = list(output_size) - if len(output_size) == len(kernel_size) + 2: - output_size = output_size[2:] - if len(output_size) != len(kernel_size): - raise ValueError("output_size should be a sequence containing " - "{} or {} elements, but it has a length of '{}'" - .format(len(kernel_size), len(kernel_size) + 2, - len(output_size))) - for d in range(len(kernel_size)): - min_size = default_size[d] - stride[d] - max_size = default_size[d] + stride[d] - if not (min_size < output_size[d] < max_size): - raise ValueError( - 'invalid output_size "{}" (dim {} must be between {} and {})' - .format(output_size, d, min_size, max_size)) - - return output_size - - -
    [docs]def max_unpool1d(input, indices, kernel_size, stride=None, padding=0, - output_size=None): - kernel_size = _single(kernel_size) - stride = _single(stride) - padding = _single(padding) - output_size = _unpool_output_size(input, kernel_size, stride, padding, - output_size) - f = _functions.thnn.MaxUnpool2d(output_size + [1]) - return f(input.unsqueeze(3), indices.unsqueeze(3)).squeeze(3)
    - - -
    [docs]def max_unpool2d(input, indices, kernel_size, stride=None, padding=0, - output_size=None): - kernel_size = _pair(kernel_size) - stride = _pair(stride) - padding = _pair(padding) - output_size = _unpool_output_size(input, kernel_size, stride, padding, - output_size) - f = _functions.thnn.MaxUnpool2d(output_size) - return f(input, indices)
    - - -
    [docs]def max_unpool3d(input, indices, kernel_size, stride=None, padding=0, - output_size=None): - kernel_size = _triple(kernel_size) - stride = _triple(stride) - padding = _triple(padding) - output_size = _unpool_output_size(input, kernel_size, stride, padding, - output_size) - f = _functions.thnn.MaxUnpool3d(output_size, stride, padding) - return f(input, indices)
    - - -
    [docs]def lp_pool2d(input, norm_type, kernel_size, stride=None, ceil_mode=False): - kw, kh = utils._pair(kernel_size) - out = avg_pool2d(input.pow(norm_type), kernel_size, stride, 0, ceil_mode) - return out.mul(kw * kh).pow(1. / norm_type)
    - - -
    [docs]def adaptive_max_pool1d(input, output_size, return_indices=False): - r"""Applies a 1D adaptive max pooling over an input signal composed of - several input planes. - - See :class:`~torch.nn.AdaptiveMaxPool1d` for details and output shape. - - Args: - output_size: the target output size (single integer) - return_indices: whether to return pooling indices - """ - return _functions.thnn.AdaptiveMaxPool1d(output_size, return_indices)(input)
    - - -
    [docs]def adaptive_max_pool2d(input, output_size, return_indices=False): - r"""Applies a 2D adaptive max pooling over an input signal composed of - several input planes. - - See :class:`~torch.nn.AdaptiveMaxPool2d` for details and output shape. - - Args: - output_size: the target output size (single integer or double-integer tuple) - return_indices: whether to return pooling indices - """ - return _functions.thnn.AdaptiveMaxPool2d(output_size, return_indices)(input)
    - - -
    [docs]def adaptive_avg_pool1d(input, output_size): - r"""Applies a 1D adaptive average pooling over an input signal composed of - several input planes. - - See :class:`~torch.nn.AdaptiveAvgPool1d` for details and output shape. - - Args: - output_size: the target output size (single integer) - """ - return _functions.thnn.AdaptiveAvgPool1d(output_size)(input)
    - - -
    [docs]def adaptive_avg_pool2d(input, output_size): - r"""Applies a 2D adaptive average pooling over an input signal composed of - several input planes. - - See :class:`~torch.nn.AdaptiveAvgPool2d` for details and output shape. - - Args: - output_size: the target output size (single integer or double-integer tuple) - """ - return _functions.thnn.AdaptiveAvgPool2d(output_size)(input)
    - - -# Activation functions - -
    [docs]def dropout(input, p=0.5, training=False, inplace=False): - return _functions.dropout.Dropout(p, training, inplace)(input)
    - - -
    [docs]def threshold(input, threshold, value, inplace=False): - return _functions.thnn.auto.Threshold(threshold, value, inplace)(input)
    - - -
    [docs]def relu(input, inplace=False): - return _functions.thnn.auto.Threshold(0, 0, inplace)(input)
    - - -
    [docs]def hardtanh(input, min_val=-1., max_val=1., inplace=False): - return _functions.thnn.auto.Hardtanh(min_val, max_val, inplace)(input)
    - - -
    [docs]def relu6(input, inplace=False): - return _functions.thnn.auto.Hardtanh(0, 6, inplace)(input)
    - - -
    [docs]def elu(input, alpha=1., inplace=False): - return _functions.thnn.auto.ELU(alpha, inplace)(input)
    - - -
    [docs]def leaky_relu(input, negative_slope=1e-2, inplace=False): - return _functions.thnn.auto.LeakyReLU(negative_slope, inplace)(input)
    - - -
    [docs]def prelu(input, weight): - return _functions.thnn.PReLU()(input, weight)
    - - -
    [docs]def rrelu(input, lower=1. / 8, upper=1. / 3, training=False, inplace=False): - return _functions.thnn.RReLU(lower, upper, training, inplace)(input)
    - - -
    [docs]def logsigmoid(input): - return _functions.thnn.LogSigmoid()(input)
    - - -
    [docs]def hardshrink(input, lambd=0.5): - return _functions.thnn.auto.Hardshrink(lambd)(input)
    - - -
    [docs]def tanhshrink(input): - return input - torch.tanh(input)
    - - -
    [docs]def softsign(input): - return _functions.activation.Softsign()(input)
    - - -
    [docs]def softplus(input, beta=1, threshold=20): - return _functions.thnn.auto.Softplus(beta, threshold)(input)
    - - -
    [docs]def softmin(input): - return _functions.thnn.Softmin()(input)
    - - -
    [docs]def softmax(input): - return _functions.thnn.auto.Softmax()(input)
    - - -
    [docs]def softshrink(input, lambd=0.5): - return _functions.thnn.auto.Softshrink(lambd)(input)
    - - -
    [docs]def log_softmax(input): - return _functions.thnn.LogSoftmax()(input)
    - - -
    [docs]def tanh(input): - return torch.tanh(input)
    - - -
    [docs]def sigmoid(input): - return torch.sigmoid(input)
    - - -# etc. - -
    [docs]def linear(input, weight, bias=None): - state = _functions.linear.Linear() - return bias and state(input, weight, bias) or state(input, weight)
    - - -
    [docs]def batch_norm(input, running_mean, running_var, weight=None, bias=None, - training=False, momentum=0.1, eps=1e-5): - f = torch._C._functions.BatchNorm(running_mean, running_var, training, momentum, eps, torch.backends.cudnn.enabled) - return f(input, weight, bias)
    - - -# loss - -
    [docs]def nll_loss(input, target, weight=None, size_average=True): - r"""The negative log likelihood loss. - - See :class:`~torch.nn.NLLLoss` for details. - - Args: - input: :math:`(N, C)` where `C = number of classes` - target: :math:`(N)` where each value is `0 <= targets[i] <= C-1` - weight (Variable, optional): a manual rescaling weight given to each - class. If given, has to be a Variable of size "nclasses" - size_average (bool, optional): By default, the losses are averaged - over observations for each minibatch. However, if the field - sizeAverage is set to False, the losses are instead summed - for each minibatch. - - Attributes: - weight: the class-weights given as input to the constructor - - Example: - >>> # input is of size nBatch x nClasses = 3 x 5 - >>> input = autograd.Variable(torch.randn(3, 5)) - >>> # each element in target has to have 0 <= value < nclasses - >>> target = autograd.Variable(torch.LongTensor([1, 0, 4])) - >>> output = F.nll_loss(F.log_softmax(input), target) - >>> output.backward() - """ - dim = input.dim() - if dim == 2: - f = _functions.thnn.NLLLoss(size_average, weight=weight) - elif dim == 4: - f = _functions.thnn.NLLLoss2d(size_average, weight=weight) - else: - raise ValueError('Expected 2 or 4 dimensions (got {})'.format(dim)) - return f(input, target)
    - - -
    [docs]def kl_div(input, target, size_average=True): - r"""The `Kullback-Leibler divergence`_ Loss. - - See :class:`~torch.nn.KLDivLoss` for details. - - Args: - input: Variable of arbitrary shape - target: Variable of the same shape as input - size_average: if True the output is divided by the number of elements - in input tensor - """ - return _functions.thnn.KLDivLoss(size_average)(input, target)
    - - -
    [docs]def cross_entropy(input, target, weight=None, size_average=True): - r"""This criterion combines `log_softmax` and `nll_loss` in one single class. - - See :class:`torch.nn.CrossEntropyLoss` for details. - - Args: - input: Variable :math:`(N, C)` where `C = number of classes` - target: Variable :math:`(N)` where each value is `0 <= targets[i] <= C-1` - weight (Tensor, optional): a manual rescaling weight given to each - class. If given, has to be a Tensor of size "nclasses" - size_average (bool, optional): By default, the losses are averaged - over observations for each minibatch. However, if the field - sizeAverage is set to False, the losses are instead summed - for each minibatch. - """ - return nll_loss(log_softmax(input), target, weight, size_average)
    - - -
    [docs]def binary_cross_entropy(input, target, weight=None, size_average=True): - r"""Function that measures the Binary Cross Entropy - between the target and the output: - - See :class:`~torch.nn.BCELoss` for details. - - Args: - input: Variable of arbitrary shape - target: Variable of the same shape as input - weight (Variable, optional): a manual rescaling weight - if provided it's repeated to match input tensor shape - size_average (bool, optional): By default, the losses are averaged - over observations for each minibatch. However, if the field - sizeAverage is set to False, the losses are instead summed - for each minibatch. - """ - return _functions.thnn.BCELoss(size_average, weight=weight)(input, target)
    - - -
    [docs]def smooth_l1_loss(input, target, size_average=True): - return _functions.thnn.SmoothL1Loss(size_average)(input, target)
    - - -
    [docs]def pixel_shuffle(input, upscale_factor): - r"""Rearranges elements in a tensor of shape ``[*, C*r^2, H, W]`` to a - tensor of shape ``[C, H*r, W*r]``. - - See :class:`~torch.nn.PixelShuffle` for details. - - Args: - input (Variable): Input - upscale_factor (int): factor to increase spatial resolution by - - Examples: - >>> ps = nn.PixelShuffle(3) - >>> input = autograd.Variable(torch.Tensor(1, 9, 4, 4)) - >>> output = ps(input) - >>> print(output.size()) - torch.Size([1, 1, 12, 12]) - """ - batch_size, channels, in_height, in_width = input.size() - channels //= upscale_factor ** 2 - - out_height = in_height * upscale_factor - out_width = in_width * upscale_factor - - input_view = input.contiguous().view( - batch_size, channels, upscale_factor, upscale_factor, - in_height, in_width) - - shuffle_out = input_view.permute(0, 1, 4, 2, 5, 3).contiguous() - return shuffle_out.view(batch_size, channels, out_height, out_width)
    - - -def upsample_nearest(input, size=None, scale_factor=None): - """Upsamples the input, using nearest neighbours' pixel values. - - Currently only spatial upsampling is supported (i.e. expected inputs - are 4 dimensional). - - Args: - input (Variable): input - size (int or Tuple[int, int]): output spatial size. - scale_factor (int): multiplier for spatial size. Has to be an integer. - """ - return _functions.thnn.UpsamplingNearest2d(size, scale_factor)(input) - - -def upsample_bilinear(input, size=None, scale_factor=None): - """Upscales the input, using the bilinear upsampling. - - Currently only spatial upsampling is supported (i.e. expected inputs - are 4 dimensional). - - Args: - input (Variable): input - size (int or Tuple[int, int]): output spatial size. - scale_factor (int): multiplier for spatial size. Has to be an integer. - """ - return _functions.thnn.UpsamplingBilinear2d(size, scale_factor)(input) - - -
    [docs]def pad(input, pad, mode='constant', value=0): - """Pads tensor. - - Currently only 2D and 3D padding supported. - In case of 4D input tensor pad should be in form (pad_l, pad_r, pad_t, pad_b ) - In case of 5D pad should be (pleft, pright, ptop, pbottom, pfront, pback) - - Args: - input (Variable): 4D or 5D tensor - pad (tuple): 4-elem or 6-elem tuple - mode: 'constant', 'reflect' or 'replicate' - value: fill value for 'constant' padding - """ - if input.dim() == 4: - assert len(pad) == 4, '4D tensors expect 4 values for padding' - if mode == 'constant': - return ConstantPad2d(pad, value)(input) - elif mode == 'reflect': - return _functions.thnn.ReflectionPad2d(*pad)(input) - elif mode == 'replicate': - return _functions.thnn.ReplicationPad2d(*pad)(input) - elif input.dim() == 5: - assert len(pad) == 6, '5D tensors expect 6 values for padding' - if mode == 'constant': - raise NotImplementedError - elif mode == 'reflect': - raise NotImplementedError - elif mode == 'replicate': - return _functions.thnn.ReplicationPad3d(*pad)(input) - else: - raise NotImplementedError("Only 4D and 5D padding is supported for now")
    - - -# distance - -
    [docs]def pairwise_distance(x1, x2, p=2, eps=1e-6): - r""" - Computes the batchwise pairwise distance between vectors v1,v2: - - .. math :: - \Vert x \Vert _p := \left( \sum_{i=1}^n \vert x_i \vert ^ p \right) ^ {1/p} - - Args: - x1: first input tensor - x2: second input tensor - p: the norm degree. Default: 2 - - Shape: - - Input: :math:`(N, D)` where `D = vector dimension` - - Output: :math:`(N, 1)` - - >>> input1 = autograd.Variable(torch.randn(100, 128)) - >>> input2 = autograd.Variable(torch.randn(100, 128)) - >>> output = F.pairwise_distance(input1, input2, p=2) - >>> output.backward() - """ - assert x1.size() == x2.size(), "Input sizes must be equal." - assert x1.dim() == 2, "Input must be a 2D matrix." - diff = torch.abs(x1 - x2) - out = torch.pow(diff + eps, p).sum(dim=1) - return torch.pow(out, 1. / p)
    - - -def triplet_margin_loss(anchor, positive, negative, margin=1.0, p=2, eps=1e-6, swap=False): - r"""Creates a criterion that measures the triplet loss given an input tensors x1, x2, x3 - and a margin with a value greater than 0. - This is used for measuring a relative similarity between samples. A triplet is composed by - `a`, `p` and `n`: anchor, positive examples and negative example respectively. - The shape of all input variables should be :math:`(N, D)`. - - The distance swap is described in detail in the paper `Learning shallow convolutional feature descriptors with - triplet losses`_ by V. Balntas, E. Riba et al. - - .. math:: - L(a, p, n) = \frac{1}{N} \left( \sum_{i=1}^N \max \{d(a_i, p_i) - d(a_i, n_i) + {\rm margin}, 0\} \right) - - where :math: `d(x_i, y_i) = \| {\bf x}_i - {\bf y}_i \|_2^2`. - - Args: - anchor: anchor input tensor - positive: positive input tensor - negative: negative input tensor - p: the norm degree. Default: 2 - eps: small epsilon value to avoid numerical issues - swap: compute distance swap - - Shape: - - Input: :math:`(N, D)` where `D = vector dimension` - - Output: :math:`(N, 1)` - - >>> input1 = autograd.Variable(torch.randn(100, 128)) - >>> input2 = autograd.Variable(torch.randn(100, 128)) - >>> input3 = autograd.Variable(torch.randn(100, 128)) - >>> output = F.triplet_margin_loss(input1, input2, input3, p=2) - >>> output.backward() - - .. _Learning shallow convolutional feature descriptors with triplet losses: - http://www.iis.ee.ic.ac.uk/%7Evbalnt/shallow_descr/TFeat_paper.pdf - """ - assert anchor.size() == positive.size(), "Input sizes between positive and negative must be equal." - assert anchor.size() == negative.size(), "Input sizes between anchor and negative must be equal." - assert positive.size() == negative.size(), "Input sizes between positive and negative must be equal." - assert anchor.dim() == 2, "Inputd must be a 2D matrix." - assert margin > 0.0, 'Margin should be positive value.' - d_p = pairwise_distance(anchor, positive, p, eps) - d_n = pairwise_distance(anchor, negative, p, eps) - if swap: - d_s = pairwise_distance(positive, negative, p, eps) - d_n = torch.min(d_n, d_s) - - dist_hinge = torch.clamp(margin + d_p - d_n, min=0.0) - loss = torch.mean(dist_hinge) - return loss -
    - -
    -
    - -
    -
    -
    - - -
    - -
    -

    - © Copyright 2017, Torch Contributors. - -

    -
    - Built with Sphinx using a theme provided by Read the Docs. - -
    - -
    -
    - -
    - -
    - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/_modules/torch/nn/init.html b/docs/_modules/torch/nn/init.html deleted file mode 100644 index 3364874f8fe2..000000000000 --- a/docs/_modules/torch/nn/init.html +++ /dev/null @@ -1,829 +0,0 @@ - - - - - - - - - - - torch.nn.init — PyTorch 0.1.11 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - -
    - - - - - - -
    -
    - - - - - - - - - - - - - - - - -
    - - - - -
    -
    -
    -
    - -

    Source code for torch.nn.init

    -import math
    -import random
    -
    -import torch
    -from torch.autograd import Variable
    -
    -
    -
    [docs]def uniform(tensor, a=0, b=1): - """Fills the input Tensor or Variable with values drawn from a uniform U(a,b) - - Args: - tensor: a n-dimension torch.Tensor - a: the lower bound of the uniform distribution - b: the upper bound of the uniform distribution - - Examples: - >>> w = torch.Tensor(3, 5) - >>> nn.init.uniform(w) - """ - if isinstance(tensor, Variable): - uniform(tensor.data, a=a, b=b) - return tensor - return tensor.uniform_(a, b)
    - - -
    [docs]def normal(tensor, mean=0, std=1): - """Fills the input Tensor or Variable with values drawn from a normal distribution with the given mean and std - - Args: - tensor: a n-dimension torch.Tensor - mean: the mean of the normal distribution - std: the standard deviation of the normal distribution - - Examples: - >>> w = torch.Tensor(3, 5) - >>> nn.init.normal(w) - """ - if isinstance(tensor, Variable): - normal(tensor.data, mean=mean, std=std) - return tensor - return tensor.normal_(mean, std)
    - - -
    [docs]def constant(tensor, val): - """Fills the input Tensor or Variable with the value `val` - - Args: - tensor: a n-dimension torch.Tensor - val: the value to fill the tensor with - - Examples: - >>> w = torch.Tensor(3, 5) - >>> nn.init.constant(w) - """ - if isinstance(tensor, Variable): - constant(tensor.data, val) - return tensor - return tensor.fill_(val)
    - - -def _calculate_fan_in_and_fan_out(tensor): - if tensor.ndimension() < 2: - raise ValueError("fan in and fan out can not be computed for tensor of size ", tensor.size()) - - if tensor.ndimension() == 2: # Linear - fan_in = tensor.size(1) - fan_out = tensor.size(0) - else: - num_input_fmaps = tensor.size(1) - num_output_fmaps = tensor.size(0) - receptive_field_size = 1 - if tensor.dim() > 2: - receptive_field_size = tensor[0][0].numel() - fan_in = num_input_fmaps * receptive_field_size - fan_out = num_output_fmaps * receptive_field_size - - return fan_in, fan_out - - -
    [docs]def xavier_uniform(tensor, gain=1): - """Fills the input Tensor or Variable with values according to the method described in "Understanding the - difficulty of training deep feedforward neural networks" - Glorot, X. and Bengio, Y., using a uniform - distribution. The resulting tensor will have values sampled from U(-a, a) where a = gain * sqrt(2/(fan_in + - fan_out)) * sqrt(3) - - Args: - tensor: a n-dimension torch.Tensor - gain: an optional scaling factor to be applied - - Examples: - >>> w = torch.Tensor(3, 5) - >>> nn.init.xavier_uniform(w, gain=math.sqrt(2.0)) - """ - if isinstance(tensor, Variable): - xavier_uniform(tensor.data, gain=gain) - return tensor - - fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor) - std = gain * math.sqrt(2.0 / (fan_in + fan_out)) - a = math.sqrt(3.0) * std - return tensor.uniform_(-a, a)
    - - -
    [docs]def xavier_normal(tensor, gain=1): - """Fills the input Tensor or Variable with values according to the method described in "Understanding the - difficulty of training deep feedforward neural networks" - Glorot, X. and Bengio, Y., using a normal - distribution. The resulting tensor will have values sampled from normal distribution with mean=0 and std = gain * - sqrt(2/(fan_in + fan_out)) - - Args: - tensor: a n-dimension torch.Tensor - gain: an optional scaling factor to be applied - - Examples: - >>> w = torch.Tensor(3, 5) - >>> nn.init.xavier_normal(w) - """ - if isinstance(tensor, Variable): - xavier_normal(tensor.data, gain=gain) - return tensor - - fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor) - std = gain * math.sqrt(2.0 / (fan_in + fan_out)) - return tensor.normal_(0, std)
    - - -def _calculate_correct_fan(tensor, mode): - mode = mode.lower() - valid_modes = ['fan_in', 'fan_out'] - if mode not in valid_modes: - raise ValueError("mode {} not supported, please use one of {}".format(mode, valid_modes)) - - fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor) - if mode == 'fan_in': - return fan_in - else: - return fan_out - - -
    [docs]def kaiming_uniform(tensor, a=0, mode='fan_in'): - """Fills the input Tensor or Variable with values according to the method described in "Delving deep into - rectifiers: Surpassing human-level performance on ImageNet classification" - He, K. et al using a uniform - distribution. The resulting tensor will have values sampled from U(-bound, bound) where bound = sqrt(2/((1 + a^2) - * fan_in)) * sqrt(3) - - Args: - tensor: a n-dimension torch.Tensor - a: the coefficient of the slope of the rectifier used after this layer (0 for ReLU by default) - mode: either 'fan_in' (default) or 'fan_out'. Choosing `fan_in` preserves the magnitude of the variance of the - weights in the forward pass. Choosing `fan_out` preserves the magnitudes in the backwards pass. - - Examples: - >>> w = torch.Tensor(3, 5) - >>> nn.init.kaiming_uniform(w, mode='fan_in') - """ - if isinstance(tensor, Variable): - kaiming_uniform(tensor.data, a=a, mode=mode) - return tensor - - fan = _calculate_correct_fan(tensor, mode) - std = math.sqrt(2.0 / ((1 + a ** 2) * fan)) - bound = math.sqrt(3.0) * std - return tensor.uniform_(-bound, bound)
    - - -
    [docs]def kaiming_normal(tensor, a=0, mode='fan_in'): - """Fills the input Tensor or Variable with values according to the method described in "Delving deep into - rectifiers: Surpassing human-level performance on ImageNet classification" - He, K. et al using a normal - distribution. The resulting tensor will have values sampled from normal distribution with mean=0 and std = sqrt( - 2/((1 + a^2) * fan_in)) - - Args: - tensor: a n-dimension torch.Tensor - a: the coefficient of the slope of the rectifier used after this layer (0 for ReLU by default) - mode: either 'fan_in' (default) or 'fan_out'. Choosing `fan_in` preserves the magnitude of the variance of the - weights in the forward pass. Choosing `fan_out` preserves the magnitudes in the backwards pass. - - Examples: - >>> w = torch.Tensor(3, 5) - >>> nn.init.kaiming_normal(w, mode='fan_out') - """ - if isinstance(tensor, Variable): - kaiming_normal(tensor.data, a=a, mode=mode) - return tensor - - fan = _calculate_correct_fan(tensor, mode) - std = math.sqrt(2.0 / ((1 + a ** 2) * fan)) - return tensor.normal_(0, std)
    - - -
    [docs]def orthogonal(tensor, gain=1): - """Fills the input Tensor or Variable with a (semi) orthogonal matrix. The input tensor must have at least 2 - dimensions, and for tensors with more than 2 dimensions the trailing dimensions are flattened. viewed as 2D - representation with rows equal to the first dimension and columns equal to the product of as a sparse matrix, - where the non-zero elements will be drawn from a normal distribution with mean=0 and std=`std`. Reference: "Exact - solutions to the nonlinear dynamics of learning in deep linear neural networks"-Saxe, A. et al. - - Args: - tensor: a n-dimension torch.Tensor, where n >= 2 - gain: optional gain to be applied - - Examples: - >>> w = torch.Tensor(3, 5) - >>> nn.init.orthogonal(w) - """ - if isinstance(tensor, Variable): - orthogonal(tensor.data, gain=gain) - return tensor - - if tensor.ndimension() < 2: - raise ValueError("Only tensors with 2 or more dimensions are supported.") - rows = tensor.size(0) - cols = tensor[0].numel() - flattened = torch.Tensor(rows, cols).normal_(0, 1) - - u, s, v = torch.svd(flattened, some=True) - if u.is_same_size(flattened): - tensor.view_as(u).copy_(u) - else: - tensor.view_as(v.t()).copy_(v.t()) - - tensor.mul_(gain) - return tensor
    - - -
    [docs]def sparse(tensor, sparsity, std=0.01): - """Fills the 2D input Tensor or Variable as a sparse matrix, where the non-zero elements will be drawn from a - normal distribution with mean=0 and std=`std`. - - Args: - tensor: a n-dimension torch.Tensor - sparsity: The fraction of elements in each column to be set to zero - std: the standard deviation of the normal distribution used to generate the non-zero values - - Examples: - >>> w = torch.Tensor(3, 5) - >>> nn.init.sparse(w, sparsity=0.1) - """ - if isinstance(tensor, Variable): - sparse(tensor.data, sparsity, std=std) - return tensor - - if tensor.ndimension() != 2: - raise ValueError("Sparse initialization only supported for 2D inputs") - tensor.normal_(0, std) - rows, cols = tensor.size(0), tensor.size(1) - num_zeros = int(math.ceil(cols * sparsity)) - - for col_idx in range(tensor.size(1)): - row_indices = list(range(rows)) - random.shuffle(row_indices) - zero_indices = row_indices[:num_zeros] - for row_idx in zero_indices: - tensor[row_idx, col_idx] = 0 - - return tensor
    -
    - -
    -
    - -
    -
    -
    - - -
    - -
    -

    - © Copyright 2017, Torch Contributors. - -

    -
    - Built with Sphinx using a theme provided by Read the Docs. - -
    - -
    -
    - -
    - -
    - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/_modules/torch/nn/modules/activation.html b/docs/_modules/torch/nn/modules/activation.html deleted file mode 100644 index edcbca428942..000000000000 --- a/docs/_modules/torch/nn/modules/activation.html +++ /dev/null @@ -1,1196 +0,0 @@ - - - - - - - - - - - torch.nn.modules.activation — PyTorch 0.1.11 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - -
    - - - - - - -
    -
    - - - - - - - - - - - - - - - - -
    - - - - -
    -
    -
    -
    - -

    Source code for torch.nn.modules.activation

    -import torch
    -from torch.nn.parameter import Parameter
    -
    -from .module import Module
    -from .. import functional as F
    -
    -
    -
    [docs]class Threshold(Module): - """Thresholds each element of the input Tensor - - Threshold is defined as:: - - y = x if x >= threshold - value if x < threshold - - Args: - threshold: The value to threshold at - value: The value to replace with - inplace: can optionally do the operation in-place - - Shape: - - Input: :math:`(N, *)` where `*` means, any number of additional dimensions - - Output: :math:`(N, *)`, same shape as the input - - Examples:: - - >>> m = nn.Threshold(0.1, 20) - >>> input = Variable(torch.randn(2)) - >>> print(input) - >>> print(m(input)) - """ - - def __init__(self, threshold, value, inplace=False): - super(Threshold, self).__init__() - self.threshold = threshold - self.value = value - self.inplace = inplace - # TODO: check in THNN (if inplace == True, then assert value <= threshold) - - def forward(self, input): - return F.threshold(input, self.threshold, self.value, self.inplace) - - def __repr__(self): - inplace_str = ', inplace' if self.inplace else '' - return self.__class__.__name__ + ' (' \ - + str(self.threshold) \ - + ', ' + str(self.value) \ - + inplace_str + ')'
    - - -
    [docs]class ReLU(Threshold): - """Applies the rectified linear unit function element-wise :math:`{ReLU}(x)= max(0, x)` - - Args: - inplace: can optionally do the operation in-place - - Shape: - - Input: :math:`(N, *)` where `*` means, any number of additional dimensions - - Output: :math:`(N, *)`, same shape as the input - - Examples:: - - >>> m = nn.ReLU() - >>> input = autograd.Variable(torch.randn(2)) - >>> print(input) - >>> print(m(input)) - """ - - def __init__(self, inplace=False): - super(ReLU, self).__init__(0, 0, inplace) - - def __repr__(self): - inplace_str = 'inplace' if self.inplace else '' - return self.__class__.__name__ + ' (' \ - + inplace_str + ')'
    - - -class RReLU(Module): - - def __init__(self, lower=1. / 8, upper=1. / 3, inplace=False): - super(RReLU, self).__init__() - self.lower = lower - self.upper = upper - self.inplace = inplace - - def forward(self, input): - return F.rrelu(input, self.lower, self.upper, self.training, self.inplace) - - def __repr__(self): - inplace_str = ', inplace' if self.inplace else '' - return self.__class__.__name__ + ' (' \ - + str(self.lower) \ - + ', ' + str(self.upper) \ - + inplace_str + ')' - - -
    [docs]class Hardtanh(Module): - """Applies the HardTanh function element-wise - - HardTanh is defined as:: - - f(x) = +1, if x > 1 - f(x) = -1, if x < -1 - f(x) = x, otherwise - - The range of the linear region :math:`[-1, 1]` can be adjusted - - Args: - min_value: minimum value of the linear region range - max_value: maximum value of the linear region range - inplace: can optionally do the operation in-place - - Shape: - - Input: :math:`(N, *)` where `*` means, any number of additional dimensions - - Output: :math:`(N, *)`, same shape as the input - - Examples:: - - >>> m = nn.HardTanh(-2, 2) - >>> input = autograd.Variable(torch.randn(2)) - >>> print(input) - >>> print(m(input)) - """ - - def __init__(self, min_value=-1, max_value=1, inplace=False): - super(Hardtanh, self).__init__() - self.min_val = min_value - self.max_val = max_value - self.inplace = inplace - assert self.max_val > self.min_val - - def forward(self, input): - return F.hardtanh(input, self.min_val, self.max_val, self.inplace) - - def __repr__(self): - inplace_str = ', inplace' if self.inplace else '' - return self.__class__.__name__ + ' (' \ - + 'min_val=' + str(self.min_val) \ - + ', max_val=' + str(self.max_val) \ - + inplace_str + ')'
    - - -
    [docs]class ReLU6(Hardtanh): - """Applies the element-wise function :math:`{ReLU6}(x) = min(max(0,x), 6)` - - Args: - inplace: can optionally do the operation in-place - - Shape: - - Input: :math:`(N, *)` where `*` means, any number of additional dimensions - - Output: :math:`(N, *)`, same shape as the input - - Examples:: - - >>> m = nn.ReLU6() - >>> input = autograd.Variable(torch.randn(2)) - >>> print(input) - >>> print(m(input)) - """ - - def __init__(self, inplace=False): - super(ReLU6, self).__init__(0, 6, inplace) - - def __repr__(self): - inplace_str = 'inplace' if self.inplace else '' - return self.__class__.__name__ + ' (' \ - + inplace_str + ')'
    - - -
    [docs]class Sigmoid(Module): - """Applies the element-wise function :math:`f(x) = 1 / ( 1 + exp(-x))` - - Shape: - - Input: :math:`(N, *)` where `*` means, any number of additional dimensions - - Output: :math:`(N, *)`, same shape as the input - - Examples:: - - >>> m = nn.Sigmoid() - >>> input = autograd.Variable(torch.randn(2)) - >>> print(input) - >>> print(m(input)) - """ - - def forward(self, input): - return torch.sigmoid(input) - - def __repr__(self): - return self.__class__.__name__ + ' ()'
    - - -
    [docs]class Tanh(Module): - """Applies element-wise, :math:`f(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))` - - Shape: - - Input: :math:`(N, *)` where `*` means, any number of additional dimensions - - Output: :math:`(N, *)`, same shape as the input - - Examples:: - - >>> m = nn.Tanh() - >>> input = autograd.Variable(torch.randn(2)) - >>> print(input) - >>> print(m(input)) - """ - - def forward(self, input): - return torch.tanh(input) - - def __repr__(self): - return self.__class__.__name__ + ' ()'
    - - -
    [docs]class ELU(Module): - """Applies element-wise, :math:`f(x) = max(0,x) + min(0, alpha * (exp(x) - 1))` - - Args: - alpha: the alpha value for the ELU formulation - inplace: can optionally do the operation in-place - - Shape: - - Input: :math:`(N, *)` where `*` means, any number of additional dimensions - - Output: :math:`(N, *)`, same shape as the input - - Examples:: - - >>> m = nn.ELU() - >>> input = autograd.Variable(torch.randn(2)) - >>> print(input) - >>> print(m(input)) - """ - - def __init__(self, alpha=1., inplace=False): - super(ELU, self).__init__() - self.alpha = alpha - self.inplace = inplace - - def forward(self, input): - return F.elu(input, self.alpha, self.inplace) - - def __repr__(self): - inplace_str = ', inplace' if self.inplace else '' - return self.__class__.__name__ + ' (' \ - + 'alpha=' + str(self.alpha) \ - + inplace_str + ')'
    - - -class Hardshrink(Module): - """Applies the hard shrinkage function element-wise - Hardshrink is defined as:: - f(x) = x, if x > lambda - f(x) = x, if x < -lambda - f(x) = 0, otherwise - - Args: - lambd: the lambda value for the Hardshrink formulation. Default: 0.5 - - Shape: - - Input: :math:`(N, *)` where `*` means, any number of additional dimensions - - Output: :math:`(N, *)`, same shape as the input - - Examples:: - - >>> m = nn.Hardshrink() - >>> input = autograd.Variable(torch.randn(2)) - >>> print(input) - >>> print(m(input)) - """ - - def __init__(self, lambd=0.5): - super(Hardshrink, self).__init__() - self.lambd = lambd - - def forward(self, input): - return F.hardshrink(input, self.lambd) - - def __repr__(self): - return self.__class__.__name__ + ' (' \ - + str(self.lambd) + ')' - - -
    [docs]class LeakyReLU(Module): - """Applies element-wise, :math:`f(x) = max(0, x) + {negative\_slope} * min(0, x)` - - Args: - negative_slope: Controls the angle of the negative slope. Default: 1e-2 - inplace: can optionally do the operation in-place - - Shape: - - Input: :math:`(N, *)` where `*` means, any number of additional dimensions - - Output: :math:`(N, *)`, same shape as the input - - Examples:: - - >>> m = nn.LeakyReLU(0.1) - >>> input = autograd.Variable(torch.randn(2)) - >>> print(input) - >>> print(m(input)) - """ - - def __init__(self, negative_slope=1e-2, inplace=False): - super(LeakyReLU, self).__init__() - self.negative_slope = negative_slope - self.inplace = inplace - - def forward(self, input): - return F.leaky_relu(input, self.negative_slope, self.inplace) - - def __repr__(self): - inplace_str = ', inplace' if self.inplace else '' - return self.__class__.__name__ + ' (' \ - + str(self.negative_slope) \ - + inplace_str + ')'
    - - -
    [docs]class LogSigmoid(Module): - """Applies element-wise :math:`LogSigmoid(x) = log( 1 / (1 + exp(-x_i)))` - - Shape: - - Input: :math:`(N, *)` where `*` means, any number of additional dimensions - - Output: :math:`(N, *)`, same shape as the input - - Examples:: - - >>> m = nn.LogSigmoid() - >>> input = autograd.Variable(torch.randn(2)) - >>> print(input) - >>> print(m(input)) - """ - - def forward(self, input): - return F.logsigmoid(input) - - def __repr__(self): - return self.__class__.__name__ + ' ()'
    - - -
    [docs]class Softplus(Module): - """Applies element-wise :math:`f(x) = 1/beta * log(1 + exp(beta * x_i))` - - SoftPlus is a smooth approximation to the ReLU function and can be used - to constrain the output of a machine to always be positive. - - For numerical stability the implementation reverts to the linear function - for inputs above a certain value. - - Args: - beta: the beta value for the Softplus formulation. Default: 1 - threshold: values above this revert to a linear function. Default: 20 - - Shape: - - Input: :math:`(N, *)` where `*` means, any number of additional dimensions - - Output: :math:`(N, *)`, same shape as the input - - Examples:: - - >>> m = nn.Softplus() - >>> input = autograd.Variable(torch.randn(2)) - >>> print(input) - >>> print(m(input)) - """ - - def __init__(self, beta=1, threshold=20): - super(Softplus, self).__init__() - self.beta = beta - self.threshold = threshold - - def forward(self, input): - return F.softplus(input, self.beta, self.threshold) - - def __repr__(self): - return self.__class__.__name__ + ' (' \ - + 'beta=' + str(self.beta) \ - + ', threshold=' + str(self.threshold) + ')'
    - - -
    [docs]class Softshrink(Module): - """Applies the soft shrinkage function elementwise - - SoftShrinkage operator is defined as:: - - f(x) = x-lambda, if x > lambda > f(x) = x+lambda, if x < -lambda - f(x) = 0, otherwise - - Args: - lambd: the lambda value for the Softshrink formulation. Default: 0.5 - - Shape: - - Input: :math:`(N, *)` where `*` means, any number of additional dimensions - - Output: :math:`(N, *)`, same shape as the input - - Examples:: - - >>> m = nn.Softshrink() - >>> input = autograd.Variable(torch.randn(2)) - >>> print(input) - >>> print(m(input)) - """ - - def __init__(self, lambd=0.5): - super(Softshrink, self).__init__() - self.lambd = lambd - - def forward(self, input): - return F.softshrink(input, self.lambd) - - def __repr__(self): - return self.__class__.__name__ + ' (' \ - + str(self.lambd) + ')'
    - - -
    [docs]class PReLU(Module): - """Applies element-wise the function :math:`PReLU(x) = max(0,x) + a * min(0,x)` - Here "a" is a learnable parameter. - When called without arguments, nn.PReLU() uses a single parameter "a" - across all input channels. If called with nn.PReLU(nChannels), a separate - "a" is used for each input channel. - - - .. note:: - weight decay should not be used when learning "a" for good performance. - - Args: - num_parameters: number of "a" to learn. Default: 1 - init: the initial value of "a". Default: 0.25 - - Shape: - - Input: :math:`(N, *)` where `*` means, any number of additional dimensions - - Output: :math:`(N, *)`, same shape as the input - - Examples:: - - >>> m = nn.PReLU() - >>> input = autograd.Variable(torch.randn(2)) - >>> print(input) - >>> print(m(input)) - """ - - def __init__(self, num_parameters=1, init=0.25): - self.num_parameters = num_parameters - super(PReLU, self).__init__() - self.weight = Parameter(torch.Tensor(num_parameters).fill_(init)) - - def forward(self, input): - return F.prelu(input, self.weight) - - def __repr__(self): - return self.__class__.__name__ + ' (' \ - + str(self.num_parameters) + ')'
    - - -
    [docs]class Softsign(Module): - """Applies element-wise, the function :math:`f(x) = x / (1 + |x|)` - - Shape: - - Input: :math:`(N, *)` where `*` means, any number of additional dimensions - - Output: :math:`(N, *)`, same shape as the input - - Examples:: - - >>> m = nn.Softsign() - >>> input = autograd.Variable(torch.randn(2)) - >>> print(input) - >>> print(m(input)) - """ - - def forward(self, input): - return F.softsign(input) - - def __repr__(self): - return self.__class__.__name__ + ' ()'
    - - -
    [docs]class Tanhshrink(Module): - """Applies element-wise, :math:`Tanhshrink(x) = x - Tanh(x)` - - Shape: - - Input: :math:`(N, *)` where `*` means, any number of additional dimensions - - Output: :math:`(N, *)`, same shape as the input - - Examples:: - - >>> m = nn.Tanhshrink() - >>> input = autograd.Variable(torch.randn(2)) - >>> print(input) - >>> print(m(input)) - """ - - def forward(self, input): - return F.tanhshrink(input) - - def __repr__(self): - return self.__class__.__name__ + ' ()'
    - - -
    [docs]class Softmin(Module): - """Applies the Softmin function to an n-dimensional input Tensor - rescaling them so that the elements of the n-dimensional output Tensor - lie in the range `(0, 1)` and sum to 1 - - :math:`f(x) = exp(-x_i - {shift}) / sum_j exp(-x_j - {shift})` - - where :math:`{shift} = max_i - x_i` - - Shape: - - Input: :math:`(N, L)` - - Output: :math:`(N, L)` - - Returns: - a Tensor of the same dimension and shape as the input, with - values in the range [0, 1] - - Examples:: - - >>> m = nn.Softmin() - >>> input = autograd.Variable(torch.randn(2, 3)) - >>> print(input) - >>> print(m(input)) - """ - - def forward(self, input): - return F.softmin(input) - - def __repr__(self): - return self.__class__.__name__ + ' ()'
    - - -
    [docs]class Softmax(Module): - """Applies the Softmax function to an n-dimensional input Tensor - rescaling them so that the elements of the n-dimensional output Tensor - lie in the range (0,1) and sum to 1 - - Softmax is defined as :math:`f_i(x) = exp(x_i - shift) / sum_j exp(x_j - shift)` - where `shift = max_i x_i` - - Shape: - - Input: :math:`(N, L)` - - Output: :math:`(N, L)` - - Returns: - a Tensor of the same dimension and shape as the input with - values in the range [0, 1] - - .. note:: - This module doesn't work directly with NLLLoss, - which expects the Log to be computed between the Softmax and itself. - Use Logsoftmax instead (it's faster). - - Examples:: - - >>> m = nn.Softmax() - >>> input = autograd.Variable(torch.randn(2, 3)) - >>> print(input) - >>> print(m(input)) - """ - - def forward(self, input): - assert input.dim() == 2, 'Softmax requires a 2D tensor as input' - return F.softmax(input) - - def __repr__(self): - return self.__class__.__name__ + ' ()'
    - - -class Softmax2d(Module): - """Applies SoftMax over features to each spatial location - - When given an image of Channels x Height x Width, it will - - apply Softmax to each location :math:`(Channels, h_i, w_j)` - - Shape: - - Input: :math:`(N, C, H, W)` - - Output: :math:`(N, C, H, W)` (same shape as input) - - Returns: - a Tensor of the same dimension and shape as the input with - values in the range [0, 1] - - Examples:: - - >>> m = nn.Softmax2d() - >>> # you softmax over the 2nd dimension - >>> input = autograd.Variable(torch.randn(2, 3, 12, 13)) - >>> print(input) - >>> print(m(input)) - """ - - def forward(self, input): - assert input.dim() == 4, 'Softmax2d requires a 4D tensor as input' - return F.softmax(input) - - def __repr__(self): - return self.__class__.__name__ + ' ()' - - -
    [docs]class LogSoftmax(Module): - """Applies the Log(Softmax(x)) function to an n-dimensional input Tensor. - The LogSoftmax formulation can be simplified as - - :math:`f_i(x) = log(1 / a * exp(x_i))` where :math:`a = sum_j exp(x_j)` - - Shape: - - Input: :math:`(N, L)` - - Output: :math:`(N, L)` - - Returns: - a Tensor of the same dimension and shape as the input with - values in the range [-inf, 0) - - Examples:: - - >>> m = nn.LogSoftmax() - >>> input = autograd.Variable(torch.randn(2, 3)) - >>> print(input) - >>> print(m(input)) - """ - - def forward(self, input): - return F.log_softmax(input) - - def __repr__(self): - return self.__class__.__name__ + ' ()'
    -
    - -
    -
    - -
    -
    -
    - - -
    - -
    -

    - © Copyright 2017, Torch Contributors. - -

    -
    - Built with Sphinx using a theme provided by Read the Docs. - -
    - -
    -
    - -
    - -
    - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/_modules/torch/nn/modules/batchnorm.html b/docs/_modules/torch/nn/modules/batchnorm.html deleted file mode 100644 index 5cc8064c74b2..000000000000 --- a/docs/_modules/torch/nn/modules/batchnorm.html +++ /dev/null @@ -1,747 +0,0 @@ - - - - - - - - - - - torch.nn.modules.batchnorm — PyTorch 0.1.11 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - -
    - - - - - - -
    -
    - - - - - - - - - - - - - - - - -
    - - - - -
    -
    -
    -
    - -

    Source code for torch.nn.modules.batchnorm

    -import torch
    -from .module import Module
    -from torch.nn.parameter import Parameter
    -from .. import functional as F
    -
    -
    -# TODO: check contiguous in THNN
    -# TODO: use separate backend functions?
    -class _BatchNorm(Module):
    -
    -    def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True):
    -        super(_BatchNorm, self).__init__()
    -        self.num_features = num_features
    -        self.affine = affine
    -        self.eps = eps
    -        self.momentum = momentum
    -        if self.affine:
    -            self.weight = Parameter(torch.Tensor(num_features))
    -            self.bias = Parameter(torch.Tensor(num_features))
    -        else:
    -            self.register_parameter('weight', None)
    -            self.register_parameter('bias', None)
    -        self.register_buffer('running_mean', torch.zeros(num_features))
    -        self.register_buffer('running_var', torch.ones(num_features))
    -        self.reset_parameters()
    -
    -    def reset_parameters(self):
    -        self.running_mean.zero_()
    -        self.running_var.fill_(1)
    -        if self.affine:
    -            self.weight.data.uniform_()
    -            self.bias.data.zero_()
    -
    -    def _check_input_dim(self, input):
    -        if input.size(1) != self.running_mean.nelement():
    -            raise ValueError('got {}-feature tensor, expected {}'
    -                             .format(input.size(1), self.num_features))
    -
    -    def forward(self, input):
    -        self._check_input_dim(input)
    -        return F.batch_norm(
    -            input, self.running_mean, self.running_var, self.weight, self.bias,
    -            self.training, self.momentum, self.eps)
    -
    -    def __repr__(self):
    -        return ('{name}({num_features}, eps={eps}, momentum={momentum},'
    -                ' affine={affine})'
    -                .format(name=self.__class__.__name__, **self.__dict__))
    -
    -
    -
    [docs]class BatchNorm1d(_BatchNorm): - r"""Applies Batch Normalization over a 2d or 3d input that is seen as a mini-batch. - - .. math:: - - y = \frac{x - mean[x]}{ \sqrt{Var[x]} + \epsilon} * gamma + beta - - The mean and standard-deviation are calculated per-dimension over - the mini-batches and gamma and beta are learnable parameter vectors - of size N (where N is the input size). - - During training, this layer keeps a running estimate of its computed mean - and variance. The running sum is kept with a default momentum of 0.1. - - During evaluation, this running mean/variance is used for normalization. - - Args: - num_features: num_features from an expected input of size `batch_size x num_features [x width]` - eps: a value added to the denominator for numerical stability. Default: 1e-5 - momentum: the value used for the running_mean and running_var computation. Default: 0.1 - affine: a boolean value that when set to true, gives the layer learnable affine parameters. - - Shape: - - Input: :math:`(N, C)` or :math:`(N, C, L)` - - Output: :math:`(N, C)` or :math:`(N, C, L)` (same shape as input) - - Examples: - >>> # With Learnable Parameters - >>> m = nn.BatchNorm1d(100) - >>> # Without Learnable Parameters - >>> m = nn.BatchNorm1d(100, affine=False) - >>> input = autograd.Variable(torch.randn(20, 100)) - >>> output = m(input) - """ - - def _check_input_dim(self, input): - if input.dim() != 2 and input.dim() != 3: - raise ValueError('expected 2D or 3D input (got {}D input)' - .format(input.dim())) - super(BatchNorm1d, self)._check_input_dim(input)
    - - -
    [docs]class BatchNorm2d(_BatchNorm): - r"""Applies Batch Normalization over a 4d input that is seen as a mini-batch of 3d inputs - - .. math:: - - y = \frac{x - mean[x]}{ \sqrt{Var[x]} + \epsilon} * gamma + beta - - The mean and standard-deviation are calculated per-dimension over - the mini-batches and gamma and beta are learnable parameter vectors - of size N (where N is the input size). - - During training, this layer keeps a running estimate of its computed mean - and variance. The running sum is kept with a default momentum of 0.1. - - During evaluation, this running mean/variance is used for normalization. - - Args: - num_features: num_features from an expected input of size batch_size x num_features x height x width - eps: a value added to the denominator for numerical stability. Default: 1e-5 - momentum: the value used for the running_mean and running_var computation. Default: 0.1 - affine: a boolean value that when set to true, gives the layer learnable affine parameters. - - Shape: - - Input: :math:`(N, C, H, W)` - - Output: :math:`(N, C, H, W)` (same shape as input) - - Examples: - >>> # With Learnable Parameters - >>> m = nn.BatchNorm2d(100) - >>> # Without Learnable Parameters - >>> m = nn.BatchNorm2d(100, affine=False) - >>> input = autograd.Variable(torch.randn(20, 100, 35, 45)) - >>> output = m(input) - """ - - def _check_input_dim(self, input): - if input.dim() != 4: - raise ValueError('expected 4D input (got {}D input)' - .format(input.dim())) - super(BatchNorm2d, self)._check_input_dim(input)
    - - -
    [docs]class BatchNorm3d(_BatchNorm): - r"""Applies Batch Normalization over a 5d input that is seen as a mini-batch of 4d inputs - - .. math:: - - y = \frac{x - mean[x]}{ \sqrt{Var[x]} + \epsilon} * gamma + beta - - The mean and standard-deviation are calculated per-dimension over - the mini-batches and gamma and beta are learnable parameter vectors - of size N (where N is the input size). - - During training, this layer keeps a running estimate of its computed mean - and variance. The running sum is kept with a default momentum of 0.1. - - During evaluation, this running mean/variance is used for normalization. - - Args: - num_features: num_features from an expected input of size batch_size x num_features x height x width - eps: a value added to the denominator for numerical stability. Default: 1e-5 - momentum: the value used for the running_mean and running_var computation. Default: 0.1 - affine: a boolean value that when set to true, gives the layer learnable affine parameters. - - Shape: - - Input: :math:`(N, C, D, H, W)` - - Output: :math:`(N, C, D, H, W)` (same shape as input) - - Examples: - >>> # With Learnable Parameters - >>> m = nn.BatchNorm3d(100) - >>> # Without Learnable Parameters - >>> m = nn.BatchNorm3d(100, affine=False) - >>> input = autograd.Variable(torch.randn(20, 100, 35, 45, 10)) - >>> output = m(input) - """ - - def _check_input_dim(self, input): - if input.dim() != 5: - raise ValueError('expected 5D input (got {}D input)' - .format(input.dim())) - super(BatchNorm3d, self)._check_input_dim(input)
    -
    - -
    -
    - -
    -
    -
    - - -
    - -
    -

    - © Copyright 2017, Torch Contributors. - -

    -
    - Built with Sphinx using a theme provided by Read the Docs. - -
    - -
    -
    - -
    - -
    - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/_modules/torch/nn/modules/container.html b/docs/_modules/torch/nn/modules/container.html deleted file mode 100644 index 0a82009ff131..000000000000 --- a/docs/_modules/torch/nn/modules/container.html +++ /dev/null @@ -1,776 +0,0 @@ - - - - - - - - - - - torch.nn.modules.container — PyTorch 0.1.11 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - -
    - - - - - - -
    -
    - - - - - - - - - - - - - - - - -
    - - - - -
    -
    -
    -
    - -

    Source code for torch.nn.modules.container

    -from collections import OrderedDict
    -import string
    -import torch
    -import warnings
    -from .module import Module
    -
    -
    -class Container(Module):
    -
    -    def __init__(self, **kwargs):
    -        super(Container, self).__init__()
    -        # DeprecationWarning is ignored by default <sigh>
    -        warnings.warn("nn.Container is deprecated. All of it's functionality "
    -                      "is now implemented in nn.Module. Subclass that instead.")
    -        for key, value in kwargs.items():
    -            self.add_module(key, value)
    -
    -
    -
    [docs]class Sequential(Module): - """A sequential container. - Modules will be added to it in the order they are passed in the constructor. - Alternatively, an ordered dict of modules can also be passed in. - - To make it easier to understand, given is a small example:: - - # Example of using Sequential - model = nn.Sequential( - nn.Conv2d(1,20,5), - nn.ReLU(), - nn.Conv2d(20,64,5), - nn.ReLU() - ) - - # Example of using Sequential with OrderedDict - model = nn.Sequential(OrderedDict([ - ('conv1', nn.Conv2d(1,20,5)), - ('relu1', nn.ReLU()), - ('conv2', nn.Conv2d(20,64,5)), - ('relu2', nn.ReLU()) - ])) - """ - - def __init__(self, *args): - super(Sequential, self).__init__() - if len(args) == 1 and isinstance(args[0], OrderedDict): - for key, module in args[0].items(): - self.add_module(key, module) - else: - idx = 0 - for module in args: - self.add_module(str(idx), module) - idx += 1 - - def __getitem__(self, idx): - if idx < 0 or idx >= len(self._modules): - raise IndexError('index {} is out of range'.format(idx)) - it = iter(self._modules.values()) - for i in range(idx): - next(it) - return next(it) - - def forward(self, input): - for module in self._modules.values(): - input = module(input) - return input
    - - -
    [docs]class ModuleList(Module): - """Holds submodules in a list. - - ModuleList can be indexed like a regular Python list, but modules it contains - are properly registered, and will be visible by all Module methods. - - Arguments: - modules (list, optional): a list of modules to add - - Example:: - - class MyModule(nn.Module): - def __init__(self): - super(MyModule, self).__init__() - self.linears = nn.ModuleList([nn.Linear(10, 10) for i in range(10)]) - - def forward(self, x): - # ModuleList can act as an iterable, or be indexed using ints - for i, l in enumerate(self.linears): - x = self.linears[i // 2](x) + l(x) - return x - """ - - def __init__(self, modules=None): - super(ModuleList, self).__init__() - if modules is not None: - self += modules - - def __getitem__(self, idx): - if idx < 0: - idx += len(self) - return self._modules[str(idx)] - - def __setitem__(self, idx, module): - return setattr(self, str(idx), module) - - def __len__(self): - return len(self._modules) - - def __iter__(self): - return iter(self._modules.values()) - - def __iadd__(self, modules): - return self.extend(modules) - -
    [docs] def append(self, module): - """Appends a given module at the end of the list. - - Arguments: - module (nn.Module): module to append - """ - self.add_module(str(len(self)), module) - return self
    - -
    [docs] def extend(self, modules): - """Appends modules from a Python list at the end. - - Arguments: - modules (list): list of modules to append - """ - if not isinstance(modules, list): - raise TypeError("ModuleList.extend should be called with a " - "list, but got " + type(modules).__name__) - offset = len(self) - for i, module in enumerate(modules): - self.add_module(str(offset + i), module) - return self
    - - -
    [docs]class ParameterList(Module): - """Holds submodules in a list. - - ParameterList can be indexed like a regular Python list, but parameters it contains - are properly registered, and will be visible by all Module methods. - - Arguments: - modules (list, optional): a list of :class:`nn.Parameter`` to add - - Example:: - - class MyModule(nn.Module): - def __init__(self): - super(MyModule, self).__init__() - self.params = nn.ParameterList([nn.Parameter(torch.randn(10, 10)) for i in range(10)]) - - def forward(self, x): - # ModuleList can act as an iterable, or be indexed using ints - for i, p in enumerate(self.params): - x = self.params[i // 2].mm(x) + p.mm(x) - return x - """ - - def __init__(self, parameters=None): - super(ParameterList, self).__init__() - if parameters is not None: - self += parameters - - def __getitem__(self, idx): - if idx < 0: - idx += len(self) - return self._parameters[str(idx)] - - def __setitem__(self, idx, param): - return self.register_parameter(str(idx), param) - - def __len__(self): - return len(self._parameters) - - def __iter__(self): - return iter(self._parameters.values()) - - def __iadd__(self, parameters): - return self.extend(parameters) - -
    [docs] def append(self, parameter): - """Appends a given parameter at the end of the list. - - Arguments: - parameter (nn.Parameter): parameter to append - """ - self.register_parameter(str(len(self)), parameter) - return self
    - -
    [docs] def extend(self, parameters): - """Appends parameters from a Python list at the end. - - Arguments: - parameters (list): list of parameters to append - """ - if not isinstance(parameters, list): - raise TypeError("ParameterList.extend should be called with a " - "list, but got " + type(parameters).__name__) - offset = len(self) - for i, param in enumerate(parameters): - self.register_parameter(str(offset + i), param) - return self
    -
    - -
    -
    - -
    -
    -
    - - -
    - -
    -

    - © Copyright 2017, Torch Contributors. - -

    -
    - Built with Sphinx using a theme provided by Read the Docs. - -
    - -
    -
    - -
    - -
    - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/_modules/torch/nn/modules/conv.html b/docs/_modules/torch/nn/modules/conv.html deleted file mode 100644 index 3b789268d8d9..000000000000 --- a/docs/_modules/torch/nn/modules/conv.html +++ /dev/null @@ -1,1191 +0,0 @@ - - - - - - - - - - - torch.nn.modules.conv — PyTorch 0.1.11 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - -
    - - - - - - -
    -
    - - - - - - - - - - - - - - - - -
    - - - - -
    -
    -
    -
    - -

    Source code for torch.nn.modules.conv

    -import math
    -import torch
    -from torch.nn.parameter import Parameter
    -from .. import functional as F
    -from .module import Module
    -from .utils import _single, _pair, _triple
    -
    -
    -class _ConvNd(Module):
    -
    -    def __init__(self, in_channels, out_channels, kernel_size, stride,
    -                 padding, dilation, transposed, output_padding, groups, bias):
    -        super(_ConvNd, self).__init__()
    -        if in_channels % groups != 0:
    -            raise ValueError('in_channels must be divisible by groups')
    -        if out_channels % groups != 0:
    -            raise ValueError('out_channels must be divisible by groups')
    -        self.in_channels = in_channels
    -        self.out_channels = out_channels
    -        self.kernel_size = kernel_size
    -        self.stride = stride
    -        self.padding = padding
    -        self.dilation = dilation
    -        self.transposed = transposed
    -        self.output_padding = output_padding
    -        self.groups = groups
    -        if transposed:
    -            self.weight = Parameter(torch.Tensor(
    -                in_channels, out_channels // groups, *kernel_size))
    -        else:
    -            self.weight = Parameter(torch.Tensor(
    -                out_channels, in_channels // groups, *kernel_size))
    -        if bias:
    -            self.bias = Parameter(torch.Tensor(out_channels))
    -        else:
    -            self.register_parameter('bias', None)
    -        self.reset_parameters()
    -
    -    def reset_parameters(self):
    -        n = self.in_channels
    -        for k in self.kernel_size:
    -            n *= k
    -        stdv = 1. / math.sqrt(n)
    -        self.weight.data.uniform_(-stdv, stdv)
    -        if self.bias is not None:
    -            self.bias.data.uniform_(-stdv, stdv)
    -
    -    def __repr__(self):
    -        s = ('{name}({in_channels}, {out_channels}, kernel_size={kernel_size}'
    -             ', stride={stride}')
    -        if self.padding != (0,) * len(self.padding):
    -            s += ', padding={padding}'
    -        if self.dilation != (1,) * len(self.dilation):
    -            s += ', dilation={dilation}'
    -        if self.output_padding != (0,) * len(self.output_padding):
    -            s += ', output_padding={output_padding}'
    -        if self.groups != 1:
    -            s += ', groups={groups}'
    -        if self.bias is None:
    -            s += ', bias=False'
    -        s += ')'
    -        return s.format(name=self.__class__.__name__, **self.__dict__)
    -
    -
    -
    [docs]class Conv1d(_ConvNd): - r"""Applies a 1D convolution over an input signal composed of several input - planes. - - In the simplest case, the output value of the layer with input size :math:`(N, C_{in}, L)` - and output :math:`(N, C_{out}, L_{out})` can be precisely described as: - - .. math:: - - \begin{array}{ll} - out(N_i, C_{out_j}) = bias(C_{out_j}) - + \sum_{{k}=0}^{C_{in}-1} weight(C_{out_j}, k) \star input(N_i, k) - \end{array} - - where :math:`\star` is the valid `cross-correlation`_ operator - - | :attr:`stride` controls the stride for the cross-correlation. - | If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides - for :attr:`padding` number of points - | :attr:`dilation` controls the spacing between the kernel points. It is harder to describe, - but this `link`_ has a nice visualization of what :attr:`dilation` does. - | :attr:`groups` controls the connections between inputs and outputs. - | At groups=1, all inputs are convolved to all outputs. - | At groups=2, the operation becomes equivalent to having two conv layers - side by side, each seeing half the input channels, - and producing half the output channels, and both subsequently concatenated. - - .. note:: - - Depending of the size of your kernel, several (of the last) - columns of the input might be lost, because it is a valid `cross-correlation`_, - and not a full `cross-correlation`_. - It is up to the user to add proper padding. - - Args: - in_channels (int): Number of channels in the input image - out_channels (int): Number of channels produced by the convolution - kernel_size (int or tuple): Size of the convolving kernel - stride (int or tuple, optional): Stride of the convolution - padding (int or tuple, optional): Zero-padding added to both sides of the input - dilation (int or tuple, optional): Spacing between kernel elements - groups (int, optional): Number of blocked connections from input channels to output channels - bias (bool, optional): If True, adds a learnable bias to the output - - Shape: - - Input: :math:`(N, C_{in}, L_{in})` - - Output: :math:`(N, C_{out}, L_{out})` where - :math:`L_{out} = floor((L_{in} + 2 * padding - dilation * (kernel\_size - 1) - 1) / stride + 1)` - - Attributes: - weight (Tensor): the learnable weights of the module of shape (out_channels, in_channels, kernel_size) - bias (Tensor): the learnable bias of the module of shape (out_channels) - - Examples:: - - >>> m = nn.Conv1d(16, 33, 3, stride=2) - >>> input = autograd.Variable(torch.randn(20, 16, 50)) - >>> output = m(input) - - .. _cross-correlation: - https://en.wikipedia.org/wiki/Cross-correlation - - .. _link: - https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md - """ - - def __init__(self, in_channels, out_channels, kernel_size, stride=1, - padding=0, dilation=1, groups=1, bias=True): - kernel_size = _single(kernel_size) - stride = _single(stride) - padding = _single(padding) - dilation = _single(dilation) - super(Conv1d, self).__init__( - in_channels, out_channels, kernel_size, stride, padding, dilation, - False, _single(0), groups, bias) - - def forward(self, input): - return F.conv1d(input, self.weight, self.bias, self.stride, - self.padding, self.dilation, self.groups)
    - - -
    [docs]class Conv2d(_ConvNd): - r"""Applies a 2D convolution over an input signal composed of several input - planes. - - In the simplest case, the output value of the layer with input size :math:`(N, C_{in}, H, W)` - and output :math:`(N, C_{out}, H_{out}, W_{out})` can be precisely described as: - - .. math:: - - \begin{array}{ll} - out(N_i, C_{out_j}) = bias(C_{out_j}) - + \sum_{{k}=0}^{C_{in}-1} weight(C_{out_j}, k) \star input(N_i, k) - \end{array} - - where :math:`\star` is the valid 2D `cross-correlation`_ operator - - | :attr:`stride` controls the stride for the cross-correlation. - | If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides - for :attr:`padding` number of points - | :attr:`dilation` controls the spacing between the kernel points. It is harder to describe, - but this `link`_ has a nice visualization of what :attr:`dilation` does. - | :attr:`groups` controls the connections between inputs and outputs. - | At groups=1, all inputs are convolved to all outputs. - | At groups=2, the operation becomes equivalent to having two conv layers - side by side, each seeing half the input channels, - and producing half the output channels, and both subsequently concatenated. - - The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`dilation` can either be: - - - a single ``int`` -- in which case the same value is used for the height and width dimension - - a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension, - and the second `int` for the width dimension - - .. note:: - - Depending of the size of your kernel, several (of the last) - columns of the input might be lost, because it is a valid `cross-correlation`_, - and not a full `cross-correlation`_. - It is up to the user to add proper padding. - - Args: - in_channels (int): Number of channels in the input image - out_channels (int): Number of channels produced by the convolution - kernel_size (int or tuple): Size of the convolving kernel - stride (int or tuple, optional): Stride of the convolution - padding (int or tuple, optional): Zero-padding added to both sides of the input - dilation (int or tuple, optional): Spacing between kernel elements - groups (int, optional): Number of blocked connections from input channels to output channels - bias (bool, optional): If True, adds a learnable bias to the output - - Shape: - - Input: :math:`(N, C_{in}, H_{in}, W_{in})` - - Output: :math:`(N, C_{out}, H_{out}, W_{out})` where - :math:`H_{out} = floor((H_{in} + 2 * padding[0] - dilation[0] * (kernel\_size[0] - 1) - 1) / stride[0] + 1)` - :math:`W_{out} = floor((W_{in} + 2 * padding[1] - dilation[1] * (kernel\_size[1] - 1) - 1) / stride[1] + 1)` - - Attributes: - weight (Tensor): the learnable weights of the module of shape - (out_channels, in_channels, kernel_size[0], kernel_size[1]) - bias (Tensor): the learnable bias of the module of shape (out_channels) - - Examples:: - - >>> # With square kernels and equal stride - >>> m = nn.Conv2d(16, 33, 3, stride=2) - >>> # non-square kernels and unequal stride and with padding - >>> m = nn.Conv2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2)) - >>> # non-square kernels and unequal stride and with padding and dilation - >>> m = nn.Conv2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2), dilation=(3, 1)) - >>> input = autograd.Variable(torch.randn(20, 16, 50, 100)) - >>> output = m(input) - - .. _cross-correlation: - https://en.wikipedia.org/wiki/Cross-correlation - - .. _link: - https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md - """ - - def __init__(self, in_channels, out_channels, kernel_size, stride=1, - padding=0, dilation=1, groups=1, bias=True): - kernel_size = _pair(kernel_size) - stride = _pair(stride) - padding = _pair(padding) - dilation = _pair(dilation) - super(Conv2d, self).__init__( - in_channels, out_channels, kernel_size, stride, padding, dilation, - False, _pair(0), groups, bias) - - def forward(self, input): - return F.conv2d(input, self.weight, self.bias, self.stride, - self.padding, self.dilation, self.groups)
    - - -
    [docs]class Conv3d(_ConvNd): - r"""Applies a 3D convolution over an input signal composed of several input - planes. - - In the simplest case, the output value of the layer with input size :math:`(N, C_{in}, D, H, W)` - and output :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})` can be precisely described as: - - .. math:: - - \begin{array}{ll} - out(N_i, C_{out_j}) = bias(C_{out_j}) - + \sum_{{k}=0}^{C_{in}-1} weight(C_{out_j}, k) \star input(N_i, k) - \end{array} - - where :math:`\star` is the valid 3D `cross-correlation`_ operator - - | :attr:`stride` controls the stride for the cross-correlation. - | If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides - for :attr:`padding` number of points - | :attr:`dilation` controls the spacing between the kernel points. It is harder to describe, - but this `link`_ has a nice visualization of what :attr:`dilation` does. - | :attr:`groups` controls the connections between inputs and outputs. - | At groups=1, all inputs are convolved to all outputs. - | At groups=2, the operation becomes equivalent to having two conv layers - side by side, each seeing half the input channels, - and producing half the output channels, and both subsequently concatenated. - - The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`dilation` can either be: - - - a single ``int`` -- in which case the same value is used for the height and width dimension - - a ``tuple`` of three ints -- in which case, the first `int` is used for the depth dimension, - the second `int` for the height dimension and the third `int` for the width dimension - - .. note:: - - Depending of the size of your kernel, several (of the last) - columns of the input might be lost, because it is a valid `cross-correlation`_, - and not a full `cross-correlation`_. - It is up to the user to add proper padding. - - Args: - in_channels (int): Number of channels in the input image - out_channels (int): Number of channels produced by the convolution - kernel_size (int or tuple): Size of the convolving kernel - stride (int or tuple, optional): Stride of the convolution - padding (int or tuple, optional): Zero-padding added to both sides of the input - dilation (int or tuple, optional): Spacing between kernel elements - groups (int, optional): Number of blocked connections from input channels to output channels - bias (bool, optional): If True, adds a learnable bias to the output - - Shape: - - Input: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})` - - Output: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})` where - :math:`D_{out} = floor((D_{in} + 2 * padding[0] - dilation[0] * (kernel\_size[0] - 1) - 1) / stride[0] + 1)` - :math:`H_{out} = floor((H_{in} + 2 * padding[1] - dilation[1] * (kernel\_size[1] - 1) - 1) / stride[1] + 1)` - :math:`W_{out} = floor((W_{in} + 2 * padding[2] - dilation[2] * (kernel\_size[2] - 1) - 1) / stride[2] + 1)` - - Attributes: - weight (Tensor): the learnable weights of the module of shape - (out_channels, in_channels, kernel_size[0], kernel_size[1], kernel_size[2]) - bias (Tensor): the learnable bias of the module of shape (out_channels) - - Examples:: - - >>> # With square kernels and equal stride - >>> m = nn.Conv3d(16, 33, 3, stride=2) - >>> # non-square kernels and unequal stride and with padding - >>> m = nn.Conv3d(16, 33, (3, 5, 2), stride=(2, 1, 1), padding=(4, 2, 0)) - >>> input = autograd.Variable(torch.randn(20, 16, 10, 50, 100)) - >>> output = m(input) - - .. _cross-correlation: - https://en.wikipedia.org/wiki/Cross-correlation - - .. _link: - https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md - """ - - def __init__(self, in_channels, out_channels, kernel_size, stride=1, - padding=0, dilation=1, groups=1, bias=True): - kernel_size = _triple(kernel_size) - stride = _triple(stride) - padding = _triple(padding) - dilation = _triple(dilation) - super(Conv3d, self).__init__( - in_channels, out_channels, kernel_size, stride, padding, dilation, - False, _triple(0), groups, bias) - - def forward(self, input): - return F.conv3d(input, self.weight, self.bias, self.stride, - self.padding, self.dilation, self.groups)
    - - -class _ConvTransposeMixin(object): - - def forward(self, input, output_size=None): - output_padding = self._output_padding(input, output_size) - func = self._backend.ConvNd( - self.stride, self.padding, self.dilation, self.transposed, - output_padding, self.groups) - if self.bias is None: - return func(input, self.weight) - else: - return func(input, self.weight, self.bias) - - def _output_padding(self, input, output_size): - if output_size is None: - return self.output_padding - - output_size = list(output_size) - k = input.dim() - 2 - if len(output_size) == k + 2: - output_size = output_size[-2:] - if len(output_size) != k: - raise ValueError( - "output_size must have {} or {} elements (got {})" - .format(k, k + 2, len(output_size))) - - def dim_size(d): - return ((input.size(d + 2) - 1) * self.stride[d] - - 2 * self.padding[d] + self.kernel_size[d]) - - min_sizes = [dim_size(d) for d in range(k)] - max_sizes = [min_sizes[d] + self.stride[d] - 1 for d in range(k)] - for size, min_size, max_size in zip(output_size, min_sizes, max_sizes): - if size < min_size or size > max_size: - raise ValueError(( - "requested an output size of {}, but valid sizes range " - "from {} to {} (for an input of {})").format( - output_size, min_sizes, max_sizes, input.size()[2:])) - - return tuple([output_size[d] - min_sizes[d] for d in range(k)]) - - -
    [docs]class ConvTranspose1d(_ConvTransposeMixin, _ConvNd): - """Applies a 1D transposed convolution operator over an input image - composed of several input planes. - - This module can be seen as the gradient of Conv1d with respect to its input. - It is sometimes (but incorrectly) refered to as a deconvolutional operation. - - .. note:: - - Depending of the size of your kernel, several (of the last) - columns of the input might be lost, because it is a valid `cross-correlation`_, - and not a full `cross-correlation`_. - It is up to the user to add proper padding. - - Args: - in_channels (int): Number of channels in the input image - out_channels (int): Number of channels produced by the convolution - kernel_size (int or tuple): Size of the convolving kernel - stride (int or tuple, optional): Stride of the convolution - padding (int or tuple, optional): Zero-padding added to both sides of the input - output_padding (int or tuple, optional): Zero-padding added to one side of the output - groups (int, optional): Number of blocked connections from input channels to output channels - bias (bool, optional): If True, adds a learnable bias to the output - - Shape: - - Input: :math:`(N, C_{in}, L_{in})` - - Output: :math:`(N, C_{out}, L_{out})` where - :math:`L_{out} = (L_{in} - 1) * stride - 2 * padding + kernel\_size + output\_padding` - - Attributes: - weight (Tensor): the learnable weights of the module of shape - (in_channels, out_channels, kernel_size[0], kernel_size[1]) - bias (Tensor): the learnable bias of the module of shape (out_channels) - """ - - def __init__(self, in_channels, out_channels, kernel_size, stride=1, - padding=0, output_padding=0, groups=1, bias=True): - kernel_size = _single(kernel_size) - stride = _single(stride) - padding = _single(padding) - dilation = _single(1) - output_padding = _single(output_padding) - super(ConvTranspose1d, self).__init__( - in_channels, out_channels, kernel_size, stride, padding, dilation, - True, output_padding, groups, bias) - - def forward(self, input, output_size=None): - output_padding = self._output_padding(input, output_size) - return F.conv_transpose1d( - input, self.weight, self.bias, self.stride, self.padding, - output_padding, self.groups)
    - - -
    [docs]class ConvTranspose2d(_ConvTransposeMixin, _ConvNd): - r"""Applies a 2D transposed convolution operator over an input image - composed of several input planes. - - This module can be seen as the gradient of Conv2d with respect to its input. - It is sometimes (but incorrectly) refered to as a deconvolutional operation. - - | :attr:`stride` controls the stride for the cross-correlation. - | If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides - for :attr:`padding` number of points - | If :attr:`output_padding` is non-zero, then the output is implicitly zero-padded on one side - for :attr:`output_padding` number of points - | :attr:`dilation` controls the spacing between the kernel points. It is harder to describe, - but this `link`_ has a nice visualization of what :attr:`dilation` does. - | :attr:`groups` controls the connections between inputs and outputs. - | At groups=1, all inputs are convolved to all outputs. - | At groups=2, the operation becomes equivalent to having two conv layers - side by side, each seeing half the input channels, - and producing half the output channels, and both subsequently concatenated. - - The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`output_padding` - can either be: - - - a single ``int`` -- in which case the same value is used for the height and width dimension - - a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension, - and the second `int` for the width dimension - - .. note:: - - Depending of the size of your kernel, several (of the last) - columns of the input might be lost, because it is a valid `cross-correlation`_, - and not a full `cross-correlation`_. - It is up to the user to add proper padding. - - Args: - in_channels (int): Number of channels in the input image - out_channels (int): Number of channels produced by the convolution - kernel_size (int or tuple): Size of the convolving kernel - stride (int or tuple, optional): Stride of the convolution - padding (int or tuple, optional): Zero-padding added to both sides of the input - output_padding (int or tuple, optional): Zero-padding added to one side of the output - groups (int, optional): Number of blocked connections from input channels to output channels - bias (bool, optional): If True, adds a learnable bias to the output - - Shape: - - Input: :math:`(N, C_{in}, H_{in}, W_{in})` - - Output: :math:`(N, C_{out}, H_{out}, W_{out})` where - :math:`H_{out} = (H_{in} - 1) * stride[0] - 2 * padding[0] + kernel\_size[0] + output\_padding[0]` - :math:`W_{out} = (W_{in} - 1) * stride[1] - 2 * padding[1] + kernel\_size[1] + output\_padding[1]` - - Attributes: - weight (Tensor): the learnable weights of the module of shape - (in_channels, out_channels, kernel_size[0], kernel_size[1]) - bias (Tensor): the learnable bias of the module of shape (out_channels) - - Examples:: - - >>> # With square kernels and equal stride - >>> m = nn.ConvTranspose2d(16, 33, 3, stride=2) - >>> # non-square kernels and unequal stride and with padding - >>> m = nn.ConvTranspose2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2)) - >>> input = autograd.Variable(torch.randn(20, 16, 50, 100)) - >>> output = m(input) - >>> # exact output size can be also specified as an argument - >>> input = autograd.Variable(torch.randn(1, 16, 12, 12)) - >>> downsample = nn.Conv2d(16, 16, 3, stride=2, padding=1) - >>> upsample = nn.ConvTranspose2d(16, 16, 3, stride=2, padding=1) - >>> h = downsample(input) - >>> h.size() - torch.Size([1, 16, 6, 6]) - >>> output = upsample(h, output_size=input.size()) - >>> output.size() - torch.Size([1, 16, 12, 12]) - - .. _cross-correlation: - https://en.wikipedia.org/wiki/Cross-correlation - - .. _link: - https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md - """ - - def __init__(self, in_channels, out_channels, kernel_size, stride=1, - padding=0, output_padding=0, groups=1, bias=True): - kernel_size = _pair(kernel_size) - stride = _pair(stride) - padding = _pair(padding) - dilation = _pair(1) - output_padding = _pair(output_padding) - super(ConvTranspose2d, self).__init__( - in_channels, out_channels, kernel_size, stride, padding, dilation, - True, output_padding, groups, bias) - - def forward(self, input, output_size=None): - output_padding = self._output_padding(input, output_size) - return F.conv_transpose2d( - input, self.weight, self.bias, self.stride, self.padding, - output_padding, self.groups)
    - - -
    [docs]class ConvTranspose3d(_ConvTransposeMixin, _ConvNd): - r"""Applies a 3D transposed convolution operator over an input image composed of several input - planes. - The transposed convolution operator multiplies each input value element-wise by a learnable kernel, - and sums over the outputs from all input feature planes. - - **This module can be seen as the exact reverse of Conv3d**. - It is sometimes (but incorrectly) refered to as a deconvolutional operation. - - | :attr:`stride` controls the stride for the cross-correlation. - | If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides - for :attr:`padding` number of points - | If :attr:`output_padding` is non-zero, then the output is implicitly zero-padded on one side - for :attr:`output_padding` number of points - | :attr:`groups` controls the connections between inputs and outputs. - | At groups=1, all inputs are convolved to all outputs. - | At groups=2, the operation becomes equivalent to having two conv layers - side by side, each seeing half the input channels, - and producing half the output channels, and both subsequently concatenated. - - The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`output_padding` - can either be: - - - a single ``int`` -- in which case the same value is used for the height and width dimension - - a ``tuple`` of three ints -- in which case, the first `int` is used for the depth dimension, - the second `int` for the width dimension and the third `int` for the width dimension - - .. note:: - - Depending of the size of your kernel, several (of the last) - columns of the input might be lost, because it is a valid `cross-correlation`_, - and not a full `cross-correlation`_. - It is up to the user to add proper padding. - - Args: - in_channels (int): Number of channels in the input image - out_channels (int): Number of channels produced by the convolution - kernel_size (int or tuple): Size of the convolving kernel - stride (int or tuple, optional): Stride of the convolution - padding (int or tuple, optional): Zero-padding added to both sides of the input - output_padding (int or tuple, optional): Zero-padding added to one side of the output - groups (int, optional): Number of blocked connections from input channels to output channels - bias (bool, optional): If True, adds a learnable bias to the output - - Shape: - - Input: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})` - - Output: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})` where - :math:`D_{out} = (D_{in} - 1) * stride[0] - 2 * padding[0] + kernel\_size[0] + output\_padding[0]` - :math:`H_{out} = (H_{in} - 1) * stride[1] - 2 * padding[1] + kernel\_size[1] + output\_padding[1]` - :math:`W_{out} = (W_{in} - 1) * stride[2] - 2 * padding[2] + kernel\_size[2] + output\_padding[2]` - - Attributes: - weight (Tensor): the learnable weights of the module of shape - (in_channels, out_channels, kernel_size[0], kernel_size[1], kernel_size[2]) - bias (Tensor): the learnable bias of the module of shape (out_channels) - - Examples:: - - >>> # With square kernels and equal stride - >>> m = nn.ConvTranspose3d(16, 33, 3, stride=2) - >>> # non-square kernels and unequal stride and with padding - >>> m = nn.Conv3d(16, 33, (3, 5, 2), stride=(2, 1, 1), padding=(0, 4, 2)) - >>> input = autograd.Variable(torch.randn(20, 16, 10, 50, 100)) - >>> output = m(input) - - .. _cross-correlation: - https://en.wikipedia.org/wiki/Cross-correlation - - .. _link: - https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md - """ - - def __init__(self, in_channels, out_channels, kernel_size, stride=1, - padding=0, output_padding=0, groups=1, bias=True): - kernel_size = _triple(kernel_size) - stride = _triple(stride) - padding = _triple(padding) - dilation = _triple(1) - output_padding = _triple(output_padding) - super(ConvTranspose3d, self).__init__( - in_channels, out_channels, kernel_size, stride, padding, dilation, - True, output_padding, groups, bias) - - def forward(self, input, output_size=None): - output_padding = self._output_padding(input, output_size) - return F.conv_transpose3d( - input, self.weight, self.bias, self.stride, self.padding, - output_padding, self.groups)
    - - -# TODO: Conv2dLocal -# TODO: Conv2dMap -# TODO: ConvTranspose2dMap -
    - -
    -
    - -
    -
    -
    - - -
    - -
    -

    - © Copyright 2017, Torch Contributors. - -

    -
    - Built with Sphinx using a theme provided by Read the Docs. - -
    - -
    -
    - -
    - -
    - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/_modules/torch/nn/modules/distance.html b/docs/_modules/torch/nn/modules/distance.html deleted file mode 100644 index a645d592a117..000000000000 --- a/docs/_modules/torch/nn/modules/distance.html +++ /dev/null @@ -1,610 +0,0 @@ - - - - - - - - - - - torch.nn.modules.distance — PyTorch 0.1.11 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - -
    - - - - - - -
    -
    - - - - - - - - - - - - - - - - -
    - - - - -
    -
    -
    -
    - -

    Source code for torch.nn.modules.distance

    -import torch
    -from .module import Module
    -from .. import functional as F
    -
    -
    -
    [docs]class PairwiseDistance(Module): - r""" - Computes the batchwise pairwise distance between vectors v1,v2: - - .. math :: - \Vert x \Vert _p := \left( \sum_{i=1}^n \vert x_i \vert ^ p \right) ^ {1/p} - - Args: - x (Tensor): input tensor containing the two input batches - p (real): the norm degree. Default: 2 - - Shape: - - Input: :math:`(N, D)` where `D = vector dimension` - - Output: :math:`(N, 1)` - - >>> pdist = nn.PairwiseDistance(2) - >>> input1 = autograd.Variable(torch.randn(100, 128)) - >>> input2 = autograd.Variable(torch.randn(100, 128)) - >>> output = pdist(input1, input2) - """ - def __init__(self, p=2, eps=1e-6): - super(PairwiseDistance, self).__init__() - self.norm = p - self.eps = eps - - def forward(self, x1, x2): - return F.pairwise_distance(x1, x2, self.norm, self.eps)
    - -# TODO: Cosine -# TODO: CosineDistance - make sure lua's CosineDistance isn't actually cosine similarity -# TODO: Euclidean -# TODO: WeightedEuclidean -
    - -
    -
    - -
    -
    -
    - - -
    - -
    -

    - © Copyright 2017, Torch Contributors. - -

    -
    - Built with Sphinx using a theme provided by Read the Docs. - -
    - -
    -
    - -
    - -
    - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/_modules/torch/nn/modules/dropout.html b/docs/_modules/torch/nn/modules/dropout.html deleted file mode 100644 index 1d3fc951d4b7..000000000000 --- a/docs/_modules/torch/nn/modules/dropout.html +++ /dev/null @@ -1,716 +0,0 @@ - - - - - - - - - - - torch.nn.modules.dropout — PyTorch 0.1.11 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - -
    - - - - - - -
    -
    - - - - - - - - - - - - - - - - -
    - - - - -
    -
    -
    -
    - -

    Source code for torch.nn.modules.dropout

    -from .module import Module
    -from .. import functional as F
    -
    -
    -
    [docs]class Dropout(Module): - r"""Randomly zeroes some of the elements of the input tensor. - The elements to zero are randomized on every forward call. - - Args: - p: probability of an element to be zeroed. Default: 0.5 - inplace: If set to True, will do this operation in-place. Default: false - - Shape: - - Input: `Any`. Input can be of any shape - - Output: `Same`. Output is of the same shape as input - - Examples:: - - >>> m = nn.Dropout(p=0.2) - >>> input = autograd.Variable(torch.randn(20, 16)) - >>> output = m(input) - """ - - def __init__(self, p=0.5, inplace=False): - super(Dropout, self).__init__() - if p < 0 or p > 1: - raise ValueError("dropout probability has to be between 0 and 1, " - "but got {}".format(p)) - self.p = p - self.inplace = inplace - - def forward(self, input): - return F.dropout(input, self.p, self.training, self.inplace) - - def __repr__(self): - inplace_str = ', inplace' if self.inplace else '' - return self.__class__.__name__ + ' (' \ - + 'p = ' + str(self.p) \ - + inplace_str + ')'
    - - -
    [docs]class Dropout2d(Module): - r"""Randomly zeroes whole channels of the input tensor. - The channels to zero-out are randomized on every forward call. - - *Usually the input comes from Conv2d modules.* - - As described in the paper - `Efficient Object Localization Using Convolutional Networks`_ , - if adjacent pixels within feature maps are strongly correlated - (as is normally the case in early convolution layers) then iid dropout - will not regularize the activations and will otherwise just result - in an effective learning rate decrease. - - In this case, :func:`nn.Dropout2d` will help promote independence between - feature maps and should be used instead. - - Args: - p (float, optional): probability of an element to be zeroed. - inplace (bool, optional): If set to True, will do this operation in-place - - Shape: - - Input: :math:`(N, C, H, W)` - - Output: :math:`(N, C, H, W)` (same shape as input) - - Examples:: - - >>> m = nn.Dropout2d(p=0.2) - >>> input = autograd.Variable(torch.randn(20, 16, 32, 32)) - >>> output = m(input) - - .. _Efficient Object Localization Using Convolutional Networks: - http://arxiv.org/abs/1411.4280 - """ - - def __init__(self, p=0.5, inplace=False): - super(Dropout2d, self).__init__() - if p < 0 or p > 1: - raise ValueError("dropout probability has to be between 0 and 1, " - "but got {}".format(p)) - self.p = p - self.inplace = inplace - - def forward(self, input): - return self._backend.Dropout2d(self.p, self.training, self.inplace)(input) - - def __repr__(self): - inplace_str = ', inplace' if self.inplace else '' - return self.__class__.__name__ + ' (' \ - + 'p=' + str(self.p) \ - + inplace_str + ')'
    - - -
    [docs]class Dropout3d(Module): - r"""Randomly zeroes whole channels of the input tensor. - The channels to zero are randomized on every forward call. - - *Usually the input comes from Conv3d modules.* - - As described in the paper - `Efficient Object Localization Using Convolutional Networks`_ , - if adjacent pixels within feature maps are strongly correlated - (as is normally the case in early convolution layers) then iid dropout - will not regularize the activations and will otherwise just result - in an effective learning rate decrease. - - In this case, :func:`nn.Dropout3d` will help promote independence between - feature maps and should be used instead. - - Args: - p (float, optional): probability of an element to be zeroed. - inplace (bool, optional): If set to True, will do this operation in-place - - Shape: - - Input: :math:`(N, C, D, H, W)` - - Output: :math:`(N, C, D, H, W)` (same shape as input) - - Examples:: - - >>> m = nn.Dropout3d(p=0.2) - >>> input = autograd.Variable(torch.randn(20, 16, 4, 32, 32)) - >>> output = m(input) - - .. _Efficient Object Localization Using Convolutional Networks: - http://arxiv.org/abs/1411.4280 - """ - - def __init__(self, p=0.5, inplace=False): - super(Dropout3d, self).__init__() - if p < 0 or p > 1: - raise ValueError("dropout probability has to be between 0 and 1, " - "but got {}".format(p)) - self.p = p - self.inplace = inplace - - def forward(self, input): - return self._backend.Dropout3d(self.p, self.training, self.inplace)(input) - - def __repr__(self): - inplace_str = ', inplace' if self.inplace else '' - return self.__class__.__name__ + ' (' \ - + 'p=' + str(self.p) \ - + inplace_str + ')'
    -
    - -
    -
    - -
    -
    -
    - - -
    - -
    -

    - © Copyright 2017, Torch Contributors. - -

    -
    - Built with Sphinx using a theme provided by Read the Docs. - -
    - -
    -
    - -
    - -
    - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/_modules/torch/nn/modules/linear.html b/docs/_modules/torch/nn/modules/linear.html deleted file mode 100644 index 1b8e207819ee..000000000000 --- a/docs/_modules/torch/nn/modules/linear.html +++ /dev/null @@ -1,636 +0,0 @@ - - - - - - - - - - - torch.nn.modules.linear — PyTorch 0.1.11 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - -
    - - - - - - -
    -
    - - - - - - - - - - - - - - - - -
    - - - - -
    -
    -
    -
    - -

    Source code for torch.nn.modules.linear

    -import math
    -
    -import torch
    -from torch.nn.parameter import Parameter
    -
    -from .module import Module
    -
    -
    -
    [docs]class Linear(Module): - r"""Applies a linear transformation to the incoming data: :math:`y = Ax + b` - - Args: - in_features: size of each input sample - out_features: size of each output sample - bias: If set to False, the layer will not learn an additive bias. Default: True - - Shape: - - Input: :math:`(N, in\_features)` - - Output: :math:`(N, out\_features)` - - Attributes: - weight: the learnable weights of the module of shape (out_features x in_features) - bias: the learnable bias of the module of shape (out_features) - - Examples:: - - >>> m = nn.Linear(20, 30) - >>> input = autograd.Variable(torch.randn(128, 20)) - >>> output = m(input) - >>> print(output.size()) - """ - - def __init__(self, in_features, out_features, bias=True): - super(Linear, self).__init__() - self.in_features = in_features - self.out_features = out_features - self.weight = Parameter(torch.Tensor(out_features, in_features)) - if bias: - self.bias = Parameter(torch.Tensor(out_features)) - else: - self.register_parameter('bias', None) - self.reset_parameters() - - def reset_parameters(self): - stdv = 1. / math.sqrt(self.weight.size(1)) - self.weight.data.uniform_(-stdv, stdv) - if self.bias is not None: - self.bias.data.uniform_(-stdv, stdv) - - def forward(self, input): - if self.bias is None: - return self._backend.Linear()(input, self.weight) - else: - return self._backend.Linear()(input, self.weight, self.bias) - - def __repr__(self): - return self.__class__.__name__ + ' (' \ - + str(self.in_features) + ' -> ' \ - + str(self.out_features) + ')'
    - - -# TODO: Bilinear -# TODO: PartialLinear - maybe in sparse? -
    - -
    -
    - -
    -
    -
    - - -
    - -
    -

    - © Copyright 2017, Torch Contributors. - -

    -
    - Built with Sphinx using a theme provided by Read the Docs. - -
    - -
    -
    - -
    - -
    - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/_modules/torch/nn/modules/loss.html b/docs/_modules/torch/nn/modules/loss.html deleted file mode 100644 index 62db3cb91357..000000000000 --- a/docs/_modules/torch/nn/modules/loss.html +++ /dev/null @@ -1,1052 +0,0 @@ - - - - - - - - - - - torch.nn.modules.loss — PyTorch 0.1.11 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - -
    - - - - - - -
    -
    - - - - - - - - - - - - - - - - -
    - - - - -
    -
    -
    -
    - -

    Source code for torch.nn.modules.loss

    -from torch.autograd import Variable
    -import torch
    -from .module import Module
    -from .container import Sequential
    -from .activation import LogSoftmax
    -from .. import functional as F
    -
    -
    -def _assert_no_grad(variable):
    -    assert not variable.requires_grad, \
    -        "nn criterions don't compute the gradient w.r.t. targets - please " \
    -        "mark these variables as volatile or not requiring gradients"
    -
    -
    -class _Loss(Module):
    -
    -    def __init__(self, size_average=True):
    -        super(_Loss, self).__init__()
    -        self.size_average = size_average
    -
    -    def forward(self, input, target):
    -        _assert_no_grad(target)
    -        backend_fn = getattr(self._backend, type(self).__name__)
    -        return backend_fn(self.size_average)(input, target)
    -
    -
    -class _WeightedLoss(_Loss):
    -
    -    def __init__(self, weight=None, size_average=True):
    -        super(_WeightedLoss, self).__init__(size_average)
    -        self.register_buffer('weight', weight)
    -
    -    def forward(self, input, target):
    -        _assert_no_grad(target)
    -        backend_fn = getattr(self._backend, type(self).__name__)
    -        return backend_fn(self.size_average, weight=self.weight)(input, target)
    -
    -
    -
    [docs]class L1Loss(_Loss): - r"""Creates a criterion that measures the mean absolute value of the - element-wise difference between input `x` and target `y`: - - :math:`{loss}(x, y) = 1/n \sum |x_i - y_i|` - - `x` and `y` arbitrary shapes with a total of `n` elements each. - - The sum operation still operates over all the elements, and divides by `n`. - - The division by `n` can be avoided if one sets the constructor argument `size_average=False` - """ - pass
    - - -
    [docs]class NLLLoss(_WeightedLoss): - r"""The negative log likelihood loss. It is useful to train a classification problem with n classes - - If provided, the optional argument `weights` should be a 1D Tensor assigning - weight to each of the classes. - - This is particularly useful when you have an unbalanced training set. - - The input given through a forward call is expected to contain log-probabilities - of each class: input has to be a 2D Tensor of size `(minibatch, n)` - - Obtaining log-probabilities in a neural network is easily achieved by - adding a `LogSoftmax` layer in the last layer of your network. - - You may use `CrossEntropyLoss` instead, if you prefer not to add an extra layer. - - The target that this loss expects is a class index `(0 to N-1, where N = number of classes)` - - The loss can be described as:: - - loss(x, class) = -x[class] - - or in the case of the weights argument it is specified as follows:: - - loss(x, class) = -weights[class] * x[class] - - Args: - weight (Tensor, optional): a manual rescaling weight given to each class. - If given, has to be a Tensor of size "nclasses" - size_average (bool, optional): By default, the losses are averaged over observations for each minibatch. - However, if the field size_average is set to False, - the losses are instead summed for each minibatch. - - - Shape: - - Input: :math:`(N, C)` where `C = number of classes` - - Target: :math:`(N)` where each value is `0 <= targets[i] <= C-1` - - Attributes: - weight: the class-weights given as input to the constructor - - Examples:: - - >>> m = nn.LogSoftmax() - >>> loss = nn.NLLLoss() - >>> # input is of size nBatch x nClasses = 3 x 5 - >>> input = autograd.Variable(torch.randn(3, 5), requires_grad=True) - >>> # each element in target has to have 0 <= value < nclasses - >>> target = autograd.Variable(torch.LongTensor([1, 0, 4])) - >>> output = loss(m(input), target) - >>> output.backward() - """ - pass
    - - -
    [docs]class NLLLoss2d(_WeightedLoss): - r"""This is negative log likehood loss, but for image inputs. It computes NLL loss per-pixel. - - Args: - weight (Tensor, optional): a manual rescaling weight given to each class. - If given, has to be a 1D Tensor having as many elements, as there are classes. - size_average: By default, the losses are averaged over observations for each minibatch. - However, if the field size_average is set to False, the losses - are instead summed for each minibatch. Default: True - - Shape: - - Input: :math:`(N, C, H, W)` where `C = number of classes` - - Target: :math:`(N, H, W)` where each value is `0 <= targets[i] <= C-1` - - Examples: - >>> m = nn.Conv2d(16, 32, (3, 3)).float() - >>> loss = nn.NLLLoss2d() - >>> # input is of size nBatch x nClasses x height x width - >>> input = autograd.Variable(torch.randn(3, 16, 10, 10)) - >>> # each element in target has to have 0 <= value < nclasses - >>> target = autograd.Variable(torch.LongTensor(3, 8, 8).random_(0, 4)) - >>> output = loss(m(input), target) - >>> output.backward() - """ - pass
    - - -
    [docs]class KLDivLoss(_WeightedLoss): - r"""The `Kullback-Leibler divergence`_ Loss - - KL divergence is a useful distance measure for continuous distributions - and is often useful when performing direct regression over the space of - (discretely sampled) continuous output distributions. - - As with `NLLLoss`, the `input` given is expected to contain - *log-probabilities*, however unlike `ClassNLLLoss`, `input` is not - restricted to a 2D Tensor, because the criterion is applied element-wise. - - This criterion expects a `target` `Tensor` of the same size as the - `input` `Tensor`. - - The loss can be described as: - - .. math:: loss(x, target) = 1/n \sum(target_i * (log(target_i) - x_i)) - - By default, the losses are averaged for each minibatch over observations - **as well as** over dimensions. However, if the field - `size_average` is set to `False`, the losses are instead summed. - - .. _Kullback-Leibler divergence: - https://en.wikipedia.org/wiki/Kullback-Leibler_divergence - """ - pass
    - - -
    [docs]class MSELoss(_Loss): - r"""Creates a criterion that measures the mean squared error between - `n` elements in the input `x` and target `y`: - - :math:`{loss}(x, y) = 1/n \sum |x_i - y_i|^2` - - `x` and `y` arbitrary shapes with a total of `n` elements each. - - The sum operation still operates over all the elements, and divides by `n`. - - The division by `n` can be avoided if one sets the internal variable - `size_average` to `False`. - - """ - pass
    - - -
    [docs]class BCELoss(_WeightedLoss): - r"""Creates a criterion that measures the Binary Cross Entropy - between the target and the output: - - .. math:: loss(o, t) = - 1/n \sum_i (t[i] * log(o[i]) + (1 - t[i]) * log(1 - o[i])) - - or in the case of the weights argument being specified: - - .. math:: loss(o, t) = - 1/n \sum_i weights[i] * (t[i] * log(o[i]) + (1 - t[i]) * log(1 - o[i])) - - This is used for measuring the error of a reconstruction in for example - an auto-encoder. Note that the targets `t[i]` should be numbers between 0 and 1. - - By default, the losses are averaged for each minibatch over observations - *as well as* over dimensions. However, if the field `size_average` is set - to `False`, the losses are instead summed. - - """ - pass
    - - -
    [docs]class HingeEmbeddingLoss(_Loss): - r"""Measures the loss given an input `x` which is a 2D mini-batch tensor - and a labels `y`, a 1D tensor containg values (`1` or `-1`). - This is usually used for measuring whether two inputs are similar or dissimilar, - e.g. using the L1 pairwise distance, and is typically used for learning - nonlinear embeddings or semi-supervised learning:: - - { x_i, if y_i == 1 - loss(x, y) = 1/n { - { max(0, margin - x_i), if y_i == -1 - - `x` and `y` arbitrary shapes with a total of `n` elements each - the sum operation still operates over all the elements, and divides by `n`. - - The division by `n` can be avoided if one sets the internal variable `size_average=False`. - - The `margin` has a default value of `1`, or can be set in the constructor. - """ - pass
    - - -
    [docs]class MultiLabelMarginLoss(_Loss): - r"""Creates a criterion that optimizes a multi-class multi-classification - hinge loss (margin-based loss) between input `x` (a 2D mini-batch `Tensor`) and - output `y` (which is a 2D `Tensor` of target class indices). - For each sample in the mini-batch:: - - loss(x, y) = sum_ij(max(0, 1 - (x[y[j]] - x[i]))) / x.size(0) - - where `i == 0` to `x.size(0)`, `j == 0` to `y.size(0)`, - `y[j] != 0`, and `i != y[j]` for all `i` and `j`. - - `y` and `x` must have the same size. - - The criterion only considers the first non zero `y[j]` targets. - - This allows for different samples to have variable amounts of target classes - """ - pass
    - - -
    [docs]class SmoothL1Loss(_Loss): - r"""Creates a criterion that uses a squared term if the absolute - element-wise error falls below 1 and an L1 term otherwise. - It is less sensitive to outliers than the `MSELoss` and in some cases - prevents exploding gradients (e.g. see "Fast R-CNN" paper by Ross Girshick). - Also known as the Huber loss:: - - { 0.5 * (x_i - y_i)^2, if |x_i - y_i| < 1 - loss(x, y) = 1/n \sum { - { |x_i - y_i| - 0.5, otherwise - - `x` and `y` arbitrary shapes with a total of `n` elements each - the sum operation still operates over all the elements, and divides by `n`. - - The division by `n` can be avoided if one sets the internal variable - `size_average` to `False` - """ - pass
    - - -
    [docs]class SoftMarginLoss(_Loss): - r"""Creates a criterion that optimizes a two-class classification - logistic loss between input `x` (a 2D mini-batch Tensor) and - target `y` (which is a tensor containing either `1` or `-1`). - - :: - - loss(x, y) = sum_i (log(1 + exp(-y[i]*x[i]))) / x.nelement() - - The normalization by the number of elements in the input can be disabled by - setting `self.size_average` to `False`. - """ - pass
    - - -
    [docs]class CrossEntropyLoss(_WeightedLoss): - r"""This criterion combines `LogSoftMax` and `NLLLoss` in one single class. - - It is useful when training a classification problem with `n` classes. - If provided, the optional argument `weights` should be a 1D `Tensor` - assigning weight to each of the classes. - This is particularly useful when you have an unbalanced training set. - - The `input` is expected to contain scores for each class. - - `input` has to be a 2D `Tensor` of size `batch x n`. - - This criterion expects a class index (0 to nClasses-1) as the - `target` for each value of a 1D tensor of size `n` - - The loss can be described as:: - - loss(x, class) = -log(exp(x[class]) / (\sum_j exp(x[j]))) - = -x[class] + log(\sum_j exp(x[j])) - - or in the case of the `weights` argument being specified:: - - loss(x, class) = weights[class] * (-x[class] + log(\sum_j exp(x[j]))) - - The losses are averaged across observations for each minibatch. - - Shape: - - Input: :math:`(N, C)` where `C = number of classes` - - Target: :math:`(N)` where each value is `0 <= targets[i] <= C-1` - - """ - - def forward(self, input, target): - _assert_no_grad(target) - return F.cross_entropy(input, target, - self.weight, self.size_average)
    - - -
    [docs]class MultiLabelSoftMarginLoss(_WeightedLoss): - r"""Creates a criterion that optimizes a multi-label one-versus-all - loss based on max-entropy, between input `x` (a 2D mini-batch `Tensor`) and - target `y` (a binary 2D `Tensor`). For each sample in the minibatch:: - - loss(x, y) = - sum_i (y[i] log( exp(x[i]) / (1 + exp(x[i]))) - + (1-y[i]) log(1/(1+exp(x[i])))) / x:nElement() - - where `i == 0` to `x.nElement()-1`, `y[i] in {0,1}`. - `y` and `x` must have the same size. - """ - - def forward(self, input, target): - return F.binary_cross_entropy(torch.sigmoid(input), target, - self.weight, self.size_average)
    - - -
    [docs]class CosineEmbeddingLoss(Module): - r"""Creates a criterion that measures the loss given an input tensors x1, x2 - and a `Tensor` label `y` with values 1 or -1. - This is used for measuring whether two inputs are similar or dissimilar, - using the cosine distance, and is typically used for learning nonlinear - embeddings or semi-supervised learning. - - `margin` should be a number from `-1` to `1`, `0` to `0.5` is suggested. - If `margin` is missing, the default value is `0`. - - The loss function for each sample is:: - - { 1 - cos(x1, x2), if y == 1 - loss(x, y) = { - { max(0, cos(x1, x2) - margin), if y == -1 - - If the internal variable `size_average` is equal to `True`, - the loss function averages the loss over the batch samples; - if `size_average` is `False`, then the loss function sums over the - batch samples. By default, `size_average = True`. - """ - - def __init__(self, margin=0, size_average=True): - super(CosineEmbeddingLoss, self).__init__() - self.margin = margin - self.size_average = size_average - - def forward(self, input1, input2, target): - return self._backend.CosineEmbeddingLoss(self.margin, - self.size_average)(input1, input2, target)
    - - -
    [docs]class MarginRankingLoss(Module): - r"""Creates a criterion that measures the loss given - inputs `x1`, `x2`, two 1D min-batch `Tensor`s, - and a label 1D mini-batch tensor `y` with values (`1` or `-1`). - - If `y == 1` then it assumed the first input should be ranked higher - (have a larger value) than the second input, and vice-versa for `y == -1`. - - The loss function for each sample in the mini-batch is:: - - loss(x, y) = max(0, -y * (x1 - x2) + margin) - - if the internal variable `size_average = True`, - the loss function averages the loss over the batch samples; - if `size_average = False`, then the loss function sums over the batch samples. - By default, `size_average` equals to `True`. - """ - - def __init__(self, margin=0, size_average=True): - super(MarginRankingLoss, self).__init__() - self.margin = margin - self.size_average = size_average - - def forward(self, input1, input2, target): - return self._backend.MarginRankingLoss(self.margin, - self.size_average)(input1, input2, target)
    - - -
    [docs]class MultiMarginLoss(Module): - r"""Creates a criterion that optimizes a multi-class classification hinge loss - (margin-based loss) between input `x` (a 2D mini-batch `Tensor`) and - output `y` (which is a 1D tensor of target class indices, `0` <= `y` <= `x.size(1)`): - - For each mini-batch sample:: - - loss(x, y) = sum_i(max(0, (margin - x[y] + x[i]))^p) / x.size(0) - where `i == 0` to `x.size(0)` and `i != y`. - - Optionally, you can give non-equal weighting on the classes by passing - a 1D `weights` tensor into the constructor. - - The loss function then becomes: - - loss(x, y) = sum_i(max(0, w[y] * (margin - x[y] - x[i]))^p) / x.size(0) - - By default, the losses are averaged over observations for each minibatch. - However, if the field `size_average` is set to `False`, - the losses are instead summed. - """ - - def __init__(self, p=1, margin=1, weight=None, size_average=True): - super(MultiMarginLoss, self).__init__() - if p != 1 and p != 2: - raise ValueError("only p == 1 and p == 2 supported") - assert weight is None or weight.dim() == 1 - self.p = p - self.margin = margin - self.size_average = size_average - self.weight = weight - - def forward(self, input, target): - return self._backend.MultiMarginLoss(self.size_average, self.p, - self.margin, weight=self.weight)(input, target)
    - - -class TripletMarginLoss(Module): - r"""Creates a criterion that measures the triplet loss given an input tensors x1, x2, x3 - and a margin with a value greater than 0. - This is used for measuring a relative similarity between samples. A triplet is composed by - `a`, `p` and `n`: anchor, positive examples and negative example respectively. - The shape of all input variables should be :math:`(N, D)`. - - The distance swap is described in detail in the paper `Learning shallow convolutional feature descriptors with - triplet losses`_ by V. Balntas, E. Riba et al. - - .. math:: - L(a, p, n) = \frac{1}{N} \left( \sum_{i=1}^N \max \{d(a_i, p_i) - d(a_i, n_i) + {\rm margin}, 0\} \right) - - where :math: `d(x_i, y_i) = \| {\bf x}_i - {\bf y}_i \|_2^2`. - - Args: - anchor: anchor input tensor - positive: positive input tensor - negative: negative input tensor - p: the norm degree. Default: 2 - - Shape: - - Input: :math:`(N, D)` where `D = vector dimension` - - Output: :math:`(N, 1)` - - >>> triplet_loss = nn.TripletMarginLoss(margin=1.0, p=2) - >>> input1 = autograd.Variable(torch.randn(100, 128)) - >>> input2 = autograd.Variable(torch.randn(100, 128)) - >>> input3 = autograd.Variable(torch.randn(100, 128)) - >>> output = triplet_loss(input1, input2, input3) - >>> output.backward() - - .. _Learning shallow convolutional feature descriptors with triplet losses: - http://www.iis.ee.ic.ac.uk/%7Evbalnt/shallow_descr/TFeat_paper.pdf - """ - - def __init__(self, margin=1.0, p=2, eps=1e-6, swap=False): - super(TripletMarginLoss, self).__init__() - self.margin = margin - self.p = p - self.eps = eps - self.swap = swap - - def forward(self, anchor, positive, negative): - return F.triplet_margin_loss(anchor, positive, negative, self.margin, - self.p, self.eps, self.swap) - -# TODO: L1HingeEmbeddingCriterion -# TODO: MSECriterion weight -# TODO: ClassSimplexCriterion -
    - -
    -
    - -
    -
    -
    - - -
    - -
    -

    - © Copyright 2017, Torch Contributors. - -

    -
    - Built with Sphinx using a theme provided by Read the Docs. - -
    - -
    -
    - -
    - -
    - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/_modules/torch/nn/modules/module.html b/docs/_modules/torch/nn/modules/module.html deleted file mode 100644 index 5da9043b0c1a..000000000000 --- a/docs/_modules/torch/nn/modules/module.html +++ /dev/null @@ -1,1046 +0,0 @@ - - - - - - - - - - - torch.nn.modules.module — PyTorch 0.1.11 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - -
    - - - - - - -
    -
    - - - - - - - - - - - - - - - - -
    - - - - -
    -
    -
    -
    - -

    Source code for torch.nn.modules.module

    -from itertools import chain
    -from collections import OrderedDict
    -import functools
    -
    -import torch
    -from ..backends.thnn import backend as thnn_backend
    -from ..parameter import Parameter
    -from torch.autograd import Variable
    -import torch.utils.hooks as hooks
    -
    -
    -def _addindent(s_, numSpaces):
    -    s = s_.split('\n')
    -    # dont do anything for single-line stuff
    -    if len(s) == 1:
    -        return s_
    -    first = s.pop(0)
    -    s = [(numSpaces * ' ') + line for line in s]
    -    s = '\n'.join(s)
    -    s = first + '\n' + s
    -    return s
    -
    -
    -
    [docs]class Module(object): - """Base class for all neural network modules. - - Your models should also subclass this class. - - Modules can also contain other Modules, allowing to nest them in - a tree structure. You can assign the submodules as regular attributes:: - - import torch.nn as nn - import torch.nn.functional as F - - class Model(nn.Module): - def __init__(self): - super(Model, self).__init__() - self.conv1 = nn.Conv2d(1, 20, 5) - self.conv2 = nn.Conv2d(20, 20, 5) - - def forward(self, x): - x = F.relu(self.conv1(x)) - return F.relu(self.conv2(x)) - - Submodules assigned in this way will be registered, and will have their - parameters converted too when you call .cuda(), etc. - """ - - dump_patches = False - - def __init__(self): - self._backend = thnn_backend - self._parameters = OrderedDict() - self._buffers = OrderedDict() - self._backward_hooks = OrderedDict() - self._forward_hooks = OrderedDict() - self._modules = OrderedDict() - self.training = True - -
    [docs] def forward(self, *input): - """Defines the computation performed at every call. - - Should be overriden by all subclasses. - """ - raise NotImplementedError
    - -
    [docs] def register_buffer(self, name, tensor): - """Adds a persistent buffer to the module. - - This is typically used to register a buffer that should not to be - considered a model parameter. For example, BatchNorm's ``running_mean`` - is not a parameter, but is part of the persistent state. - - Buffers can be accessed as attributes using given names. - - Example: - >>> self.register_buffer('running_mean', torch.zeros(num_features)) - """ - self._buffers[name] = tensor
    - -
    [docs] def register_parameter(self, name, param): - """Adds a parameter to the module. - - The parameter can be accessed as an attribute using given name. - """ - if '_parameters' not in self.__dict__: - raise AttributeError( - "cannot assign parameter before Module.__init__() call") - if param is None: - self._parameters[name] = None - elif not isinstance(param, Parameter): - raise TypeError("cannot assign '{}' object to parameter '{}' " - "(torch.nn.Parameter or None required)" - .format(torch.typename(param), name)) - elif param.creator: - raise ValueError( - "Cannot assign non-leaf Variable to parameter '{0}'. Model " - "parameters must be created explicitly. To express '{0}' " - "as a function of another variable, compute the value in " - "the forward() method.".format(name)) - else: - self._parameters[name] = param
    - -
    [docs] def add_module(self, name, module): - """Adds a child module to the current module. - - The module can be accessed as an attribute using the given name. - """ - if hasattr(self, name): - raise KeyError("attribute already exists '{}'".format(name)) - if not isinstance(module, Module) and module is not None: - raise TypeError("{} is not a Module subclass".format( - torch.typename(module))) - self._modules[name] = module
    - - def _apply(self, fn): - for module in self.children(): - module._apply(fn) - - for param in self._parameters.values(): - if param is not None: - # Variables stored in modules are graph leaves, and we don't - # want to create copy nodes, so we have to unpack the data. - param.data = fn(param.data) - if param._grad is not None: - param._grad.data = fn(param._grad.data) - - for key, buf in self._buffers.items(): - if buf is not None: - self._buffers[key] = fn(buf) - - return self - - def apply(self, fn): - for module in self.children(): - module.apply(fn) - fn(self) - return self - -
    [docs] def cuda(self, device_id=None): - """Moves all model parameters and buffers to the GPU. - - Arguments: - device_id (int, optional): if specified, all parameters will be - copied to that device - """ - return self._apply(lambda t: t.cuda(device_id))
    - -
    [docs] def cpu(self, device_id=None): - """Moves all model parameters and buffers to the CPU.""" - return self._apply(lambda t: t.cpu())
    - - def type(self, dst_type): - return self._apply(lambda t: t.type(dst_type)) - -
    [docs] def float(self): - """Casts all parameters and buffers to float datatype.""" - return self._apply(lambda t: t.float())
    - -
    [docs] def double(self): - """Casts all parameters and buffers to double datatype.""" - return self._apply(lambda t: t.double())
    - -
    [docs] def half(self): - """Casts all parameters and buffers to half datatype.""" - return self._apply(lambda t: t.half())
    - -
    [docs] def register_backward_hook(self, hook): - """Registers a backward hook on the module. - - The hook will be called every time the gradients with respect to module - inputs are computed. The hook should have the following signature:: - - hook(module, grad_input, grad_output) -> Tensor or None - - The :attr:`grad_input` and :attr:`grad_output` may be tuples if the - module has multiple inputs or outputs. The hook should not modify its - arguments, but it can optionally return a new gradient with respect to - input that will be used in place of :attr:`grad_input` in subsequent - computations. - - This function returns a handle with a method ``handle.remove()`` - that removes the hook from the module. - """ - handle = hooks.RemovableHandle(self._backward_hooks) - self._backward_hooks[handle.id] = hook - return handle
    - -
    [docs] def register_forward_hook(self, hook): - """Registers a forward hook on the module. - - The hook will be called every time :func:`forward` computes an output. - It should have the following signature:: - - hook(module, input, output) -> None - - The hook should not modify the input or output. - This function returns a handle with a method ``handle.remove()`` - that removes the hook from the module. - """ - handle = hooks.RemovableHandle(self._forward_hooks) - self._forward_hooks[handle.id] = hook - return handle
    - - def __call__(self, *input, **kwargs): - result = self.forward(*input, **kwargs) - for hook in self._forward_hooks.values(): - hook_result = hook(self, input, result) - if hook_result is not None: - raise RuntimeError( - "forward hooks should never return any values, but '{}'" - "didn't return None".format(hook)) - var = result - while not isinstance(var, Variable): - var = var[0] - creator = var.creator - if creator is not None and len(self._backward_hooks) > 0: - for hook in self._backward_hooks.values(): - wrapper = functools.partial(hook, self) - functools.update_wrapper(wrapper, hook) - creator.register_hook(wrapper) - return result - - def __getattr__(self, name): - if '_parameters' in self.__dict__: - _parameters = self.__dict__['_parameters'] - if name in _parameters: - return _parameters[name] - if '_buffers' in self.__dict__: - _buffers = self.__dict__['_buffers'] - if name in _buffers: - return _buffers[name] - if '_modules' in self.__dict__: - modules = self.__dict__['_modules'] - if name in modules: - return modules[name] - raise AttributeError("'{}' object has no attribute '{}'".format( - type(self).__name__, name)) - - def __setattr__(self, name, value): - def remove_from(*dicts): - for d in dicts: - if name in d: - del d[name] - - params = self.__dict__.get('_parameters') - if isinstance(value, Parameter): - if params is None: - raise AttributeError( - "cannot assign parameters before Module.__init__() call") - remove_from(self.__dict__, self._buffers, self._modules) - self.register_parameter(name, value) - elif params is not None and name in params: - if value is not None: - raise TypeError("cannot assign '{}' as parameter '{}' " - "(torch.nn.Parameter or None expected)" - .format(torch.typename(value), name)) - self.register_parameter(name, value) - else: - modules = self.__dict__.get('_modules') - if isinstance(value, Module): - if modules is None: - raise AttributeError( - "cannot assign module before Module.__init__() call") - remove_from(self.__dict__, self._parameters, self._buffers) - modules[name] = value - elif modules is not None and name in modules: - if value is not None: - raise TypeError("cannot assign '{}' as child module '{}' " - "(torch.nn.Module or None expected)" - .format(torch.typename(value), name)) - modules[name] = value - else: - buffers = self.__dict__.get('_buffers') - if buffers is not None and name in buffers: - if value is not None and not torch.is_tensor(value): - raise TypeError("cannot assign '{}' as buffer '{}' " - "(torch.Tensor or None expected)" - .format(torch.typename(value), name)) - buffers[name] = value - else: - object.__setattr__(self, name, value) - - def __delattr__(self, name): - if name in self._parameters: - del self._parameters[name] - elif name in self._buffers: - del self._buffers[name] - elif name in self._modules: - del self._modules[name] - else: - object.__delattr__(self, name) - -
    [docs] def state_dict(self, destination=None, prefix=''): - """Returns a dictionary containing a whole state of the module. - - Both parameters and persistent buffers (e.g. running averages) are - included. Keys are corresponding parameter and buffer names. - - Example: - >>> module.state_dict().keys() - ['bias', 'weight'] - """ - if destination is None: - destination = OrderedDict() - for name, param in self._parameters.items(): - if param is not None: - destination[prefix + name] = param.data - for name, buf in self._buffers.items(): - if buf is not None: - destination[prefix + name] = buf - for name, module in self._modules.items(): - if module is not None: - module.state_dict(destination, prefix + name + '.') - return destination
    - -
    [docs] def load_state_dict(self, state_dict): - """Copies parameters and buffers from :attr:`state_dict` into - this module and its descendants. The keys of :attr:`state_dict` must - exactly match the keys returned by this module's :func:`state_dict()` - function. - - Arguments: - state_dict (dict): A dict containing parameters and - persistent buffers. - """ - own_state = self.state_dict() - for name, param in state_dict.items(): - if name not in own_state: - raise KeyError('unexpected key "{}" in state_dict' - .format(name)) - if isinstance(param, Parameter): - # backwards compatibility for serialized parameters - param = param.data - own_state[name].copy_(param) - - missing = set(own_state.keys()) - set(state_dict.keys()) - if len(missing) > 0: - raise KeyError('missing keys in state_dict: "{}"'.format(missing))
    - -
    [docs] def parameters(self, memo=None): - """Returns an iterator over module parameters. - - This is typically passed to an optimizer. - - Example: - >>> for param in model.parameters(): - >>> print(type(param.data), param.size()) - <class 'torch.FloatTensor'> (20L,) - <class 'torch.FloatTensor'> (20L, 1L, 5L, 5L) - """ - if memo is None: - memo = set() - for p in self._parameters.values(): - if p is not None and p not in memo: - memo.add(p) - yield p - for module in self.children(): - for p in module.parameters(memo): - yield p
    - -
    [docs] def children(self): - """Returns an iterator over immediate children modules.""" - for name, module in self.named_children(): - yield module
    - -
    [docs] def named_children(self): - """Returns an iterator over immediate children modules, yielding both - the name of the module as well as the module itself. - - Example: - >>> for name, module in model.named_children(): - >>> if name in ['conv4', 'conv5']: - >>> print(module) - """ - memo = set() - for name, module in self._modules.items(): - if module is not None and module not in memo: - memo.add(module) - yield name, module
    - -
    [docs] def modules(self): - """Returns an iterator over all modules in the network. - - Note: - Duplicate modules are returned only once. In the following - example, ``l`` will be returned only once. - - >>> l = nn.Linear(2, 2) - >>> net = nn.Sequential(l, l) - >>> for idx, m in enumerate(net.modules()): - >>> print(idx, '->', m) - 0 -> Sequential ( - (0): Linear (2 -> 2) - (1): Linear (2 -> 2) - ) - 1 -> Linear (2 -> 2) - """ - for name, module in self.named_modules(): - yield module
    - -
    [docs] def named_modules(self, memo=None, prefix=''): - """Returns an iterator over all modules in the network, yielding - both the name of the module as well as the module itself. - - Note: - Duplicate modules are returned only once. In the following - example, ``l`` will be returned only once. - - >>> l = nn.Linear(2, 2) - >>> net = nn.Sequential(l, l) - >>> for idx, m in enumerate(net.named_modules()): - >>> print(idx, '->', m) - 0 -> ('', Sequential ( - (0): Linear (2 -> 2) - (1): Linear (2 -> 2) - )) - 1 -> ('0', Linear (2 -> 2)) - """ - - if memo is None: - memo = set() - if self not in memo: - memo.add(self) - yield prefix, self - for name, module in self._modules.items(): - submodule_prefix = prefix + ('.' if prefix else '') + name - for m in module.named_modules(memo, submodule_prefix): - yield m
    - -
    [docs] def train(self, mode=True): - """Sets the module in training mode. - - This has any effect only on modules such as Dropout or BatchNorm. - """ - self.training = mode - for module in self.children(): - module.train(mode) - return self
    - -
    [docs] def eval(self): - """Sets the module in evaluation mode. - - This has any effect only on modules such as Dropout or BatchNorm. - """ - return self.train(False)
    - -
    [docs] def zero_grad(self): - """Sets gradients of all model parameters to zero.""" - for p in self.parameters(): - if p.grad is not None: - p.grad.data.zero_()
    - - def share_memory(self): - return self._apply(lambda t: t.share_memory_()) - - def __repr__(self): - tmpstr = self.__class__.__name__ + ' (\n' - for key, module in self._modules.items(): - modstr = module.__repr__() - modstr = _addindent(modstr, 2) - tmpstr = tmpstr + ' (' + key + '): ' + modstr + '\n' - tmpstr = tmpstr + ')' - return tmpstr - - def __dir__(self): - module_attrs = dir(self.__class__) - attrs = list(self.__dict__.keys()) - parameters = list(self._parameters.keys()) - modules = list(self._modules.keys()) - buffers = list(self._buffers.keys()) - keys = module_attrs + attrs + parameters + modules + buffers - return sorted(keys)
    -
    - -
    -
    - -
    -
    -
    - - -
    - -
    -

    - © Copyright 2017, Torch Contributors. - -

    -
    - Built with Sphinx using a theme provided by Read the Docs. - -
    - -
    -
    - -
    - -
    - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/_modules/torch/nn/modules/pixelshuffle.html b/docs/_modules/torch/nn/modules/pixelshuffle.html deleted file mode 100644 index 051e805191af..000000000000 --- a/docs/_modules/torch/nn/modules/pixelshuffle.html +++ /dev/null @@ -1,616 +0,0 @@ - - - - - - - - - - - torch.nn.modules.pixelshuffle — PyTorch 0.1.11 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - -
    - - - - - - -
    -
    - - - - - - - - - - - - - - - - -
    - - - - -
    -
    -
    -
    - -

    Source code for torch.nn.modules.pixelshuffle

    -from .module import Module
    -from .. import functional as F
    -
    -
    -
    [docs]class PixelShuffle(Module): - r"""Rearranges elements in a Tensor of shape :math:`(*, C * r^2, H, W]` to a - tensor of shape :math:`(C, H * r, W * r)`. - - This is useful for implementing efficient sub-pixel convolution - with a stride of :math:`1/r`. - - Look at the paper: - `Real-Time Single Image and Video Super-Resolution Using an Efficient Sub-Pixel Convolutional Neural Network`_ - by Shi et. al (2016) for more details - - Args: - upscale_factor (int): factor to increase spatial resolution by - - Shape: - - Input: :math:`(N, C * {upscale\_factor}^2, H, W)` - - Output: :math:`(N, C, H * {upscale\_factor}, W * {upscale\_factor})` - - Examples:: - - >>> ps = nn.PixelShuffle(3) - >>> input = autograd.Variable(torch.Tensor(1, 9, 4, 4)) - >>> output = ps(input) - >>> print(output.size()) - torch.Size([1, 1, 12, 12]) - - .. _Real-Time Single Image and Video Super-Resolution Using an Efficient Sub-Pixel Convolutional Neural Network: - https://arxiv.org/abs/1609.05158 - """ - - def __init__(self, upscale_factor): - super(PixelShuffle, self).__init__() - self.upscale_factor = upscale_factor - - def forward(self, input): - return F.pixel_shuffle(input, self.upscale_factor) - - def __repr__(self): - return self.__class__.__name__ + ' (upscale_factor=' + str(self.upscale_factor) + ')'
    -
    - -
    -
    - -
    -
    -
    - - -
    - -
    -

    - © Copyright 2017, Torch Contributors. - -

    -
    - Built with Sphinx using a theme provided by Read the Docs. - -
    - -
    -
    - -
    - -
    - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/_modules/torch/nn/modules/pooling.html b/docs/_modules/torch/nn/modules/pooling.html deleted file mode 100644 index 1e73f67134c1..000000000000 --- a/docs/_modules/torch/nn/modules/pooling.html +++ /dev/null @@ -1,1416 +0,0 @@ - - - - - - - - - - - torch.nn.modules.pooling — PyTorch 0.1.11 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - -
    - - - - - - -
    -
    - - - - - - - - - - - - - - - - -
    - - - - -
    -
    -
    -
    - -

    Source code for torch.nn.modules.pooling

    -import torch
    -from torch.autograd import Variable
    -
    -from .module import Module
    -from .utils import _single, _pair, _triple
    -from .. import functional as F
    -
    -
    -
    [docs]class MaxPool1d(Module): - r"""Applies a 1D max pooling over an input signal composed of several input - planes. - - In the simplest case, the output value of the layer with input size :math:`(N, C, L)` - and output :math:`(N, C, L_{out})` can be precisely described as: - - .. math:: - - \begin{array}{ll} - out(N_i, C_j, k) = \max_{{m}=0}^{{kernel\_size}-1} input(N_i, C_j, stride * k + m) - \end{array} - - | If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides - for :attr:`padding` number of points - | :attr:`dilation` controls the spacing between the kernel points. It is harder to describe, - but this `link`_ has a nice visualization of what :attr:`dilation` does. - - Args: - kernel_size: the size of the window to take a max over - stride: the stride of the window. Default value is :attr:`kernel_size` - padding: implicit zero padding to be added on both sides - dilation: a parameter that controls the stride of elements in the window - return_indices: if True, will return the max indices along with the outputs. - Useful when Unpooling later - ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape - - Shape: - - Input: :math:`(N, C, L_{in})` - - Output: :math:`(N, C, L_{out})` where - :math:`L_{out} = floor((L_{in} + 2 * padding - dilation * (kernel\_size - 1) - 1) / stride + 1)` - - Examples:: - - >>> # pool of size=3, stride=2 - >>> m = nn.MaxPool1d(3, stride=2) - >>> input = autograd.Variable(torch.randn(20, 16, 50)) - >>> output = m(input) - - .. _link: - https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md - """ - - def __init__(self, kernel_size, stride=None, padding=0, dilation=1, - return_indices=False, ceil_mode=False): - super(MaxPool1d, self).__init__() - self.kernel_size = kernel_size - self.stride = stride or kernel_size - self.padding = padding - self.dilation = dilation - self.return_indices = return_indices - self.ceil_mode = ceil_mode - - def forward(self, input): - return F.max_pool1d(input, self.kernel_size, self.stride, - self.padding, self.dilation, self.ceil_mode, - self.return_indices) - - def __repr__(self): - return self.__class__.__name__ + ' (' \ - + 'size=' + str(self.kernel_size) \ - + ', stride=' + str(self.stride) \ - + ', padding=' + str(self.padding) \ - + ', dilation=' + str(self.dilation) \ - + ', ceil_mode=' + str(self.ceil_mode) + ')'
    - - -
    [docs]class MaxPool2d(Module): - r"""Applies a 2D max pooling over an input signal composed of several input - planes. - - In the simplest case, the output value of the layer with input size :math:`(N, C, H, W)`, - output :math:`(N, C, H_{out}, W_{out})` and :attr:`kernel_size` :math:`(kH, kW)` - can be precisely described as: - - .. math:: - - \begin{array}{ll} - out(N_i, C_j, h, w) = \max_{{m}=0}^{kH-1} \max_{{n}=0}^{kW-1} - input(N_i, C_j, stride[0] * h + m, stride[1] * w + n) - \end{array} - - | If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides - for :attr:`padding` number of points - | :attr:`dilation` controls the spacing between the kernel points. It is harder to describe, - but this `link`_ has a nice visualization of what :attr:`dilation` does. - - The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`dilation` can either be: - - - a single ``int`` -- in which case the same value is used for the height and width dimension - - a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension, - and the second `int` for the width dimension - - Args: - kernel_size: the size of the window to take a max over - stride: the stride of the window. Default value is :attr:`kernel_size` - padding: implicit zero padding to be added on both sides - dilation: a parameter that controls the stride of elements in the window - return_indices: if True, will return the max indices along with the outputs. - Useful when Unpooling later - ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape - - Shape: - - Input: :math:`(N, C, H_{in}, W_{in})` - - Output: :math:`(N, C, H_{out}, W_{out})` where - :math:`H_{out} = floor((H_{in} + 2 * padding[0] - dilation[0] * (kernel\_size[0] - 1) - 1) / stride[0] + 1)` - :math:`W_{out} = floor((W_{in} + 2 * padding[1] - dilation[1] * (kernel\_size[1] - 1) - 1) / stride[1] + 1)` - - Examples:: - - >>> # pool of square window of size=3, stride=2 - >>> m = nn.MaxPool2d(3, stride=2) - >>> # pool of non-square window - >>> m = nn.MaxPool2d((3, 2), stride=(2, 1)) - >>> input = autograd.Variable(torch.randn(20, 16, 50, 32)) - >>> output = m(input) - - .. _link: - https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md - """ - - def __init__(self, kernel_size, stride=None, padding=0, dilation=1, - return_indices=False, ceil_mode=False): - super(MaxPool2d, self).__init__() - self.kernel_size = kernel_size - self.stride = stride or kernel_size - self.padding = padding - self.dilation = dilation - self.return_indices = return_indices - self.ceil_mode = ceil_mode - - def forward(self, input): - return F.max_pool2d(input, self.kernel_size, self.stride, - self.padding, self.dilation, self.ceil_mode, - self.return_indices) - - def __repr__(self): - kh, kw = _pair(self.kernel_size) - dh, dw = _pair(self.stride) - padh, padw = _pair(self.padding) - dilh, dilw = _pair(self.dilation) - padding_str = ', padding=(' + str(padh) + ', ' + str(padw) + ')' \ - if padh != 0 and padw != 0 else '' - dilation_str = (', dilation=(' + str(dilh) + ', ' + str(dilw) + ')' - if dilh != 0 and dilw != 0 else '') - return self.__class__.__name__ + ' (' \ - + 'size=(' + str(kh) + ', ' + str(kw) + ')' \ - + ', stride=(' + str(dh) + ', ' + str(dw) + ')' \ - + padding_str + dilation_str + ')'
    - - -
    [docs]class MaxUnpool1d(Module): - r"""Computes a partial inverse of :class:`MaxPool1d`. - - :class:`MaxPool1d` is not fully invertible, since the non-maximal values are lost. - - :class:`MaxUnpool1d` takes in as input the output of :class:`MaxPool1d` - including the indices of the maximal values and computes a partial inverse - in which all non-maximal values are set to zero. - - .. note:: `MaxPool1d` can map several input sizes to the same output sizes. - Hence, the inversion process can get ambiguous. - To accommodate this, you can provide the needed output size - as an additional argument `output_size` in the forward call. - See the Inputs and Example below. - - Args: - kernel_size (int or tuple): Size of the max pooling window. - stride (int or tuple): Stride of the max pooling window. - It is set to ``kernel_size`` by default. - padding (int or tuple): Padding that was added to the input - - Inputs: - - `input`: the input Tensor to invert - - `indices`: the indices given out by `MaxPool1d` - - `output_size` (optional) : a `torch.Size` that specifies the targeted output size - - Shape: - - Input: :math:`(N, C, H_{in})` - - Output: :math:`(N, C, H_{out})` where - :math:`H_{out} = (H_{in} - 1) * stride[0] - 2 * padding[0] + kernel\_size[0]` - or as given by :attr:`output_size` in the call operator - - Example:: - - >>> pool = nn.MaxPool1d(2, stride=2, return_indices=True) - >>> unpool = nn.MaxUnpool1d(2, stride=2) - >>> input = Variable(torch.Tensor([[[1, 2, 3, 4, 5, 6, 7, 8]]])) - >>> output, indices = pool(input) - >>> unpool(output, indices) - Variable containing: - (0 ,.,.) = - 0 2 0 4 0 6 0 8 - [torch.FloatTensor of size 1x1x8] - - >>> # Example showcasing the use of output_size - >>> input = Variable(torch.Tensor([[[1, 2, 3, 4, 5, 6, 7, 8, 9]]])) - >>> output, indices = pool(input) - >>> unpool(output, indices, output_size=input.size()) - Variable containing: - (0 ,.,.) = - 0 2 0 4 0 6 0 8 0 - [torch.FloatTensor of size 1x1x9] - - >>> unpool(output, indices) - Variable containing: - (0 ,.,.) = - 0 2 0 4 0 6 0 8 - [torch.FloatTensor of size 1x1x8] - - """ - - def __init__(self, kernel_size, stride=None, padding=0): - super(MaxUnpool1d, self).__init__() - self.kernel_size = _single(kernel_size) - self.stride = _single(stride if stride is not None else kernel_size) - self.padding = _single(padding) - - def forward(self, input, indices, output_size=None): - return F.max_unpool1d(input, indices, self.kernel_size, self.stride, - self.padding, output_size)
    - - -
    [docs]class MaxUnpool2d(Module): - r"""Computes a partial inverse of :class:`MaxPool2d`. - - :class:`MaxPool2d` is not fully invertible, since the non-maximal values are lost. - - :class:`MaxUnpool2d` takes in as input the output of :class:`MaxPool2d` - including the indices of the maximal values and computes a partial inverse - in which all non-maximal values are set to zero. - - .. note:: `MaxPool2d` can map several input sizes to the same output sizes. - Hence, the inversion process can get ambiguous. - To accommodate this, you can provide the needed output size - as an additional argument `output_size` in the forward call. - See the Inputs and Example below. - - Args: - kernel_size (int or tuple): Size of the max pooling window. - stride (int or tuple): Stride of the max pooling window. - It is set to ``kernel_size`` by default. - padding (int or tuple): Padding that was added to the input - - Inputs: - - `input`: the input Tensor to invert - - `indices`: the indices given out by `MaxPool2d` - - `output_size` (optional) : a `torch.Size` that specifies the targeted output size - - Shape: - - Input: :math:`(N, C, H_{in}, W_{in})` - - Output: :math:`(N, C, H_{out}, W_{out})` where - :math:`H_{out} = (H_{in} - 1) * stride[0] -2 * padding[0] + kernel\_size[0]` - :math:`W_{out} = (W_{in} - 1) * stride[1] -2 * padding[1] + kernel\_size[1]` - or as given by :attr:`output_size` in the call operator - - Example:: - - >>> pool = nn.MaxPool2d(2, stride=2, return_indices=True) - >>> unpool = nn.MaxUnpool2d(2, stride=2) - >>> input = Variable(torch.Tensor([[[[ 1, 2, 3, 4], - ... [ 5, 6, 7, 8], - ... [ 9, 10, 11, 12], - ... [13, 14, 15, 16]]]])) - >>> output, indices = pool(input) - >>> unpool(output, indices) - Variable containing: - (0 ,0 ,.,.) = - 0 0 0 0 - 0 6 0 8 - 0 0 0 0 - 0 14 0 16 - [torch.FloatTensor of size 1x1x4x4] - - >>> # specify a different output size than input size - >>> unpool(output, indices, output_size=torch.Size([1, 1, 5, 5])) - Variable containing: - (0 ,0 ,.,.) = - 0 0 0 0 0 - 6 0 8 0 0 - 0 0 0 14 0 - 16 0 0 0 0 - 0 0 0 0 0 - [torch.FloatTensor of size 1x1x5x5] - - """ - - def __init__(self, kernel_size, stride=None, padding=0): - super(MaxUnpool2d, self).__init__() - self.kernel_size = _pair(kernel_size) - self.stride = _pair(stride if stride is not None else kernel_size) - self.padding = _pair(padding) - - def forward(self, input, indices, output_size=None): - return F.max_unpool2d(input, indices, self.kernel_size, self.stride, - self.padding, output_size)
    - - -
    [docs]class MaxUnpool3d(Module): - r"""Computes a partial inverse of :class:`MaxPool3d`. - - :class:`MaxPool3d` is not fully invertible, since the non-maximal values are lost. - :class:`MaxUnpool3d` takes in as input the output of :class:`MaxPool3d` - including the indices of the maximal values and computes a partial inverse - in which all non-maximal values are set to zero. - - .. note:: `MaxPool3d` can map several input sizes to the same output sizes. - Hence, the inversion process can get ambiguous. - To accommodate this, you can provide the needed output size - as an additional argument `output_size` in the forward call. - See the Inputs section below. - - Args: - kernel_size (int or tuple): Size of the max pooling window. - stride (int or tuple): Stride of the max pooling window. - It is set to ``kernel_size`` by default. - padding (int or tuple): Padding that was added to the input - - Inputs: - - `input`: the input Tensor to invert - - `indices`: the indices given out by `MaxPool3d` - - `output_size` (optional) : a `torch.Size` that specifies the targeted output size - - Shape: - - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` - - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` where - :math:`D_{out} = (D_{in} - 1) * stride[0] - 2 * padding[0] + kernel\_size[0]` - :math:`H_{out} = (H_{in} - 1) * stride[1] - 2 * padding[1] + kernel\_size[1]` - :math:`W_{out} = (W_{in} - 1) * stride[2] - 2 * padding[2] + kernel\_size[2]` - or as given by :attr:`output_size` in the call operator - - Example:: - - >>> # pool of square window of size=3, stride=2 - >>> pool = nn.MaxPool3d(3, stride=2, return_indices=True) - >>> unpool = nn.MaxUnpool3d(3, stride=2) - >>> output, indices = pool(Variable(torch.randn(20, 16, 51, 33, 15))) - >>> unpooled_output = unpool(output, indices) - >>> unpooled_output.size() - torch.Size([20, 16, 51, 33, 15]) - """ - - def __init__(self, kernel_size, stride=None, padding=0): - super(MaxUnpool3d, self).__init__() - self.kernel_size = _triple(kernel_size) - self.stride = _triple(stride if stride is not None else kernel_size) - self.padding = _triple(padding) - - def forward(self, input, indices, output_size=None): - return F.max_unpool3d(input, indices, self.kernel_size, self.stride, - self.padding, output_size)
    - - -
    [docs]class AvgPool1d(Module): - r"""Applies a 1D average pooling over an input signal composed of several - input planes. - - In the simplest case, the output value of the layer with input size :math:`(N, C, L)`, - output :math:`(N, C, L_{out})` and :attr:`kernel_size` :math:`k` - can be precisely described as: - - .. math:: - - \begin{array}{ll} - out(N_i, C_j, l) = 1 / k * \sum_{{m}=0}^{k} - input(N_i, C_j, stride * l + m) - \end{array} - - | If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides - for :attr:`padding` number of points - - The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding` can each be - an ``int`` or a one-element tuple. - - Args: - kernel_size: the size of the window - stride: the stride of the window. Default value is :attr:`kernel_size` - padding: implicit zero padding to be added on both sides - ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape - count_include_pad: when True, will include the zero-padding in the averaging calculation - - Shape: - - Input: :math:`(N, C, L_{in})` - - Output: :math:`(N, C, L_{out})` where - :math:`L_{out} = floor((L_{in} + 2 * padding - kernel\_size) / stride + 1)` - - Examples:: - - >>> # pool with window of size=3, stride=2 - >>> m = nn.AvgPool1d(3, stride=2) - >>> m(Variable(torch.Tensor([[[1,2,3,4,5,6,7]]]))) - Variable containing: - (0 ,.,.) = - 2 4 6 - [torch.FloatTensor of size 1x1x3] - """ - - def __init__(self, kernel_size, stride=None, padding=0, ceil_mode=False, - count_include_pad=True): - super(AvgPool1d, self).__init__() - self.kernel_size = _single(kernel_size) - self.stride = _single(stride if stride is not None else kernel_size) - self.padding = _single(padding) - self.ceil_mode = ceil_mode - self.count_include_pad = count_include_pad - - def forward(self, input): - return F.avg_pool1d( - input, self.kernel_size, self.stride, self.padding, self.ceil_mode, - self.count_include_pad)
    - - -
    [docs]class AvgPool2d(Module): - r"""Applies a 2D average pooling over an input signal composed of several input - planes. - - In the simplest case, the output value of the layer with input size :math:`(N, C, H, W)`, - output :math:`(N, C, H_{out}, W_{out})` and :attr:`kernel_size` :math:`(kH, kW)` - can be precisely described as: - - .. math:: - - \begin{array}{ll} - out(N_i, C_j, h, w) = 1 / (kH * kW) * \sum_{{m}=0}^{kH-1} \sum_{{n}=0}^{kW-1} - input(N_i, C_j, stride[0] * h + m, stride[1] * w + n) - \end{array} - - | If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides - for :attr:`padding` number of points - - The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding` can either be: - - - a single ``int`` -- in which case the same value is used for the height and width dimension - - a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension, - and the second `int` for the width dimension - - Args: - kernel_size: the size of the window - stride: the stride of the window. Default value is :attr:`kernel_size` - padding: implicit zero padding to be added on both sides - ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape - count_include_pad: when True, will include the zero-padding in the averaging calculation - - Shape: - - Input: :math:`(N, C, H_{in}, W_{in})` - - Output: :math:`(N, C, H_{out}, W_{out})` where - :math:`H_{out} = floor((H_{in} + 2 * padding[0] - kernel\_size[0]) / stride[0] + 1)` - :math:`W_{out} = floor((W_{in} + 2 * padding[1] - kernel\_size[1]) / stride[1] + 1)` - - Examples:: - - >>> # pool of square window of size=3, stride=2 - >>> m = nn.AvgPool2d(3, stride=2) - >>> # pool of non-square window - >>> m = nn.AvgPool2d((3, 2), stride=(2, 1)) - >>> input = autograd.Variable(torch.randn(20, 16, 50, 32)) - >>> output = m(input) - """ - - def __init__(self, kernel_size, stride=None, padding=0, ceil_mode=False, - count_include_pad=True): - super(AvgPool2d, self).__init__() - self.kernel_size = kernel_size - self.stride = stride or kernel_size - self.padding = padding - self.ceil_mode = ceil_mode - self.count_include_pad = count_include_pad - - def forward(self, input): - return F.avg_pool2d(input, self.kernel_size, self.stride, - self.padding, self.ceil_mode, self.count_include_pad)
    - - -
    [docs]class MaxPool3d(Module): - r"""Applies a 3D max pooling over an input signal composed of several input - planes. - - In the simplest case, the output value of the layer with input size :math:`(N, C, D, H, W)`, - output :math:`(N, C, D_{out}, H_{out}, W_{out})` and :attr:`kernel_size` :math:`(kD, kH, kW)` - can be precisely described as: - - .. math:: - - \begin{array}{ll} - out(N_i, C_j, d, h, w) = \max_{{k}=0}^{kD-1} \max_{{m}=0}^{kH-1} \max_{{n}=0}^{kW-1} - input(N_i, C_j, stride[0] * k + d, stride[1] * h + m, stride[2] * w + n) - \end{array} - - | If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides - for :attr:`padding` number of points - | :attr:`dilation` controls the spacing between the kernel points. It is harder to describe, - but this `link`_ has a nice visualization of what :attr:`dilation` does. - - The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`dilation` can either be: - - - a single ``int`` -- in which case the same value is used for the height and width dimension - - a ``tuple`` of three ints -- in which case, the first `int` is used for the depth dimension, - the second `int` for the width dimension and the third `int` for the width dimension - - Args: - kernel_size: the size of the window to take a max over - stride: the stride of the window. Default value is :attr:`kernel_size` - padding: implicit zero padding to be added on both sides - dilation: a parameter that controls the stride of elements in the window - return_indices: if True, will return the max indices along with the outputs. - Useful when Unpooling later - ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape - - Shape: - - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` - - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` where - :math:`D_{out} = floor((D_{in} + 2 * padding[0] - dilation[0] * (kernel\_size[0] - 1) - 1) / stride[0] + 1)` - :math:`H_{out} = floor((H_{in} + 2 * padding[1] - dilation[1] * (kernel\_size[1] - 1) - 1) / stride[1] + 1)` - :math:`W_{out} = floor((W_{in} + 2 * padding[2] - dilation[2] * (kernel\_size[2] - 1) - 1) / stride[2] + 1)` - - Examples:: - - >>> # pool of square window of size=3, stride=2 - >>> m = nn.MaxPool3d(3, stride=2) - >>> # pool of non-square window - >>> m = nn.MaxPool3d((3, 2, 2), stride=(2, 1, 2)) - >>> input = autograd.Variable(torch.randn(20, 16, 50,44, 31)) - >>> output = m(input) - - .. _link: - https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md - """ - - def __init__(self, kernel_size, stride=None, padding=0, dilation=1, - return_indices=False, ceil_mode=False): - super(MaxPool3d, self).__init__() - self.kernel_size = kernel_size - self.stride = stride or kernel_size - self.padding = padding - self.dilation = dilation - self.return_indices = return_indices - self.ceil_mode = ceil_mode - - def forward(self, input): - return F.max_pool3d(input, self.kernel_size, self.stride, - self.padding, self.dilation, self.ceil_mode, - self.return_indices)
    - - -
    [docs]class AvgPool3d(Module): - r"""Applies a 3D average pooling over an input signal composed of several input - planes. - - In the simplest case, the output value of the layer with input size :math:`(N, C, D, H, W)`, - output :math:`(N, C, D_{out}, H_{out}, W_{out})` and :attr:`kernel_size` :math:`(kD, kH, kW)` - can be precisely described as: - - .. math:: - - \begin{array}{ll} - out(N_i, C_j, d, h, w) = 1 / (kD * kH * kW) * \sum_{{k}=0}^{kD-1} \sum_{{m}=0}^{kH-1} \sum_{{n}=0}^{kW-1} - input(N_i, C_j, stride[0] * d + k, stride[1] * h + m, stride[2] * w + n) - \end{array} - - The parameters :attr:`kernel_size`, :attr:`stride` can either be: - - - a single ``int`` -- in which case the same value is used for the height and width dimension - - a ``tuple`` of three ints -- in which case, the first `int` is used for the depth dimension, - the second `int` for the width dimension and the third `int` for the width dimension - - Args: - kernel_size: the size of the window - stride: the stride of the window. Default value is :attr:`kernel_size` - - Shape: - - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` - - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` where - :math:`D_{out} = floor((D_{in} - kernel\_size[0]) / stride[0] + 1)` - :math:`H_{out} = floor((H_{in} - kernel\_size[1]) / stride[1] + 1)` - :math:`W_{out} = floor((W_{in} - kernel\_size[2]) / stride[2] + 1)` - - Examples:: - - >>> # pool of square window of size=3, stride=2 - >>> m = nn.AvgPool3d(3, stride=2) - >>> # pool of non-square window - >>> m = nn.AvgPool3d((3, 2, 2), stride=(2, 1, 2)) - >>> input = autograd.Variable(torch.randn(20, 16, 50,44, 31)) - >>> output = m(input) - """ - - def __init__(self, kernel_size, stride=None): - super(AvgPool3d, self).__init__() - self.kernel_size = kernel_size - self.stride = stride - - def forward(self, input): - return F.avg_pool3d(input, self.kernel_size, self.stride)
    - - -
    [docs]class FractionalMaxPool2d(Module): - """Applies a 2D fractional max pooling over an input signal composed of several input planes. - - Fractiona MaxPooling is described in detail in the paper `Fractional MaxPooling`_ by Ben Graham - - The max-pooling operation is applied in kHxkW regions by a stochastic - step size determined by the target output size. - The number of output features is equal to the number of input planes. - - Args: - kernel_size: the size of the window to take a max over. - Can be a single number k (for a square kernel of k x k) or a tuple (kh x kw) - output_size: the target output size of the image of the form oH x oW. - Can be a tuple (oH, oW) or a single number oH for a square image oH x oH - output_ratio: If one wants to have an output size as a ratio of the input size, this option can be given. - This has to be a number or tuple in the range (0, 1) - return_indices: if True, will return the indices along with the outputs. - Useful to pass to nn.MaxUnpool2d . Default: False - - Examples: - >>> # pool of square window of size=3, and target output size 13x12 - >>> m = nn.FractionalMaxPool2d(3, output_size=(13, 12)) - >>> # pool of square window and target output size being half of input image size - >>> m = nn.FractionalMaxPool2d(3, output_ratio=(0.5, 0.5)) - >>> input = autograd.Variable(torch.randn(20, 16, 50, 32)) - >>> output = m(input) - - .. _Fractional MaxPooling: - http://arxiv.org/abs/1412.6071 - """ - - def __init__(self, kernel_size, output_size=None, output_ratio=None, - return_indices=False, _random_samples=None): - super(FractionalMaxPool2d, self).__init__() - self.kh, self.kw = _pair(kernel_size) - self.return_indices = return_indices - self.register_buffer('_random_samples', _random_samples) - if output_size is not None: - self.outh, self.outw = _pair(output_size) - self.rh, self.rw = None, None - assert output_ratio is None - elif output_ratio is not None: - self.outh, self.outw = None, None - self.rh, self.rw = _pair(output_ratio) - assert output_size is None - assert 0 < self.rh < 1 - assert 0 < self.rw < 1 - else: - raise ValueError("FractionalMaxPool2d requires specifying either " - "an output size, or a pooling ratio") - - def forward(self, input): - kwargs = {} - if self.outh is not None: - kwargs['output_size'] = self.outh, self.outw - else: - kwargs['output_ratio'] = self.rh, self.rw - func = self._backend.FractionalMaxPool2d(self.kw, self.kh, - return_indices=self.return_indices, - _random_samples=self._random_samples, **kwargs) - return func(input)
    - - -
    [docs]class LPPool2d(Module): - r"""Applies a 2D power-average pooling over an input signal composed of several input - planes. - - On each window, the function computed is: :math:`f(X) = pow(sum(pow(X, p)), 1/p)` - - - At p = infinity, one gets Max Pooling - - At p = 1, one gets Average Pooling - - The parameters :attr:`kernel_size`, :attr:`stride` can either be: - - - a single ``int`` -- in which case the same value is used for the height and width dimension - - a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension, - and the second `int` for the width dimension - - Args: - kernel_size: the size of the window - stride: the stride of the window. Default value is :attr:`kernel_size` - ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape - - Shape: - - Input: :math:`(N, C, H_{in}, W_{in})` - - Output: :math:`(N, C, H_{out}, W_{out})` where - :math:`H_{out} = floor((H_{in} + 2 * padding[0] - dilation[0] * (kernel\_size[0] - 1) - 1) / stride[0] + 1)` - :math:`W_{out} = floor((W_{in} + 2 * padding[1] - dilation[1] * (kernel\_size[1] - 1) - 1) / stride[1] + 1)` - - Examples:: - - >>> # power-2 pool of square window of size=3, stride=2 - >>> m = nn.LPPool2d(2, 3, stride=2) - >>> # pool of non-square window of power 1.2 - >>> m = nn.LPPool2d(1.2, (3, 2), stride=(2, 1)) - >>> input = autograd.Variable(torch.randn(20, 16, 50, 32)) - >>> output = m(input) - - """ - - def __init__(self, norm_type, kernel_size, stride=None, ceil_mode=False): - super(LPPool2d, self).__init__() - self.norm_type = norm_type - self.kernel_size = kernel_size - self.stride = stride - self.ceil_mode = ceil_mode - - def forward(self, input): - return F.lp_pool2d(input, self.norm_type, self.kernel_size, - self.stride, self.ceil_mode)
    - - -
    [docs]class AdaptiveMaxPool1d(Module): - """Applies a 1D adaptive max pooling over an input signal composed of several input planes. - - The output size is H, for any input size. - The number of output features is equal to the number of input planes. - - Args: - output_size: the target output size H - return_indices: if True, will return the indices along with the outputs. - Useful to pass to nn.MaxUnpool2d . Default: False - - Examples: - >>> # target output size of 5 - >>> m = nn.AdaptiveMaxPool1d(5) - >>> input = autograd.Variable(torch.randn(1, 64, 8)) - >>> output = m(input) - - """ - - def __init__(self, output_size, return_indices=False): - super(AdaptiveMaxPool1d, self).__init__() - self.output_size = output_size - self.return_indices = return_indices - - def forward(self, input): - return F.adaptive_max_pool1d(input, self.output_size, self.return_indices) - - def __repr__(self): - return self.__class__.__name__ + ' (' \ - + 'output_size=' + str(self.output_size) + ')'
    - - -
    [docs]class AdaptiveMaxPool2d(Module): - """Applies a 2D adaptive max pooling over an input signal composed of several input planes. - - The output is of size H x W, for any input size. - The number of output features is equal to the number of input planes. - - Args: - output_size: the target output size of the image of the form H x W. - Can be a tuple (H, W) or a single number H for a square image H x H - return_indices: if True, will return the indices along with the outputs. - Useful to pass to nn.MaxUnpool2d . Default: False - - Examples: - >>> # target output size of 5x7 - >>> m = nn.AdaptiveMaxPool2d((5,7)) - >>> input = autograd.Variable(torch.randn(1, 64, 8, 9)) - >>> # target output size of 7x7 (square) - >>> m = nn.AdaptiveMaxPool2d(7) - >>> input = autograd.Variable(torch.randn(1, 64, 10, 9)) - >>> output = m(input) - - """ - - def __init__(self, output_size, return_indices=False): - super(AdaptiveMaxPool2d, self).__init__() - self.output_size = output_size - self.return_indices = return_indices - - def forward(self, input): - return F.adaptive_max_pool2d(input, self.output_size, self.return_indices) - - def __repr__(self): - return self.__class__.__name__ + ' (' \ - + 'output_size=' + str(self.output_size) + ')'
    - - -
    [docs]class AdaptiveAvgPool1d(Module): - """Applies a 1D adaptive average pooling over an input signal composed of several input planes. - - The output size is H, for any input size. - The number of output features is equal to the number of input planes. - - Args: - output_size: the target output size H - - Examples: - >>> # target output size of 5 - >>> m = nn.AdaptiveAvgPool1d(5) - >>> input = autograd.Variable(torch.randn(1, 64, 8)) - >>> output = m(input) - - """ - - def __init__(self, output_size): - super(AdaptiveAvgPool1d, self).__init__() - self.output_size = output_size - - def forward(self, input): - return F.adaptive_avg_pool1d(input, self.output_size) - - def __repr__(self): - return self.__class__.__name__ + ' (' \ - + 'output_size=' + str(self.output_size) + ')'
    - - -
    [docs]class AdaptiveAvgPool2d(Module): - """Applies a 2D adaptive average pooling over an input signal composed of several input planes. - - The output is of size H x W, for any input size. - The number of output features is equal to the number of input planes. - - Args: - output_size: the target output size of the image of the form H x W. - Can be a tuple (H, W) or a single number H for a square image H x H - - Examples: - >>> # target output size of 5x7 - >>> m = nn.AdaptiveAvgPool2d((5,7)) - >>> input = autograd.Variable(torch.randn(1, 64, 8, 9)) - >>> # target output size of 7x7 (square) - >>> m = nn.AdaptiveAvgPool2d(7) - >>> input = autograd.Variable(torch.randn(1, 64, 10, 9)) - >>> output = m(input) - - """ - - def __init__(self, output_size): - super(AdaptiveAvgPool2d, self).__init__() - self.output_size = output_size - - def forward(self, input): - return F.adaptive_avg_pool2d(input, self.output_size) - - def __repr__(self): - return self.__class__.__name__ + ' (' \ - + 'output_size=' + str(self.output_size) + ')'
    -
    - -
    -
    - -
    -
    -
    - - -
    - -
    -

    - © Copyright 2017, Torch Contributors. - -

    -
    - Built with Sphinx using a theme provided by Read the Docs. - -
    - -
    -
    - -
    - -
    - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/_modules/torch/nn/modules/rnn.html b/docs/_modules/torch/nn/modules/rnn.html deleted file mode 100644 index f031f9c50dfa..000000000000 --- a/docs/_modules/torch/nn/modules/rnn.html +++ /dev/null @@ -1,1142 +0,0 @@ - - - - - - - - - - - torch.nn.modules.rnn — PyTorch 0.1.11 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - -
    - - - - - - -
    -
    - - - - - - - - - - - - - - - - -
    - - - - -
    -
    -
    -
    - -

    Source code for torch.nn.modules.rnn

    -import math
    -import torch
    -
    -from .module import Module
    -from ..parameter import Parameter
    -from ..utils.rnn import PackedSequence
    -
    -
    -class RNNBase(Module):
    -
    -    def __init__(self, mode, input_size, hidden_size,
    -                 num_layers=1, bias=True, batch_first=False,
    -                 dropout=0, bidirectional=False):
    -        super(RNNBase, self).__init__()
    -        self.mode = mode
    -        self.input_size = input_size
    -        self.hidden_size = hidden_size
    -        self.num_layers = num_layers
    -        self.bias = bias
    -        self.batch_first = batch_first
    -        self.dropout = dropout
    -        self.dropout_state = {}
    -        self.bidirectional = bidirectional
    -        num_directions = 2 if bidirectional else 1
    -
    -        self._all_weights = []
    -        for layer in range(num_layers):
    -            for direction in range(num_directions):
    -                layer_input_size = input_size if layer == 0 else hidden_size * num_directions
    -                if mode == 'LSTM':
    -                    gate_size = 4 * hidden_size
    -                elif mode == 'GRU':
    -                    gate_size = 3 * hidden_size
    -                else:
    -                    gate_size = hidden_size
    -
    -                w_ih = Parameter(torch.Tensor(gate_size, layer_input_size))
    -                w_hh = Parameter(torch.Tensor(gate_size, hidden_size))
    -                b_ih = Parameter(torch.Tensor(gate_size))
    -                b_hh = Parameter(torch.Tensor(gate_size))
    -
    -                suffix = '_reverse' if direction == 1 else ''
    -                weights = ['weight_ih_l{}{}', 'weight_hh_l{}{}', 'bias_ih_l{}{}', 'bias_hh_l{}{}']
    -                weights = [x.format(layer, suffix) for x in weights]
    -                setattr(self, weights[0], w_ih)
    -                setattr(self, weights[1], w_hh)
    -                if bias:
    -                    setattr(self, weights[2], b_ih)
    -                    setattr(self, weights[3], b_hh)
    -                    self._all_weights += [weights]
    -                else:
    -                    self._all_weights += [weights[:2]]
    -
    -        self.reset_parameters()
    -
    -    def reset_parameters(self):
    -        stdv = 1.0 / math.sqrt(self.hidden_size)
    -        for weight in self.parameters():
    -            weight.data.uniform_(-stdv, stdv)
    -
    -    def forward(self, input, hx=None):
    -        is_packed = isinstance(input, PackedSequence)
    -        if is_packed:
    -            input, batch_sizes = input
    -            max_batch_size = batch_sizes[0]
    -        else:
    -            batch_sizes = None
    -            max_batch_size = input.size(0) if self.batch_first else input.size(1)
    -
    -        if hx is None:
    -            num_directions = 2 if self.bidirectional else 1
    -            hx = torch.autograd.Variable(input.data.new(self.num_layers *
    -                                                        num_directions,
    -                                                        max_batch_size,
    -                                                        self.hidden_size).zero_())
    -            if self.mode == 'LSTM':
    -                hx = (hx, hx)
    -
    -        func = self._backend.RNN(
    -            self.mode,
    -            self.input_size,
    -            self.hidden_size,
    -            num_layers=self.num_layers,
    -            batch_first=self.batch_first,
    -            dropout=self.dropout,
    -            train=self.training,
    -            bidirectional=self.bidirectional,
    -            batch_sizes=batch_sizes,
    -            dropout_state=self.dropout_state
    -        )
    -        output, hidden = func(input, self.all_weights, hx)
    -        if is_packed:
    -            output = PackedSequence(output, batch_sizes)
    -        return output, hidden
    -
    -    def __repr__(self):
    -        s = '{name}({input_size}, {hidden_size}'
    -        if self.num_layers != 1:
    -            s += ', num_layers={num_layers}'
    -        if self.bias is not True:
    -            s += ', bias={bias}'
    -        if self.batch_first is not False:
    -            s += ', batch_first={batch_first}'
    -        if self.dropout != 0:
    -            s += ', dropout={dropout}'
    -        if self.bidirectional is not False:
    -            s += ', bidirectional={bidirectional}'
    -        s += ')'
    -        return s.format(name=self.__class__.__name__, **self.__dict__)
    -
    -    def __setstate__(self, d):
    -        self.__dict__.update(d)
    -        if 'all_weights' in d:
    -            self._all_weights = d['all_weights']
    -        if isinstance(self._all_weights[0][0], str):
    -            return
    -        num_layers = self.num_layers
    -        num_directions = 2 if self.bidirectional else 1
    -        self._all_weights = []
    -        for layer in range(num_layers):
    -            for direction in range(num_directions):
    -                suffix = '_reverse' if direction == 1 else ''
    -                weights = ['weight_ih_l{}{}', 'weight_hh_l{}{}', 'bias_ih_l{}{}', 'bias_hh_l{}{}']
    -                weights = [x.format(layer, suffix) for x in weights]
    -                if self.bias:
    -                    self._all_weights += [weights]
    -                else:
    -                    self._all_weights += [weights[:2]]
    -
    -    @property
    -    def all_weights(self):
    -        return [[getattr(self, weight) for weight in weights] for weights in self._all_weights]
    -
    -
    -
    [docs]class RNN(RNNBase): - r"""Applies a multi-layer Elman RNN with tanh or ReLU non-linearity to an input sequence. - - - For each element in the input sequence, each layer computes the following - function: - - .. math:: - - h_t = \tanh(w_{ih} * x_t + b_{ih} + w_{hh} * h_{(t-1)} + b_{hh}) - - where :math:`h_t` is the hidden state at time `t`, and :math:`x_t` is the hidden - state of the previous layer at time `t` or :math:`input_t` for the first layer. - If nonlinearity='relu', then `ReLU` is used instead of `tanh`. - - Args: - input_size: The number of expected features in the input x - hidden_size: The number of features in the hidden state h - num_layers: Number of recurrent layers. - nonlinearity: The non-linearity to use ['tanh'|'relu']. Default: 'tanh' - bias: If False, then the layer does not use bias weights b_ih and b_hh. Default: True - batch_first: If True, then the input and output tensors are provided as (batch, seq, feature) - dropout: If non-zero, introduces a dropout layer on the outputs of each RNN layer except the last layer - bidirectional: If True, becomes a bidirectional RNN. Default: False - - Inputs: input, h_0 - - **input** (seq_len, batch, input_size): tensor containing the features of the input sequence. - The input can also be a packed variable length sequence. See :func:`torch.nn.utils.rnn.pack_padded_sequence` - for details. - - **h_0** (num_layers * num_directions, batch, hidden_size): tensor containing the initial hidden state - for each element in the batch. - - Outputs: output, h_n - - **output** (seq_len, batch, hidden_size * num_directions): tensor containing the output features (h_k) - from the last layer of the RNN, for each k. If a :class:`torch.nn.utils.rnn.PackedSequence` has been given - as the input, the output will also be a packed sequence. - - **h_n** (num_layers * num_directions, batch, hidden_size): tensor containing the hidden state for k=seq_len. - - Attributes: - weight_ih_l[k]: the learnable input-hidden weights of the k-th layer, - of shape `(input_size x hidden_size)` - weight_hh_l[k]: the learnable hidden-hidden weights of the k-th layer, - of shape `(hidden_size x hidden_size)` - bias_ih_l[k]: the learnable input-hidden bias of the k-th layer, of shape `(hidden_size)` - bias_hh_l[k]: the learnable hidden-hidden bias of the k-th layer, of shape `(hidden_size)` - - Examples:: - - >>> rnn = nn.RNN(10, 20, 2) - >>> input = Variable(torch.randn(5, 3, 10)) - >>> h0 = Variable(torch.randn(2, 3, 20)) - >>> output, hn = rnn(input, h0) - """ - - def __init__(self, *args, **kwargs): - if 'nonlinearity' in kwargs: - if kwargs['nonlinearity'] == 'tanh': - mode = 'RNN_TANH' - elif kwargs['nonlinearity'] == 'relu': - mode = 'RNN_RELU' - else: - raise ValueError("Unknown nonlinearity '{}'".format( - kwargs['nonlinearity'])) - del kwargs['nonlinearity'] - else: - mode = 'RNN_TANH' - - super(RNN, self).__init__(mode, *args, **kwargs)
    - - -
    [docs]class LSTM(RNNBase): - r"""Applies a multi-layer long short-term memory (LSTM) RNN to an input sequence. - - - For each element in the input sequence, each layer computes the following - function: - - .. math:: - - \begin{array}{ll} - i_t = sigmoid(W_{ii} x_t + b_{ii} + W_{hi} h_{(t-1)} + b_{hi}) \\ - f_t = sigmoid(W_{if} x_t + b_{if} + W_{hf} h_{(t-1)} + b_{hf}) \\ - g_t = \tanh(W_{ig} x_t + b_{ig} + W_{hc} h_{(t-1)} + b_{hg}) \\ - o_t = sigmoid(W_{io} x_t + b_{io} + W_{ho} h_{(t-1)} + b_{ho}) \\ - c_t = f_t * c_{(t-1)} + i_t * g_t \\ - h_t = o_t * \tanh(c_t) - \end{array} - - where :math:`h_t` is the hidden state at time `t`, :math:`c_t` is the cell state at time `t`, - :math:`x_t` is the hidden state of the previous layer at time `t` or :math:`input_t` for the first layer, - and :math:`i_t`, :math:`f_t`, :math:`g_t`, :math:`o_t` are the input, forget, - cell, and out gates, respectively. - - Args: - input_size: The number of expected features in the input x - hidden_size: The number of features in the hidden state h - num_layers: Number of recurrent layers. - bias: If False, then the layer does not use bias weights b_ih and b_hh. Default: True - batch_first: If True, then the input and output tensors are provided as (batch, seq, feature) - dropout: If non-zero, introduces a dropout layer on the outputs of each RNN layer except the last layer - bidirectional: If True, becomes a bidirectional RNN. Default: False - - Inputs: input, (h_0, c_0) - - **input** (seq_len, batch, input_size): tensor containing the features of the input sequence. - The input can also be a packed variable length sequence. See :func:`torch.nn.utils.rnn.pack_padded_sequence` - for details. - - **h_0** (num_layers \* num_directions, batch, hidden_size): tensor containing - the initial hidden state for each element in the batch. - - **c_0** (num_layers \* num_directions, batch, hidden_size): tensor containing - the initial cell state for each element in the batch. - - - Outputs: output, (h_n, c_n) - - **output** (seq_len, batch, hidden_size * num_directions): tensor containing - the output features `(h_t)` from the last layer of the RNN, for each t. If a - :class:`torch.nn.utils.rnn.PackedSequence` has been given as the input, the output will also be a - packed sequence. - - **h_n** (num_layers * num_directions, batch, hidden_size): tensor containing the hidden state for t=seq_len - - **c_n** (num_layers * num_directions, batch, hidden_size): tensor containing the cell state for t=seq_len - - Attributes: - weight_ih_l[k] : the learnable input-hidden weights of the k-th layer `(W_ii|W_if|W_ig|W_io)`, of shape - `(input_size x 4*hidden_size)` - weight_hh_l[k] : the learnable hidden-hidden weights of the k-th layer `(W_hi|W_hf|W_hg|W_ho)`, of shape - `(hidden_size x 4*hidden_size)` - bias_ih_l[k] : the learnable input-hidden bias of the k-th layer `(b_ii|b_if|b_ig|b_io)`, of shape - `(4*hidden_size)` - bias_hh_l[k] : the learnable hidden-hidden bias of the k-th layer `(W_hi|W_hf|W_hg|b_ho)`, of shape - `(4*hidden_size)` - - Examples:: - - >>> rnn = nn.LSTM(10, 20, 2) - >>> input = Variable(torch.randn(5, 3, 10)) - >>> h0 = Variable(torch.randn(2, 3, 20)) - >>> c0 = Variable(torch.randn(2, 3, 20)) - >>> output, hn = rnn(input, (h0, c0)) - """ - - def __init__(self, *args, **kwargs): - super(LSTM, self).__init__('LSTM', *args, **kwargs)
    - - -
    [docs]class GRU(RNNBase): - r"""Applies a multi-layer gated recurrent unit (GRU) RNN to an input sequence. - - - For each element in the input sequence, each layer computes the following - function: - - .. math:: - - \begin{array}{ll} - r_t = sigmoid(W_{ir} x_t + b_{ir} + W_{hr} h_{(t-1)} + b_{hr}) \\ - i_t = sigmoid(W_{ii} x_t + b_{ii} + W_hi h_{(t-1)} + b_{hi}) \\ - n_t = \tanh(W_{in} x_t + b_{in} + r_t * (W_{hn} h_{(t-1)}+ b_{hn})) \\ - h_t = (1 - i_t) * n_t + i_t * h_{(t-1)} \\ - \end{array} - - where :math:`h_t` is the hidden state at time `t`, :math:`x_t` is the hidden - state of the previous layer at time `t` or :math:`input_t` for the first layer, - and :math:`r_t`, :math:`i_t`, :math:`n_t` are the reset, input, and new gates, respectively. - - Args: - input_size: The number of expected features in the input x - hidden_size: The number of features in the hidden state h - num_layers: Number of recurrent layers. - bias: If False, then the layer does not use bias weights b_ih and b_hh. Default: True - batch_first: If True, then the input and output tensors are provided as (batch, seq, feature) - dropout: If non-zero, introduces a dropout layer on the outputs of each RNN layer except the last layer - bidirectional: If True, becomes a bidirectional RNN. Default: False - - Inputs: input, h_0 - - **input** (seq_len, batch, input_size): tensor containing the features of the input sequence. - The input can also be a packed variable length sequence. See :func:`torch.nn.utils.rnn.pack_padded_sequence` - for details. - - **h_0** (num_layers * num_directions, batch, hidden_size): tensor containing the initial - hidden state for each element in the batch. - - Outputs: output, h_n - - **output** (seq_len, batch, hidden_size * num_directions): tensor containing the output features h_t from - the last layer of the RNN, for each t. If a :class:`torch.nn.utils.rnn.PackedSequence` has been given as the - input, the output will also be a packed sequence. - - **h_n** (num_layers * num_directions, batch, hidden_size): tensor containing the hidden state for t=seq_len - - Attributes: - weight_ih_l[k] : the learnable input-hidden weights of the k-th layer (W_ir|W_ii|W_in), of shape - `(input_size x 3*hidden_size)` - weight_hh_l[k] : the learnable hidden-hidden weights of the k-th layer (W_hr|W_hi|W_hn), of shape - `(hidden_size x 3*hidden_size)` - bias_ih_l[k] : the learnable input-hidden bias of the k-th layer (b_ir|b_ii|b_in), of shape - `(3*hidden_size)` - bias_hh_l[k] : the learnable hidden-hidden bias of the k-th layer (W_hr|W_hi|W_hn), of shape - `(3*hidden_size)` - Examples:: - - >>> rnn = nn.GRU(10, 20, 2) - >>> input = Variable(torch.randn(5, 3, 10)) - >>> h0 = Variable(torch.randn(2, 3, 20)) - >>> output, hn = rnn(input, h0) - """ - - def __init__(self, *args, **kwargs): - super(GRU, self).__init__('GRU', *args, **kwargs)
    - - -class RNNCellBase(Module): - - def __repr__(self): - s = '{name}({input_size}, {hidden_size}' - if 'bias' in self.__dict__ and self.bias is not True: - s += ', bias={bias}' - if 'nonlinearity' in self.__dict__ and self.nonlinearity != "tanh": - s += ', nonlinearity={nonlinearity}' - s += ')' - return s.format(name=self.__class__.__name__, **self.__dict__) - - -
    [docs]class RNNCell(RNNCellBase): - r"""An Elman RNN cell with tanh or ReLU non-linearity. - - .. math:: - - h' = \tanh(w_{ih} * x + b_{ih} + w_{hh} * h + b_{hh}) - - If nonlinearity='relu', then ReLU is used in place of tanh. - - Args: - input_size: The number of expected features in the input x - hidden_size: The number of features in the hidden state h - bias: If False, then the layer does not use bias weights b_ih and b_hh. Default: True - nonlinearity: The non-linearity to use ['tanh'|'relu']. Default: 'tanh' - - Inputs: input, hidden - - **input** (batch, input_size): tensor containing input features - - **hidden** (batch, hidden_size): tensor containing the initial hidden state for each element in the batch. - - Outputs: h' - - **h'** (batch, hidden_size): tensor containing the next hidden state for each element in the batch - - Attributes: - weight_ih: the learnable input-hidden weights, of shape `(input_size x hidden_size)` - weight_hh: the learnable hidden-hidden weights, of shape `(hidden_size x hidden_size)` - bias_ih: the learnable input-hidden bias, of shape `(hidden_size)` - bias_hh: the learnable hidden-hidden bias, of shape `(hidden_size)` - - Examples:: - - >>> rnn = nn.RNNCell(10, 20) - >>> input = Variable(torch.randn(6, 3, 10)) - >>> hx = Variable(torch.randn(3, 20)) - >>> output = [] - >>> for i in range(6): - ... hx = rnn(input[i], hx) - ... output.append(hx) - """ - - def __init__(self, input_size, hidden_size, bias=True, nonlinearity="tanh"): - super(RNNCell, self).__init__() - self.input_size = input_size - self.hidden_size = hidden_size - self.bias = bias - self.nonlinearity = nonlinearity - self.weight_ih = Parameter(torch.Tensor(hidden_size, input_size)) - self.weight_hh = Parameter(torch.Tensor(hidden_size, hidden_size)) - if bias: - self.bias_ih = Parameter(torch.Tensor(hidden_size)) - self.bias_hh = Parameter(torch.Tensor(hidden_size)) - else: - self.register_parameter('bias_ih', None) - self.register_parameter('bias_hh', None) - self.reset_parameters() - - def reset_parameters(self): - stdv = 1.0 / math.sqrt(self.hidden_size) - for weight in self.parameters(): - weight.data.uniform_(-stdv, stdv) - - def forward(self, input, hx): - if self.nonlinearity == "tanh": - func = self._backend.RNNTanhCell - elif self.nonlinearity == "relu": - func = self._backend.RNNReLUCell - else: - raise RuntimeError( - "Unknown nonlinearity: {}".format(self.nonlinearity)) - - return func( - input, hx, - self.weight_ih, self.weight_hh, - self.bias_ih, self.bias_hh, - )
    - - -
    [docs]class LSTMCell(RNNCellBase): - r"""A long short-term memory (LSTM) cell. - - .. math:: - - \begin{array}{ll} - i = sigmoid(W_{ii} x + b_{ii} + W_{hi} h + b_{hi}) \\ - f = sigmoid(W_{if} x + b_{if} + W_{hf} h + b_{hf}) \\ - g = \tanh(W_{ig} x + b_{ig} + W_{hc} h + b_{hg}) \\ - o = sigmoid(W_{io} x + b_{io} + W_{ho} h + b_{ho}) \\ - c' = f * c + i * g \\ - h' = o * \tanh(c_t) \\ - \end{array} - - Args: - input_size: The number of expected features in the input x - hidden_size: The number of features in the hidden state h - bias: If `False`, then the layer does not use bias weights `b_ih` and `b_hh`. Default: True - - Inputs: input, (h_0, c_0) - - **input** (batch, input_size): tensor containing input features - - **h_0** (batch, hidden_size): tensor containing the initial hidden state for each element in the batch. - - **c_0** (batch. hidden_size): tensor containing the initial cell state for each element in the batch. - - Outputs: h_1, c_1 - - **h_1** (batch, hidden_size): tensor containing the next hidden state for each element in the batch - - **c_1** (batch, hidden_size): tensor containing the next cell state for each element in the batch - - Attributes: - weight_ih: the learnable input-hidden weights, of shape `(input_size x hidden_size)` - weight_hh: the learnable hidden-hidden weights, of shape `(hidden_size x hidden_size)` - bias_ih: the learnable input-hidden bias, of shape `(hidden_size)` - bias_hh: the learnable hidden-hidden bias, of shape `(hidden_size)` - - Examples:: - - >>> rnn = nn.LSTMCell(10, 20) - >>> input = Variable(torch.randn(6, 3, 10)) - >>> hx = Variable(torch.randn(3, 20)) - >>> cx = Variable(torch.randn(3, 20)) - >>> output = [] - >>> for i in range(6): - ... hx, cx = rnn(input[i], (hx, cx)) - ... output.append(hx) - """ - - def __init__(self, input_size, hidden_size, bias=True): - super(LSTMCell, self).__init__() - self.input_size = input_size - self.hidden_size = hidden_size - self.bias = bias - self.weight_ih = Parameter(torch.Tensor(4 * hidden_size, input_size)) - self.weight_hh = Parameter(torch.Tensor(4 * hidden_size, hidden_size)) - if bias: - self.bias_ih = Parameter(torch.Tensor(4 * hidden_size)) - self.bias_hh = Parameter(torch.Tensor(4 * hidden_size)) - else: - self.register_parameter('bias_ih', None) - self.register_parameter('bias_hh', None) - self.reset_parameters() - - def reset_parameters(self): - stdv = 1.0 / math.sqrt(self.hidden_size) - for weight in self.parameters(): - weight.data.uniform_(-stdv, stdv) - - def forward(self, input, hx): - return self._backend.LSTMCell( - input, hx, - self.weight_ih, self.weight_hh, - self.bias_ih, self.bias_hh, - )
    - - -
    [docs]class GRUCell(RNNCellBase): - r"""A gated recurrent unit (GRU) cell - - .. math:: - - \begin{array}{ll} - r = sigmoid(W_{ir} x + b_{ir} + W_{hr} h + b_{hr}) \\ - i = sigmoid(W_{ii} x + b_{ii} + W_{hi} h + b_{hi}) \\ - n = \tanh(W_{in} x + b_{in} + r * (W_{hn} h + b_{hn})) \\ - h' = (1 - i) * n + i * h - \end{array} - - Args: - input_size: The number of expected features in the input x - hidden_size: The number of features in the hidden state h - bias: If `False`, then the layer does not use bias weights `b_ih` and `b_hh`. Default: `True` - - Inputs: input, hidden - - **input** (batch, input_size): tensor containing input features - - **hidden** (batch, hidden_size): tensor containing the initial hidden state for each element in the batch. - - Outputs: h' - - **h'**: (batch, hidden_size): tensor containing the next hidden state for each element in the batch - - Attributes: - weight_ih: the learnable input-hidden weights, of shape `(input_size x hidden_size)` - weight_hh: the learnable hidden-hidden weights, of shape `(hidden_size x hidden_size)` - bias_ih: the learnable input-hidden bias, of shape `(hidden_size)` - bias_hh: the learnable hidden-hidden bias, of shape `(hidden_size)` - - Examples:: - - >>> rnn = nn.GRUCell(10, 20) - >>> input = Variable(torch.randn(6, 3, 10)) - >>> hx = Variable(torch.randn(3, 20)) - >>> output = [] - >>> for i in range(6): - ... hx = rnn(input[i], hx) - ... output.append(hx) - """ - - def __init__(self, input_size, hidden_size, bias=True): - super(GRUCell, self).__init__() - self.input_size = input_size - self.hidden_size = hidden_size - self.bias = bias - self.weight_ih = Parameter(torch.Tensor(3 * hidden_size, input_size)) - self.weight_hh = Parameter(torch.Tensor(3 * hidden_size, hidden_size)) - if bias: - self.bias_ih = Parameter(torch.Tensor(3 * hidden_size)) - self.bias_hh = Parameter(torch.Tensor(3 * hidden_size)) - else: - self.register_parameter('bias_ih', None) - self.register_parameter('bias_hh', None) - self.reset_parameters() - - def reset_parameters(self): - stdv = 1.0 / math.sqrt(self.hidden_size) - for weight in self.parameters(): - weight.data.uniform_(-stdv, stdv) - - def forward(self, input, hx): - return self._backend.GRUCell( - input, hx, - self.weight_ih, self.weight_hh, - self.bias_ih, self.bias_hh, - )
    -
    - -
    -
    - -
    -
    -
    - - -
    - -
    -

    - © Copyright 2017, Torch Contributors. - -

    -
    - Built with Sphinx using a theme provided by Read the Docs. - -
    - -
    -
    - -
    - -
    - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/_modules/torch/nn/modules/sparse.html b/docs/_modules/torch/nn/modules/sparse.html deleted file mode 100644 index 1b6f59ce0c08..000000000000 --- a/docs/_modules/torch/nn/modules/sparse.html +++ /dev/null @@ -1,685 +0,0 @@ - - - - - - - - - - - torch.nn.modules.sparse — PyTorch 0.1.11 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - -
    - - - - - - -
    -
    - - - - - - - - - - - - - - - - -
    - - - - -
    -
    -
    -
    - -

    Source code for torch.nn.modules.sparse

    -import torch
    -from torch.nn.parameter import Parameter
    -
    -from .module import Module
    -
    -
    -
    [docs]class Embedding(Module): - r"""A simple lookup table that stores embeddings of a fixed dictionary and size. - - This module is often used to store word embeddings and retrieve them using indices. - The input to the module is a list of indices, and the output is the corresponding - word embeddings. - - Args: - num_embeddings (int): size of the dictionary of embeddings - embedding_dim (int): the size of each embedding vector - padding_idx (int, optional): If given, pads the output with zeros whenever it encounters the index. - max_norm (float, optional): If given, will renormalize the embeddings to always have a norm lesser than this - norm_type (float, optional): The p of the p-norm to compute for the max_norm option - scale_grad_by_freq (boolean, optional): if given, this will scale gradients by the frequency of - the words in the dictionary. - - Attributes: - weight (Tensor): the learnable weights of the module of shape (num_embeddings, embedding_dim) - - Shape: - - Input: LongTensor `(N, W)`, N = mini-batch, W = number of indices to extract per mini-batch - - Output: `(N, W, embedding_dim)` - - Examples:: - - >>> # an Embedding module containing 10 tensors of size 3 - >>> embedding = nn.Embedding(10, 3) - >>> # a batch of 2 samples of 4 indices each - >>> input = Variable(torch.LongTensor([[1,2,4,5],[4,3,2,9]])) - >>> embedding(input) - - Variable containing: - (0 ,.,.) = - -1.0822 1.2522 0.2434 - 0.8393 -0.6062 -0.3348 - 0.6597 0.0350 0.0837 - 0.5521 0.9447 0.0498 - - (1 ,.,.) = - 0.6597 0.0350 0.0837 - -0.1527 0.0877 0.4260 - 0.8393 -0.6062 -0.3348 - -0.8738 -0.9054 0.4281 - [torch.FloatTensor of size 2x4x3] - - >>> # example with padding_idx - >>> embedding = nn.Embedding(10, 3, padding_idx=0) - >>> input = Variable(torch.LongTensor([[0,2,0,5]])) - >>> embedding(input) - - Variable containing: - (0 ,.,.) = - 0.0000 0.0000 0.0000 - 0.3452 0.4937 -0.9361 - 0.0000 0.0000 0.0000 - 0.0706 -2.1962 -0.6276 - [torch.FloatTensor of size 1x4x3] - - """ - - def __init__(self, num_embeddings, embedding_dim, padding_idx=None, - max_norm=None, norm_type=2, scale_grad_by_freq=False, - sparse=False): - super(Embedding, self).__init__() - self.num_embeddings = num_embeddings - self.embedding_dim = embedding_dim - self.padding_idx = padding_idx - self.max_norm = max_norm - self.norm_type = norm_type - self.scale_grad_by_freq = scale_grad_by_freq - self.weight = Parameter(torch.Tensor(num_embeddings, embedding_dim)) - self.sparse = sparse - - self.reset_parameters() - - def reset_parameters(self): - self.weight.data.normal_(0, 1) - if self.padding_idx is not None: - self.weight.data[self.padding_idx].fill_(0) - - def forward(self, input): - padding_idx = self.padding_idx - if padding_idx is None: - padding_idx = -1 - return self._backend.Embedding( - padding_idx, self.max_norm, self.norm_type, - self.scale_grad_by_freq, self.sparse - )(input, self.weight) - - def __repr__(self): - s = '{name}({num_embeddings}, {embedding_dim}' - if self.padding_idx is not None: - s += ', padding_idx={padding_idx}' - if self.max_norm is not None: - s += ', max_norm={max_norm}' - if self.norm_type != 2: - s += ', norm_type={norm_type}' - if self.scale_grad_by_freq is not False: - s += ', scale_grad_by_freq={scale_grad_by_freq}' - if self.sparse is not False: - s += ', sparse=True' - s += ')' - return s.format(name=self.__class__.__name__, **self.__dict__)
    - - -# TODO: SparseLinear -
    - -
    -
    - -
    -
    -
    - - -
    - -
    -

    - © Copyright 2017, Torch Contributors. - -

    -
    - Built with Sphinx using a theme provided by Read the Docs. - -
    - -
    -
    - -
    - -
    - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/_modules/torch/nn/modules/upsampling.html b/docs/_modules/torch/nn/modules/upsampling.html deleted file mode 100644 index 4d473f6a86ca..000000000000 --- a/docs/_modules/torch/nn/modules/upsampling.html +++ /dev/null @@ -1,687 +0,0 @@ - - - - - - - - - - - torch.nn.modules.upsampling — PyTorch 0.1.11 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - -
    - - - - - - -
    -
    - - - - - - - - - - - - - - - - -
    - - - - -
    -
    -
    -
    - -

    Source code for torch.nn.modules.upsampling

    -from numbers import Integral
    -
    -from .module import Module
    -from .. import functional as F
    -from .utils import _pair
    -
    -
    -class _UpsamplingBase(Module):
    -
    -    def __init__(self, size=None, scale_factor=None):
    -        super(_UpsamplingBase, self).__init__()
    -        if size is None and scale_factor is None:
    -            raise ValueError('either size or scale_factor should be defined')
    -        if scale_factor is not None and not isinstance(scale_factor, Integral):
    -            raise ValueError('scale_factor must be of integer type')
    -        self.size = _pair(size)
    -        self.scale_factor = scale_factor
    -
    -    def __repr__(self):
    -        if self.scale_factor is not None:
    -            info = 'scale_factor=' + str(self.scale_factor)
    -        else:
    -            info = 'size=' + str(self.size)
    -        return self.__class__.__name__ + '(' + info + ')'
    -
    -
    -
    [docs]class UpsamplingNearest2d(_UpsamplingBase): - """ - Applies a 2D nearest neighbor upsampling to an input signal composed of several input - channels. - - To specify the scale, it takes either the :attr:`size` or the :attr:`scale_factor` - as it's constructor argument. - - When `size` is given, it is the output size of the image (h, w). - - Args: - size (tuple, optional): a tuple of ints (H_out, W_out) output sizes - scale_factor (int, optional): the multiplier for the image height / width - - Shape: - - Input: :math:`(N, C, H_{in}, W_{in})` - - Output: :math:`(N, C, H_{out}, W_{out})` where - :math:`H_{out} = floor(H_{in} * scale\_factor)` - :math:`W_{out} = floor(W_{in} * scale\_factor)` - - Examples:: - - >>> inp - Variable containing: - (0 ,0 ,.,.) = - 1 2 - 3 4 - [torch.FloatTensor of size 1x1x2x2] - - >>> m = nn.UpsamplingNearest2d(scale_factor=2) - >>> m(inp) - Variable containing: - (0 ,0 ,.,.) = - 1 1 2 2 - 1 1 2 2 - 3 3 4 4 - 3 3 4 4 - [torch.FloatTensor of size 1x1x4x4] - - """ - - def forward(self, input): - return F.upsample_nearest(input, self.size, self.scale_factor)
    - - -
    [docs]class UpsamplingBilinear2d(_UpsamplingBase): - """ - Applies a 2D bilinear upsampling to an input signal composed of several input - channels. - - To specify the scale, it takes either the :attr:`size` or the :attr:`scale_factor` - as it's constructor argument. - - When `size` is given, it is the output size of the image (h, w). - - Args: - size (tuple, optional): a tuple of ints (H_out, W_out) output sizes - scale_factor (int, optional): the multiplier for the image height / width - - Shape: - - Input: :math:`(N, C, H_{in}, W_{in})` - - Output: :math:`(N, C, H_{out}, W_{out})` where - :math:`H_{out} = floor(H_{in} * scale\_factor)` - :math:`W_{out} = floor(W_{in} * scale\_factor)` - - Examples:: - - >>> inp - Variable containing: - (0 ,0 ,.,.) = - 1 2 - 3 4 - [torch.FloatTensor of size 1x1x2x2] - - >>> m = nn.UpsamplingBilinear2d(scale_factor=2) - >>> m(inp) - Variable containing: - (0 ,0 ,.,.) = - 1.0000 1.3333 1.6667 2.0000 - 1.6667 2.0000 2.3333 2.6667 - 2.3333 2.6667 3.0000 3.3333 - 3.0000 3.3333 3.6667 4.0000 - [torch.FloatTensor of size 1x1x4x4] - - """ - - def forward(self, input): - return F.upsample_bilinear(input, self.size, self.scale_factor)
    -
    - -
    -
    - -
    -
    -
    - - -
    - -
    -

    - © Copyright 2017, Torch Contributors. - -

    -
    - Built with Sphinx using a theme provided by Read the Docs. - -
    - -
    -
    - -
    - -
    - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/_modules/torch/nn/parallel/data_parallel.html b/docs/_modules/torch/nn/parallel/data_parallel.html deleted file mode 100644 index fbe86ec8e7f2..000000000000 --- a/docs/_modules/torch/nn/parallel/data_parallel.html +++ /dev/null @@ -1,679 +0,0 @@ - - - - - - - - - - - torch.nn.parallel.data_parallel — PyTorch 0.1.11 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - -
    - - - - - - -
    -
    - - - - - - - - - - - - - - - - -
    - - - - -
    -
    -
    -
    - -

    Source code for torch.nn.parallel.data_parallel

    -import torch
    -from ..modules import Module
    -from .scatter_gather import scatter_kwargs, gather
    -from .replicate import replicate
    -from .parallel_apply import parallel_apply
    -
    -
    -
    [docs]class DataParallel(Module): - - """Implements data parallelism at the module level. - - This container parallelizes the application of the given module by - splitting the input across the specified devices by chunking in the batch - dimension. In the forward pass, the module is replicated on each device, - and each replica handles a portion of the input. During the backwards - pass, gradients from each replica are summed into the original module. - - The batch size should be larger than the number of GPUs used. It should - also be an integer multiple of the number of GPUs so that each chunk is the - same size (so that each GPU processes the same number of samples). - - See also: :ref:`cuda-nn-dataparallel-instead` - - Arbitrary positional and keyword inputs are allowed to be passed into - DataParallel EXCEPT Tensors. All variables will be scattered on dim - specified (default 0). Primitive types will be broadcasted, but all - other types will be a shallow copy and can be corrupted if written to in - the model's forward pass. - - Args: - module: module to be parallelized - device_ids: CUDA devices (default: all devices) - output_device: device location of output (default: device_ids[0]) - - Example:: - - >>> net = torch.nn.DataParallel(model, device_ids=[0, 1, 2]) - >>> output = net(input_var) - """ - - # TODO: update notes/cuda.rst when this class handles 8+ GPUs well - - def __init__(self, module, device_ids=None, output_device=None, dim=0): - super(DataParallel, self).__init__() - if device_ids is None: - device_ids = list(range(torch.cuda.device_count())) - if output_device is None: - output_device = device_ids[0] - self.dim = dim - self.module = module - self.device_ids = device_ids - self.output_device = output_device - if len(self.device_ids) == 1: - self.module.cuda(device_ids[0]) - - def forward(self, *inputs, **kwargs): - inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids) - if len(self.device_ids) == 1: - return self.module(*inputs[0], **kwargs[0]) - replicas = self.replicate(self.module, self.device_ids[:len(inputs)]) - outputs = self.parallel_apply(replicas, inputs, kwargs) - return self.gather(outputs, self.output_device) - - def replicate(self, module, device_ids): - return replicate(module, device_ids) - - def scatter(self, inputs, kwargs, device_ids): - return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim) - - def parallel_apply(self, replicas, inputs, kwargs): - return parallel_apply(replicas, inputs, kwargs) - - def gather(self, outputs, output_device): - return gather(outputs, output_device, dim=self.dim)
    - - -def data_parallel(module, inputs, device_ids=None, output_device=None, dim=0, module_kwargs=None): - """Evaluates module(input) in parallel across the GPUs given in device_ids. - - This is the functional version of the DataParallel module. - - Args: - module: the module to evaluate in parallel - inputs: inputs to the module - device_ids: GPU ids on which to replicate module - output_device: GPU location of the output Use -1 to indicate the CPU. - (default: device_ids[0]) - Returns: - a Variable containing the result of module(input) located on - output_device - """ - if not isinstance(inputs, tuple): - inputs = (inputs,) - - if device_ids is None: - device_ids = list(range(torch.cuda.device_count())) - - if output_device is None: - output_device = device_ids[0] - - inputs, module_kwargs = scatter_kwargs(inputs, module_kwargs, device_ids, dim) - if len(device_ids) == 1: - return module(*inputs[0], **module_kwargs[0]) - replicas = replicate(module, device_ids[:len(inputs)]) - outputs = parallel_apply(replicas, inputs, module_kwargs) - return gather(outputs, output_device, dim) -
    - -
    -
    - -
    -
    -
    - - -
    - -
    -

    - © Copyright 2017, Torch Contributors. - -

    -
    - Built with Sphinx using a theme provided by Read the Docs. - -
    - -
    -
    - -
    - -
    - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/_modules/torch/nn/parameter.html b/docs/_modules/torch/nn/parameter.html deleted file mode 100644 index 1037536c2510..000000000000 --- a/docs/_modules/torch/nn/parameter.html +++ /dev/null @@ -1,601 +0,0 @@ - - - - - - - - - - - torch.nn.parameter — PyTorch 0.1.11 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - -
    - - - - - - -
    -
    - - - - - - - - - - - - - - - - -
    - - - - -
    -
    -
    -
    - -

    Source code for torch.nn.parameter

    -from torch.autograd import Variable
    -
    -
    -
    [docs]class Parameter(Variable): - """A kind of Variable that is to be considered a module parameter. - - Parameters are :class:`~torch.autograd.Variable` subclasses, that have a - very special property when used with :class:`Module` s - when they're - assigned as Module attributes they are automatically added to the list of - its parameters, and will appear e.g. in :meth:`~Module.parameters` iterator. - Assigning a Variable doesn't have such effect. This is because one might - want to cache some temporary state, like last hidden state of the RNN, in - the model. If there was no such class as :class:`Parameter`, these - temporaries would get registered too. - - Another difference is that parameters can't be volatile and that they - require gradient by default. - - Arguments: - data (Tensor): parameter tensor. - requires_grad (bool, optional): if the parameter requires gradient. See - :ref:`excluding-subgraphs` for more details. - """ - def __new__(cls, data=None, requires_grad=True): - return super(Parameter, cls).__new__(cls, data, requires_grad=requires_grad) - - def __repr__(self): - return 'Parameter containing:' + self.data.__repr__()
    -
    - -
    -
    - -
    -
    -
    - - -
    - -
    -

    - © Copyright 2017, Torch Contributors. - -

    -
    - Built with Sphinx using a theme provided by Read the Docs. - -
    - -
    -
    - -
    - -
    - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/_modules/torch/nn/utils/clip_grad.html b/docs/_modules/torch/nn/utils/clip_grad.html deleted file mode 100644 index 45a465fc1ef6..000000000000 --- a/docs/_modules/torch/nn/utils/clip_grad.html +++ /dev/null @@ -1,605 +0,0 @@ - - - - - - - - - - - torch.nn.utils.clip_grad — PyTorch 0.1.11 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - -
    - - - - - - -
    -
    - - - - - - - - - - - - - - - - -
    - - - - -
    -
    -
    -
    - -

    Source code for torch.nn.utils.clip_grad

    -
    -
    [docs]def clip_grad_norm(parameters, max_norm, norm_type=2): - """Clips gradient norm of an iterable of parameters. - - The norm is computed over all gradients together, as if they were - concatenated into a single vector. Gradients are modified in-place. - - Arguments: - parameters (Iterable[Variable]): an iterable of Variables that will have - gradients normalized - max_norm (float or int): max norm of the gradients - norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for infinity norm. - - Returns: - Total norm of the parameters (viewed as a single vector). - """ - parameters = list(filter(lambda p: p.grad is not None, parameters)) - max_norm = float(max_norm) - norm_type = float(norm_type) - if norm_type == float('inf'): - total_norm = max(p.grad.data.abs().max() for p in parameters) - else: - total_norm = 0 - for p in parameters: - param_norm = p.grad.data.norm(norm_type) - total_norm += param_norm ** norm_type - total_norm = total_norm ** (1. / norm_type) - clip_coef = max_norm / (total_norm + 1e-6) - if clip_coef < 1: - for p in parameters: - p.grad.data.mul_(clip_coef) - return total_norm
    -
    - -
    -
    - -
    -
    -
    - - -
    - -
    -

    - © Copyright 2017, Torch Contributors. - -

    -
    - Built with Sphinx using a theme provided by Read the Docs. - -
    - -
    -
    - -
    - -
    - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/_modules/torch/nn/utils/rnn.html b/docs/_modules/torch/nn/utils/rnn.html deleted file mode 100644 index 308f0d1deecb..000000000000 --- a/docs/_modules/torch/nn/utils/rnn.html +++ /dev/null @@ -1,699 +0,0 @@ - - - - - - - - - - - torch.nn.utils.rnn — PyTorch 0.1.11 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - -
    - - - - - - -
    -
    - - - - - - - - - - - - - - - - -
    - - - - -
    -
    -
    -
    - -

    Source code for torch.nn.utils.rnn

    -from collections import namedtuple
    -import torch
    -from torch.autograd import Variable
    -
    -
    -PackedSequence_ = namedtuple('PackedSequence', ['data', 'batch_sizes'])
    -
    -
    -
    [docs]class PackedSequence(PackedSequence_): - """Holds the data and list of batch_sizes of a packed sequence. - - All RNN modules accept packed sequences as inputs. - - Note: - Instances of this class should never be created manually. They are meant - to be instantiated by functions like :func:`pack_padded_sequence`. - - Attributes: - data (Variable): Variable containing packed sequence - batch_sizes (list[int]): list of integers holding information about - the batch size at each sequence step - """ - pass
    - - -
    [docs]def pack_padded_sequence(input, lengths, batch_first=False): - """Packs a Variable containing padded sequences of variable length. - - Input can be of size ``TxBx*`` where T is the length of the longest sequence - (equal to ``lengths[0]``), B is the batch size, and * is any number of - dimensions (including 0). If ``batch_first`` is True ``BxTx*`` inputs are expected. - - The sequences should be sorted by length in a decreasing order, i.e. - ``input[:,0]`` should be the longest sequence, and ``input[:,B-1]`` the - shortest one. - - Note: - This function accept any input that has at least two dimensions. You - can apply it to pack the labels, and use the output of the RNN with - them to compute the loss directly. A Variable can be retrieved from - a :class:`PackedSequence` object by accessing its ``.data`` attribute. - - Arguments: - input (Variable): padded batch of variable length sequences. - lengths (list[int]): list of sequences lengths of each batch element. - batch_first (bool, optional): if True, the input is expected in BxTx* - format. - - Returns: - a :class:`PackedSequence` object - """ - if lengths[-1] <= 0: - raise ValueError("length of all samples has to be greater than 0, " - "but found an element in 'lengths' that is <=0") - if batch_first: - input = input.transpose(0, 1) - - steps = [] - batch_sizes = [] - lengths_iter = reversed(lengths) - current_length = next(lengths_iter) - batch_size = input.size(1) - if len(lengths) != batch_size: - raise ValueError("lengths array has incorrect size") - - for step, step_value in enumerate(input, 1): - steps.append(step_value[:batch_size]) - batch_sizes.append(batch_size) - - while step == current_length: - try: - new_length = next(lengths_iter) - except StopIteration: - current_length = None - break - - if current_length > new_length: # remember that new_length is the preceding length in the array - raise ValueError("lengths array has to be sorted in decreasing order") - batch_size -= 1 - current_length = new_length - if current_length is None: - break - return PackedSequence(torch.cat(steps), batch_sizes)
    - - -
    [docs]def pad_packed_sequence(sequence, batch_first=False): - """Pads a packed batch of variable length sequences. - - It is an inverse operation to :func:`pack_padded_sequence`. - - The returned Variable's data will be of size TxBx*, where T is the length - of the longest sequence and B is the batch size. If ``batch_size`` is True, - the data will be transposed into BxTx* format. - - Batch elements will be ordered decreasingly by their length. - - Arguments: - sequence (PackedSequence): batch to pad - batch_first (bool, optional): if True, the output will be in BxTx* format. - - Returns: - Tuple of Variable containing the padded sequence, and a list of lengths - of each sequence in the batch. - """ - var_data, batch_sizes = sequence - max_batch_size = batch_sizes[0] - output = var_data.data.new(len(batch_sizes), max_batch_size, *var_data.size()[1:]).zero_() - output = Variable(output) - - lengths = [] - data_offset = 0 - prev_batch_size = batch_sizes[0] - for i, batch_size in enumerate(batch_sizes): - output[i, :batch_size] = var_data[data_offset:data_offset + batch_size] - data_offset += batch_size - - dec = prev_batch_size - batch_size - if dec > 0: - lengths.extend((i,) * dec) - prev_batch_size = batch_size - lengths.extend((i + 1,) * batch_size) - lengths.reverse() - - if batch_first: - output = output.transpose(0, 1) - return output, lengths
    -
    - -
    -
    - -
    -
    -
    - - -
    - -
    -

    - © Copyright 2017, Torch Contributors. - -

    -
    - Built with Sphinx using a theme provided by Read the Docs. - -
    - -
    -
    - -
    - -
    - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/_modules/torch/optim/adadelta.html b/docs/_modules/torch/optim/adadelta.html deleted file mode 100644 index 143301b30765..000000000000 --- a/docs/_modules/torch/optim/adadelta.html +++ /dev/null @@ -1,638 +0,0 @@ - - - - - - - - - - - torch.optim.adadelta — PyTorch 0.1.11 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - -
    - - - - - - -
    -
    - - - - - - - - - - - - - - - - -
    - - - - -
    -
    -
    -
    - -

    Source code for torch.optim.adadelta

    -from .optimizer import Optimizer
    -
    -
    -
    [docs]class Adadelta(Optimizer): - """Implements Adadelta algorithm. - - It has been proposed in `ADADELTA: An Adaptive Learning Rate Method`__. - - Arguments: - params (iterable): iterable of parameters to optimize or dicts defining - parameter groups - rho (float, optional): coefficient used for computing a running average - of squared gradients (default: 0.9) - eps (float, optional): term added to the denominator to improve - numerical stability (default: 1e-6) - lr (float, optional): coefficient that scale delta before it is applied to the - parameters (default: 1.0) - weight_decay (float, optional): weight decay (L2 penalty) (default: 0) - - __ https://arxiv.org/abs/1212.5701 - """ - - def __init__(self, params, lr=1.0, rho=0.9, eps=1e-6, weight_decay=0): - defaults = dict(lr=lr, rho=rho, eps=eps, weight_decay=weight_decay) - super(Adadelta, self).__init__(params, defaults) - -
    [docs] def step(self, closure=None): - """Performs a single optimization step. - - Arguments: - closure (callable, optional): A closure that reevaluates the model - and returns the loss. - """ - loss = None - if closure is not None: - loss = closure() - - for group in self.param_groups: - for p in group['params']: - if p.grad is None: - continue - grad = p.grad.data - state = self.state[p] - - # State initialization - if len(state) == 0: - state['step'] = 0 - state['square_avg'] = grad.new().resize_as_(grad).zero_() - state['acc_delta'] = grad.new().resize_as_(grad).zero_() - - square_avg, acc_delta = state['square_avg'], state['acc_delta'] - rho, eps = group['rho'], group['eps'] - - state['step'] += 1 - - if group['weight_decay'] != 0: - grad = grad.add(group['weight_decay'], p.data) - - square_avg.mul_(rho).addcmul_(1 - rho, grad, grad) - std = square_avg.add(eps).sqrt_() - delta = acc_delta.add(eps).sqrt_().div_(std).mul_(grad) - p.data.add_(-group['lr'], delta) - acc_delta.mul_(rho).addcmul_(1 - rho, delta, delta) - - return loss
    -
    - -
    -
    - -
    -
    -
    - - -
    - -
    -

    - © Copyright 2017, Torch Contributors. - -

    -
    - Built with Sphinx using a theme provided by Read the Docs. - -
    - -
    -
    - -
    - -
    - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/_modules/torch/optim/adagrad.html b/docs/_modules/torch/optim/adagrad.html deleted file mode 100644 index 01ca9b6f4bc8..000000000000 --- a/docs/_modules/torch/optim/adagrad.html +++ /dev/null @@ -1,657 +0,0 @@ - - - - - - - - - - - torch.optim.adagrad — PyTorch 0.1.11 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - -
    - - - - - - -
    -
    - - - - - - - - - - - - - - - - -
    - - - - -
    -
    -
    -
    - -

    Source code for torch.optim.adagrad

    -import torch
    -
    -from .optimizer import Optimizer
    -
    -
    -
    [docs]class Adagrad(Optimizer): - """Implements Adagrad algorithm. - - It has been proposed in `Adaptive Subgradient Methods for Online Learning and Stochastic Optimization`_. - - Arguments: - params (iterable): iterable of parameters to optimize or dicts defining - parameter groups - lr (float, optional): learning rate (default: 1e-2) - lr_decay (float, optional): learning rate decay (default: 0) - weight_decay (float, optional): weight decay (L2 penalty) (default: 0) - - .. _Adaptive Subgradient Methods for Online Learning and Stochastic Optimization: - http://jmlr.org/papers/v12/duchi11a.html - """ - - def __init__(self, params, lr=1e-2, lr_decay=0, weight_decay=0): - defaults = dict(lr=lr, lr_decay=lr_decay, weight_decay=weight_decay) - super(Adagrad, self).__init__(params, defaults) - - for group in self.param_groups: - for p in group['params']: - state = self.state[p] - state['step'] = 0 - state['sum'] = p.data.new().resize_as_(p.data).zero_() - - def share_memory(self): - for group in self.param_groups: - for p in group['params']: - state = self.state[p] - state['sum'].share_memory_() - -
    [docs] def step(self, closure=None): - """Performs a single optimization step. - - Arguments: - closure (callable, optional): A closure that reevaluates the model - and returns the loss. - """ - loss = None - if closure is not None: - loss = closure() - - for group in self.param_groups: - for p in group['params']: - if p.grad is None: - continue - grad = p.grad.data - state = self.state[p] - - state['step'] += 1 - - if group['weight_decay'] != 0: - if p.grad.data.is_sparse: - raise RuntimeError("weight_decay option is not compatible with sparse gradients ") - grad = grad.add(group['weight_decay'], p.data) - - clr = group['lr'] / (1 + (state['step'] - 1) * group['lr_decay']) - - if p.grad.data.is_sparse: - grad_indices = grad.indices() - grad_values = grad.values() - size = torch.Size([x for x in grad.size()]) - - def make_sparse(values): - constructor = type(p.grad.data) - if grad_indices.dim() == 0 or values.dim() == 0: - return constructor() - return constructor(grad_indices, values, size) - state['sum'].add_(make_sparse(grad_values.pow(2))) - std = state['sum'].sparse_mask(grad) - std_values = std.values().sqrt_().add_(1e-10) - p.data.add_(-clr, make_sparse(grad_values / std_values)) - else: - state['sum'].addcmul_(1, grad, grad) - std = state['sum'].sqrt().add_(1e-10) - p.data.addcdiv_(-clr, grad, std) - - return loss
    -
    - -
    -
    - -
    -
    -
    - - -
    - -
    -

    - © Copyright 2017, Torch Contributors. - -

    -
    - Built with Sphinx using a theme provided by Read the Docs. - -
    - -
    -
    - -
    - -
    - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/_modules/torch/optim/adam.html b/docs/_modules/torch/optim/adam.html deleted file mode 100644 index f65d014e7871..000000000000 --- a/docs/_modules/torch/optim/adam.html +++ /dev/null @@ -1,649 +0,0 @@ - - - - - - - - - - - torch.optim.adam — PyTorch 0.1.11 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - -
    - - - - - - -
    -
    - - - - - - - - - - - - - - - - -
    - - - - -
    -
    -
    -
    - -

    Source code for torch.optim.adam

    -import math
    -from .optimizer import Optimizer
    -
    -
    -
    [docs]class Adam(Optimizer): - """Implements Adam algorithm. - - It has been proposed in `Adam: A Method for Stochastic Optimization`_. - - Arguments: - params (iterable): iterable of parameters to optimize or dicts defining - parameter groups - lr (float, optional): learning rate (default: 1e-3) - betas (Tuple[float, float], optional): coefficients used for computing - running averages of gradient and its square (default: (0.9, 0.999)) - eps (float, optional): term added to the denominator to improve - numerical stability (default: 1e-8) - weight_decay (float, optional): weight decay (L2 penalty) (default: 0) - - .. _Adam\: A Method for Stochastic Optimization: - https://arxiv.org/abs/1412.6980 - """ - - def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, - weight_decay=0): - defaults = dict(lr=lr, betas=betas, eps=eps, - weight_decay=weight_decay) - super(Adam, self).__init__(params, defaults) - -
    [docs] def step(self, closure=None): - """Performs a single optimization step. - - Arguments: - closure (callable, optional): A closure that reevaluates the model - and returns the loss. - """ - loss = None - if closure is not None: - loss = closure() - - for group in self.param_groups: - for p in group['params']: - if p.grad is None: - continue - grad = p.grad.data - state = self.state[p] - - # State initialization - if len(state) == 0: - state['step'] = 0 - # Exponential moving average of gradient values - state['exp_avg'] = grad.new().resize_as_(grad).zero_() - # Exponential moving average of squared gradient values - state['exp_avg_sq'] = grad.new().resize_as_(grad).zero_() - - exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] - beta1, beta2 = group['betas'] - - state['step'] += 1 - - if group['weight_decay'] != 0: - grad = grad.add(group['weight_decay'], p.data) - - # Decay the first and second moment running average coefficient - exp_avg.mul_(beta1).add_(1 - beta1, grad) - exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) - - denom = exp_avg_sq.sqrt().add_(group['eps']) - - bias_correction1 = 1 - beta1 ** state['step'] - bias_correction2 = 1 - beta2 ** state['step'] - step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1 - - p.data.addcdiv_(-step_size, exp_avg, denom) - - return loss
    -
    - -
    -
    - -
    -
    -
    - - -
    - -
    -

    - © Copyright 2017, Torch Contributors. - -

    -
    - Built with Sphinx using a theme provided by Read the Docs. - -
    - -
    -
    - -
    - -
    - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/_modules/torch/optim/adamax.html b/docs/_modules/torch/optim/adamax.html deleted file mode 100644 index 97ca9e865342..000000000000 --- a/docs/_modules/torch/optim/adamax.html +++ /dev/null @@ -1,649 +0,0 @@ - - - - - - - - - - - torch.optim.adamax — PyTorch 0.1.11 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - -
    - - - - - - -
    -
    - - - - - - - - - - - - - - - - -
    - - - - -
    -
    -
    -
    - -

    Source code for torch.optim.adamax

    -import torch
    -from .optimizer import Optimizer
    -
    -
    -
    [docs]class Adamax(Optimizer): - """Implements Adamax algorithm (a variant of Adam based on infinity norm). - - It has been proposed in `Adam: A Method for Stochastic Optimization`__. - - Arguments: - params (iterable): iterable of parameters to optimize or dicts defining - parameter groups - lr (float, optional): learning rate (default: 2e-3) - betas (Tuple[float, float], optional): coefficients used for computing - running averages of gradient and its square - eps (float, optional): term added to the denominator to improve - numerical stability (default: 1e-8) - weight_decay (float, optional): weight decay (L2 penalty) (default: 0) - - __ https://arxiv.org/abs/1412.6980 - """ - - def __init__(self, params, lr=2e-3, betas=(0.9, 0.999), eps=1e-8, - weight_decay=0): - defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) - super(Adamax, self).__init__(params, defaults) - -
    [docs] def step(self, closure=None): - """Performs a single optimization step. - - Arguments: - closure (callable, optional): A closure that reevaluates the model - and returns the loss. - """ - loss = None - if closure is not None: - loss = closure() - - for group in self.param_groups: - for p in group['params']: - if p.grad is None: - continue - grad = p.grad.data - state = self.state[p] - - # State initialization - if len(state) == 0: - state['step'] = 0 - state['exp_avg'] = grad.new().resize_as_(grad).zero_() - state['exp_inf'] = grad.new().resize_as_(grad).zero_() - - exp_avg, exp_inf = state['exp_avg'], state['exp_inf'] - beta1, beta2 = group['betas'] - eps = group['eps'] - - state['step'] += 1 - - if group['weight_decay'] != 0: - grad = grad.add(group['weight_decay'], p.data) - - # Update biased first moment estimate. - exp_avg.mul_(beta1).add_(1 - beta1, grad) - # Update the exponentially weighted infinity norm. - norm_buf = torch.cat([ - exp_inf.mul_(beta2).unsqueeze(0), - grad.abs().add_(eps).unsqueeze_(0) - ], 0) - torch.max(norm_buf, 0, out=(exp_inf, exp_inf.new().long())) - exp_inf.squeeze_(0) - - bias_correction = 1 - beta1 ** state['step'] - clr = group['lr'] / bias_correction - - p.data.addcdiv_(-clr, exp_avg, exp_inf) - - return loss
    -
    - -
    -
    - -
    -
    -
    - - -
    - -
    -

    - © Copyright 2017, Torch Contributors. - -

    -
    - Built with Sphinx using a theme provided by Read the Docs. - -
    - -
    -
    - -
    - -
    - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/_modules/torch/optim/asgd.html b/docs/_modules/torch/optim/asgd.html deleted file mode 100644 index 682b89982fec..000000000000 --- a/docs/_modules/torch/optim/asgd.html +++ /dev/null @@ -1,648 +0,0 @@ - - - - - - - - - - - torch.optim.asgd — PyTorch 0.1.11 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - -
    - - - - - - -
    -
    - - - - - - - - - - - - - - - - -
    - - - - -
    -
    -
    -
    - -

    Source code for torch.optim.asgd

    -import math
    -from .optimizer import Optimizer
    -
    -
    -
    [docs]class ASGD(Optimizer): - """Implements Averaged Stochastic Gradient Descent. - - It has been proposed in `Acceleration of stochastic approximation by averaging`_. - - Arguments: - params (iterable): iterable of parameters to optimize or dicts defining - parameter groups - lr (float, optional): learning rate (default: 1e-2) - lambd (float, optional): decay term (default: 1e-4) - alpha (float, optional): power for eta update (default: 0.75) - t0 (float, optional): point at which to start averaging (default: 1e6) - weight_decay (float, optional): weight decay (L2 penalty) (default: 0) - - .. _Acceleration of stochastic approximation by averaging: - http://dl.acm.org/citation.cfm?id=131098 - """ - - def __init__(self, params, lr=1e-2, lambd=1e-4, alpha=0.75, t0=1e6, weight_decay=0): - defaults = dict(lr=lr, lambd=lambd, alpha=alpha, t0=t0, - weight_decay=weight_decay) - super(ASGD, self).__init__(params, defaults) - -
    [docs] def step(self, closure=None): - """Performs a single optimization step. - - Arguments: - closure (callable, optional): A closure that reevaluates the model - and returns the loss. - """ - loss = None - if closure is not None: - loss = closure() - - for group in self.param_groups: - for p in group['params']: - if p.grad is None: - continue - grad = p.grad.data - state = self.state[p] - - # State initialization - if len(state) == 0: - state['step'] = 0 - state['eta'] = group['lr'] - state['mu'] = 1 - state['ax'] = grad.new().resize_as_(grad).zero_() - - state['step'] += 1 - - if group['weight_decay'] != 0: - grad = grad.add(group['weight_decay'], p.data) - - # decay term - p.data.mul_(1 - group['lambd'] * state['eta']) - - # update parameter - p.data.add_(-state['eta'], grad) - - # averaging - if state['mu'] != 1: - state['ax'].add_(p.data.sub(state['ax']).mul(state['mu'])) - else: - state['ax'].copy_(p.data) - - # update eta and mu - state['eta'] = (group['lr'] / - math.pow((1 + group['lambd'] * group['lr'] * state['step']), group['alpha'])) - state['mu'] = 1 / max(1, state['step'] - group['t0']) - - return loss
    -
    - -
    -
    - -
    -
    -
    - - -
    - -
    -

    - © Copyright 2017, Torch Contributors. - -

    -
    - Built with Sphinx using a theme provided by Read the Docs. - -
    - -
    -
    - -
    - -
    - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/_modules/torch/optim/lbfgs.html b/docs/_modules/torch/optim/lbfgs.html deleted file mode 100644 index b57a63d84b1d..000000000000 --- a/docs/_modules/torch/optim/lbfgs.html +++ /dev/null @@ -1,821 +0,0 @@ - - - - - - - - - - - torch.optim.lbfgs — PyTorch 0.1.11 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - -
    - - - - - - -
    -
    - - - - - - - - - - - - - - - - -
    - - - - -
    -
    -
    -
    - -

    Source code for torch.optim.lbfgs

    -import torch
    -from functools import reduce
    -from .optimizer import Optimizer
    -
    -
    -
    [docs]class LBFGS(Optimizer): - """Implements L-BFGS algorithm. - - .. warning:: - This optimizer doesn't support per-parameter options and parameter - groups (there can be only one). - - .. warning:: - Right now all parameters have to be on a single device. This will be - improved in the future. - - .. note:: - This is a very memory intensive optimizer (it requires additional - ``param_bytes * (history_size + 1)`` bytes). If it doesn't fit in memory - try reducing the history size, or use a different algorithm. - - Arguments: - lr (float): learning rate (default: 1) - max_iter (int): maximal number of iterations per optimization step - (default: 20) - max_eval (int): maximal number of function evaluations per optimization - step (default: max_iter * 1.25). - tolerance_grad (float): termination tolerance on first order optimality - (default: 1e-5). - tolerance_change (float): termination tolerance on function value/parameter - changes (default: 1e-9). - history_size (int): update history size (default: 100). - """ - - def __init__(self, params, lr=1, max_iter=20, max_eval=None, - tolerance_grad=1e-5, tolerance_change=1e-9, history_size=100, - line_search_fn=None): - if max_eval is None: - max_eval = max_iter * 5 // 4 - defaults = dict(lr=lr, max_iter=max_iter, max_eval=max_eval, - tolerance_grad=tolerance_grad, tolerance_change=tolerance_change, - history_size=history_size, line_search_fn=line_search_fn) - super(LBFGS, self).__init__(params, defaults) - - if len(self.param_groups) != 1: - raise ValueError("LBFGS doesn't support per-parameter options " - "(parameter groups)") - - self._params = self.param_groups[0]['params'] - self._numel_cache = None - - def _numel(self): - if self._numel_cache is None: - self._numel_cache = reduce(lambda total, p: total + p.numel(), self._params, 0) - return self._numel_cache - - def _gather_flat_grad(self): - views = [] - for p in self._params: - if p.grad is None: - view = p.data.new(p.data.numel()).zero_() - elif p.grad.data.is_sparse: - view = p.grad.data.to_dense().view(-1) - else: - view = p.grad.data.view(-1) - views.append(view) - return torch.cat(views, 0) - - def _add_grad(self, step_size, update): - offset = 0 - for p in self._params: - numel = p.numel() - p.data.add_(step_size, update[offset:offset + numel]) - offset += numel - assert offset == self._numel() - -
    [docs] def step(self, closure): - """Performs a single optimization step. - - Arguments: - closure (callable): A closure that reevaluates the model - and returns the loss. - """ - assert len(self.param_groups) == 1 - - group = self.param_groups[0] - lr = group['lr'] - max_iter = group['max_iter'] - max_eval = group['max_eval'] - tolerance_grad = group['tolerance_grad'] - tolerance_change = group['tolerance_change'] - line_search_fn = group['line_search_fn'] - history_size = group['history_size'] - - state = self.state['global_state'] - state.setdefault('func_evals', 0) - state.setdefault('n_iter', 0) - - # evaluate initial f(x) and df/dx - orig_loss = closure() - loss = orig_loss.data[0] - current_evals = 1 - state['func_evals'] += 1 - - flat_grad = self._gather_flat_grad() - abs_grad_sum = flat_grad.abs().sum() - - if abs_grad_sum <= tolerance_grad: - return loss - - # variables cached in state (for tracing) - d = state.get('d') - t = state.get('t') - old_dirs = state.get('old_dirs') - old_stps = state.get('old_stps') - H_diag = state.get('H_diag') - prev_flat_grad = state.get('prev_flat_grad') - prev_loss = state.get('prev_loss') - - n_iter = 0 - # optimize for a max of max_iter iterations - while n_iter < max_iter: - # keep track of nb of iterations - n_iter += 1 - state['n_iter'] += 1 - - ############################################################ - # compute gradient descent direction - ############################################################ - if state['n_iter'] == 1: - d = flat_grad.neg() - old_dirs = [] - old_stps = [] - H_diag = 1 - else: - # do lbfgs update (update memory) - y = flat_grad.sub(prev_flat_grad) - s = d.mul(t) - ys = y.dot(s) # y*s - if ys > 1e-10: - # updating memory - if len(old_dirs) == history_size: - # shift history by one (limited-memory) - old_dirs.pop(0) - old_stps.pop(0) - - # store new direction/step - old_dirs.append(s) - old_stps.append(y) - - # update scale of initial Hessian approximation - H_diag = ys / y.dot(y) # (y*y) - - # compute the approximate (L-BFGS) inverse Hessian - # multiplied by the gradient - num_old = len(old_dirs) - - if 'ro' not in state: - state['ro'] = [None] * history_size - state['al'] = [None] * history_size - ro = state['ro'] - al = state['al'] - - for i in range(num_old): - ro[i] = 1. / old_stps[i].dot(old_dirs[i]) - - # iteration in L-BFGS loop collapsed to use just one buffer - q = flat_grad.neg() - for i in range(num_old - 1, -1, -1): - al[i] = old_dirs[i].dot(q) * ro[i] - q.add_(-al[i], old_stps[i]) - - # multiply by initial Hessian - # r/d is the final direction - d = r = torch.mul(q, H_diag) - for i in range(num_old): - be_i = old_stps[i].dot(r) * ro[i] - r.add_(al[i] - be_i, old_dirs[i]) - - if prev_flat_grad is None: - prev_flat_grad = flat_grad.clone() - else: - prev_flat_grad.copy_(flat_grad) - prev_loss = loss - - ############################################################ - # compute step length - ############################################################ - # reset initial guess for step size - if state['n_iter'] == 1: - t = min(1., 1. / abs_grad_sum) * lr - else: - t = lr - - # directional derivative - gtd = flat_grad.dot(d) # g * d - - # optional line search: user function - ls_func_evals = 0 - if line_search_fn is not None: - # perform line search, using user function - raise RuntimeError("line search function is not supported yet") - else: - # no line search, simply move with fixed-step - self._add_grad(t, d) - if n_iter != max_iter: - # re-evaluate function only if not in last iteration - # the reason we do this: in a stochastic setting, - # no use to re-evaluate that function here - loss = closure().data[0] - flat_grad = self._gather_flat_grad() - abs_grad_sum = flat_grad.abs().sum() - ls_func_evals = 1 - - # update func eval - current_evals += ls_func_evals - state['func_evals'] += ls_func_evals - - ############################################################ - # check conditions - ############################################################ - if n_iter == max_iter: - break - - if current_evals >= max_eval: - break - - if abs_grad_sum <= tolerance_grad: - break - - if gtd > -tolerance_change: - break - - if d.mul(t).abs_().sum() <= tolerance_change: - break - - if abs(loss - prev_loss) < tolerance_change: - break - - state['d'] = d - state['t'] = t - state['old_dirs'] = old_dirs - state['old_stps'] = old_stps - state['H_diag'] = H_diag - state['prev_flat_grad'] = prev_flat_grad - state['prev_loss'] = prev_loss - - return orig_loss
    -
    - -
    -
    - -
    -
    -
    - - -
    - -
    -

    - © Copyright 2017, Torch Contributors. - -

    -
    - Built with Sphinx using a theme provided by Read the Docs. - -
    - -
    -
    - -
    - -
    - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/_modules/torch/optim/optimizer.html b/docs/_modules/torch/optim/optimizer.html deleted file mode 100644 index 0d2c220f3d3a..000000000000 --- a/docs/_modules/torch/optim/optimizer.html +++ /dev/null @@ -1,718 +0,0 @@ - - - - - - - - - - - torch.optim.optimizer — PyTorch 0.1.11 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - -
    - - - - - - -
    -
    - - - - - - - - - - - - - - - - -
    - - - - -
    -
    -
    -
    - -

    Source code for torch.optim.optimizer

    -from collections import defaultdict
    -
    -import torch
    -from copy import deepcopy
    -from itertools import chain
    -from torch.autograd import Variable
    -
    -required = object()
    -
    -
    -
    [docs]class Optimizer(object): - """Base class for all optimizers. - - Arguments: - params (iterable): an iterable of :class:`Variable` s or - :class:`dict` s. Specifies what Variables should be optimized. - defaults: (dict): a dict containing default values of optimization - options (used when a parameter group doesn't specify them). - """ - - def __init__(self, params, defaults): - if isinstance(params, Variable) or torch.is_tensor(params): - raise TypeError("params argument given to the optimizer should be " - "an iterable of Variables or dicts, but got " + - torch.typename(params)) - - self.state = defaultdict(dict) - self.param_groups = list(params) - if len(self.param_groups) == 0: - raise ValueError("optimizer got an empty parameter list") - if not isinstance(self.param_groups[0], dict): - self.param_groups = [{'params': self.param_groups}] - - param_set = set() - for group in self.param_groups: - group['params'] = list(group['params']) - group_set = set(group['params']) - if not param_set.isdisjoint(group_set): - raise ValueError("some parameters appear in more than one " - "parameter group") - param_set.update(group_set) - - for name, default in defaults.items(): - for i, group in enumerate(self.param_groups): - if default is required and name not in group: - raise ValueError("parameter group " + str(i) + " didn't " - "specify a value of required optimization parameter " + - name) - else: - group.setdefault(name, default) - - for group in self.param_groups: - for param in group['params']: - if not isinstance(param, Variable): - raise TypeError("optimizer can only optimize Variables, " - "but one of the params is " + torch.typename(param)) - if not param.requires_grad: - raise ValueError("optimizing a parameter that doesn't " - "require gradients") - if param.creator is not None: - raise ValueError("can't optimize a non-leaf Variable") - - def __getstate__(self): - return { - 'state': self.state, - 'param_groups': self.param_groups, - } - - def __setstate__(self, state): - self.__dict__.update(state) - -
    [docs] def state_dict(self): - """Returns the state of the optimizer as a :class:`dict`. - - It contains two entries: - - * state - a dict holding current optimization state. Its content - differs between optimizer classes. - * param_groups - a dict containig all parameter groups - """ - # Save ids instead of Variables - def pack_group(group): - packed = {k: v for k, v in group.items() if k != 'params'} - packed['params'] = [id(p) for p in group['params']] - return packed - param_groups = [pack_group(g) for g in self.param_groups] - # Remap state to use ids as keys - packed_state = {(id(k) if isinstance(k, Variable) else k): v - for k, v in self.state.items()} - return { - 'state': packed_state, - 'param_groups': param_groups, - }
    - -
    [docs] def load_state_dict(self, state_dict): - """Loads the optimizer state. - - Arguments: - state_dict (dict): optimizer state. Should be an object returned - from a call to :meth:`state_dict`. - """ - # deepcopy, to be consistent with module API - state_dict = deepcopy(state_dict) - # Validate the state_dict - groups = self.param_groups - saved_groups = state_dict['param_groups'] - - if len(groups) != len(saved_groups): - raise ValueError("loaded state dict has a different number of " - "parameter groups") - param_lens = (len(g['params']) for g in groups) - saved_lens = (len(g['params']) for g in saved_groups) - if any(p_len != s_len for p_len, s_len in zip(param_lens, saved_lens)): - raise ValueError("loaded state dict contains a parameter group " - "that doesn't match the size of optimizer's group") - - # Update the state - id_map = {old_id: p for old_id, p in - zip(chain(*(g['params'] for g in saved_groups)), - chain(*(g['params'] for g in groups)))} - state = {id_map.get(k, k): v for k, v in state_dict['state'].items()} - - # Update parameter groups, setting their 'params' value - def update_group(group, new_group): - new_group['params'] = group['params'] - return new_group - param_groups = [ - update_group(g, ng) for g, ng in zip(groups, saved_groups)] - self.__setstate__({'state': state, 'param_groups': param_groups})
    - -
    [docs] def zero_grad(self): - """Clears the gradients of all optimized :class:`Variable` s.""" - for group in self.param_groups: - for param in group['params']: - if param.grad is not None: - param.grad.data.zero_()
    - -
    [docs] def step(self, closure): - """Performs a single optimization step (parameter update). - - Arguments: - closure (callable): A closure that reevaluates the model and - returns the loss. Optional for most optimizers. - """ - raise NotImplementedError
    -
    - -
    -
    - -
    -
    -
    - - -
    - -
    -

    - © Copyright 2017, Torch Contributors. - -

    -
    - Built with Sphinx using a theme provided by Read the Docs. - -
    - -
    -
    - -
    - -
    - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/_modules/torch/optim/rmsprop.html b/docs/_modules/torch/optim/rmsprop.html deleted file mode 100644 index 05e905811c41..000000000000 --- a/docs/_modules/torch/optim/rmsprop.html +++ /dev/null @@ -1,660 +0,0 @@ - - - - - - - - - - - torch.optim.rmsprop — PyTorch 0.1.11 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - -
    - - - - - - -
    -
    - - - - - - - - - - - - - - - - -
    - - - - -
    -
    -
    -
    - -

    Source code for torch.optim.rmsprop

    -from .optimizer import Optimizer
    -
    -
    -
    [docs]class RMSprop(Optimizer): - """Implements RMSprop algorithm. - - Proposed by G. Hinton in his `course <http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf>`_. - - The centered version first appears in `Generating Sequences - With Recurrent Neural Networks <https://arxiv.org/pdf/1308.0850v5.pdf>`_. - - Arguments: - params (iterable): iterable of parameters to optimize or dicts defining - parameter groups - lr (float, optional): learning rate (default: 1e-2) - momentum (float, optional): momentum factor (default: 0) - alpha (float, optional): smoothing constant (default: 0.99) - eps (float, optional): term added to the denominator to improve - numerical stability (default: 1e-8) - centered (bool, optional) : if True, compute the centered RMSProp, - the gradient is normalized by an estimation of its variance - weight_decay (float, optional): weight decay (L2 penalty) (default: 0) - - """ - - def __init__(self, params, lr=1e-2, alpha=0.99, eps=1e-8, weight_decay=0, momentum=0, centered=False): - defaults = dict(lr=lr, momentum=momentum, alpha=alpha, eps=eps, centered=centered, weight_decay=weight_decay) - super(RMSprop, self).__init__(params, defaults) - - def __setstate__(self, state): - super(RMSprop, self).__setstate__(state) - for group in self.param_groups: - group.setdefault('momentum', 0) - group.setdefault('centered', False) - -
    [docs] def step(self, closure=None): - """Performs a single optimization step. - - Arguments: - closure (callable, optional): A closure that reevaluates the model - and returns the loss. - """ - loss = None - if closure is not None: - loss = closure() - - for group in self.param_groups: - for p in group['params']: - if p.grad is None: - continue - grad = p.grad.data - state = self.state[p] - - # State initialization - if len(state) == 0: - state['step'] = 0 - state['square_avg'] = grad.new().resize_as_(grad).zero_() - if group['momentum'] > 0: - state['momentum_buffer'] = grad.new().resize_as_(grad).zero_() - if group['centered']: - state['grad_avg'] = grad.new().resize_as_(grad).zero_() - - square_avg = state['square_avg'] - alpha = group['alpha'] - - state['step'] += 1 - - if group['weight_decay'] != 0: - grad = grad.add(group['weight_decay'], p.data) - - square_avg.mul_(alpha).addcmul_(1 - alpha, grad, grad) - - if group['centered']: - grad_avg = state['grad_avg'] - grad_avg.mul_(alpha).add_(1 - alpha, grad) - avg = square_avg.addcmul(-1, grad_avg, grad_avg).sqrt().add_(group['eps']) - else: - avg = square_avg.sqrt().add_(group['eps']) - - if group['momentum'] > 0: - buf = state['momentum_buffer'] - buf.mul_(group['momentum']).addcdiv_(grad, avg) - p.data.add_(-group['lr'], buf) - else: - p.data.addcdiv_(-group['lr'], grad, avg) - - return loss
    -
    - -
    -
    - -
    -
    -
    - - -
    - -
    -

    - © Copyright 2017, Torch Contributors. - -

    -
    - Built with Sphinx using a theme provided by Read the Docs. - -
    - -
    -
    - -
    - -
    - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/_modules/torch/optim/rprop.html b/docs/_modules/torch/optim/rprop.html deleted file mode 100644 index 5d457a1296f7..000000000000 --- a/docs/_modules/torch/optim/rprop.html +++ /dev/null @@ -1,643 +0,0 @@ - - - - - - - - - - - torch.optim.rprop — PyTorch 0.1.11 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - -
    - - - - - - -
    -
    - - - - - - - - - - - - - - - - -
    - - - - -
    -
    -
    -
    - -

    Source code for torch.optim.rprop

    -import math
    -from .optimizer import Optimizer
    -
    -
    -
    [docs]class Rprop(Optimizer): - """Implements the resilient backpropagation algorithm. - - Arguments: - params (iterable): iterable of parameters to optimize or dicts defining - parameter groups - lr (float, optional): learning rate (default: 1e-2) - etas (Tuple[float, float], optional): pair of (etaminus, etaplis), that - are multiplicative increase and decrease factors (default: (0.5, 1.2)) - step_sizes (Tuple[float, float], optional): a pair of minimal and - maximal allowed step sizes (default: (1e-6, 50)) - """ - - def __init__(self, params, lr=1e-2, etas=(0.5, 1.2), step_sizes=(1e-6, 50)): - defaults = dict(lr=lr, etas=etas, step_sizes=step_sizes) - super(Rprop, self).__init__(params, defaults) - -
    [docs] def step(self, closure=None): - """Performs a single optimization step. - - Arguments: - closure (callable, optional): A closure that reevaluates the model - and returns the loss. - """ - loss = None - if closure is not None: - loss = closure() - - for group in self.param_groups: - for p in group['params']: - if p.grad is None: - continue - grad = p.grad.data - state = self.state[p] - - # State initialization - if len(state) == 0: - state['step'] = 0 - state['prev'] = grad.new().resize_as_(grad).zero_() - state['step_size'] = grad.new().resize_as_(grad).fill_(group['lr']) - - etaminus, etaplus = group['etas'] - step_size_min, step_size_max = group['step_sizes'] - step_size = state['step_size'] - - state['step'] += 1 - - sign = grad.mul(state['prev']).sign() - sign[sign.gt(0)] = etaplus - sign[sign.lt(0)] = etaminus - sign[sign.eq(0)] = 1 - - # update stepsizes with step size updates - step_size.mul_(sign).clamp_(step_size_min, step_size_max) - - # for dir<0, dfdx=0 - # for dir>=0 dfdx=dfdx - grad = grad.clone() - grad[sign.eq(etaminus)] = 0 - - # update parameters - p.data.addcmul_(-1, grad.sign(), step_size) - - state['prev'].copy_(grad) - - return loss
    -
    - -
    -
    - -
    -
    -
    - - -
    - -
    -

    - © Copyright 2017, Torch Contributors. - -

    -
    - Built with Sphinx using a theme provided by Read the Docs. - -
    - -
    -
    - -
    - -
    - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/_modules/torch/optim/sgd.html b/docs/_modules/torch/optim/sgd.html deleted file mode 100644 index 166feba32011..000000000000 --- a/docs/_modules/torch/optim/sgd.html +++ /dev/null @@ -1,673 +0,0 @@ - - - - - - - - - - - torch.optim.sgd — PyTorch 0.1.11 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - -
    - - - - - - -
    -
    - - - - - - - - - - - - - - - - -
    - - - - -
    -
    -
    -
    - -

    Source code for torch.optim.sgd

    -from .optimizer import Optimizer, required
    -
    -
    -
    [docs]class SGD(Optimizer): - r"""Implements stochastic gradient descent (optionally with momentum). - - Nesterov momentum is based on the formula from - `On the importance of initialization and momentum in deep learning`__. - - Args: - params (iterable): iterable of parameters to optimize or dicts defining - parameter groups - lr (float): learning rate - momentum (float, optional): momentum factor (default: 0) - weight_decay (float, optional): weight decay (L2 penalty) (default: 0) - dampening (float, optional): dampening for momentum (default: 0) - nesterov (bool, optional): enables Nesterov momentum (default: False) - - Example: - >>> optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9) - >>> optimizer.zero_grad() - >>> loss_fn(model(input), target).backward() - >>> optimizer.step() - - __ http://www.cs.toronto.edu/%7Ehinton/absps/momentum.pdf - - .. note:: - The implementation of SGD with Momentum/Nesterov subtly differs from - Sutskever et. al. and implementations in some other frameworks. - - Considering the specific case of Momentum, the update can be written as - - .. math:: - v = \rho * v + g \\ - p = p - lr * v - - where p, g, v and :math:`\rho` denote the parameters, gradient, velocity, and - momentum respectively. - - This is in constrast to Sutskever et. al. and - other frameworks which employ an update of the form - - .. math:: - v = \rho * v + lr * g \\ - p = p - v - - The Nesterov version is analogously modified. - """ - - def __init__(self, params, lr=required, momentum=0, dampening=0, - weight_decay=0, nesterov=False): - defaults = dict(lr=lr, momentum=momentum, dampening=dampening, - weight_decay=weight_decay, nesterov=nesterov) - if nesterov and (momentum <= 0 or dampening != 0): - raise ValueError("Nesterov momentum requires a momentum and zero dampening") - super(SGD, self).__init__(params, defaults) - - def __setstate__(self, state): - super(SGD, self).__setstate__(state) - for group in self.param_groups: - group.setdefault('nesterov', False) - -
    [docs] def step(self, closure=None): - """Performs a single optimization step. - - Arguments: - closure (callable, optional): A closure that reevaluates the model - and returns the loss. - """ - loss = None - if closure is not None: - loss = closure() - - for group in self.param_groups: - weight_decay = group['weight_decay'] - momentum = group['momentum'] - dampening = group['dampening'] - nesterov = group['nesterov'] - - for p in group['params']: - if p.grad is None: - continue - d_p = p.grad.data - if weight_decay != 0: - d_p.add_(weight_decay, p.data) - if momentum != 0: - param_state = self.state[p] - if 'momentum_buffer' not in param_state: - buf = param_state['momentum_buffer'] = d_p.clone() - else: - buf = param_state['momentum_buffer'] - buf.mul_(momentum).add_(1 - dampening, d_p) - if nesterov: - d_p = d_p.add(momentum, buf) - else: - d_p = buf - - p.data.add_(-group['lr'], d_p) - - return loss
    -
    - -
    -
    - -
    -
    -
    - - -
    - -
    -

    - © Copyright 2017, Torch Contributors. - -

    -
    - Built with Sphinx using a theme provided by Read the Docs. - -
    - -
    -
    - -
    - -
    - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/_modules/torch/serialization.html b/docs/_modules/torch/serialization.html deleted file mode 100644 index 1996d3198539..000000000000 --- a/docs/_modules/torch/serialization.html +++ /dev/null @@ -1,960 +0,0 @@ - - - - - - - - - - - torch.serialization — PyTorch 0.1.11 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - -
    - - - - - - -
    -
    - - - - - - - - - - - - - - - - -
    - - - - -
    -
    -
    -
    - -

    Source code for torch.serialization

    -import difflib
    -import inspect
    -import os
    -import shutil
    -import struct
    -import sys
    -import torch
    -import tarfile
    -import tempfile
    -import warnings
    -from contextlib import closing, contextmanager
    -from ._utils import _import_dotted_name
    -if sys.version_info[0] == 2:
    -    import cPickle as pickle
    -else:
    -    import pickle
    -
    -DEFAULT_PROTOCOL = 2
    -
    -LONG_SIZE = struct.Struct('=l').size
    -INT_SIZE = struct.Struct('=i').size
    -SHORT_SIZE = struct.Struct('=h').size
    -
    -MAGIC_NUMBER = 0x1950a86a20f9469cfc6c
    -PROTOCOL_VERSION = 1001
    -STORAGE_KEY_SEPARATOR = ','
    -
    -
    -class SourceChangeWarning(Warning):
    -    pass
    -
    -
    -@contextmanager
    -def mkdtemp():
    -    path = tempfile.mkdtemp()
    -    yield path
    -    shutil.rmtree(path)
    -
    -
    -_package_registry = []
    -
    -
    -def register_package(priority, tagger, deserializer):
    -    queue_elem = (priority, tagger, deserializer)
    -    _package_registry.append(queue_elem)
    -    _package_registry.sort()
    -
    -
    -def _cpu_tag(obj):
    -    if type(obj).__module__ == 'torch':
    -        return 'cpu'
    -
    -
    -def _cuda_tag(obj):
    -    if type(obj).__module__ == 'torch.cuda':
    -        return 'cuda:' + str(obj.get_device())
    -
    -
    -def _cpu_deserialize(obj, location):
    -    if location == 'cpu':
    -        return obj
    -
    -
    -def _cuda_deserialize(obj, location):
    -    if location.startswith('cuda'):
    -        device_id = max(int(location[5:]), 0)
    -        return obj.cuda(device_id)
    -
    -
    -register_package(10, _cpu_tag, _cpu_deserialize)
    -register_package(20, _cuda_tag, _cuda_deserialize)
    -
    -
    -def location_tag(storage):
    -    for _, tagger, _ in _package_registry:
    -        location = tagger(storage)
    -        if location:
    -            return location
    -    raise RuntimeError("don't know how to determine data location of " +
    -                       torch.typename(storage))
    -
    -
    -def default_restore_location(storage, location):
    -    for _, _, fn in _package_registry:
    -        result = fn(storage, location)
    -        if result is not None:
    -            return result
    -    raise RuntimeError("don't know how to restore data location of " +
    -                       torch.typename(storage) + " (tagged with " +
    -                       location + ")")
    -
    -
    -def normalize_storage_type(storage_type):
    -    return getattr(torch, storage_type.__name__)
    -
    -
    -def storage_to_tensor_type(storage):
    -    storage_type = type(storage)
    -    module = _import_dotted_name(storage_type.__module__)
    -    return getattr(module, storage_type.__name__.replace('Storage', 'Tensor'))
    -
    -
    -
    [docs]def save(obj, f, pickle_module=pickle, pickle_protocol=DEFAULT_PROTOCOL): - """Saves an object to a disk file. - - See also: :ref:`recommend-saving-models` - - Args: - obj: saved object - f: a file-like object (has to implement fileno that returns a file descriptor) - or a string containing a file name - pickle_module: module used for pickling metadata and objects - pickle_protocol: can be specified to override the default protocol - """ - new_fd = False - if isinstance(f, str) or (sys.version_info[0] == 2 and isinstance(f, unicode)): - new_fd = True - f = open(f, "wb") - try: - return _save(obj, f, pickle_module, pickle_protocol) - finally: - if new_fd: - f.close()
    - - -def _save(obj, f, pickle_module, pickle_protocol): - import torch.nn as nn - serialized_container_types = {} - serialized_storages = {} - - def persistent_id(obj): - # FIXME: the docs say that persistent_id should only return a string - # but torch store returns tuples. This works only in the binary protocol - # see - # https://docs.python.org/2/library/pickle.html#pickling-and-unpickling-external-objects - # https://github.com/python/cpython/blob/master/Lib/pickle.py#L527-L537 - if isinstance(obj, type) and issubclass(obj, nn.Module): - if obj in serialized_container_types: - return None - serialized_container_types[obj] = True - source_file = source = None - try: - source_file = inspect.getsourcefile(obj) - source = inspect.getsource(obj) - except (TypeError, IOError): - warnings.warn("Couldn't retrieve source code for container of " - "type " + obj.__name__ + ". It won't be checked " - "for correctness upon loading.") - return ('module', obj, source_file, source) - elif torch.is_storage(obj): - storage_type = normalize_storage_type(type(obj)) - root, offset = obj._root_storage() - root_key = str(root._cdata) - location = location_tag(obj) - serialized_storages[root_key] = root - is_view = obj._cdata != root._cdata - if is_view: - view_metadata = (str(obj._cdata), offset, obj.size()) - else: - view_metadata = None - - return ('storage', - storage_type, - root_key, - location, - root.size(), - view_metadata) - - return None - - sys_info = dict( - protocol_version=PROTOCOL_VERSION, - little_endian=sys.byteorder == 'little', - type_sizes=dict( - short=SHORT_SIZE, - int=INT_SIZE, - long=LONG_SIZE, - ), - ) - - pickle_module.dump(MAGIC_NUMBER, f, protocol=pickle_protocol) - pickle_module.dump(PROTOCOL_VERSION, f, protocol=pickle_protocol) - pickle_module.dump(sys_info, f, protocol=pickle_protocol) - pickler = pickle_module.Pickler(f, protocol=pickle_protocol) - pickler.persistent_id = persistent_id - pickler.dump(obj) - - serialized_storage_keys = sorted(serialized_storages.keys()) - pickle_module.dump(serialized_storage_keys, f, protocol=pickle_protocol) - f.flush() - for key in serialized_storage_keys: - serialized_storages[key]._write_file(f) - - -
    [docs]def load(f, map_location=None, pickle_module=pickle): - """Loads an object saved with :func:`torch.save` from a file. - - torch.load can dynamically remap storages to be loaded on a different device - using the map_location argument. If it's a callable, it will be called with - two arguments: storage and location tag. It's expected to either return a - storage that's been moved to a different location, or None (and the location - will be resolved using the default method). If this argument is a dict it's - expected to be a mapping from location tags used in a file, to location - tags of the current system. - - By default the location tags are 'cpu' for host tensors and 'cuda:device_id' - (e.g. 'cuda:2') for cuda tensors. User extensions can register their own - tagging and deserialization methods using register_package. - - Args: - f: a file-like object (has to implement fileno that returns a file descriptor, - and must implement seek), or a string containing a file name - map_location: a function or a dict specifying how to remap storage locations - pickle_module: module used for unpickling metadata and objects (has to match - the pickle_module used to serialize file) - - Example: - >>> torch.load('tensors.pt') - # Load all tensors onto the CPU - >>> torch.load('tensors.pt', map_location=lambda storage, loc: storage) - # Map tensors from GPU 1 to GPU 0 - >>> torch.load('tensors.pt', map_location={'cuda:1':'cuda:0'}) - """ - new_fd = False - if isinstance(f, str) or (sys.version_info[0] == 2 and isinstance(f, unicode)): - new_fd = True - f = open(f, 'rb') - try: - return _load(f, map_location, pickle_module) - finally: - if new_fd: - f.close()
    - - -def _load(f, map_location, pickle_module): - deserialized_objects = {} - - if map_location is None: - restore_location = default_restore_location - elif isinstance(map_location, dict): - def restore_location(storage, location): - location = map_location.get(location, location) - return default_restore_location(storage, location) - else: - def restore_location(storage, location): - result = map_location(storage, location) - if result is None: - result = default_restore_location(storage, location) - return result - - def _check_container_source(container_type, source_file, original_source): - current_source = inspect.getsource(container_type) - if original_source != current_source: - if container_type.dump_patches: - file_name = container_type.__name__ + '.patch' - diff = difflib.unified_diff(current_source.split('\n'), - original_source.split('\n'), - source_file, - source_file, lineterm="") - lines = '\n'.join(diff) - try: - with open(file_name, 'a+') as f: - file_size = f.seek(0, 2) - f.seek(0) - if file_size == 0: - f.write(lines) - elif file_size != len(lines) or f.read() != lines: - raise IOError - msg = ("Saved a reverse patch to " + file_name + ". " - "Run `patch -p0 < " + file_name + "` to revert your " - "changes.") - except IOError: - msg = ("Tried to save a patch, but couldn't create a " - "writable file " + file_name + ". Make sure it " - "doesn't exist and your working directory is " - "writable.") - else: - msg = ("you can retrieve the original source code by " - "accessing the object's source attribute or set " - "`torch.nn.Module.dump_patches = True` and use the " - "patch tool to revert the changes.") - msg = ("source code of class '{}' has changed. {}" - .format(torch.typename(container_type), msg)) - warnings.warn(msg, SourceChangeWarning) - - def legacy_load(f): - deserialized_objects = {} - - def persistent_load(saved_id): - if isinstance(saved_id, tuple): - # Ignore containers that don't have any sources saved - if all(saved_id[1:]): - _check_container_source(*saved_id) - return saved_id[0] - return deserialized_objects[int(saved_id)] - - with closing(tarfile.open(fileobj=f, mode='r:', format=tarfile.PAX_FORMAT)) as tar, \ - mkdtemp() as tmpdir: - - tar.extract('storages', path=tmpdir) - with open(os.path.join(tmpdir, 'storages'), 'rb', 0) as f: - num_storages = pickle_module.load(f) - for i in range(num_storages): - args = pickle_module.load(f) - key, location, storage_type = args - obj = storage_type._new_with_file(f) - obj = restore_location(obj, location) - deserialized_objects[key] = obj - - storage_views = pickle_module.load(f) - for target_cdata, root_cdata, offset, size in storage_views: - root = deserialized_objects[root_cdata] - deserialized_objects[target_cdata] = root[offset:offset + size] - - tar.extract('tensors', path=tmpdir) - with open(os.path.join(tmpdir, 'tensors'), 'rb', 0) as f: - num_tensors = pickle_module.load(f) - for i in range(num_tensors): - args = pickle_module.load(f) - key, storage_id, original_tensor_type = args - storage = deserialized_objects[storage_id] - tensor_type = storage_to_tensor_type(storage) - tensor = tensor_type._new_with_metadata_file(f, storage) - deserialized_objects[key] = tensor - - pickle_file = tar.extractfile('pickle') - unpickler = pickle_module.Unpickler(pickle_file) - unpickler.persistent_load = persistent_load - result = unpickler.load() - return result - - deserialized_objects = {} - - def persistent_load(saved_id): - assert isinstance(saved_id, tuple) - typename = saved_id[0] - data = saved_id[1:] - - if typename == 'module': - # Ignore containers that don't have any sources saved - if all(data[1:]): - _check_container_source(*data) - return data[0] - elif typename == 'storage': - data_type, root_key, location, size, view_metadata = data - if root_key not in deserialized_objects: - deserialized_objects[root_key] = restore_location( - data_type(size), location) - storage = deserialized_objects[root_key] - if view_metadata is not None: - view_key, offset, view_size = view_metadata - if view_key not in deserialized_objects: - deserialized_objects[view_key] = storage[offset:offset + view_size] - return deserialized_objects[view_key] - else: - return storage - else: - raise RuntimeError("Unknown saved id type: %s" % saved_id[0]) - - # try the legacy loader first, which only works if f is a tarfile - try: - return legacy_load(f) - except tarfile.TarError: - pass - - f.seek(0) - magic_number = pickle_module.load(f) - if magic_number != MAGIC_NUMBER: - raise RuntimeError("Invalid magic number; corrupt file?") - protocol_version = pickle_module.load(f) - if protocol_version != PROTOCOL_VERSION: - raise RuntimeError("Invalid protocol version: %s" % protocol_version) - - _sys_info = pickle_module.load(f) - unpickler = pickle_module.Unpickler(f) - unpickler.persistent_load = persistent_load - result = unpickler.load() - - deserialized_storage_keys = pickle_module.load(f) - - offset = f.tell() - for key in deserialized_storage_keys: - assert key in deserialized_objects - deserialized_objects[key]._set_from_file(f, offset) - offset = None - - return result -
    - -
    -
    - -
    -
    -
    - - -
    - -
    -

    - © Copyright 2017, Torch Contributors. - -

    -
    - Built with Sphinx using a theme provided by Read the Docs. - -
    - -
    -
    - -
    - -
    - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/_modules/torch/storage.html b/docs/_modules/torch/storage.html deleted file mode 100644 index 8ae8fc38f325..000000000000 --- a/docs/_modules/torch/storage.html +++ /dev/null @@ -1,679 +0,0 @@ - - - - - - - - - - - torch.storage — PyTorch 0.1.11 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - -
    - - - - - - -
    -
    - - - - - - - - - - - - - - - - -
    - - - - -
    -
    -
    -
    - -

    Source code for torch.storage

    -import torch
    -from ._utils import _type, _cuda, _range
    -
    -
    -class _StorageBase(object):
    -    is_cuda = False
    -    is_sparse = False
    -
    -    def __str__(self):
    -        content = ' ' + '\n '.join(str(self[i]) for i in _range(len(self)))
    -        return content + '\n[{} of size {}]'.format(torch.typename(self), len(self))
    -
    -    def __repr__(self):
    -        return str(self)
    -
    -    def __iter__(self):
    -        return iter(map(lambda i: self[i], _range(self.size())))
    -
    -    def __copy__(self):
    -        return self.clone()
    -
    -    def __deepcopy__(self, memo):
    -        memo = memo.setdefault('torch', {})
    -        if self._cdata in memo:
    -            return memo[self._cdata]
    -        new_storage = self.clone()
    -        memo[self._cdata] = new_storage
    -        return new_storage
    -
    -    def __reduce__(self):
    -        return type(self), (self.tolist(),)
    -
    -    def clone(self):
    -        """Returns a copy of this storage"""
    -        return type(self)(self.size()).copy_(self)
    -
    -    def tolist(self):
    -        """Returns a list containing the elements of this storage"""
    -        return [v for v in self]
    -
    -    def cpu(self):
    -        """Returns a CPU copy of this storage if it's not already on the CPU"""
    -        return self.type(getattr(torch, self.__class__.__name__))
    -
    -    def double(self):
    -        """Casts this storage to double type"""
    -        return self.type(type(self).__module__ + '.DoubleStorage')
    -
    -    def float(self):
    -        """Casts this storage to float type"""
    -        return self.type(type(self).__module__ + '.FloatStorage')
    -
    -    def half(self):
    -        """Casts this storage to half type"""
    -        return self.type(type(self).__module__ + '.HalfStorage')
    -
    -    def long(self):
    -        """Casts this storage to long type"""
    -        return self.type(type(self).__module__ + '.LongStorage')
    -
    -    def int(self):
    -        """Casts this storage to int type"""
    -        return self.type(type(self).__module__ + '.IntStorage')
    -
    -    def short(self):
    -        """Casts this storage to short type"""
    -        return self.type(type(self).__module__ + '.ShortStorage')
    -
    -    def char(self):
    -        """Casts this storage to char type"""
    -        return self.type(type(self).__module__ + '.CharStorage')
    -
    -    def byte(self):
    -        """Casts this storage to byte type"""
    -        return self.type(type(self).__module__ + '.ByteStorage')
    -
    -    def pin_memory(self):
    -        """Copies the storage to pinned memory, if it's not already pinned."""
    -        if self.is_cuda:
    -            raise TypeError("cannot pin '{0}' only CPU memory can be pinned"
    -                            .format(self.type()))
    -        import torch.cuda
    -        allocator = torch.cuda._host_allocator()
    -        return type(self)(self.size(), allocator=allocator).copy_(self)
    -
    -    def share_memory_(self):
    -        """Moves the storage to shared memory.
    -
    -        This is a no-op for storages already in shared memory and for CUDA
    -        storages, which do not need to be moved for sharing across processes.
    -        Storages in shared memory cannot be resized.
    -
    -        Returns: self
    -        """
    -        from torch.multiprocessing import get_sharing_strategy
    -        if self.is_cuda:
    -            pass  # CUDA doesn't use POSIX shared memory
    -        elif get_sharing_strategy() == 'file_system':
    -            self._share_filename_()
    -        else:
    -            self._share_fd_()
    -        return self
    -
    -
    -_StorageBase.type = _type
    -_StorageBase.cuda = _cuda
    -
    - -
    -
    - -
    -
    -
    - - -
    - -
    -

    - © Copyright 2017, Torch Contributors. - -

    -
    - Built with Sphinx using a theme provided by Read the Docs. - -
    - -
    -
    - -
    - -
    - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/_modules/torch/tensor.html b/docs/_modules/torch/tensor.html deleted file mode 100644 index ec5a9a08a273..000000000000 --- a/docs/_modules/torch/tensor.html +++ /dev/null @@ -1,1021 +0,0 @@ - - - - - - - - - - - torch.tensor — PyTorch 0.1.11 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - -
    - - - - - - -
    -
    - - - - - - - - - - - - - - - - -
    - - - - -
    -
    -
    -
    - -

    Source code for torch.tensor

    -import torch
    -from . import _tensor_str
    -from ._utils import _type, _cuda, _range, _rebuild_tensor
    -import sys
    -
    -
    -class _TensorBase(object):
    -    #: bool: True if this is a CUDA tensor
    -    is_cuda = False
    -    is_sparse = False
    -
    -    def new(self, *args, **kwargs):
    -        """Constructs a new tensor of the same data type."""
    -        return self.__class__(*args, **kwargs)
    -
    -    def type_as(self, tensor):
    -        """Returns this tensor cast to the type of the given tensor.
    -
    -        This is a no-op if the tensor is already of the correct type. This is
    -        equivalent to::
    -
    -            self.type(tensor.type())
    -
    -        Params:
    -            tensor (Tensor): the tensor which has the desired type
    -        """
    -        return self.type(tensor.type())
    -
    -    def cpu(self):
    -        """Returns a CPU copy of this tensor if it's not already on the CPU"""
    -        return self.type(getattr(torch, self.__class__.__name__))
    -
    -    def double(self):
    -        """Casts this tensor to double type"""
    -        return self.type(type(self).__module__ + '.DoubleTensor')
    -
    -    def float(self):
    -        """Casts this tensor to float type"""
    -        return self.type(type(self).__module__ + '.FloatTensor')
    -
    -    def half(self):
    -        """Casts this tensor to half-precision float type"""
    -        return self.type(type(self).__module__ + '.HalfTensor')
    -
    -    def long(self):
    -        """Casts this tensor to long type"""
    -        return self.type(type(self).__module__ + '.LongTensor')
    -
    -    def int(self):
    -        """Casts this tensor to int type"""
    -        return self.type(type(self).__module__ + '.IntTensor')
    -
    -    def short(self):
    -        """Casts this tensor to short type"""
    -        return self.type(type(self).__module__ + '.ShortTensor')
    -
    -    def char(self):
    -        """Casts this tensor to char type"""
    -        return self.type(type(self).__module__ + '.CharTensor')
    -
    -    def byte(self):
    -        """Casts this tensor to byte type"""
    -        return self.type(type(self).__module__ + '.ByteTensor')
    -
    -    def is_pinned(self):
    -        """Returns true if this tensor resides in pinned memory"""
    -        storage = self.storage()
    -        return storage.is_pinned() if storage else False
    -
    -    def pin_memory(self):
    -        """Copies the tensor to pinned memory, if it's not already pinned."""
    -        if self.is_cuda:
    -            raise TypeError("cannot pin '{0}' only CPU memory can be pinned"
    -                            .format(self.type()))
    -        storage = self.storage()
    -        if storage is None:
    -            storage = (self.storage_type())()
    -        return type(self)().set_(storage.pin_memory()).view_as(self)
    -
    -    def share_memory_(self):
    -        """Moves the underlying storage to shared memory.
    -
    -        This is a no-op if the underlying storage is already in shared memory
    -        and for CUDA tensors. Tensors in shared memory cannot be resized.
    -        """
    -        self.storage().share_memory_()
    -        return self
    -
    -    def is_shared(self):
    -        """Checks if tensor is in shared memory.
    -
    -        This is always ``True`` for CUDA tensors.
    -        """
    -        return self.storage().is_shared()
    -
    -    def __deepcopy__(self, _memo):
    -        memo = _memo.setdefault('torch', {})
    -        if self._cdata in memo:
    -            return memo[self._cdata]
    -        new_storage = self.storage().__deepcopy__(_memo)
    -        new_tensor = self.new()
    -        new_tensor.set_(new_storage, self.storage_offset(), self.size(), self.stride())
    -        memo[self._cdata] = new_tensor
    -        return new_tensor
    -
    -    def __reduce__(self):
    -        # NOTE: _rebuild_tensor does not call __setstate__
    -        args = self.__getstate__()
    -        return (_rebuild_tensor, args)
    -
    -    def __getstate__(self):
    -        return (self.storage(),
    -                self.storage_offset(),
    -                tuple(self.size()),
    -                self.stride())
    -
    -    def __setstate__(self, state):
    -        self.set_(*state)
    -
    -    def __repr__(self):
    -        return str(self)
    -
    -    def __str__(self):
    -        # All strings are unicode in Python 3, while we have to encode unicode
    -        # strings in Python2. If we can't, let python decide the best
    -        # characters to replace unicode characters with.
    -        if sys.version_info > (3,):
    -            return _tensor_str._str(self)
    -        else:
    -            if hasattr(sys.stdout, 'encoding'):
    -                return _tensor_str._str(self).encode(
    -                    sys.stdout.encoding or 'UTF-8', 'replace')
    -            else:
    -                return _tensor_str._str(self).encode('UTF-8', 'replace')
    -
    -    def __bool__(self):
    -        if self.numel() == 0:
    -            return False
    -        raise RuntimeError("bool value of non-empty " + torch.typename(self) +
    -                           " objects is ambiguous")
    -
    -    __nonzero__ = __bool__
    -
    -    def __iter__(self):
    -        return iter(map(lambda i: self.select(0, i), _range(self.size(0))))
    -
    -    def split(self, split_size, dim=0):
    -        """Splits this tensor into a tuple of tensors.
    -
    -        See :func:`torch.split`.
    -        """
    -        return torch.split(self, split_size, dim)
    -
    -    def chunk(self, n_chunks, dim=0):
    -        """Splits this tensor into a tuple of tensors.
    -
    -        See :func:`torch.chunk`.
    -        """
    -        return torch.chunk(self, n_chunks, dim)
    -
    -    def tolist(self):
    -        """Returns a nested list represenation of this tensor."""
    -        dim = self.dim()
    -        if dim == 1:
    -            return [v for v in self]
    -        elif dim > 0:
    -            return [subt.tolist() for subt in self]
    -        return []
    -
    -    def view_as(self, tensor):
    -        """Returns this tensor viewed as the size as the specified tensor.
    -
    -        This is equivalent to::
    -
    -                self.view(tensor.size())
    -        """
    -        return self.view(tensor.size())
    -
    -    def permute(self, *dims):
    -        """Permute the dimensions of this tensor.
    -
    -        Args:
    -            *dims (int...): The desired ordering of dimensions
    -
    -        Example:
    -            >>> x = torch.randn(2, 3, 5)
    -            >>> x.size()
    -            torch.Size([2, 3, 5])
    -            >>> x.permute(2, 0, 1).size()
    -            torch.Size([5, 2, 3])
    -        """
    -        perm = list(dims)
    -        tensor = self
    -        n_dims = tensor.dim()
    -        assert len(perm) == n_dims, 'Invalid permutation'
    -        for i, p in enumerate(perm):
    -            if p != i and p != -1:
    -                j = i
    -                while True:
    -                    assert 0 <= perm[j] and perm[j] < n_dims, 'Invalid permutation'
    -                    tensor = tensor.transpose(j, perm[j])
    -                    perm[j], j = -1, perm[j]
    -                    if perm[j] == i:
    -                        break
    -                perm[j] = -1
    -        return tensor
    -
    -    def expand(self, *sizes):
    -        """Returns a new view of the tensor with singleton dimensions expanded
    -        to a larger size.
    -
    -        Tensor can be also expanded to a larger number of dimensions, and the
    -        new ones will be appended at the front.
    -
    -        Expanding a tensor does not allocate new memory, but only creates a
    -        new view on the existing tensor where a dimension of size one is
    -        expanded to a larger size by setting the ``stride`` to 0. Any dimension
    -        of size 1 can be expanded to an arbitrary value without allocating new
    -        memory.
    -
    -        Args:
    -            *sizes (torch.Size or int...): The desired expanded size
    -
    -        Example:
    -            >>> x = torch.Tensor([[1], [2], [3]])
    -            >>> x.size()
    -            torch.Size([3, 1])
    -            >>> x.expand(3, 4)
    -             1  1  1  1
    -             2  2  2  2
    -             3  3  3  3
    -            [torch.FloatTensor of size 3x4]
    -        """
    -        result = self.new()
    -        if len(sizes) == 1 and isinstance(sizes[0], torch.Size):
    -            sizes = sizes[0]
    -        else:
    -            sizes = torch.Size(sizes)
    -        src = self
    -
    -        num_unsqueezed = len(sizes) - src.dim()
    -        if src.dim() == 0:
    -            raise ValueError('can\'t expand an empty tensor')
    -        if num_unsqueezed < 0:
    -            raise ValueError('the number of dimensions provided must be greater or equal tensor.dim()')
    -
    -        src_stride = [0] * num_unsqueezed + list(src.stride())
    -        src_size = [1] * num_unsqueezed + list(src.size())
    -        for i in range(num_unsqueezed - 1, -1, -1):
    -            # to be consistent with .unsqueeze()
    -            src_stride[i] = src_size[i + 1] * src_stride[i + 1]
    -
    -        # create a new geometry for tensor:
    -        for i, (size, target_size) in enumerate(zip(src_size, sizes)):
    -            if size == 1:
    -                if target_size == 1:
    -                    continue
    -                src_size[i] = target_size
    -                src_stride[i] = 0
    -            elif size != target_size:
    -                raise ValueError('incorrect size: only supporting singleton expansion (size=1)')
    -
    -        result.set_(src.storage(), src.storage_offset(), torch.Size(src_size),
    -                    tuple(src_stride))
    -        return result
    -
    -    def expand_as(self, tensor):
    -        """Expands this tensor to the size of the specified tensor.
    -
    -        This is equivalent to::
    -
    -            self.expand(tensor.size())
    -        """
    -        return self.expand(tensor.size())
    -
    -    def repeat(self, *sizes):
    -        """Repeats this tensor along the specified dimensions.
    -
    -        Unlike :meth:`expand`, this function copies the tensor's data.
    -
    -        Args:
    -            *sizes (torch.Size or int...): The number of times to repeat this tensor along each dimension
    -
    -        Example:
    -            >>> x = torch.Tensor([1, 2, 3])
    -            >>> x.repeat(4, 2)
    -             1  2  3  1  2  3
    -             1  2  3  1  2  3
    -             1  2  3  1  2  3
    -             1  2  3  1  2  3
    -            [torch.FloatTensor of size 4x6]
    -            >>> x.repeat(4, 2, 1).size()
    -            torch.Size([4, 2, 3])
    -        """
    -        # If args == (torch.Size,), then we need to unpack the tuple
    -        if len(sizes) == 1 and isinstance(sizes[0], torch.Size):
    -            sizes = sizes[0]
    -        repeats = list(sizes)
    -        result = self.new()
    -        src = self.contiguous()
    -
    -        if len(repeats) < src.dim():
    -            raise ValueError('Number of dimensions of repeat dims can not be '
    -                             'smaller than number of dimensions of tensor')
    -
    -        xtensor = src.new().set_(src)
    -        xsize = list(xtensor.size())
    -        for i in _range(len(repeats) - src.dim()):
    -            xsize = [1] + xsize
    -
    -        size = torch.Size([a * b for a, b in zip(xsize, repeats)])
    -        xtensor.resize_(torch.Size(xsize))
    -        result.resize_(size)
    -        urtensor = result.new(result)
    -        for i in _range(xtensor.dim()):
    -            urtensor = urtensor.unfold(i, xtensor.size(i), xtensor.size(i))
    -        for i in _range(urtensor.dim() - xtensor.dim()):
    -            xsize = [1] + xsize
    -        xtensor.resize_(torch.Size(xsize))
    -        xxtensor = xtensor.expand_as(urtensor)
    -        urtensor.copy_(xxtensor)
    -        return result
    -
    -    # TODO: add tests for operators
    -    def __add__(self, other):
    -        return self.add(other)
    -    __radd__ = __add__
    -
    -    def __iadd__(self, other):
    -        return self.add_(other)
    -
    -    def __sub__(self, other):
    -        return self.sub(other)
    -
    -    def __rsub__(self, other):
    -        return self.new().resize_as_(self).fill_(other).add_(-1, self)
    -
    -    def __isub__(self, other):
    -        return self.sub_(other)
    -
    -    def __mul__(self, other):
    -        return self.mul(other)
    -    __rmul__ = __mul__
    -
    -    def __imul__(self, other):
    -        return self.mul_(other)
    -
    -    def __matmul__(self, other):
    -        dim_self = self.dim()
    -        try:
    -            dim_other = other.dim()
    -        except AttributeError:  # not a tensor
    -            return NotImplemented
    -        if dim_self == 1 and dim_other == 1:
    -            return self.dot(other)
    -        if dim_self == 2 and dim_other == 1:
    -            return self.mv(other)
    -        if dim_self == 1 and dim_other == 2:
    -            return self.unsqueeze(0).mm(other).squeeze(0)
    -        elif dim_self == 2 and dim_other == 2:
    -            return self.mm(other)
    -        raise ValueError("both arguments to __matmul__ need to be 1D or 2D, "
    -                         "but they are {}D and {}D".format(dim_self, dim_other))
    -
    -    def __pow__(self, other):
    -        return self.pow(other)
    -
    -    def __ipow__(self, other):
    -        return self.pow_(other)
    -
    -    def __div__(self, other):
    -        return self.div(other)
    -    __truediv__ = __div__
    -
    -    def __rdiv__(self, other):
    -        return self.new().resize_as_(self).fill_(other).div_(self)
    -    __rtruediv__ = __rdiv__
    -
    -    def __idiv__(self, other):
    -        return self.div_(other)
    -
    -    def __mod__(self, other):
    -        return self.remainder(other)
    -
    -    def __neg__(self):
    -        return self.neg()
    -
    -    def __eq__(self, other):
    -        return self.eq(other)
    -
    -    def __ne__(self, other):
    -        return self.ne(other)
    -
    -    def __lt__(self, other):
    -        return self.lt(other)
    -
    -    def __le__(self, other):
    -        return self.le(other)
    -
    -    def __gt__(self, other):
    -        return self.gt(other)
    -
    -    def __ge__(self, other):
    -        return self.ge(other)
    -
    -    # TODO: add native add or and xor in the libs
    -    def __and__(self, other):
    -        if (type(self).__name__ != 'ByteTensor' or
    -                type(other).__name__ != 'ByteTensor'):
    -            raise RuntimeError('logical operations are supported on ByteTensors only')
    -        return (self + other).eq(2)
    -
    -    def __or__(self, other):
    -        if (type(self).__name__ != 'ByteTensor' or
    -                type(other).__name__ != 'ByteTensor'):
    -            raise RuntimeError('logical operations are supported on ByteTensors only')
    -        return (self + other).gt(0)
    -
    -    def __xor__(self, other):
    -        if (type(self).__name__ != 'ByteTensor' or
    -                type(other).__name__ != 'ByteTensor'):
    -            raise RuntimeError('logical operations are supported on ByteTensors only')
    -        return (self + other).eq(1)
    -
    -    def __iand__(self, other):
    -        if (type(self).__name__ != 'ByteTensor' or
    -                type(other).__name__ != 'ByteTensor'):
    -            raise RuntimeError('logical operations are supported on ByteTensors only')
    -        return self.mul_(other)
    -
    -    def __ior__(self, other):
    -        if (type(self).__name__ != 'ByteTensor' or
    -                type(other).__name__ != 'ByteTensor'):
    -            raise RuntimeError('logical operations are supported on ByteTensors only')
    -        return self.copy_((self + other).gt(0))
    -
    -    def __ixor__(self, other):
    -        if (type(self).__name__ != 'ByteTensor' or
    -                type(other).__name__ != 'ByteTensor'):
    -            raise RuntimeError('logical operations are supported on ByteTensors only')
    -        return self.copy_((self + other).eq(1))
    -
    -    def __hash__(self):
    -        return id(self)
    -
    -
    -_TensorBase.type = _type
    -_TensorBase.cuda = _cuda
    -
    - -
    -
    - -
    -
    -
    - - -
    - -
    -

    - © Copyright 2017, Torch Contributors. - -

    -
    - Built with Sphinx using a theme provided by Read the Docs. - -
    - -
    -
    - -
    - -
    - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/_modules/torch/utils/data/dataloader.html b/docs/_modules/torch/utils/data/dataloader.html deleted file mode 100644 index d9631761c4c5..000000000000 --- a/docs/_modules/torch/utils/data/dataloader.html +++ /dev/null @@ -1,849 +0,0 @@ - - - - - - - - - - - torch.utils.data.dataloader — PyTorch 0.1.11 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - -
    - - - - - - -
    -
    - - - - - - - - - - - - - - - - -
    - - - - -
    -
    -
    -
    - -

    Source code for torch.utils.data.dataloader

    -import torch
    -import torch.multiprocessing as multiprocessing
    -from .sampler import SequentialSampler, RandomSampler
    -import collections
    -import math
    -import sys
    -import traceback
    -import threading
    -if sys.version_info[0] == 2:
    -    import Queue as queue
    -    string_classes = basestring
    -else:
    -    import queue
    -    string_classes = (str, bytes)
    -
    -
    -class ExceptionWrapper(object):
    -    "Wraps an exception plus traceback to communicate across threads"
    -
    -    def __init__(self, exc_info):
    -        self.exc_type = exc_info[0]
    -        self.exc_msg = "".join(traceback.format_exception(*exc_info))
    -
    -
    -def _worker_loop(dataset, index_queue, data_queue, collate_fn):
    -    torch.set_num_threads(1)
    -    while True:
    -        r = index_queue.get()
    -        if r is None:
    -            data_queue.put(None)
    -            break
    -        idx, batch_indices = r
    -        try:
    -            samples = collate_fn([dataset[i] for i in batch_indices])
    -        except Exception:
    -            data_queue.put((idx, ExceptionWrapper(sys.exc_info())))
    -        else:
    -            data_queue.put((idx, samples))
    -
    -
    -def _pin_memory_loop(in_queue, out_queue, done_event):
    -    while True:
    -        try:
    -            r = in_queue.get()
    -        except:
    -            if done_event.is_set():
    -                return
    -            raise
    -        if r is None:
    -            break
    -        if isinstance(r[1], ExceptionWrapper):
    -            out_queue.put(r)
    -            continue
    -        idx, batch = r
    -        try:
    -            batch = pin_memory_batch(batch)
    -        except Exception:
    -            out_queue.put((idx, ExceptionWrapper(sys.exc_info())))
    -        else:
    -            out_queue.put((idx, batch))
    -
    -
    -def default_collate(batch):
    -    "Puts each data field into a tensor with outer dimension batch size"
    -    if torch.is_tensor(batch[0]):
    -        return torch.stack(batch, 0)
    -    elif type(batch[0]).__module__ == 'numpy' and type(batch[0]).__name__ == 'ndarray':
    -        return torch.stack([torch.from_numpy(b) for b in batch], 0)
    -    elif isinstance(batch[0], int):
    -        return torch.LongTensor(batch)
    -    elif isinstance(batch[0], float):
    -        return torch.DoubleTensor(batch)
    -    elif isinstance(batch[0], string_classes):
    -        return batch
    -    elif isinstance(batch[0], collections.Iterable):
    -        # if each batch element is not a tensor, then it should be a tuple
    -        # of tensors; in that case we collate each element in the tuple
    -        transposed = zip(*batch)
    -        return [default_collate(samples) for samples in transposed]
    -
    -    raise TypeError(("batch must contain tensors, numbers, or lists; found {}"
    -                     .format(type(batch[0]))))
    -
    -
    -def pin_memory_batch(batch):
    -    if torch.is_tensor(batch):
    -        return batch.pin_memory()
    -    elif isinstance(batch, string_classes):
    -        return batch
    -    elif isinstance(batch, collections.Iterable):
    -        return [pin_memory_batch(sample) for sample in batch]
    -    else:
    -        return batch
    -
    -
    -class DataLoaderIter(object):
    -    "Iterates once over the DataLoader's dataset, as specified by the sampler"
    -
    -    def __init__(self, loader):
    -        self.dataset = loader.dataset
    -        self.batch_size = loader.batch_size
    -        self.collate_fn = loader.collate_fn
    -        self.sampler = loader.sampler
    -        self.num_workers = loader.num_workers
    -        self.pin_memory = loader.pin_memory
    -        self.drop_last = loader.drop_last
    -        self.done_event = threading.Event()
    -
    -        self.samples_remaining = len(self.sampler)
    -        self.sample_iter = iter(self.sampler)
    -
    -        if self.num_workers > 0:
    -            self.index_queue = multiprocessing.SimpleQueue()
    -            self.data_queue = multiprocessing.SimpleQueue()
    -            self.batches_outstanding = 0
    -            self.shutdown = False
    -            self.send_idx = 0
    -            self.rcvd_idx = 0
    -            self.reorder_dict = {}
    -
    -            self.workers = [
    -                multiprocessing.Process(
    -                    target=_worker_loop,
    -                    args=(self.dataset, self.index_queue, self.data_queue, self.collate_fn))
    -                for _ in range(self.num_workers)]
    -
    -            for w in self.workers:
    -                w.daemon = True  # ensure that the worker exits on process exit
    -                w.start()
    -
    -            if self.pin_memory:
    -                in_data = self.data_queue
    -                self.data_queue = queue.Queue()
    -                self.pin_thread = threading.Thread(
    -                    target=_pin_memory_loop,
    -                    args=(in_data, self.data_queue, self.done_event))
    -                self.pin_thread.daemon = True
    -                self.pin_thread.start()
    -
    -            # prime the prefetch loop
    -            for _ in range(2 * self.num_workers):
    -                self._put_indices()
    -
    -    def __len__(self):
    -        if self.drop_last:
    -            return len(self.sampler) // self.batch_size
    -        else:
    -            return (len(self.sampler) + self.batch_size - 1) // self.batch_size
    -
    -    def __next__(self):
    -        if self.num_workers == 0:  # same-process loading
    -            if self.drop_last and self.samples_remaining < self.batch_size:
    -                raise StopIteration
    -            if self.samples_remaining == 0:
    -                raise StopIteration
    -            indices = self._next_indices()
    -            batch = self.collate_fn([self.dataset[i] for i in indices])
    -            if self.pin_memory:
    -                batch = pin_memory_batch(batch)
    -            return batch
    -
    -        # check if the next sample has already been generated
    -        if self.rcvd_idx in self.reorder_dict:
    -            batch = self.reorder_dict.pop(self.rcvd_idx)
    -            return self._process_next_batch(batch)
    -
    -        if self.batches_outstanding == 0:
    -            self._shutdown_workers()
    -            raise StopIteration
    -
    -        while True:
    -            assert (not self.shutdown and self.batches_outstanding > 0)
    -            idx, batch = self.data_queue.get()
    -            self.batches_outstanding -= 1
    -            if idx != self.rcvd_idx:
    -                # store out-of-order samples
    -                self.reorder_dict[idx] = batch
    -                continue
    -            return self._process_next_batch(batch)
    -
    -    next = __next__  # Python 2 compatibility
    -
    -    def __iter__(self):
    -        return self
    -
    -    def _next_indices(self):
    -        batch_size = min(self.samples_remaining, self.batch_size)
    -        batch = [next(self.sample_iter) for _ in range(batch_size)]
    -        self.samples_remaining -= len(batch)
    -        return batch
    -
    -    def _put_indices(self):
    -        assert self.batches_outstanding < 2 * self.num_workers
    -        if self.samples_remaining > 0:
    -            if self.samples_remaining < self.batch_size and self.drop_last:
    -                self._next_indices()
    -            else:
    -                self.index_queue.put((self.send_idx, self._next_indices()))
    -                self.batches_outstanding += 1
    -                self.send_idx += 1
    -
    -    def _process_next_batch(self, batch):
    -        self.rcvd_idx += 1
    -        self._put_indices()
    -        if isinstance(batch, ExceptionWrapper):
    -            raise batch.exc_type(batch.exc_msg)
    -        return batch
    -
    -    def __getstate__(self):
    -        # TODO: add limited pickling support for sharing an iterator
    -        # across multiple threads for HOGWILD.
    -        # Probably the best way to do this is by moving the sample pushing
    -        # to a separate thread and then just sharing the data queue
    -        # but signalling the end is tricky without a non-blocking API
    -        raise NotImplementedError("DataLoaderIterator cannot be pickled")
    -
    -    def _shutdown_workers(self):
    -        if not self.shutdown:
    -            self.shutdown = True
    -            self.done_event.set()
    -            for _ in self.workers:
    -                self.index_queue.put(None)
    -
    -    def __del__(self):
    -        if self.num_workers > 0:
    -            self._shutdown_workers()
    -
    -
    -
    [docs]class DataLoader(object): - """ - Data loader. Combines a dataset and a sampler, and provides - single- or multi-process iterators over the dataset. - - Arguments: - dataset (Dataset): dataset from which to load the data. - batch_size (int, optional): how many samples per batch to load - (default: 1). - shuffle (bool, optional): set to ``True`` to have the data reshuffled - at every epoch (default: False). - sampler (Sampler, optional): defines the strategy to draw samples from - the dataset. If specified, the ``shuffle`` argument is ignored. - num_workers (int, optional): how many subprocesses to use for data - loading. 0 means that the data will be loaded in the main process - (default: 0) - collate_fn (callable, optional) - pin_memory (bool, optional) - drop_last (bool, optional): set to ``True`` to drop the last incomplete batch, - if the dataset size is not divisible by the batch size. If False and - the size of dataset is not divisible by the batch size, then the last batch - will be smaller. (default: False) - """ - - def __init__(self, dataset, batch_size=1, shuffle=False, sampler=None, num_workers=0, - collate_fn=default_collate, pin_memory=False, drop_last=False): - self.dataset = dataset - self.batch_size = batch_size - self.num_workers = num_workers - self.collate_fn = collate_fn - self.pin_memory = pin_memory - self.drop_last = drop_last - - if sampler is not None: - self.sampler = sampler - elif shuffle: - self.sampler = RandomSampler(dataset) - elif not shuffle: - self.sampler = SequentialSampler(dataset) - - def __iter__(self): - return DataLoaderIter(self) - - def __len__(self): - if self.drop_last: - return len(self.sampler) // self.batch_size - else: - return (len(self.sampler) + self.batch_size - 1) // self.batch_size
    -
    - -
    -
    - -
    -
    -
    - - -
    - -
    -

    - © Copyright 2017, Torch Contributors. - -

    -
    - Built with Sphinx using a theme provided by Read the Docs. - -
    - -
    -
    - -
    - -
    - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/_modules/torch/utils/data/dataset.html b/docs/_modules/torch/utils/data/dataset.html deleted file mode 100644 index 7a77c8eeaa63..000000000000 --- a/docs/_modules/torch/utils/data/dataset.html +++ /dev/null @@ -1,610 +0,0 @@ - - - - - - - - - - - torch.utils.data.dataset — PyTorch 0.1.11 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - -
    - - - - - - -
    -
    - - - - - - - - - - - - - - - - -
    - - - - -
    -
    -
    -
    - -

    Source code for torch.utils.data.dataset

    -
    -
    [docs]class Dataset(object): - """An abstract class representing a Dataset. - - All other datasets should subclass it. All subclasses should override - ``__len__``, that provides the size of the dataset, and ``__getitem__``, - supporting integer indexing in range from 0 to len(self) exclusive. - """ - - def __getitem__(self, index): - raise NotImplementedError - - def __len__(self): - raise NotImplementedError
    - - -
    [docs]class TensorDataset(Dataset): - """Dataset wrapping data and target tensors. - - Each sample will be retrieved by indexing both tensors along the first - dimension. - - Arguments: - data_tensor (Tensor): contains sample data. - target_tensor (Tensor): contains sample targets (labels). - """ - - def __init__(self, data_tensor, target_tensor): - assert data_tensor.size(0) == target_tensor.size(0) - self.data_tensor = data_tensor - self.target_tensor = target_tensor - - def __getitem__(self, index): - return self.data_tensor[index], self.target_tensor[index] - - def __len__(self): - return self.data_tensor.size(0)
    -
    - -
    -
    - -
    -
    -
    - - -
    - -
    -

    - © Copyright 2017, Torch Contributors. - -

    -
    - Built with Sphinx using a theme provided by Read the Docs. - -
    - -
    -
    - -
    - -
    - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/_modules/torch/utils/data/sampler.html b/docs/_modules/torch/utils/data/sampler.html deleted file mode 100644 index a097b4e5a826..000000000000 --- a/docs/_modules/torch/utils/data/sampler.html +++ /dev/null @@ -1,662 +0,0 @@ - - - - - - - - - - - torch.utils.data.sampler — PyTorch 0.1.11 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - -
    - - - - - - -
    -
    - - - - - - - - - - - - - - - - -
    - - - - -
    -
    -
    -
    - -

    Source code for torch.utils.data.sampler

    -import torch
    -
    -
    -
    [docs]class Sampler(object): - """Base class for all Samplers. - - Every Sampler subclass has to provide an __iter__ method, providing a way - to iterate over indices of dataset elements, and a __len__ method that - returns the length of the returned iterators. - """ - - def __init__(self, data_source): - pass - - def __iter__(self): - raise NotImplementedError - - def __len__(self): - raise NotImplementedError
    - - -
    [docs]class SequentialSampler(Sampler): - """Samples elements sequentially, always in the same order. - - Arguments: - data_source (Dataset): dataset to sample from - """ - - def __init__(self, data_source): - self.num_samples = len(data_source) - - def __iter__(self): - return iter(range(self.num_samples)) - - def __len__(self): - return self.num_samples
    - - -
    [docs]class RandomSampler(Sampler): - """Samples elements randomly, without replacement. - - Arguments: - data_source (Dataset): dataset to sample from - """ - - def __init__(self, data_source): - self.num_samples = len(data_source) - - def __iter__(self): - return iter(torch.randperm(self.num_samples).long()) - - def __len__(self): - return self.num_samples
    - - -
    [docs]class SubsetRandomSampler(Sampler): - """Samples elements randomly from a given list of indices, without replacement. - - Arguments: - indices (list): a list of indices - """ - - def __init__(self, indices): - self.indices = indices - - def __iter__(self): - return (self.indices[i] for i in torch.randperm(len(self.indices))) - - def __len__(self): - return len(self.indices)
    - - -
    [docs]class WeightedRandomSampler(Sampler): - """Samples elements from [0,..,len(weights)-1] with given probabilities (weights). - Arguments: - weights (list) : a list of weights, not necessary summing up to one - num_samples (int): number of samples to draw - """ - - def __init__(self, weights, num_samples, replacement=True): - self.weights = torch.DoubleTensor(weights) - self.num_samples = num_samples - self.replacement = replacement - - def __iter__(self): - return iter(torch.multinomial(self.weights, self.num_samples, self.replacement)) - - def __len__(self): - return self.num_samples
    -
    - -
    -
    - -
    -
    -
    - - -
    - -
    -

    - © Copyright 2017, Torch Contributors. - -

    -
    - Built with Sphinx using a theme provided by Read the Docs. - -
    - -
    -
    - -
    - -
    - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/_modules/torch/utils/ffi.html b/docs/_modules/torch/utils/ffi.html deleted file mode 100644 index a36230f052a5..000000000000 --- a/docs/_modules/torch/utils/ffi.html +++ /dev/null @@ -1,759 +0,0 @@ - - - - - - - - - - - torch.utils.ffi — PyTorch 0.1.11 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - -
    - - - - - - -
    -
    - - - - - - - - - - - - - - - - -
    - - - - -
    -
    -
    -
    - -

    Source code for torch.utils.ffi

    -import os
    -import glob
    -import tempfile
    -import shutil
    -from functools import wraps, reduce
    -from string import Template
    -import torch
    -import torch.cuda
    -from torch._utils import _accumulate
    -
    -try:
    -    import cffi
    -except ImportError:
    -    raise ImportError("torch.utils.ffi requires the cffi package")
    -
    -
    -if cffi.__version_info__ < (1, 4, 0):
    -    raise ImportError("torch.utils.ffi requires cffi version >= 1.4, but "
    -                      "got " + '.'.join(map(str, cffi.__version_info__)))
    -
    -
    -def _generate_typedefs():
    -    typedefs = []
    -    for t in ['Double', 'Float', 'Long', 'Int', 'Short', 'Char', 'Byte']:
    -        for lib in ['TH', 'THCuda']:
    -            for kind in ['Tensor', 'Storage']:
    -                python_name = t + kind
    -                if t == 'Float' and lib == 'THCuda':
    -                    th_name = 'THCuda' + kind
    -                else:
    -                    th_name = lib + t + kind
    -                th_struct = 'struct ' + th_name
    -
    -                typedefs += ['typedef {} {};'.format(th_struct, th_name)]
    -                module = torch if lib == 'TH' else torch.cuda
    -                python_class = getattr(module, python_name)
    -                _cffi_to_torch[th_struct] = python_class
    -                _torch_to_cffi[python_class] = th_struct
    -    return '\n'.join(typedefs) + '\n'
    -_cffi_to_torch = {}
    -_torch_to_cffi = {}
    -_typedefs = _generate_typedefs()
    -
    -
    -PY_MODULE_TEMPLATE = Template("""
    -from torch.utils.ffi import _wrap_function
    -from .$cffi_wrapper_name import lib as _lib, ffi as _ffi
    -
    -__all__ = []
    -def _import_symbols(locals):
    -    for symbol in dir(_lib):
    -        fn = getattr(_lib, symbol)
    -        locals[symbol] = _wrap_function(fn, _ffi)
    -        __all__.append(symbol)
    -
    -_import_symbols(locals())
    -""")
    -
    -
    -def _setup_wrapper(with_cuda):
    -    here = os.path.abspath(os.path.dirname(__file__))
    -    lib_dir = os.path.join(here, '..', '..', 'lib')
    -    include_dirs = [
    -        os.path.join(lib_dir, 'include'),
    -        os.path.join(lib_dir, 'include', 'TH'),
    -    ]
    -
    -    wrapper_source = '#include <TH/TH.h>\n'
    -    if with_cuda:
    -        import torch.cuda
    -        wrapper_source += '#include <THC/THC.h>\n'
    -        cuda_include_dirs = glob.glob('/usr/local/cuda/include')
    -        cuda_include_dirs += glob.glob('/Developer/NVIDIA/CUDA-*/include')
    -        include_dirs.append(os.path.join(lib_dir, 'include', 'THC'))
    -        include_dirs.extend(cuda_include_dirs)
    -    return wrapper_source, include_dirs
    -
    -
    -def _create_module_dir(base_path, fullname):
    -    module, _, name = fullname.rpartition('.')
    -    if not module:
    -        target_dir = name
    -    else:
    -        target_dir = reduce(os.path.join, fullname.split('.'))
    -    target_dir = os.path.join(base_path, target_dir)
    -    try:
    -        os.makedirs(target_dir)
    -    except os.error:
    -        pass
    -    for dirname in _accumulate(fullname.split('.'), os.path.join):
    -        init_file = os.path.join(base_path, dirname, '__init__.py')
    -        open(init_file, 'a').close()  # Create file if it doesn't exist yet
    -    return name, target_dir
    -
    -
    -def _build_extension(ffi, cffi_wrapper_name, target_dir, verbose):
    -    try:
    -        tmpdir = tempfile.mkdtemp()
    -        libname = cffi_wrapper_name + '.so'
    -        ffi.compile(tmpdir=tmpdir, verbose=verbose, target=libname)
    -        shutil.copy(os.path.join(tmpdir, libname),
    -                    os.path.join(target_dir, libname))
    -    finally:
    -        shutil.rmtree(tmpdir)
    -
    -
    -def _make_python_wrapper(name, cffi_wrapper_name, target_dir):
    -    py_source = PY_MODULE_TEMPLATE.substitute(name=name,
    -                                              cffi_wrapper_name=cffi_wrapper_name)
    -    with open(os.path.join(target_dir, '__init__.py'), 'w') as f:
    -        f.write(py_source)
    -
    -
    -
    [docs]def create_extension(name, headers, sources, verbose=True, with_cuda=False, - package=False, relative_to='.', **kwargs): - """Creates and configures a cffi.FFI object, that builds PyTorch extension. - - Arguments: - name (str): package name. Can be a nested module e.g. ``.ext.my_lib``. - headers (str or List[str]): list of headers, that contain only exported - functions - sources (List[str]): list of sources to compile. - verbose (bool, optional): if set to ``False``, no output will be printed - (default: True). - with_cuda (bool, optional): set to ``True`` to compile with CUDA headers - (default: False) - package (bool, optional): set to ``True`` to build in package mode (for modules - meant to be installed as pip packages) (default: False). - relative_to (str, optional): path of the build file. Required when - ``package is True``. It's best to use ``__file__`` for this argument. - kwargs: additional arguments that are passed to ffi to declare the - extension. See `Extension API reference`_ for details. - - .. _`Extension API reference`: https://docs.python.org/3/distutils/apiref.html#distutils.core.Extension - """ - base_path = os.path.abspath(os.path.dirname(relative_to)) - name_suffix, target_dir = _create_module_dir(base_path, name) - if not package: - cffi_wrapper_name = '_' + name_suffix - else: - cffi_wrapper_name = (name.rpartition('.')[0] + - '.{0}._{0}'.format(name_suffix)) - - wrapper_source, include_dirs = _setup_wrapper(with_cuda) - include_dirs.extend(kwargs.pop('include_dirs', [])) - - if isinstance(headers, str): - headers = [headers] - all_headers_source = '' - for header in headers: - with open(os.path.join(base_path, header), 'r') as f: - all_headers_source += f.read() + '\n\n' - - ffi = cffi.FFI() - sources = [os.path.join(base_path, src) for src in sources] - ffi.set_source(cffi_wrapper_name, wrapper_source + all_headers_source, - sources=sources, - include_dirs=include_dirs, **kwargs) - ffi.cdef(_typedefs + all_headers_source) - - _make_python_wrapper(name_suffix, '_' + name_suffix, target_dir) - - def build(): - _build_extension(ffi, cffi_wrapper_name, target_dir, verbose) - ffi.build = build - return ffi
    - - -def _wrap_function(function, ffi): - @wraps(function) - def safe_call(*args, **kwargs): - args = tuple(ffi.cast(_torch_to_cffi.get(type(arg), 'void') + '*', arg._cdata) - if torch.is_tensor(arg) or torch.is_storage(arg) - else arg - for arg in args) - args = (function,) + args - result = torch._C._safe_call(*args, **kwargs) - if isinstance(result, ffi.CData): - typeof = ffi.typeof(result) - if typeof.kind == 'pointer': - cdata = int(ffi.cast('uintptr_t', result)) - cname = typeof.item.cname - if cname in _cffi_to_torch: - return _cffi_to_torch[cname](cdata=cdata) - return result - return safe_call -
    - -
    -
    - -
    -
    -
    - - -
    - -
    -

    - © Copyright 2017, Torch Contributors. - -

    -
    - Built with Sphinx using a theme provided by Read the Docs. - -
    - -
    -
    - -
    - -
    - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/_modules/torch/utils/model_zoo.html b/docs/_modules/torch/utils/model_zoo.html deleted file mode 100644 index 72f7589bb1cc..000000000000 --- a/docs/_modules/torch/utils/model_zoo.html +++ /dev/null @@ -1,682 +0,0 @@ - - - - - - - - - - - torch.utils.model_zoo — PyTorch 0.1.11 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - -
    - - - - - - -
    -
    - - - - - - - - - - - - - - - - -
    - - - - -
    -
    -
    -
    - -

    Source code for torch.utils.model_zoo

    -import torch
    -
    -import hashlib
    -import os
    -import re
    -import shutil
    -import sys
    -import tempfile
    -if sys.version_info[0] == 2:
    -    from urlparse import urlparse
    -    from urllib2 import urlopen
    -else:
    -    from urllib.request import urlopen
    -    from urllib.parse import urlparse
    -try:
    -    from tqdm import tqdm
    -except ImportError:
    -    tqdm = None  # defined below
    -
    -# matches bfd8deac from resnet18-bfd8deac.pth
    -HASH_REGEX = re.compile(r'-([a-f0-9]*)\.')
    -
    -
    -
    [docs]def load_url(url, model_dir=None): - r"""Loads the Torch serialized object at the given URL. - - If the object is already present in `model_dir`, it's deserialied and - returned. The filename part of the URL should follow the naming convention - ``filename-<sha256>.ext`` where ``<sha256>`` is the first eight or more - digits of the SHA256 hash of the contents of the file. The hash is used to - ensure unique names and to verify the contents of the file. - - The default value of `model_dir` is ``$TORCH_HOME/models`` where - ``$TORCH_HOME`` defaults to ``~/.torch``. The default directory can be - overriden with the ``$TORCH_MODEL_ZOO`` environement variable. - - Args: - url (string): URL of the object to download - model_dir (string, optional): directory in which to save the object - - Example: - >>> state_dict = torch.utils.model_zoo.load_url('https://s3.amazonaws.com/pytorch/models/resnet18-5c106cde.pth') - - """ - if model_dir is None: - torch_home = os.path.expanduser(os.getenv('TORCH_HOME', '~/.torch')) - model_dir = os.getenv('TORCH_MODEL_ZOO', os.path.join(torch_home, 'models')) - if not os.path.exists(model_dir): - os.makedirs(model_dir) - parts = urlparse(url) - filename = os.path.basename(parts.path) - cached_file = os.path.join(model_dir, filename) - if not os.path.exists(cached_file): - sys.stderr.write('Downloading: "{}" to {}\n'.format(url, cached_file)) - hash_prefix = HASH_REGEX.search(filename).group(1) - _download_url_to_file(url, cached_file, hash_prefix) - return torch.load(cached_file)
    - - -def _download_url_to_file(url, dst, hash_prefix): - u = urlopen(url) - meta = u.info() - if hasattr(meta, 'getheaders'): - file_size = int(meta.getheaders("Content-Length")[0]) - else: - file_size = int(meta.get_all("Content-Length")[0]) - - f = tempfile.NamedTemporaryFile(delete=False) - try: - sha256 = hashlib.sha256() - with tqdm(total=file_size) as pbar: - while True: - buffer = u.read(8192) - if len(buffer) == 0: - break - f.write(buffer) - sha256.update(buffer) - pbar.update(len(buffer)) - - f.close() - digest = sha256.hexdigest() - if digest[:len(hash_prefix)] != hash_prefix: - raise RuntimeError('invalid hash value (expected "{}", got "{}")' - .format(hash_prefix, digest)) - shutil.move(f.name, dst) - finally: - f.close() - if os.path.exists(f.name): - os.remove(f.name) - - -if tqdm is None: - # fake tqdm if it's not installed - class tqdm(object): - - def __init__(self, total): - self.total = total - self.n = 0 - - def update(self, n): - self.n += n - sys.stderr.write("\r{0:.1f}%".format(100 * self.n / float(self.total))) - sys.stderr.flush() - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - sys.stderr.write('\n') -
    - -
    -
    - -
    -
    -
    - - -
    - -
    -

    - © Copyright 2017, Torch Contributors. - -

    -
    - Built with Sphinx using a theme provided by Read the Docs. - -
    - -
    -
    - -
    - -
    - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/_sources/autograd.rst.txt b/docs/_sources/autograd.rst.txt deleted file mode 100644 index e5b102b376c4..000000000000 --- a/docs/_sources/autograd.rst.txt +++ /dev/null @@ -1,53 +0,0 @@ -.. role:: hidden - :class: hidden-section - -Automatic differentiation package - torch.autograd -================================================== - -.. automodule:: torch.autograd -.. currentmodule:: torch.autograd - -.. autofunction:: backward - -Variable --------- - -API compatibility -^^^^^^^^^^^^^^^^^ - -Variable API is nearly the same as regular Tensor API (with the exception -of a couple in-place methods, that would overwrite inputs required for -gradient computation). In most cases Tensors can be safely replaced with -Variables and the code will remain to work just fine. Because of this, -we're not documenting all the operations on variables, and you should -refere to :class:`torch.Tensor` docs for this purpose. - -In-place operations on Variables -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Supporting in-place operations in autograd is a hard matter, and we discourage -their use in most cases. Autograd's aggressive buffer freeing and reuse makes -it very efficient and there are very few occasions when in-place operations -actually lower memory usage by any significant amount. Unless you're operating -under heavy memory pressure, you might never need to use them. - -In-place correctness checks -^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -All :class:`Variable` s keep track of in-place operations applied to them, and -if the implementation detects that a variable was saved for backward in one of -the functions, but it was modified in-place afterwards, an error will be raised -once backward pass is started. This ensures that if you're using in-place -functions and not seing any errors, you can be sure that the computed gradients -are correct. - - -.. autoclass:: Variable - :members: - -:hidden:`Function` ------------------- - -.. autoclass:: Function - :members: - diff --git a/docs/_sources/cuda.rst.txt b/docs/_sources/cuda.rst.txt deleted file mode 100644 index 9f94d76779ad..000000000000 --- a/docs/_sources/cuda.rst.txt +++ /dev/null @@ -1,27 +0,0 @@ -torch.cuda -=================================== - -.. currentmodule:: torch.cuda - -.. automodule:: torch.cuda - :members: - -Communication collectives -------------------------- - -.. autofunction:: torch.cuda.comm.broadcast - -.. autofunction:: torch.cuda.comm.reduce_add - -.. autofunction:: torch.cuda.comm.scatter - -.. autofunction:: torch.cuda.comm.gather - -Streams and events ------------------- - -.. autoclass:: Stream - :members: - -.. autoclass:: Event - :members: diff --git a/docs/_sources/data.rst.txt b/docs/_sources/data.rst.txt deleted file mode 100644 index 11e736d61ab6..000000000000 --- a/docs/_sources/data.rst.txt +++ /dev/null @@ -1,12 +0,0 @@ -torch.utils.data -=================================== - -.. automodule:: torch.utils.data -.. autoclass:: Dataset -.. autoclass:: TensorDataset -.. autoclass:: DataLoader -.. autoclass:: torch.utils.data.sampler.Sampler -.. autoclass:: torch.utils.data.sampler.SequentialSampler -.. autoclass:: torch.utils.data.sampler.RandomSampler -.. autoclass:: torch.utils.data.sampler.SubsetRandomSampler -.. autoclass:: torch.utils.data.sampler.WeightedRandomSampler diff --git a/docs/_sources/ffi.rst.txt b/docs/_sources/ffi.rst.txt deleted file mode 100644 index ae7c0e9ddacd..000000000000 --- a/docs/_sources/ffi.rst.txt +++ /dev/null @@ -1,6 +0,0 @@ -torch.utils.ffi -=============== - -.. currentmodule:: torch.utils.ffi -.. autofunction:: create_extension - diff --git a/docs/_sources/index.rst.txt b/docs/_sources/index.rst.txt deleted file mode 100644 index 4adedb7618cf..000000000000 --- a/docs/_sources/index.rst.txt +++ /dev/null @@ -1,54 +0,0 @@ -.. PyTorch documentation master file, created by - sphinx-quickstart on Fri Dec 23 13:31:47 2016. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. - -:github_url: https://github.com/pytorch/pytorch - -PyTorch documentation -=================================== - -PyTorch is an optimized tensor library for deep learning using GPUs and CPUs. - -.. toctree:: - :glob: - :maxdepth: 1 - :caption: Notes - - notes/* - - -.. toctree:: - :maxdepth: 1 - :caption: Package Reference - - torch - tensors - storage - nn - optim - torch.autograd - torch.multiprocessing - torch.legacy - cuda - ffi - data - model_zoo - -.. toctree:: - :glob: - :maxdepth: 1 - :caption: torchvision Reference - - torchvision/torchvision - torchvision/datasets - torchvision/models - torchvision/transforms - torchvision/utils - - -Indices and tables -================== - -* :ref:`genindex` -* :ref:`modindex` diff --git a/docs/_sources/legacy.rst.txt b/docs/_sources/legacy.rst.txt deleted file mode 100644 index bc1aad54fb2b..000000000000 --- a/docs/_sources/legacy.rst.txt +++ /dev/null @@ -1,4 +0,0 @@ -Legacy package - torch.legacy -=================================== - -.. automodule:: torch.legacy diff --git a/docs/_sources/model_zoo.rst.txt b/docs/_sources/model_zoo.rst.txt deleted file mode 100644 index 3997a369d991..000000000000 --- a/docs/_sources/model_zoo.rst.txt +++ /dev/null @@ -1,5 +0,0 @@ -torch.utils.model_zoo -=================================== - -.. automodule:: torch.utils.model_zoo -.. autofunction:: load_url diff --git a/docs/_sources/multiprocessing.rst.txt b/docs/_sources/multiprocessing.rst.txt deleted file mode 100644 index 45035a0b470e..000000000000 --- a/docs/_sources/multiprocessing.rst.txt +++ /dev/null @@ -1,88 +0,0 @@ -Multiprocessing package - torch.multiprocessing -=============================================== - -.. automodule:: torch.multiprocessing -.. currentmodule:: torch.multiprocessing - -.. warning:: - - If the main process exits abruptly (e.g. because of an incoming signal), - Python's ``multiprocessing`` sometimes fails to clean up its children. - It's a known caveat, so if you're seeing any resource leaks after - interrupting the interpreter, it probably means that this has just happened - to you. - -Strategy management -------------------- - -.. autofunction:: get_all_sharing_strategies -.. autofunction:: get_sharing_strategy -.. autofunction:: set_sharing_strategy - -Sharing CUDA tensors --------------------- - -Sharing CUDA tensors between processes is supported only in Python 3, using -a ``spawn`` or ``forkserver`` start methods. :mod:`python:multiprocessing` in -Python 2 can only create subprocesses using ``fork``, and it's not supported -by the CUDA runtime. - -.. warning:: - - CUDA API requires that the allocation exported to other processes remains - valid as long as it's used by them. You should be careful and ensure that - CUDA tensors you shared don't go out of scope as long as it's necessary. - This shouldn't be a problem for sharing model parameters, but passing other - kinds of data should be done with care. Note that this restriction doesn't - apply to shared CPU memory. - - -Sharing strategies ------------------- - -This section provides a brief overview into how different sharing strategies -work. Note that it applies only to CPU tensor - CUDA tensors will always use -the CUDA API, as that's the only way they can be shared. - -File descriptor - ``file_descriptor`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - - -.. note:: - - This is the default strategy (except for macOS and OS X where it's not - supported). - -This strategy will use file descriptors as shared memory handles. Whenever a -storage is moved to shared memory, a file descriptor obtained from ``shm_open`` -is cached with the object, and when it's going to be sent to other processes, -the file descriptor will be transferred (e.g. via UNIX sockets) to it. The -receiver will also cache the file descriptor and ``mmap`` it, to obtain a shared -view onto the storage data. - -Note that if there will be a lot of tensors shared, this strategy will keep a -large number of file descriptors open most of the time. If your system has low -limits for the number of open file descriptors, and you can't rise them, you -should use the ``file_system`` strategy. - -File system - ``file_system`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -This strategy will use file names given to ``shm_open`` to identify the shared -memory regions. This has a benefit of not requiring the implementation to cache -the file descriptors obtained from it, but at the same time is prone to shared -memory leaks. The file can't be deleted right after its creation, because other -processes need to access it to open their views. If the processes fatally -crash, or are killed, and don't call the storage destructors, the files will -remain in the system. This is very serious, because they keep using up the -memory until the system is restarted, or they're freed manually. - -To counter the problem of shared memory file leaks, :mod:`torch.multiprocessing` -will spawn a daemon named ``torch_shm_manager`` that will isolate itself from -the current process group, and will keep track of all shared memory allocations. -Once all processes connected to it exit, it will wait a moment to ensure there -will be no new connections, and will iterate over all shared memory files -allocated by the group. If it finds that any of them still exist, they will be -deallocated. We've tested this method and it prooved to be robust to various -failures. Still, if your system has high enough limits, and ``file_descriptor`` -is a supported strategy, we do not recommend switching to this one. diff --git a/docs/_sources/nn.rst.txt b/docs/_sources/nn.rst.txt deleted file mode 100644 index d1371edfc1a7..000000000000 --- a/docs/_sources/nn.rst.txt +++ /dev/null @@ -1,849 +0,0 @@ -.. role:: hidden - :class: hidden-section - -torch.nn -=================================== - -.. automodule:: torch.nn -.. currentmodule:: torch.nn - -Parameters ----------- - -.. autoclass:: Parameter - :members: - -Containers ----------------------------------- - -:hidden:`Module` -~~~~~~~~~~~~~~~~ - -.. autoclass:: Module - :members: - -:hidden:`Sequential` -~~~~~~~~~~~~~~~~~~~~ - -.. autoclass:: Sequential - :members: - -:hidden:`ModuleList` -~~~~~~~~~~~~~~~~~~~~ - -.. autoclass:: ModuleList - :members: - -:hidden:`ParameterList` -~~~~~~~~~~~~~~~~~~~~~~~ - -.. autoclass:: ParameterList - :members: - -Convolution Layers ----------------------------------- - -:hidden:`Conv1d` -~~~~~~~~~~~~~~~~ - -.. autoclass:: Conv1d - :members: - -:hidden:`Conv2d` -~~~~~~~~~~~~~~~~ - -.. autoclass:: Conv2d - :members: - -:hidden:`Conv3d` -~~~~~~~~~~~~~~~~ - -.. autoclass:: Conv3d - :members: - -:hidden:`ConvTranspose1d` -~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. autoclass:: ConvTranspose1d - :members: - -:hidden:`ConvTranspose2d` -~~~~~~~~~~~~~~~~~~~~~~~~~ - - -.. autoclass:: ConvTranspose2d - :members: - -:hidden:`ConvTranspose3d` -~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. autoclass:: ConvTranspose3d - :members: - - -Pooling Layers ----------------------------------- - -:hidden:`MaxPool1d` -~~~~~~~~~~~~~~~~~~~ - -.. autoclass:: MaxPool1d - :members: - -:hidden:`MaxPool2d` -~~~~~~~~~~~~~~~~~~~ - -.. autoclass:: MaxPool2d - :members: - -:hidden:`MaxPool3d` -~~~~~~~~~~~~~~~~~~~ - -.. autoclass:: MaxPool3d - :members: - -:hidden:`MaxUnpool1d` -~~~~~~~~~~~~~~~~~~~~~ - -.. autoclass:: MaxUnpool1d - :members: - -:hidden:`MaxUnpool2d` -~~~~~~~~~~~~~~~~~~~~~ - -.. autoclass:: MaxUnpool2d - :members: - -:hidden:`MaxUnpool3d` -~~~~~~~~~~~~~~~~~~~~~ - -.. autoclass:: MaxUnpool3d - :members: - -:hidden:`AvgPool1d` -~~~~~~~~~~~~~~~~~~~ - -.. autoclass:: AvgPool1d - :members: - -:hidden:`AvgPool2d` -~~~~~~~~~~~~~~~~~~~ - -.. autoclass:: AvgPool2d - :members: - -:hidden:`AvgPool3d` -~~~~~~~~~~~~~~~~~~~ - -.. autoclass:: AvgPool3d - :members: - -:hidden:`FractionalMaxPool2d` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. autoclass:: FractionalMaxPool2d - :members: - -:hidden:`LPPool2d` -~~~~~~~~~~~~~~~~~~ - -.. autoclass:: LPPool2d - :members: - -:hidden:`AdaptiveMaxPool1d` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. autoclass:: AdaptiveMaxPool1d - :members: - -:hidden:`AdaptiveMaxPool2d` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. autoclass:: AdaptiveMaxPool2d - :members: - -:hidden:`AdaptiveAvgPool1d` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. autoclass:: AdaptiveAvgPool1d - :members: - -:hidden:`AdaptiveAvgPool2d` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. autoclass:: AdaptiveAvgPool2d - :members: - - -Non-linear Activations ----------------------------------- - -:hidden:`ReLU` -~~~~~~~~~~~~~~ - -.. autoclass:: ReLU - :members: - -:hidden:`ReLU6` -~~~~~~~~~~~~~~~ - -.. autoclass:: ReLU6 - :members: - -:hidden:`ELU` -~~~~~~~~~~~~~ - -.. autoclass:: ELU - :members: - -:hidden:`PReLU` -~~~~~~~~~~~~~~~ - -.. autoclass:: PReLU - :members: - -:hidden:`LeakyReLU` -~~~~~~~~~~~~~~~~~~~ - -.. autoclass:: LeakyReLU - :members: - -:hidden:`Threshold` -~~~~~~~~~~~~~~~~~~~ - -.. autoclass:: Threshold - :members: - -:hidden:`Hardtanh` -~~~~~~~~~~~~~~~~~~ - -.. autoclass:: Hardtanh - :members: - -:hidden:`Sigmoid` -~~~~~~~~~~~~~~~~~ - -.. autoclass:: Sigmoid - :members: - -:hidden:`Tanh` -~~~~~~~~~~~~~~ - -.. autoclass:: Tanh - :members: - -:hidden:`LogSigmoid` -~~~~~~~~~~~~~~~~~~~~ - -.. autoclass:: LogSigmoid - :members: - -:hidden:`Softplus` -~~~~~~~~~~~~~~~~~~ - -.. autoclass:: Softplus - :members: - -:hidden:`Softshrink` -~~~~~~~~~~~~~~~~~~~~ - -.. autoclass:: Softshrink - :members: - -:hidden:`Softsign` -~~~~~~~~~~~~~~~~~~ - -.. autoclass:: Softsign - :members: - -:hidden:`Tanhshrink` -~~~~~~~~~~~~~~~~~~~~ - -.. autoclass:: Tanhshrink - :members: - -:hidden:`Softmin` -~~~~~~~~~~~~~~~~~ - -.. autoclass:: Softmin - :members: - -:hidden:`Softmax` -~~~~~~~~~~~~~~~~~ - -.. autoclass:: Softmax - :members: - -:hidden:`LogSoftmax` -~~~~~~~~~~~~~~~~~~~~ - -.. autoclass:: LogSoftmax - :members: - - -Normalization layers ----------------------------------- - -:hidden:`BatchNorm1d` -~~~~~~~~~~~~~~~~~~~~~ - -.. autoclass:: BatchNorm1d - :members: - -:hidden:`BatchNorm2d` -~~~~~~~~~~~~~~~~~~~~~ - -.. autoclass:: BatchNorm2d - :members: - -:hidden:`BatchNorm3d` -~~~~~~~~~~~~~~~~~~~~~ - -.. autoclass:: BatchNorm3d - :members: - - -Recurrent layers ----------------------------------- - -:hidden:`RNN` -~~~~~~~~~~~~~ - -.. autoclass:: RNN - :members: - -:hidden:`LSTM` -~~~~~~~~~~~~~~ - -.. autoclass:: LSTM - :members: - -:hidden:`GRU` -~~~~~~~~~~~~~ - -.. autoclass:: GRU - :members: - -:hidden:`RNNCell` -~~~~~~~~~~~~~~~~~ - -.. autoclass:: RNNCell - :members: - -:hidden:`LSTMCell` -~~~~~~~~~~~~~~~~~~ - -.. autoclass:: LSTMCell - :members: - -:hidden:`GRUCell` -~~~~~~~~~~~~~~~~~ - -.. autoclass:: GRUCell - :members: - -Linear layers ----------------------------------- - -:hidden:`Linear` -~~~~~~~~~~~~~~~~ - -.. autoclass:: Linear - :members: - - -Dropout layers ----------------------------------- - -:hidden:`Dropout` -~~~~~~~~~~~~~~~~~ - -.. autoclass:: Dropout - :members: - -:hidden:`Dropout2d` -~~~~~~~~~~~~~~~~~~~ - -.. autoclass:: Dropout2d - :members: - -:hidden:`Dropout3d` -~~~~~~~~~~~~~~~~~~~ - -.. autoclass:: Dropout3d - :members: - - -Sparse layers ----------------------------------- - -:hidden:`Embedding` -~~~~~~~~~~~~~~~~~~~ - -.. autoclass:: Embedding - :members: - -Distance functions ----------------------------------- - -:hidden:`PairwiseDistance` -~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. autoclass:: PairwiseDistance - :members: - - -Loss functions ----------------------------------- - -:hidden:`L1Loss` -~~~~~~~~~~~~~~~~ - -.. autoclass:: L1Loss - :members: - -:hidden:`MSELoss` -~~~~~~~~~~~~~~~~~ - -.. autoclass:: MSELoss - :members: - -:hidden:`CrossEntropyLoss` -~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. autoclass:: CrossEntropyLoss - :members: - -:hidden:`NLLLoss` -~~~~~~~~~~~~~~~~~ - -.. autoclass:: NLLLoss - :members: - -:hidden:`NLLLoss2d` -~~~~~~~~~~~~~~~~~~~ - -.. autoclass:: NLLLoss2d - :members: - -:hidden:`KLDivLoss` -~~~~~~~~~~~~~~~~~~~ - -.. autoclass:: KLDivLoss - :members: - -:hidden:`BCELoss` -~~~~~~~~~~~~~~~~~~~ - -.. autoclass:: BCELoss - :members: - -:hidden:`MarginRankingLoss` -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. autoclass:: MarginRankingLoss - :members: - -:hidden:`HingeEmbeddingLoss` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. autoclass:: HingeEmbeddingLoss - :members: - -:hidden:`MultiLabelMarginLoss` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. autoclass:: MultiLabelMarginLoss - :members: - -:hidden:`SmoothL1Loss` -~~~~~~~~~~~~~~~~~~~~~~ - -.. autoclass:: SmoothL1Loss - :members: - -:hidden:`SoftMarginLoss` -~~~~~~~~~~~~~~~~~~~~~~~~ - -.. autoclass:: SoftMarginLoss - :members: - -:hidden:`MultiLabelSoftMarginLoss` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. autoclass:: MultiLabelSoftMarginLoss - :members: - -:hidden:`CosineEmbeddingLoss` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. autoclass:: CosineEmbeddingLoss - :members: - -:hidden:`MultiMarginLoss` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. autoclass:: MultiMarginLoss - :members: - - -Vision layers ----------------- - -:hidden:`PixelShuffle` -~~~~~~~~~~~~~~~~~~~~~~ - -.. autoclass:: PixelShuffle - :members: - -:hidden:`UpsamplingNearest2d` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. autoclass:: UpsamplingNearest2d - :members: - -:hidden:`UpsamplingBilinear2d` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. autoclass:: UpsamplingBilinear2d - :members: - - -Multi-GPU layers ----------------- - -:hidden:`DataParallel` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. autoclass:: DataParallel - :members: - - -Utilities ---------- - -:hidden:`clip_grad_norm` -~~~~~~~~~~~~~~~~~~~~~~~~ - -.. autofunction:: torch.nn.utils.clip_grad_norm - - -.. currentmodule:: torch.nn.utils.rnn - -:hidden:`PackedSequence` -~~~~~~~~~~~~~~~~~~~~~~~~ - -.. autofunction:: torch.nn.utils.rnn.PackedSequence - - -:hidden:`pack_padded_sequence` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. autofunction:: torch.nn.utils.rnn.pack_padded_sequence - - -:hidden:`pad_packed_sequence` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. autofunction:: torch.nn.utils.rnn.pad_packed_sequence - - -torch.nn.functional -=================== - -.. currentmodule:: torch.nn.functional - -Convolution functions ----------------------------------- - -:hidden:`conv1d` -~~~~~~~~~~~~~~~~ - -.. autofunction:: conv1d - -:hidden:`conv2d` -~~~~~~~~~~~~~~~~ - -.. autofunction:: conv2d - -:hidden:`conv3d` -~~~~~~~~~~~~~~~~ - -.. autofunction:: conv3d - -:hidden:`conv_transpose1d` -~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. autofunction:: conv_transpose1d - -:hidden:`conv_transpose2d` -~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. autofunction:: conv_transpose2d - -:hidden:`conv_transpose3d` -~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. autofunction:: conv_transpose3d - -Pooling functions ----------------------------------- - -:hidden:`avg_pool1d` -~~~~~~~~~~~~~~~~~~~~ - -.. autofunction:: avg_pool1d - -:hidden:`avg_pool2d` -~~~~~~~~~~~~~~~~~~~~ - -.. autofunction:: avg_pool2d - -:hidden:`avg_pool3d` -~~~~~~~~~~~~~~~~~~~~ - -.. autofunction:: avg_pool3d - -:hidden:`max_pool1d` -~~~~~~~~~~~~~~~~~~~~ - -.. autofunction:: max_pool1d - -:hidden:`max_pool2d` -~~~~~~~~~~~~~~~~~~~~ - -.. autofunction:: max_pool2d - -:hidden:`max_pool3d` -~~~~~~~~~~~~~~~~~~~~ - -.. autofunction:: max_pool3d - -:hidden:`max_unpool1d` -~~~~~~~~~~~~~~~~~~~~~~ - -.. autofunction:: max_unpool1d - -:hidden:`max_unpool2d` -~~~~~~~~~~~~~~~~~~~~~~ - -.. autofunction:: max_unpool2d - -:hidden:`max_unpool3d` -~~~~~~~~~~~~~~~~~~~~~~ - -.. autofunction:: max_unpool3d - -:hidden:`lp_pool2d` -~~~~~~~~~~~~~~~~~~~ - -.. autofunction:: lp_pool2d - -:hidden:`adaptive_max_pool1d` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. autofunction:: adaptive_max_pool1d - -:hidden:`adaptive_max_pool2d` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. autofunction:: adaptive_max_pool2d - -:hidden:`adaptive_avg_pool1d` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. autofunction:: adaptive_avg_pool1d - -:hidden:`adaptive_avg_pool2d` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. autofunction:: adaptive_avg_pool2d - - -Non-linear activation functions -------------------------------- - -:hidden:`threshold` -~~~~~~~~~~~~~~~~~~~ - -.. autofunction:: threshold - - -:hidden:`relu` -~~~~~~~~~~~~~~ - -.. autofunction:: relu - -:hidden:`hardtanh` -~~~~~~~~~~~~~~~~~~ - -.. autofunction:: hardtanh - -:hidden:`relu6` -~~~~~~~~~~~~~~~ - -.. autofunction:: relu6 - -:hidden:`elu` -~~~~~~~~~~~~~ - -.. autofunction:: elu - -:hidden:`leaky_relu` -~~~~~~~~~~~~~~~~~~~~ - -.. autofunction:: leaky_relu - -:hidden:`prelu` -~~~~~~~~~~~~~~~ - -.. autofunction:: prelu - -:hidden:`rrelu` -~~~~~~~~~~~~~~~ - -.. autofunction:: rrelu - -:hidden:`logsigmoid` -~~~~~~~~~~~~~~~~~~~~ - -.. autofunction:: logsigmoid - -:hidden:`hardshrink` -~~~~~~~~~~~~~~~~~~~~ - -.. autofunction:: hardshrink - -:hidden:`tanhshrink` -~~~~~~~~~~~~~~~~~~~~ - -.. autofunction:: tanhshrink - -:hidden:`softsign` -~~~~~~~~~~~~~~~~~~ - -.. autofunction:: softsign - -:hidden:`softplus` -~~~~~~~~~~~~~~~~~~ - -.. autofunction:: softplus - -:hidden:`softmin` -~~~~~~~~~~~~~~~~~ - -.. autofunction:: softmin - -:hidden:`softmax` -~~~~~~~~~~~~~~~~~ - -.. autofunction:: softmax - -:hidden:`softshrink` -~~~~~~~~~~~~~~~~~~~~ - -.. autofunction:: softshrink - -:hidden:`log_softmax` -~~~~~~~~~~~~~~~~~~~~~ - -.. autofunction:: log_softmax - -:hidden:`tanh` -~~~~~~~~~~~~~~ - -.. autofunction:: tanh - -:hidden:`sigmoid` -~~~~~~~~~~~~~~~~~ - -.. autofunction:: sigmoid - -Normalization functions ------------------------ - -:hidden:`batch_norm` -~~~~~~~~~~~~~~~~~~~~ - -.. autofunction:: batch_norm - -Linear functions ----------------- - -:hidden:`linear` -~~~~~~~~~~~~~~~~ - -.. autofunction:: linear - -Dropout functions ------------------ - -:hidden:`dropout` -~~~~~~~~~~~~~~~~~ - -.. autofunction:: dropout - -Distance functions ----------------------------------- - -:hidden:`pairwise_distance` -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. autofunction:: pairwise_distance - - -Loss functions --------------- - -:hidden:`nll_loss` -~~~~~~~~~~~~~~~~~~ - -.. autofunction:: nll_loss - - -:hidden:`kl_div` -~~~~~~~~~~~~~~~~ - -.. autofunction:: kl_div - -:hidden:`cross_entropy` -~~~~~~~~~~~~~~~~~~~~~~~ - -.. autofunction:: cross_entropy - -:hidden:`binary_cross_entropy` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. autofunction:: binary_cross_entropy - -:hidden:`smooth_l1_loss` -~~~~~~~~~~~~~~~~~~~~~~~~ - -.. autofunction:: smooth_l1_loss - -Vision functions ----------------- - -:hidden:`pixel_shuffle` -~~~~~~~~~~~~~~~~~~~~~~~ - -.. autofunction:: pixel_shuffle - -:hidden:`pad` -~~~~~~~~~~~~~~~~~~~~~~~ - -.. autofunction:: pad - -torch.nn.init -============= - -.. currentmodule:: torch.nn.init -.. autofunction:: uniform -.. autofunction:: normal -.. autofunction:: constant -.. autofunction:: xavier_uniform -.. autofunction:: xavier_normal -.. autofunction:: kaiming_uniform -.. autofunction:: kaiming_normal -.. autofunction:: orthogonal -.. autofunction:: sparse diff --git a/docs/_sources/notes/autograd.rst.txt b/docs/_sources/notes/autograd.rst.txt deleted file mode 100644 index d560a59a54c3..000000000000 --- a/docs/_sources/notes/autograd.rst.txt +++ /dev/null @@ -1,144 +0,0 @@ -Autograd mechanics -================== - -This note will present an overview of how autograd works and records the -operations. It's not strictly necessary to understand all this, but we recommend -getting familiar with it, as it will help you write more efficient, cleaner -programs, and can aid you in debugging. - -.. _excluding-subgraphs: - -Excluding subgraphs from backward -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Every Variable has two flags: :attr:`requires_grad` and :attr:`volatile`. -They both allow for fine grained exclusion of subgraphs from gradient -computation and can increase efficiency. - -.. _excluding-requires_grad: - -``requires_grad`` -~~~~~~~~~~~~~~~~~ - -If there's a single input to an operation that requires gradient, its output -will also require gradient. Conversely, only if all inputs don't require -gradient, the output also won't require it. Backward computation is never -performed in the subgraphs, where all Variables didn't require gradients. - -.. code:: - - >>> x = Variable(torch.randn(5, 5)) - >>> y = Variable(torch.randn(5, 5)) - >>> z = Variable(torch.randn(5, 5), requires_grad=True) - >>> a = x + y - >>> a.requires_grad - False - >>> b = a + z - >>> b.requires_grad - True - -This is especially useful when you want to freeze part of your model, or you -know in advance that you're not going to use gradients w.r.t. some parameters. -For example if you want to finetune a pretrained CNN, it's enough to switch the -:attr:`requires_grad` flags in the frozen base, and no intermediate buffers will -be saved, until the computation gets to the last layer, where the affine -transform will use weights that require gradient, and the output of the network -will also require them. - -.. code:: - - model = torchvision.models.resnet18(pretrained=True) - for param in model.parameters(): - param.requires_grad = False - # Replace the last fully-connected layer - # Parameters of newly constructed modules have requires_grad=True by default - model.fc = nn.Linear(512, 100) - - # Optimize only the classifier - optimizer = optim.SGD(model.fc.parameters(), lr=1e-2, momentum=0.9) - -``volatile`` -~~~~~~~~~~~~ - -Volatile is recommended for purely inference mode, when you're sure you won't -be even calling `.backward()`. It's more efficient than any other autograd -setting - it will use the absolute minimal amount of memory to evaluate the -model. ``volatile`` also determines that ``requires_grad is False``. - -Volatile differs from :ref:`excluding-requires_grad` in how the flag propagates. -If there's even a single volatile input to an operation, its output is also -going to be volatile. Volatility spreads accross the graph much easier than -non-requiring gradient - you only need a **single** volatile leaf to have a -volatile output, while you need **all** leaves to not require gradient to -have an output the doesn't require gradient. Using volatile flag you don't -need to change any settings of your model parameters to use it for -inference. It's enough to create a volatile input, and this will ensure that -no intermediate states are saved. - -.. code:: - - >>> regular_input = Variable(torch.randn(5, 5)) - >>> volatile_input = Variable(torch.randn(5, 5), volatile=True) - >>> model = torchvision.models.resnet18(pretrained=True) - >>> model(regular_input).requires_grad - True - >>> model(volatile_input).requires_grad - False - >>> model(volatile_input).volatile - True - >>> model(volatile_input).creator is None - True - -How autograd encodes the history -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Each Variable has a ``.creator`` attribute, that points to the function, of -which it is an output. This is an entry point to a directed acyclic graph (DAG) -consisting of :class:`Function` objects as nodes, and references between them -being the edges. Every time an operation is performed, a new :class:`Function` -representing it is instantiated, its :meth:`~torch.autograd.Function.forward` -method is called, and its output :class:`Variable` s creators are set to it. -Then, by following the path from any :class:`Variable` to the leaves, it is -possible to reconstruct the sequence of operations that has created the data, -and automatically compute the gradients. - -An important thing to note is that the graph is recreated from scratch at every -iteration, and this is exactly what allows for using arbitrary Python control -flow statements, that can change the overall shape and size of the graph at -every iteration. You don't have to encode all possible paths before you -launch the training - what you run is what you differentiate. - -In-place operations on Variables -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Supporting in-place operations in autograd is a hard matter, and we discourage -their use in most cases. Autograd's aggressive buffer freeing and reuse makes -it very efficient and there are very few occasions when in-place operations -actually lower memory usage by any significant amount. Unless you're operating -under heavy memory pressure, you might never need to use them. - -There are two main reasons that limit the applicability of in-place operations: - -1. Overwriting values required to compute gradients. This is why variables don't - support ``log_``. Its gradient formula requires the original input, and while - it is possible to recreate it by computing the inverse operation, it is - numerically unstable, and requires additional work that often defeats the - purpose of using these functions. - -2. Every in-place operation actually requires the implementation to rewrite the - computational graph. Out-of-place versions simply allocate new objects and - keep references to the old graph, while in-place operations, require - changing the creator of all inputs to the :class:`Function` representing - this operation. This can be tricky, especially if there are many Variables - that reference the same storage (e.g. created by indexing or transposing), - and in-place functions will actually raise an error if the storage of - modified inputs is referenced by any other :class:`Variable`. - -In-place correctness checks -^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Every variable keeps a version counter, that is incremented every time it's -marked dirty in any operation. When a Function saves any tensors for backward, -a version counter of their containing Variable is saved as well. Once you access -``self.saved_tensors`` it is checked, and if it's greater than the saved value -an error is raised. diff --git a/docs/_sources/notes/cuda.rst.txt b/docs/_sources/notes/cuda.rst.txt deleted file mode 100644 index 4db82e61b5c3..000000000000 --- a/docs/_sources/notes/cuda.rst.txt +++ /dev/null @@ -1,83 +0,0 @@ -.. _cuda-semantics: - -CUDA semantics -============== - -:mod:`torch.cuda` keeps track of currently selected GPU, and all CUDA tensors -you allocate will be created on it. The selected device can be changed with a -:any:`torch.cuda.device` context manager. - -However, once a tensor is allocated, you can do operations on it irrespectively -of your selected device, and the results will be always placed in on the same -device as the tensor. - -Cross-GPU operations are not allowed by default, with the only exception of -:meth:`~torch.Tensor.copy_`. Unless you enable peer-to-peer memory accesses -any attempts to launch ops on tensors spread across different devices will -raise an error. - -Below you can find a small example showcasing this:: - - x = torch.cuda.FloatTensor(1) - # x.get_device() == 0 - y = torch.FloatTensor(1).cuda() - # y.get_device() == 0 - - with torch.cuda.device(1): - # allocates a tensor on GPU 1 - a = torch.cuda.FloatTensor(1) - - # transfers a tensor from CPU to GPU 1 - b = torch.FloatTensor(1).cuda() - # a.get_device() == b.get_device() == 1 - - c = a + b - # c.get_device() == 1 - - z = x + y - # z.get_device() == 0 - - # even within a context, you can give a GPU id to the .cuda call - d = torch.randn(2).cuda(2) - # d.get_device() == 2 - -Best practices --------------- - -Use pinned memory buffers -^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. warning: - - This is an advanced tip. You overuse of pinned memory can cause serious - problems if you'll be running low on RAM, and you should be aware that - pinning is often an expensive operation. - -Host to GPU copies are much faster when they originate from pinned (page-locked) -memory. CPU tensors and storages expose a :meth:`~torch.Tensor.pin_memory` -method, that returns a copy of the object, with data put in a pinned region. - -Also, once you pin a tensor or storage, you can use asynchronous GPU copies. -Just pass an additional ``async=True`` argument to a :meth:`~torch.Tensor.cuda` -call. This can be used to overlap data transfers with computation. - -You can make the :class:`~torch.utils.data.DataLoader` return batches placed in -pinned memory by passing ``pin_memory=True`` to its constructor. - -.. _cuda-nn-dataparallel-instead: - -Use nn.DataParallel instead of multiprocessing -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Most use cases involving batched input and multiple GPUs should default to using -:class:`~torch.nn.DataParallel` to utilize more than one GPU. Even with the GIL, -a single python process can saturate multiple GPUs. - -As of version 0.1.9, large numbers of GPUs (8+) might not be fully utilized. -However, this is a known issue that is under active development. As always, -test your use case. - -There are significant caveats to using CUDA models with -:mod:`~torch.multiprocessing`; unless care is taken to meet the data handling -requirements exactly, it is likely that your program will have incorrect or -undefined behavior. diff --git a/docs/_sources/notes/extending.rst.txt b/docs/_sources/notes/extending.rst.txt deleted file mode 100644 index 2f01efcff1b2..000000000000 --- a/docs/_sources/notes/extending.rst.txt +++ /dev/null @@ -1,169 +0,0 @@ -Extending PyTorch -================= - -In this note we'll cover ways of extending :mod:`torch.nn`, -:mod:`torch.autograd`, and writing custom C extensions utilizing our C -libraries. - -Extending :mod:`torch.autograd` -------------------------------- - -.. currentmodule:: torch.autograd - -Adding operations to :mod:`~torch.autograd` requires implementing a new -:class:`Function` subclass for each operation. Recall that :class:`Function` s -are what :mod:`~torch.autograd` uses to compute the results and gradients, and -encode the operation history. Every new function requires you to implement 3 -methods: - -- ``__init__`` (*optional*) - if your operation is parametrized by/uses - objects different than :class:`Variable` s, you should pass them as arguments - to ``__init__``. For example, ``AddConstant`` function takes a scalar to add, - while ``Transpose`` requires specifying which two dimensions to swap. If your - function doesn't require any additional parameters, you can skip it. -- :meth:`~Function.forward` - the code that performs the operation. It can take - as many arguments as you want, with some of them being - optional, if you specify the default values. Keep in mind that only - :class:`Variable` s will be passed in here. You can return either a single - :class:`Variable` output, or a :class:`tuple` of :class:`Variable` s if there - are multiple. Also, please refer to the docs of :class:`Function` to find - descriptions of useful methods that can be called only from - :meth:`~Function.forward`. -- :meth:`~Function.backward` - gradient formula. It will be given - as many arguments as there were outputs, with each of them representing - gradient w.r.t. that output. It should return as many :class:`Tensor` s as - there were inputs, with each of them containing the gradient w.r.t. - corresponding input. If you inputs didn't require gradient (see - :attr:`~Variable.needs_input_grad`), or it was non-differentiable, you - can return :class:`None`. Also, if you have optional arguments to - :meth:`~Variable.forward` you can return more gradients than there were - inputs, as long as they're all :any:`python:None`. - -Below you can find code for a ``Linear`` function from :mod:`torch.nn`, with -additional comments:: - - # Inherit from Function - class Linear(Function): - - # bias is an optional argument - def forward(self, input, weight, bias=None): - self.save_for_backward(input, weight, bias) - output = input.mm(weight.t()) - if bias is not None: - output += bias.unsqueeze(0).expand_as(output) - return output - - # This function has only a single output, so it gets only one gradient - def backward(self, grad_output): - # This is a pattern that is very convenient - at the top of backward - # unpack saved_tensors and initialize all gradients w.r.t. inputs to - # None. Thanks to the fact that additional trailing Nones are - # ignored, the return statement is simple even when the function has - # optional inputs. - input, weight, bias = self.saved_tensors - grad_input = grad_weight = grad_bias = None - - # These needs_input_grad checks are optional and there only to - # improve efficiency. If you want to make your code simpler, you can - # skip them. Returning gradients for inputs that don't require it is - # not an error. - if self.needs_input_grad[0]: - grad_input = grad_output.mm(weight) - if self.needs_input_grad[1]: - grad_weight = grad_output.t().mm(input) - if bias is not None and self.needs_input_grad[2]: - grad_bias = grad_output.sum(0).squeeze(0) - - return grad_input, grad_weight, grad_bias - -Now, to make it easier to use these custom ops, we recommend wrapping them in -small helper functions:: - - def linear(input, weight, bias=None): - # First braces create a Function object. Any arguments given here - # will be passed to __init__. Second braces will invoke the __call__ - # operator, that will then use forward() to compute the result and - # return it. - return Linear()(input, weight, bias) - -You probably want to check if the backward method you implemented actually -computes the derivatives of your function. It is possible by comparing with -numerical approximations using small finite differences:: - - from torch.autograd import gradcheck - - # gradchek takes a tuple of tensor as input, check if your gradient - # evaluated with these tensors are close enough to numerical - # approximations and returns True if they all verify this condition. - input = (Variable(torch.randn(20,20).double(), requires_grad=True),) - test = gradcheck(Linear(), input, eps=1e-6, atol=1e-4) - print(test) - -Extending :mod:`torch.nn` -------------------------- - -.. currentmodule:: torch.nn - -:mod:`~torch.nn` exports two kinds of interfaces - modules and their functional -versions. You can extend it in both ways, but we recommend using modules for -all kinds of layers, that hold any parameters or buffers, and recommend using -a functional form parameter-less operations like activation functions, pooling, -etc. - -Adding a functional version of an operation is already fully covered in the -section above. - -Adding a :class:`Module` -^^^^^^^^^^^^^^^^^^^^^^^^ - -Since :mod:`~torch.nn` heavily utilizes :mod:`~torch.autograd`, adding a new -:class:`Module` requires implementing a :class:`~torch.autograd.Function` -that performs the operation and can compute the gradient. From now on let's -assume that we want to implement a ``Linear`` module and we have the function -implementated as in the listing above. There's very little code required to -add this. Now, there are two functions that need to be implemented: - -- ``__init__`` (*optional*) - takes in arguments such as kernel sizes, numbers - of features, etc. and initializes parameters and buffers. -- :meth:`~Module.forward` - instantiates a :class:`~torch.autograd.Function` and - uses it to perform the operation. It's very similar to a functional wrapper - shown above. - -This is how a ``Linear`` module can be implemented:: - - class Linear(nn.Module): - def __init__(self, input_features, output_features, bias=True): - self.input_features = input_features - self.output_features = output_features - - # nn.Parameter is a special kind of Variable, that will get - # automatically registered as Module's parameter once it's assigned - # as an attribute. Parameters and buffers need to be registered, or - # they won't appear in .parameters() (doesn't apply to buffers), and - # won't be converted when e.g. .cuda() is called. You can use - # .register_buffer() to register buffers. - # nn.Parameters can never be volatile and, different than Variables, - # they require gradients by default. - self.weight = nn.Parameter(torch.Tensor(input_features, output_features)) - if bias: - self.bias = nn.Parameter(torch.Tensor(output_features)) - else: - # You should always register all possible parameters, but the - # optional ones can be None if you want. - self.register_parameter('bias', None) - - # Not a very smart way to initialize weights - self.weight.data.uniform_(-0.1, 0.1) - if bias is not None: - self.bias.data.uniform_(-0.1, 0.1) - - def forward(self, input): - # See the autograd section for explanation of what happens here. - return Linear()(input, self.weight, self.bias) - - -Writing custom C extensions ---------------------------- - -Coming soon. For now you can find an example at -`GitHub `_. diff --git a/docs/_sources/notes/multiprocessing.rst.txt b/docs/_sources/notes/multiprocessing.rst.txt deleted file mode 100644 index 85b7d6a5faf9..000000000000 --- a/docs/_sources/notes/multiprocessing.rst.txt +++ /dev/null @@ -1,124 +0,0 @@ -Multiprocessing best practices -============================== - -:mod:`torch.multiprocessing` is a drop in replacement for Python's -:mod:`python:multiprocessing` module. It supports the exact same operations, -but extends it, so that all tensors sent through a -:class:`python:multiprocessing.Queue`, will have their data moved into shared -memory and will only send a handle to another process. - -.. note:: - - When a :class:`~torch.autograd.Variable` is sent to another process, both - the :attr:`Variable.data` and :attr:`Variable.grad.data` are going to be - shared. - -This allows to implement various training methods, like Hogwild, A3C, or any -others that require asynchronous operation. - -Sharing CUDA tensors --------------------- - -Sharing CUDA tensors between processes is supported only in Python 3, using -a ``spawn`` or ``forkserver`` start methods. :mod:`python:multiprocessing` in -Python 2 can only create subprocesses using ``fork``, and it's not supported -by the CUDA runtime. - -.. warning:: - - CUDA API requires that the allocation exported to other processes remains - valid as long as it's used by them. You should be careful and ensure that - CUDA tensors you shared don't go out of scope as long as it's necessary. - This shouldn't be a problem for sharing model parameters, but passing other - kinds of data should be done with care. Note that this restriction doesn't - apply to shared CPU memory. - -See also: :ref:`cuda-nn-dataparallel-instead` - - -Best practices and tips ------------------------ - -Avoiding and fighting deadlocks -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -There are a lot of things that can go wrong when a new process is spawned, with -the most common cause of deadlocks being background threads. If there's any -thread that holds a lock or imports a module, and ``fork`` is called, it's very -likely that the subprocess will be in a corrupted state and will deadlock or -fail in a different way. Note that even if you don't, Python built in -libraries do - no need to look further than :mod:`python:multiprocessing`. -:class:`python:multiprocessing.Queue` is actually a very complex class, that -spawns multiple threads used to serialize, send and receive objects, and they -can cause aforementioned problems too. If you find yourself in such situation -try using a :class:`~python:multiprocessing.queues.SimpleQueue`, that doesn't -use any additional threads. - -We're trying our best to make it easy for you and ensure these deadlocks don't -happen but some things are out of our control. If you have any issues you can't -cope with for a while, try reaching out on forums, and we'll see if it's an -issue we can fix. - -Reuse buffers passed through a Queue -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Remember that each time you put a :class:`~torch.Tensor` into a -:class:`python:multiprocessing.Queue`, it has to be moved into shared memory. -If it's already shared, it is a no-op, otherwise it will incur an additional -memory copy that can slow down the whole process. Even if you have a pool of -processes sending data to a single one, make it send the buffers back - this -is nearly free and will let you avoid a copy when sending next batch. - -Asynchronous multiprocess training (e.g. Hogwild) -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Using :mod:`torch.multiprocessing`, it is possible to train a model -asynchronously, with parameters either shared all the time, or being -periodically synchronized. In the first case, we recommend sending over the whole -model object, while in the latter, we advise to only send the -:meth:`~torch.nn.Module.state_dict`. - -We recommend using :class:`python:multiprocessing.Queue` for passing all kinds -of PyTorch objects between processes. It is possible to e.g. inherit the tensors -and storages already in shared memory, when using the ``fork`` start method, -however it is very bug prone and should be used with care, and only by advanced -users. Queues, even though they're sometimes a less elegant solution, will work -properly in all cases. - -.. warning:: - - You should be careful about having global statements, that are not guarded - with an ``if __name__ == '__main__'``. If a different start method than - ``fork`` is used, they will be executed in all subprocesses. - -Hogwild -~~~~~~~ - -A concrete Hogwild implementation can be found in the `examples repository`__, -but to showcase the overall structure of the code, there's also a minimal -example below as well:: - - import torch.multiprocessing as mp - from model import MyModel - - def train(model): - # Construct data_loader, optimizer, etc. - for data, labels in data_loader: - optimizer.zero_grad() - loss_fn(model(data), labels).backward() - optimizer.step() # This will update the shared parameters - - if __name__ == '__main__': - num_processes = 4 - model = MyModel() - # NOTE: this is required for the ``fork`` method to work - model.share_memory() - processes = [] - for rank in range(num_processes): - p = mp.Process(target=train, args=(model,)) - p.start() - processes.append(p) - for p in processes: - p.join() - -.. __: https://github.com/pytorch/examples/tree/master/mnist_hogwild diff --git a/docs/_sources/notes/serialization.rst.txt b/docs/_sources/notes/serialization.rst.txt deleted file mode 100644 index 46800314cf83..000000000000 --- a/docs/_sources/notes/serialization.rst.txt +++ /dev/null @@ -1,34 +0,0 @@ - -Serialization semantics -======================= - -Best practices --------------- - -.. _recommend-saving-models: - -Recommended approach for saving a model -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -There are two main approaches for serializing and restoring a model. - -The first (recommended) saves and loads only the model parameters:: - - torch.save(the_model.state_dict(), PATH) - -Then later:: - - the_model = TheModelClass(*args, **kwargs) - the_model.load_state_dict(torch.load(PATH)) - -The second saves and loads the entire model:: - - torch.save(the_model, PATH) - -Then later:: - - the_model = torch.load(PATH) - -However in this case, the serialized data is bound to the specific classes -and the exact directory structure used, so it can break in various ways when -used in other projects, or after some serious refactors. diff --git a/docs/_sources/optim.rst.txt b/docs/_sources/optim.rst.txt deleted file mode 100644 index 92e3f14d1fe5..000000000000 --- a/docs/_sources/optim.rst.txt +++ /dev/null @@ -1,116 +0,0 @@ -torch.optim -=================================== - -.. automodule:: torch.optim - -How to use an optimizer ------------------------ - -To use :mod:`torch.optim` you have to construct an optimizer object, that will hold -the current state and will update the parameters based on the computed gradients. - -Constructing it -^^^^^^^^^^^^^^^ - -To construct an :class:`Optimizer` you have to give it an iterable containing the -parameters (all should be :class:`~torch.autograd.Variable` s) to optimize. Then, -you can specify optimizer-specific options such as the learning rate, weight decay, etc. - -Example:: - - optimizer = optim.SGD(model.parameters(), lr = 0.01, momentum=0.9) - optimizer = optim.Adam([var1, var2], lr = 0.0001) - -Per-parameter options -^^^^^^^^^^^^^^^^^^^^^ - -:class:`Optimizer` s also support specifying per-parameter options. To do this, instead -of passing an iterable of :class:`~torch.autograd.Variable` s, pass in an iterable of -:class:`dict` s. Each of them will define a separate parameter group, and should contain -a ``params`` key, containing a list of parameters belonging to it. Other keys -should match the keyword arguments accepted by the optimizers, and will be used -as optimization options for this group. - -.. note:: - - You can still pass options as keyword arguments. They will be used as - defaults, in the groups that didn't override them. This is useful when you - only want to vary a single option, while keeping all others consistent - between parameter groups. - - -For example, this is very useful when one wants to specify per-layer learning rates:: - - optim.SGD([ - {'params': model.base.parameters()}, - {'params': model.classifier.parameters(), 'lr': 1e-3} - ], lr=1e-2, momentum=0.9) - -This means that ``model.base``'s parameters will use the default learning rate of ``1e-2``, -``model.classifier``'s parameters will use a learning rate of ``1e-3``, and a momentum of -``0.9`` will be used for all parameters - -Taking an optimization step -^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -All optimizers implement a :func:`~Optimizer.step` method, that updates the -parameters. It can be used in two ways: - -``optimizer.step()`` -~~~~~~~~~~~~~~~~~~~~ - -This is a simplified version supported by most optimizers. The function can be -called once the gradients are computed using e.g. -:func:`~torch.autograd.Variable.backward`. - -Example:: - - for input, target in dataset: - optimizer.zero_grad() - output = model(input) - loss = loss_fn(output, target) - loss.backward() - optimizer.step() - -``optimizer.step(closure)`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Some optimization algorithms such as Conjugate Gradient and LBFGS need to -reevaluate the function multiple times, so you have to pass in a closure that -allows them to recompute your model. The closure should clear the gradients, -compute the loss, and return it. - -Example:: - - for input, target in dataset: - def closure(): - optimizer.zero_grad() - output = model(input) - loss = loss_fn(output, target) - loss.backward() - return loss - optimizer.step(closure) - -Algorithms ----------- - -.. autoclass:: Optimizer - :members: -.. autoclass:: Adadelta - :members: -.. autoclass:: Adagrad - :members: -.. autoclass:: Adam - :members: -.. autoclass:: Adamax - :members: -.. autoclass:: ASGD - :members: -.. autoclass:: LBFGS - :members: -.. autoclass:: RMSprop - :members: -.. autoclass:: Rprop - :members: -.. autoclass:: SGD - :members: diff --git a/docs/_sources/storage.rst.txt b/docs/_sources/storage.rst.txt deleted file mode 100644 index 61148916884c..000000000000 --- a/docs/_sources/storage.rst.txt +++ /dev/null @@ -1,12 +0,0 @@ -torch.Storage -=================================== - -A :class:`torch.Storage` is a contiguous, one-dimensional array of a single -data type. - -Every :class:`torch.Tensor` has a corresponding storage of the same data type. - -.. autoclass:: torch.FloatStorage - :members: - :undoc-members: - :inherited-members: diff --git a/docs/_sources/tensors.rst.txt b/docs/_sources/tensors.rst.txt deleted file mode 100644 index 7e3b84d79eea..000000000000 --- a/docs/_sources/tensors.rst.txt +++ /dev/null @@ -1,308 +0,0 @@ -.. currentmodule:: torch - -torch.Tensor -=================================== - -A :class:`torch.Tensor` is a multi-dimensional matrix containing elements of -a single data type. - -Torch defines seven CPU tensor types and eight GPU tensor types: - -======================== =========================== ================================ -Data type CPU tensor GPU tensor -======================== =========================== ================================ -32-bit floating point :class:`torch.FloatTensor` :class:`torch.cuda.FloatTensor` -64-bit floating point :class:`torch.DoubleTensor` :class:`torch.cuda.DoubleTensor` -16-bit floating point N/A :class:`torch.cuda.HalfTensor` -8-bit integer (unsigned) :class:`torch.ByteTensor` :class:`torch.cuda.ByteTensor` -8-bit integer (signed) :class:`torch.CharTensor` :class:`torch.cuda.CharTensor` -16-bit integer (signed) :class:`torch.ShortTensor` :class:`torch.cuda.ShortTensor` -32-bit integer (signed) :class:`torch.IntTensor` :class:`torch.cuda.IntTensor` -64-bit integer (signed) :class:`torch.LongTensor` :class:`torch.cuda.LongTensor` -======================== =========================== ================================ - -The :class:`torch.Tensor` constructor is an alias for the default tensor type -(:class:`torch.FloatTensor`). - -A tensor can be constructed from a Python :class:`list` or sequence: - -:: - - >>> torch.FloatTensor([[1, 2, 3], [4, 5, 6]]) - 1 2 3 - 4 5 6 - [torch.FloatTensor of size 2x3] - -An empty tensor can be constructed by specifying its size: - -:: - - >>> torch.IntTensor(2, 4).zero_() - 0 0 0 0 - 0 0 0 0 - [torch.IntTensor of size 2x4] - -The contents of a tensor can be accessed and modified using Python's indexing -and slicing notation: - -:: - - >>> x = torch.FloatTensor([[1, 2, 3], [4, 5, 6]]) - >>> print(x[1][2]) - 6.0 - >>> x[0][1] = 8 - >>> print(x) - 1 8 3 - 4 5 6 - [torch.FloatTensor of size 2x3] - -Each tensor has an associated :class:`torch.Storage`, which holds its data. -The tensor class provides multi-dimensional, `strided `_ -view of a storage and defines numeric operations on it. - -.. note:: - Methods which mutate a tensor are marked with an underscore suffix. - For example, :func:`torch.FloatTensor.abs_` computes the absolute value - in-place and returns the modified tensor, while :func:`torch.FloatTensor.abs` - computes the result in a new tensor. - -.. class:: Tensor() - Tensor(*sizes) - Tensor(size) - Tensor(sequence) - Tensor(ndarray) - Tensor(tensor) - Tensor(storage) - - Creates a new tensor from an optional size or data. - - If no arguments are given, an empty zero-dimensional tensor is returned. - If a :class:`numpy.ndarray`, :class:`torch.Tensor`, or :class:`torch.Storage` - is given, a new tensor that shares the same data is returned. If a Python - sequence is given, a new tensor is created from a copy of the sequence. - - .. automethod:: abs - .. automethod:: abs_ - .. automethod:: acos - .. automethod:: acos_ - .. automethod:: add - .. automethod:: add_ - .. automethod:: addbmm - .. automethod:: addbmm_ - .. automethod:: addcdiv - .. automethod:: addcdiv_ - .. automethod:: addcmul - .. automethod:: addcmul_ - .. automethod:: addmm - .. automethod:: addmm_ - .. automethod:: addmv - .. automethod:: addmv_ - .. automethod:: addr - .. automethod:: addr_ - .. automethod:: apply_ - .. automethod:: asin - .. automethod:: asin_ - .. automethod:: atan - .. automethod:: atan2 - .. automethod:: atan2_ - .. automethod:: atan_ - .. automethod:: baddbmm - .. automethod:: baddbmm_ - .. automethod:: bernoulli - .. automethod:: bernoulli_ - .. automethod:: bmm - .. automethod:: byte - .. automethod:: cauchy_ - .. automethod:: ceil - .. automethod:: ceil_ - .. automethod:: char - .. automethod:: chunk - .. automethod:: clamp - .. automethod:: clamp_ - .. automethod:: clone - .. automethod:: contiguous - .. automethod:: copy_ - .. automethod:: cos - .. automethod:: cos_ - .. automethod:: cosh - .. automethod:: cosh_ - .. automethod:: cpu - .. automethod:: cross - .. automethod:: cuda - .. automethod:: cumprod - .. automethod:: cumsum - .. automethod:: data_ptr - .. automethod:: diag - .. automethod:: dim - .. automethod:: dist - .. automethod:: div - .. automethod:: div_ - .. automethod:: dot - .. automethod:: double - .. automethod:: eig - .. automethod:: element_size - .. automethod:: eq - .. automethod:: eq_ - .. automethod:: equal - .. automethod:: exp - .. automethod:: exp_ - .. automethod:: expand - .. automethod:: expand_as - .. automethod:: exponential_ - .. automethod:: fill_ - .. automethod:: float - .. automethod:: floor - .. automethod:: floor_ - .. automethod:: fmod - .. automethod:: fmod_ - .. automethod:: frac - .. automethod:: frac_ - .. automethod:: gather - .. automethod:: ge - .. automethod:: ge_ - .. automethod:: gels - .. automethod:: geometric_ - .. automethod:: geqrf - .. automethod:: ger - .. automethod:: gesv - .. automethod:: gt - .. automethod:: gt_ - .. automethod:: half - .. automethod:: histc - .. automethod:: index - .. automethod:: index_add_ - .. automethod:: index_copy_ - .. automethod:: index_fill_ - .. automethod:: index_select - .. automethod:: int - .. automethod:: inverse - .. automethod:: is_contiguous - .. autoattribute:: is_cuda - :annotation: - .. automethod:: is_pinned - .. automethod:: is_set_to - .. automethod:: is_signed - .. automethod:: kthvalue - .. automethod:: le - .. automethod:: le_ - .. automethod:: lerp - .. automethod:: lerp_ - .. automethod:: log - .. automethod:: log1p - .. automethod:: log1p_ - .. automethod:: log_ - .. automethod:: log_normal_ - .. automethod:: long - .. automethod:: lt - .. automethod:: lt_ - .. automethod:: map_ - .. automethod:: masked_copy_ - .. automethod:: masked_fill_ - .. automethod:: masked_select - .. automethod:: max - .. automethod:: mean - .. automethod:: median - .. automethod:: min - .. automethod:: mm - .. automethod:: mode - .. automethod:: mul - .. automethod:: mul_ - .. automethod:: multinomial - .. automethod:: mv - .. automethod:: narrow - .. automethod:: ndimension - .. automethod:: ne - .. automethod:: ne_ - .. automethod:: neg - .. automethod:: neg_ - .. automethod:: nelement - .. automethod:: new - .. automethod:: nonzero - .. automethod:: norm - .. automethod:: normal_ - .. automethod:: numel - .. automethod:: numpy - .. automethod:: orgqr - .. automethod:: ormqr - .. automethod:: permute - .. automethod:: pin_memory - .. automethod:: potrf - .. automethod:: potri - .. automethod:: potrs - .. automethod:: pow - .. automethod:: pow_ - .. automethod:: prod - .. automethod:: pstrf - .. automethod:: qr - .. automethod:: random_ - .. automethod:: reciprocal - .. automethod:: reciprocal_ - .. automethod:: remainder - .. automethod:: remainder_ - .. automethod:: renorm - .. automethod:: renorm_ - .. automethod:: repeat - .. automethod:: resize_ - .. automethod:: resize_as_ - .. automethod:: round - .. automethod:: round_ - .. automethod:: rsqrt - .. automethod:: rsqrt_ - .. automethod:: scatter_ - .. automethod:: select - .. automethod:: set_ - .. automethod:: share_memory_ - .. automethod:: short - .. automethod:: sigmoid - .. automethod:: sigmoid_ - .. automethod:: sign - .. automethod:: sign_ - .. automethod:: sin - .. automethod:: sin_ - .. automethod:: sinh - .. automethod:: sinh_ - .. automethod:: size - .. automethod:: sort - .. automethod:: split - .. automethod:: sqrt - .. automethod:: sqrt_ - .. automethod:: squeeze - .. automethod:: squeeze_ - .. automethod:: std - .. automethod:: storage - .. automethod:: storage_offset - .. automethod:: storage_type - .. automethod:: stride - .. automethod:: sub - .. automethod:: sub_ - .. automethod:: sum - .. automethod:: svd - .. automethod:: symeig - .. automethod:: t - .. automethod:: t_ - .. automethod:: tan - .. automethod:: tan_ - .. automethod:: tanh - .. automethod:: tanh_ - .. automethod:: tolist - .. automethod:: topk - .. automethod:: trace - .. automethod:: transpose - .. automethod:: transpose_ - .. automethod:: tril - .. automethod:: tril_ - .. automethod:: triu - .. automethod:: triu_ - .. automethod:: trtrs - .. automethod:: trunc - .. automethod:: trunc_ - .. automethod:: type - .. automethod:: type_as - .. automethod:: unfold - .. automethod:: uniform_ - .. automethod:: unsqueeze - .. automethod:: unsqueeze_ - .. automethod:: var - .. automethod:: view - .. automethod:: view_as - .. automethod:: zero_ diff --git a/docs/_sources/torch.rst.txt b/docs/_sources/torch.rst.txt deleted file mode 100644 index 8d3759de1597..000000000000 --- a/docs/_sources/torch.rst.txt +++ /dev/null @@ -1,185 +0,0 @@ -torch -=================================== -.. automodule:: torch - -Tensors ----------------------------------- -.. autofunction:: is_tensor -.. autofunction:: is_storage -.. autofunction:: set_default_tensor_type -.. autofunction:: numel -.. autofunction:: set_printoptions - - -Creation Ops -~~~~~~~~~~~~~~~~~~~~~~ -.. autofunction:: eye -.. autofunction:: from_numpy -.. autofunction:: linspace -.. autofunction:: logspace -.. autofunction:: ones -.. autofunction:: rand -.. autofunction:: randn -.. autofunction:: randperm -.. autofunction:: arange -.. autofunction:: range -.. autofunction:: zeros - - -Indexing, Slicing, Joining, Mutating Ops -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. autofunction:: cat -.. autofunction:: chunk -.. autofunction:: gather -.. autofunction:: index_select -.. autofunction:: masked_select -.. autofunction:: nonzero -.. autofunction:: split -.. autofunction:: squeeze -.. autofunction:: stack -.. autofunction:: t -.. autofunction:: transpose -.. autofunction:: unbind -.. autofunction:: unsqueeze - - -Random sampling ----------------------------------- -.. autofunction:: manual_seed -.. autofunction:: initial_seed -.. autofunction:: get_rng_state -.. autofunction:: set_rng_state -.. autodata:: default_generator -.. autofunction:: bernoulli -.. autofunction:: multinomial -.. autofunction:: normal - - -Serialization ----------------------------------- -.. autofunction:: save -.. autofunction:: load - - -Parallelism ----------------------------------- -.. autofunction:: get_num_threads -.. autofunction:: set_num_threads - - -Math operations ----------------------------------- - -Pointwise Ops -~~~~~~~~~~~~~~~~~~~~~~ - -.. autofunction:: abs -.. autofunction:: acos -.. autofunction:: add -.. autofunction:: addcdiv -.. autofunction:: addcmul -.. autofunction:: asin -.. autofunction:: atan -.. autofunction:: atan2 -.. autofunction:: ceil -.. autofunction:: clamp -.. autofunction:: cos -.. autofunction:: cosh -.. autofunction:: div -.. autofunction:: exp -.. autofunction:: floor -.. autofunction:: fmod -.. autofunction:: frac -.. autofunction:: lerp -.. autofunction:: log -.. autofunction:: log1p -.. autofunction:: mul -.. autofunction:: neg -.. autofunction:: pow -.. autofunction:: reciprocal -.. autofunction:: remainder -.. autofunction:: round -.. autofunction:: rsqrt -.. autofunction:: sigmoid -.. autofunction:: sign -.. autofunction:: sin -.. autofunction:: sinh -.. autofunction:: sqrt -.. autofunction:: tan -.. autofunction:: tanh -.. autofunction:: trunc - - -Reduction Ops -~~~~~~~~~~~~~~~~~~~~~~ -.. autofunction:: cumprod -.. autofunction:: cumsum -.. autofunction:: dist -.. autofunction:: mean -.. autofunction:: median -.. autofunction:: mode -.. autofunction:: norm -.. autofunction:: prod -.. autofunction:: std -.. autofunction:: sum -.. autofunction:: var - - -Comparison Ops -~~~~~~~~~~~~~~~~~~~~~~ -.. autofunction:: eq -.. autofunction:: equal -.. autofunction:: ge -.. autofunction:: gt -.. autofunction:: kthvalue -.. autofunction:: le -.. autofunction:: lt -.. autofunction:: max -.. autofunction:: min -.. autofunction:: ne -.. autofunction:: sort -.. autofunction:: topk - - -Other Operations -~~~~~~~~~~~~~~~~~~~~~~ -.. autofunction:: cross -.. autofunction:: diag -.. autofunction:: histc -.. autofunction:: renorm -.. autofunction:: trace -.. autofunction:: tril -.. autofunction:: triu - - -BLAS and LAPACK Operations -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. autofunction:: addbmm -.. autofunction:: addmm -.. autofunction:: addmv -.. autofunction:: addr -.. autofunction:: baddbmm -.. autofunction:: bmm -.. autofunction:: btrifact -.. autofunction:: btrisolve -.. autofunction:: dot -.. autofunction:: eig -.. autofunction:: gels -.. autofunction:: geqrf -.. autofunction:: ger -.. autofunction:: gesv -.. autofunction:: inverse -.. autofunction:: mm -.. autofunction:: mv -.. autofunction:: orgqr -.. autofunction:: ormqr -.. autofunction:: potrf -.. autofunction:: potri -.. autofunction:: potrs -.. autofunction:: pstrf -.. autofunction:: qr -.. autofunction:: svd -.. autofunction:: symeig -.. autofunction:: trtrs - diff --git a/docs/_sources/torchvision/datasets.rst.txt b/docs/_sources/torchvision/datasets.rst.txt deleted file mode 100644 index 666203f1c81d..000000000000 --- a/docs/_sources/torchvision/datasets.rst.txt +++ /dev/null @@ -1,162 +0,0 @@ -torchvision.datasets -==================== - -The following dataset loaders are available: - -- `MNIST`_ -- `COCO (Captioning and Detection)`_ -- `LSUN Classification`_ -- `ImageFolder`_ -- `Imagenet-12`_ -- `CIFAR10 and CIFAR100`_ -- `STL10`_ - -Datasets have the API: - -- ``__getitem__`` -- ``__len__`` - They all subclass from ``torch.utils.data.Dataset`` - Hence, they can all be multi-threaded (python multiprocessing) using - standard torch.utils.data.DataLoader. - -For example: - -``torch.utils.data.DataLoader(coco_cap, batch_size=args.batchSize, shuffle=True, num_workers=args.nThreads)`` - -In the constructor, each dataset has a slightly different API as needed, -but they all take the keyword args: - -- ``transform`` - a function that takes in an image and returns a - transformed version -- common stuff like ``ToTensor``, ``RandomCrop``, etc. These can be - composed together with ``transforms.Compose`` (see transforms section - below) -- ``target_transform`` - a function that takes in the target and - transforms it. For example, take in the caption string and return a - tensor of word indices. - -MNIST -~~~~~ - -``dset.MNIST(root, train=True, transform=None, target_transform=None, download=False)`` - -- ``root`` : root directory of dataset where ``processed/training.pt`` and ``processed/test.pt`` exist. -- ``train`` : ``True`` = Training set, ``False`` = Test set -- ``download`` : ``True`` = downloads the dataset from the internet and puts it in root directory. If dataset already downloaded, place the processed dataset (function available in mnist.py) in the ``processed`` folder. - -COCO -~~~~ - -This requires the `COCO API to be installed`_ - -Captions: -^^^^^^^^^ - -``dset.CocoCaptions(root="dir where images are", annFile="json annotation file", [transform, target_transform])`` - -Example: - -.. code:: python - - import torchvision.datasets as dset - import torchvision.transforms as transforms - cap = dset.CocoCaptions(root = 'dir where images are', - annFile = 'json annotation file', - transform=transforms.ToTensor()) - - print('Number of samples: ', len(cap)) - img, target = cap[3] # load 4th sample - - print("Image Size: ", img.size()) - print(target) - -Output: - -:: - - Number of samples: 82783 - Image Size: (3L, 427L, 640L) - [u'A plane emitting smoke stream flying over a mountain.', - u'A plane darts across a bright blue sky behind a mountain covered in snow', - u'A plane leaves a contrail above the snowy mountain top.', - u'A mountain that has a plane flying overheard in the distance.', - u'A mountain view with a plume of smoke in the background'] - -Detection: -^^^^^^^^^^ - -``dset.CocoDetection(root="dir where images are", annFile="json annotation file", [transform, target_transform])`` - -LSUN -~~~~ - -``dset.LSUN(db_path, classes='train', [transform, target_transform])`` - -- db\_path = root directory for the database files -- ``classes`` = ``‘train’`` (all categories, training set), ``‘val’`` (all categories, validation set), ``‘test’`` (all categories, test set) -- [``‘bedroom\_train’``, ``‘church\_train’``, …] : a list of categories to load - -ImageFolder -~~~~~~~~~~~ - -A generic data loader where the images are arranged in this way: - -:: - - root/dog/xxx.png - root/dog/xxy.png - root/dog/xxz.png - - root/cat/123.png - root/cat/nsdf3.png - root/cat/asd932_.png - -``dset.ImageFolder(root="root folder path", [transform, target_transform])`` - -It has the members: - -- ``self.classes`` - The class names as a list -- ``self.class_to_idx`` - Corresponding class indices -- ``self.imgs`` - The list of (image path, class-index) tuples - -Imagenet-12 -~~~~~~~~~~~ - -This is simply implemented with an ImageFolder dataset. - -The data is preprocessed `as described -here `__ - -`Here is an -example `__. - -CIFAR -~~~~~ - -``dset.CIFAR10(root, train=True, transform=None, target_transform=None, download=False)`` - -``dset.CIFAR100(root, train=True, transform=None, target_transform=None, download=False)`` - -- ``root`` : root directory of dataset where there is folder - ``cifar-10-batches-py`` -- ``train`` : ``True`` = Training set, ``False`` = Test set -- ``download`` : ``True`` = downloads the dataset from the internet and - puts it in root directory. If dataset already downloaded, doesn't do anything. - -STL10 -~~~~~ - -``dset.STL10(root, split='train', transform=None, target_transform=None, download=False)`` - -- ``root`` : root directory of dataset where there is folder ``stl10_binary`` -- ``split`` : ``'train'`` = Training set, ``'test'`` = Test set, ``'unlabeled'`` = Unlabeled set, ``'train+unlabeled'`` = Training + Unlabeled set (missing label marked as ``-1``) -- ``download`` : ``True`` = downloads the dataset from the internet and puts it in root directory. If dataset already downloaded, doesn't do anything. - -.. _MNIST: #mnist -.. _COCO (Captioning and Detection): #coco -.. _LSUN Classification: #lsun -.. _ImageFolder: #imagefolder -.. _Imagenet-12: #imagenet-12 -.. _CIFAR10 and CIFAR100: #cifar -.. _STL10: #stl10 -.. _COCO API to be installed: https://github.com/pdollar/coco/tree/master/PythonAPI \ No newline at end of file diff --git a/docs/_sources/torchvision/models.rst.txt b/docs/_sources/torchvision/models.rst.txt deleted file mode 100644 index 5bde1742f133..000000000000 --- a/docs/_sources/torchvision/models.rst.txt +++ /dev/null @@ -1,11 +0,0 @@ -torchvision.models -=================== - -.. currentmodule:: torchvision.models - - -.. automodule:: torchvision.models - :members: alexnet, resnet18, resnet34, resnet50, resnet101, resnet152, - vgg11, vgg11_bn, vgg13, vgg13_bn, vgg16, vgg16_bn, vgg19, - vgg19_bn - :undoc-members: diff --git a/docs/_sources/torchvision/torchvision.rst.txt b/docs/_sources/torchvision/torchvision.rst.txt deleted file mode 100644 index bbffba767ca0..000000000000 --- a/docs/_sources/torchvision/torchvision.rst.txt +++ /dev/null @@ -1,5 +0,0 @@ -torchvision -=================== - -The :mod:`torchvision` package consists of popular datasets, model -architectures, and common image transformations for computer vision. diff --git a/docs/_sources/torchvision/transforms.rst.txt b/docs/_sources/torchvision/transforms.rst.txt deleted file mode 100644 index 9c97f0db2d60..000000000000 --- a/docs/_sources/torchvision/transforms.rst.txt +++ /dev/null @@ -1,40 +0,0 @@ -torchvision.transforms -====================== - -.. currentmodule:: torchvision.transforms - -.. autoclass:: Compose - -Transforms on PIL.Image ------------------------ - -.. autoclass:: Scale - -.. autoclass:: CenterCrop - -.. autoclass:: RandomCrop - -.. autoclass:: RandomHorizontalFlip - -.. autoclass:: RandomSizedCrop - -.. autoclass:: Pad - -Transforms on torch.\*Tensor ----------------------------- - -.. autoclass:: Normalize - - -Conversion Transforms ---------------------- - -.. autoclass:: ToTensor - -.. autoclass:: ToPILImage - -Generic Transforms ------------------- - -.. autoclass:: Lambda - diff --git a/docs/_sources/torchvision/utils.rst.txt b/docs/_sources/torchvision/utils.rst.txt deleted file mode 100644 index 468ddf683739..000000000000 --- a/docs/_sources/torchvision/utils.rst.txt +++ /dev/null @@ -1,9 +0,0 @@ -torchvision.utils -=================== - -.. currentmodule:: torchvision.utils - -.. autofunction:: make_grid - -.. autofunction:: save_image - diff --git a/docs/_static/ajax-loader.gif b/docs/_static/ajax-loader.gif deleted file mode 100644 index 61faf8cab239..000000000000 Binary files a/docs/_static/ajax-loader.gif and /dev/null differ diff --git a/docs/_static/basic.css b/docs/_static/basic.css deleted file mode 100644 index 7ed0e58edb31..000000000000 --- a/docs/_static/basic.css +++ /dev/null @@ -1,632 +0,0 @@ -/* - * basic.css - * ~~~~~~~~~ - * - * Sphinx stylesheet -- basic theme. - * - * :copyright: Copyright 2007-2016 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * - */ - -/* -- main layout ----------------------------------------------------------- */ - -div.clearer { - clear: both; -} - -/* -- relbar ---------------------------------------------------------------- */ - -div.related { - width: 100%; - font-size: 90%; -} - -div.related h3 { - display: none; -} - -div.related ul { - margin: 0; - padding: 0 0 0 10px; - list-style: none; -} - -div.related li { - display: inline; -} - -div.related li.right { - float: right; - margin-right: 5px; -} - -/* -- sidebar --------------------------------------------------------------- */ - -div.sphinxsidebarwrapper { - padding: 10px 5px 0 10px; -} - -div.sphinxsidebar { - float: left; - width: 230px; - margin-left: -100%; - font-size: 90%; - word-wrap: break-word; - overflow-wrap : break-word; -} - -div.sphinxsidebar ul { - list-style: none; -} - -div.sphinxsidebar ul ul, -div.sphinxsidebar ul.want-points { - margin-left: 20px; - list-style: square; -} - -div.sphinxsidebar ul ul { - margin-top: 0; - margin-bottom: 0; -} - -div.sphinxsidebar form { - margin-top: 10px; -} - -div.sphinxsidebar input { - border: 1px solid #98dbcc; - font-family: sans-serif; - font-size: 1em; -} - -div.sphinxsidebar #searchbox input[type="text"] { - width: 170px; -} - -img { - border: 0; - max-width: 100%; -} - -/* -- search page ----------------------------------------------------------- */ - -ul.search { - margin: 10px 0 0 20px; - padding: 0; -} - -ul.search li { - padding: 5px 0 5px 20px; - background-image: url(file.png); - background-repeat: no-repeat; - background-position: 0 7px; -} - -ul.search li a { - font-weight: bold; -} - -ul.search li div.context { - color: #888; - margin: 2px 0 0 30px; - text-align: left; -} - -ul.keywordmatches li.goodmatch a { - font-weight: bold; -} - -/* -- index page ------------------------------------------------------------ */ - -table.contentstable { - width: 90%; - margin-left: auto; - margin-right: auto; -} - -table.contentstable p.biglink { - line-height: 150%; -} - -a.biglink { - font-size: 1.3em; -} - -span.linkdescr { - font-style: italic; - padding-top: 5px; - font-size: 90%; -} - -/* -- general index --------------------------------------------------------- */ - -table.indextable { - width: 100%; -} - -table.indextable td { - text-align: left; - vertical-align: top; -} - -table.indextable ul { - margin-top: 0; - margin-bottom: 0; - list-style-type: none; -} - -table.indextable > tbody > tr > td > ul { - padding-left: 0em; -} - -table.indextable tr.pcap { - height: 10px; -} - -table.indextable tr.cap { - margin-top: 10px; - background-color: #f2f2f2; -} - -img.toggler { - margin-right: 3px; - margin-top: 3px; - cursor: pointer; -} - -div.modindex-jumpbox { - border-top: 1px solid #ddd; - border-bottom: 1px solid #ddd; - margin: 1em 0 1em 0; - padding: 0.4em; -} - -div.genindex-jumpbox { - border-top: 1px solid #ddd; - border-bottom: 1px solid #ddd; - margin: 1em 0 1em 0; - padding: 0.4em; -} - -/* -- domain module index --------------------------------------------------- */ - -table.modindextable td { - padding: 2px; - border-collapse: collapse; -} - -/* -- general body styles --------------------------------------------------- */ - -div.body p, div.body dd, div.body li, div.body blockquote { - -moz-hyphens: auto; - -ms-hyphens: auto; - -webkit-hyphens: auto; - hyphens: auto; -} - -a.headerlink { - visibility: hidden; -} - -h1:hover > a.headerlink, -h2:hover > a.headerlink, -h3:hover > a.headerlink, -h4:hover > a.headerlink, -h5:hover > a.headerlink, -h6:hover > a.headerlink, -dt:hover > a.headerlink, -caption:hover > a.headerlink, -p.caption:hover > a.headerlink, -div.code-block-caption:hover > a.headerlink { - visibility: visible; -} - -div.body p.caption { - text-align: inherit; -} - -div.body td { - text-align: left; -} - -.first { - margin-top: 0 !important; -} - -p.rubric { - margin-top: 30px; - font-weight: bold; -} - -img.align-left, .figure.align-left, object.align-left { - clear: left; - float: left; - margin-right: 1em; -} - -img.align-right, .figure.align-right, object.align-right { - clear: right; - float: right; - margin-left: 1em; -} - -img.align-center, .figure.align-center, object.align-center { - display: block; - margin-left: auto; - margin-right: auto; -} - -.align-left { - text-align: left; -} - -.align-center { - text-align: center; -} - -.align-right { - text-align: right; -} - -/* -- sidebars -------------------------------------------------------------- */ - -div.sidebar { - margin: 0 0 0.5em 1em; - border: 1px solid #ddb; - padding: 7px 7px 0 7px; - background-color: #ffe; - width: 40%; - float: right; -} - -p.sidebar-title { - font-weight: bold; -} - -/* -- topics ---------------------------------------------------------------- */ - -div.topic { - border: 1px solid #ccc; - padding: 7px 7px 0 7px; - margin: 10px 0 10px 0; -} - -p.topic-title { - font-size: 1.1em; - font-weight: bold; - margin-top: 10px; -} - -/* -- admonitions ----------------------------------------------------------- */ - -div.admonition { - margin-top: 10px; - margin-bottom: 10px; - padding: 7px; -} - -div.admonition dt { - font-weight: bold; -} - -div.admonition dl { - margin-bottom: 0; -} - -p.admonition-title { - margin: 0px 10px 5px 0px; - font-weight: bold; -} - -div.body p.centered { - text-align: center; - margin-top: 25px; -} - -/* -- tables ---------------------------------------------------------------- */ - -table.docutils { - border: 0; - border-collapse: collapse; -} - -table caption span.caption-number { - font-style: italic; -} - -table caption span.caption-text { -} - -table.docutils td, table.docutils th { - padding: 1px 8px 1px 5px; - border-top: 0; - border-left: 0; - border-right: 0; - border-bottom: 1px solid #aaa; -} - -table.footnote td, table.footnote th { - border: 0 !important; -} - -th { - text-align: left; - padding-right: 5px; -} - -table.citation { - border-left: solid 1px gray; - margin-left: 1px; -} - -table.citation td { - border-bottom: none; -} - -/* -- figures --------------------------------------------------------------- */ - -div.figure { - margin: 0.5em; - padding: 0.5em; -} - -div.figure p.caption { - padding: 0.3em; -} - -div.figure p.caption span.caption-number { - font-style: italic; -} - -div.figure p.caption span.caption-text { -} - -/* -- field list styles ----------------------------------------------------- */ - -table.field-list td, table.field-list th { - border: 0 !important; -} - -.field-list ul { - margin: 0; - padding-left: 1em; -} - -.field-list p { - margin: 0; -} - -/* -- other body styles ----------------------------------------------------- */ - -ol.arabic { - list-style: decimal; -} - -ol.loweralpha { - list-style: lower-alpha; -} - -ol.upperalpha { - list-style: upper-alpha; -} - -ol.lowerroman { - list-style: lower-roman; -} - -ol.upperroman { - list-style: upper-roman; -} - -dl { - margin-bottom: 15px; -} - -dd p { - margin-top: 0px; -} - -dd ul, dd table { - margin-bottom: 10px; -} - -dd { - margin-top: 3px; - margin-bottom: 10px; - margin-left: 30px; -} - -dt:target, .highlighted { - background-color: #fbe54e; -} - -dl.glossary dt { - font-weight: bold; - font-size: 1.1em; -} - -.optional { - font-size: 1.3em; -} - -.sig-paren { - font-size: larger; -} - -.versionmodified { - font-style: italic; -} - -.system-message { - background-color: #fda; - padding: 5px; - border: 3px solid red; -} - -.footnote:target { - background-color: #ffa; -} - -.line-block { - display: block; - margin-top: 1em; - margin-bottom: 1em; -} - -.line-block .line-block { - margin-top: 0; - margin-bottom: 0; - margin-left: 1.5em; -} - -.guilabel, .menuselection { - font-family: sans-serif; -} - -.accelerator { - text-decoration: underline; -} - -.classifier { - font-style: oblique; -} - -abbr, acronym { - border-bottom: dotted 1px; - cursor: help; -} - -/* -- code displays --------------------------------------------------------- */ - -pre { - overflow: auto; - overflow-y: hidden; /* fixes display issues on Chrome browsers */ -} - -span.pre { - -moz-hyphens: none; - -ms-hyphens: none; - -webkit-hyphens: none; - hyphens: none; -} - -td.linenos pre { - padding: 5px 0px; - border: 0; - background-color: transparent; - color: #aaa; -} - -table.highlighttable { - margin-left: 0.5em; -} - -table.highlighttable td { - padding: 0 0.5em 0 0.5em; -} - -div.code-block-caption { - padding: 2px 5px; - font-size: small; -} - -div.code-block-caption code { - background-color: transparent; -} - -div.code-block-caption + div > div.highlight > pre { - margin-top: 0; -} - -div.code-block-caption span.caption-number { - padding: 0.1em 0.3em; - font-style: italic; -} - -div.code-block-caption span.caption-text { -} - -div.literal-block-wrapper { - padding: 1em 1em 0; -} - -div.literal-block-wrapper div.highlight { - margin: 0; -} - -code.descname { - background-color: transparent; - font-weight: bold; - font-size: 1.2em; -} - -code.descclassname { - background-color: transparent; -} - -code.xref, a code { - background-color: transparent; - font-weight: bold; -} - -h1 code, h2 code, h3 code, h4 code, h5 code, h6 code { - background-color: transparent; -} - -.viewcode-link { - float: right; -} - -.viewcode-back { - float: right; - font-family: sans-serif; -} - -div.viewcode-block:target { - margin: -1px -10px; - padding: 0 10px; -} - -/* -- math display ---------------------------------------------------------- */ - -img.math { - vertical-align: middle; -} - -div.body div.math p { - text-align: center; -} - -span.eqno { - float: right; -} - -span.eqno a.headerlink { - position: relative; - left: 0px; - z-index: 1; -} - -div.math:hover a.headerlink { - visibility: visible; -} - -/* -- printout stylesheet --------------------------------------------------- */ - -@media print { - div.document, - div.documentwrapper, - div.bodywrapper { - margin: 0 !important; - width: 100%; - } - - div.sphinxsidebar, - div.related, - div.footer, - #top-link { - display: none; - } -} \ No newline at end of file diff --git a/docs/_static/comment-bright.png b/docs/_static/comment-bright.png deleted file mode 100644 index 15e27edb12ac..000000000000 Binary files a/docs/_static/comment-bright.png and /dev/null differ diff --git a/docs/_static/comment-close.png b/docs/_static/comment-close.png deleted file mode 100644 index 4d91bcf57de8..000000000000 Binary files a/docs/_static/comment-close.png and /dev/null differ diff --git a/docs/_static/comment.png b/docs/_static/comment.png deleted file mode 100644 index dfbc0cbd512b..000000000000 Binary files a/docs/_static/comment.png and /dev/null differ diff --git a/docs/_static/css/badge_only.css b/docs/_static/css/badge_only.css deleted file mode 100644 index f4b46e9046c4..000000000000 --- a/docs/_static/css/badge_only.css +++ /dev/null @@ -1,2 +0,0 @@ -.fa:before{-webkit-font-smoothing:antialiased}.clearfix{*zoom:1}.clearfix:before,.clearfix:after{display:table;content:""}.clearfix:after{clear:both}@font-face{font-family:FontAwesome;font-weight:normal;font-style:normal;src:url("../fonts/fontawesome-webfont.eot");src:url("../fonts/fontawesome-webfont.eot?#iefix") format("embedded-opentype"),url("../fonts/fontawesome-webfont.woff") format("woff"),url("../fonts/fontawesome-webfont.ttf") format("truetype"),url("../fonts/fontawesome-webfont.svg#FontAwesome") format("svg")}.fa:before{display:inline-block;font-family:FontAwesome;font-style:normal;font-weight:normal;line-height:1;text-decoration:inherit}a .fa{display:inline-block;text-decoration:inherit}li .fa{display:inline-block}li .fa-large:before,li .fa-large:before{width:1.875em}ul.fas{list-style-type:none;margin-left:2em;text-indent:-0.8em}ul.fas li .fa{width:0.8em}ul.fas li .fa-large:before,ul.fas li .fa-large:before{vertical-align:baseline}.fa-book:before{content:""}.icon-book:before{content:""}.fa-caret-down:before{content:""}.icon-caret-down:before{content:""}.fa-caret-up:before{content:""}.icon-caret-up:before{content:""}.fa-caret-left:before{content:""}.icon-caret-left:before{content:""}.fa-caret-right:before{content:""}.icon-caret-right:before{content:""}.rst-versions{position:fixed;bottom:0;left:0;width:300px;color:#fcfcfc;background:#1f1d1d;border-top:solid 10px #343131;font-family:"Lato","proxima-nova","Helvetica Neue",Arial,sans-serif;z-index:400}.rst-versions a{color:#2980B9;text-decoration:none}.rst-versions .rst-badge-small{display:none}.rst-versions .rst-current-version{padding:12px;background-color:#272525;display:block;text-align:right;font-size:90%;cursor:pointer;color:#27AE60;*zoom:1}.rst-versions .rst-current-version:before,.rst-versions .rst-current-version:after{display:table;content:""}.rst-versions .rst-current-version:after{clear:both}.rst-versions .rst-current-version .fa{color:#fcfcfc}.rst-versions .rst-current-version .fa-book{float:left}.rst-versions .rst-current-version .icon-book{float:left}.rst-versions .rst-current-version.rst-out-of-date{background-color:#E74C3C;color:#fff}.rst-versions .rst-current-version.rst-active-old-version{background-color:#F1C40F;color:#000}.rst-versions.shift-up .rst-other-versions{display:block}.rst-versions .rst-other-versions{font-size:90%;padding:12px;color:gray;display:none}.rst-versions .rst-other-versions hr{display:block;height:1px;border:0;margin:20px 0;padding:0;border-top:solid 1px #413d3d}.rst-versions .rst-other-versions dd{display:inline-block;margin:0}.rst-versions .rst-other-versions dd a{display:inline-block;padding:6px;color:#fcfcfc}.rst-versions.rst-badge{width:auto;bottom:20px;right:20px;left:auto;border:none;max-width:300px}.rst-versions.rst-badge .icon-book{float:none}.rst-versions.rst-badge .fa-book{float:none}.rst-versions.rst-badge.shift-up .rst-current-version{text-align:right}.rst-versions.rst-badge.shift-up .rst-current-version .fa-book{float:left}.rst-versions.rst-badge.shift-up .rst-current-version .icon-book{float:left}.rst-versions.rst-badge .rst-current-version{width:auto;height:30px;line-height:30px;padding:0 6px;display:block;text-align:center}@media screen and (max-width: 768px){.rst-versions{width:85%;display:none}.rst-versions.shift{display:block}} -/*# sourceMappingURL=badge_only.css.map */ diff --git a/docs/_static/css/badge_only.css.map b/docs/_static/css/badge_only.css.map deleted file mode 100644 index a302a9f0afc7..000000000000 --- a/docs/_static/css/badge_only.css.map +++ /dev/null @@ -1,7 +0,0 @@ -{ -"version": 3, -"mappings": "CAyDA,SAAY,EACV,qBAAsB,EAAE,UAAW,EAqDrC,QAAS,EARP,IAAK,EAAE,AAAC,EACR,+BAAS,EAEP,MAAO,EAAE,IAAK,EACd,MAAO,EAAE,CAAE,EACb,cAAO,EACL,IAAK,EAAE,GAAI,EC1Gb,SAkBC,EAjBC,UAAW,ECFJ,UAAW,EDGlB,UAAW,EAHqC,KAAM,EAItD,SAAU,EAJsD,KAAM,EAapE,EAAG,EAAE,sCAAwB,EAC7B,EAAG,EAAE,8PAG2D,ECftE,SAAU,EACR,MAAO,EAAE,WAAY,EACrB,UAAW,EAAE,UAAW,EACxB,SAAU,EAAE,KAAM,EAClB,UAAW,EAAE,KAAM,EACnB,UAAW,EAAE,AAAC,EACd,cAAe,EAAE,MAAO,EAG1B,IAAK,EACH,MAAO,EAAE,WAAY,EACrB,cAAe,EAAE,MAAO,EAIxB,KAAG,EACD,MAAO,EAAE,WAAY,EACvB,sCAAiB,EAGf,IAAK,EAAE,MAAY,EAEvB,KAAM,EACJ,cAAe,EAAE,GAAI,EACrB,UAAW,EAAE,EAAG,EAChB,UAAW,EAAE,KAAM,EAEjB,YAAG,EACD,IAAK,EAAE,IAAI,EACb,oDAAiB,EAGf,aAAc,EAAE,OAAQ,EAG9B,cAAe,EACb,MAAO,EAAE,EAAO,EAElB,gBAAiB,EACf,MAAO,EAAE,EAAO,EAElB,oBAAqB,EACnB,MAAO,EAAE,EAAO,EAElB,sBAAuB,EACrB,MAAO,EAAE,EAAO,EAElB,kBAAmB,EACjB,MAAO,EAAE,EAAO,EAElB,oBAAqB,EACnB,MAAO,EAAE,EAAO,EAElB,oBAAqB,EACnB,MAAO,EAAE,EAAO,EAElB,sBAAuB,EACrB,MAAO,EAAE,EAAO,EAElB,qBAAsB,EACpB,MAAO,EAAE,EAAO,EAElB,uBAAwB,EACtB,MAAO,EAAE,EAAO,ECnElB,YAAa,EACX,OAAQ,EAAE,IAAK,EACf,KAAM,EAAE,AAAC,EACT,GAAI,EAAE,AAAC,EACP,IAAK,EC6E+B,IAAK,ED5EzC,IAAK,EEuC+B,MAAyB,EFtC7D,SAAU,EAAE,MAAkC,EAC9C,SAAU,EAAE,iBAAiC,EAC7C,UAAW,EEkDyB,sDAA2D,EFjD/F,MAAO,EC+E6B,EAAG,ED9EvC,cAAC,EACC,IAAK,EEkC6B,MAAK,EFjCvC,cAAe,EAAE,GAAI,EACvB,6BAAgB,EACd,MAAO,EAAE,GAAI,EACf,iCAAoB,EAClB,MAAO,EAAE,GAAqB,EAC9B,eAAgB,EAAE,MAAkC,EACpD,MAAO,EAAE,IAAK,EACd,SAAU,EAAE,IAAK,EACjB,QAAS,EAAE,EAAG,EACd,KAAM,EAAE,MAAO,EACf,IAAK,EEX6B,MAAM,EL4F1C,IAAK,EAAE,AAAC,EACR,iFAAS,EAEP,MAAO,EAAE,IAAK,EACd,MAAO,EAAE,CAAE,EACb,uCAAO,EACL,IAAK,EAAE,GAAI,EGrFX,qCAAG,EACD,IAAK,EEmB2B,MAAyB,EFlB3D,0CAAQ,EACN,IAAK,EAAE,GAAI,EACb,4CAAU,EACR,IAAK,EAAE,GAAI,EACb,iDAAiB,EACf,eAAgB,ECQgB,MAAI,EDPpC,IAAK,EEO2B,GAAM,EFNxC,wDAAwB,EACtB,eAAgB,EEsBgB,MAAO,EFrBvC,IAAK,ECzB2B,GAAI,ED0BxC,yCAA8B,EAC5B,MAAO,EAAE,IAAK,EAChB,gCAAmB,EACjB,QAAS,EAAE,EAAG,EACd,MAAO,EAAE,GAAqB,EAC9B,IAAK,EEJ6B,GAAY,EFK9C,MAAO,EAAE,GAAI,EACb,mCAAE,EACA,MAAO,EAAE,IAAK,EACd,KAAM,EAAE,EAAG,EACX,KAAM,EAAE,AAAC,EACT,KAAM,EAAE,KAAM,EACd,MAAO,EAAE,AAAC,EACV,SAAU,EAAE,gBAA6C,EAC3D,mCAAE,EACA,MAAO,EAAE,WAAY,EACrB,KAAM,EAAE,AAAC,EACT,qCAAC,EACC,MAAO,EAAE,WAAY,EACrB,MAAO,EAAE,EAAqB,EAC9B,IAAK,EEZyB,MAAyB,EFa7D,sBAAW,EACT,IAAK,EAAE,GAAI,EACX,KAAM,EAAE,GAAI,EACZ,IAAK,EAAE,GAAI,EACX,GAAI,EAAE,GAAI,EACV,KAAM,EAAE,GAAI,EACZ,QAAS,ECkByB,IAAK,EDjBvC,iCAAU,EACR,IAAK,EAAE,GAAI,EACb,+BAAQ,EACN,IAAK,EAAE,GAAI,EACb,oDAA+B,EAC7B,SAAU,EAAE,IAAK,EACjB,6DAAQ,EACN,IAAK,EAAE,GAAI,EACb,+DAAU,EACR,IAAK,EAAE,GAAI,EACf,2CAAoB,EAClB,IAAK,EAAE,GAAI,EACX,KAAM,EAAE,GAAI,EACZ,UAAW,EAAE,GAAI,EACjB,MAAO,EAAE,IAAuB,EAChC,MAAO,EAAE,IAAK,EACd,SAAU,EAAE,KAAM,EGhDpB,mCAAsB,EHmDxB,YAAa,EACX,IAAK,EAAE,EAAG,EACV,MAAO,EAAE,GAAI,EACb,kBAAO,EACL,MAAO,EAAE,IAAK", -"sources": ["../../../bower_components/wyrm/sass/wyrm_core/_mixin.sass","../../../bower_components/bourbon/dist/css3/_font-face.scss","../../../sass/_theme_badge_fa.sass","../../../sass/_theme_badge.sass","../../../bower_components/wyrm/sass/wyrm_core/_wy_variables.sass","../../../sass/_theme_variables.sass","../../../bower_components/neat/app/assets/stylesheets/grid/_media.scss"], -"names": [], -"file": "badge_only.css" -} diff --git a/docs/_static/css/pytorch_theme.css b/docs/_static/css/pytorch_theme.css deleted file mode 100644 index 31ba06911b7b..000000000000 --- a/docs/_static/css/pytorch_theme.css +++ /dev/null @@ -1,114 +0,0 @@ -body { - font-family: "Lato","proxima-nova","Helvetica Neue",Arial,sans-serif; -} - -/* Default header fonts are ugly */ -h1, h2, .rst-content .toctree-wrapper p.caption, h3, h4, h5, h6, legend, p.caption { - font-family: "Lato","proxima-nova","Helvetica Neue",Arial,sans-serif; -} - -/* Use white for docs background */ -.wy-side-nav-search { - background-color: #fff; -} - -.wy-nav-content-wrap, .wy-menu li.current > a { - background-color: #fff; -} - -@media screen and (min-width: 1400px) { - .wy-nav-content-wrap { - background-color: rgba(0, 0, 0, 0.0470588); - } - - .wy-nav-content { - background-color: #fff; - } -} - -/* Fixes for mobile */ -.wy-nav-top { - background-color: #fff; - background-image: url('../img/pytorch-logo-dark.svg'); - background-repeat: no-repeat; - background-position: center; - padding: 0; - margin: 0.4045em 0.809em; - color: #333; -} - -.wy-nav-top > a { - display: none; -} - -@media screen and (max-width: 768px) { - .wy-side-nav-search>a img.logo { - height: 60px; - } -} - -/* This is needed to ensure that logo above search scales properly */ -.wy-side-nav-search a { - display: block; -} - -/* This ensures that multiple constructors will remain in separate lines. */ -.rst-content dl:not(.docutils) dt { - display: table; -} - -/* Use our red for literals (it's very similar to the original color) */ -.rst-content tt.literal, .rst-content tt.literal, .rst-content code.literal { - color: #F05732; -} - -.rst-content tt.xref, a .rst-content tt, .rst-content tt.xref, -.rst-content code.xref, a .rst-content tt, a .rst-content code { - color: #404040; -} - -/* Change link colors (except for the menu) */ - -a { - color: #F05732; -} - -a:hover { - color: #F05732; -} - - -a:visited { - color: #D44D2C; -} - -.wy-menu a { - color: #b3b3b3; -} - -.wy-menu a:hover { - color: #b3b3b3; -} - -/* Default footer text is quite big */ -footer { - font-size: 80%; -} - -footer .rst-footer-buttons { - font-size: 125%; /* revert footer settings - 1/80% = 125% */ -} - -footer p { - font-size: 100%; -} - -/* For hidden headers that appear in TOC tree */ -/* see http://stackoverflow.com/a/32363545/3343043 */ -.rst-content .hidden-section { - display: none; -} - -nav .hidden-section { - display: inherit; -} diff --git a/docs/_static/css/theme.css b/docs/_static/css/theme.css deleted file mode 100644 index 252eef538462..000000000000 --- a/docs/_static/css/theme.css +++ /dev/null @@ -1,5 +0,0 @@ -*{-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box}article,aside,details,figcaption,figure,footer,header,hgroup,nav,section{display:block}audio,canvas,video{display:inline-block;*display:inline;*zoom:1}audio:not([controls]){display:none}[hidden]{display:none}*{-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box}html{font-size:100%;-webkit-text-size-adjust:100%;-ms-text-size-adjust:100%}body{margin:0}a:hover,a:active{outline:0}abbr[title]{border-bottom:1px dotted}b,strong{font-weight:bold}blockquote{margin:0}dfn{font-style:italic}ins{background:#ff9;color:#000;text-decoration:none}mark{background:#ff0;color:#000;font-style:italic;font-weight:bold}pre,code,.rst-content tt,.rst-content code,kbd,samp{font-family:monospace,serif;_font-family:"courier new",monospace;font-size:1em}pre{white-space:pre}q{quotes:none}q:before,q:after{content:"";content:none}small{font-size:85%}sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline}sup{top:-0.5em}sub{bottom:-0.25em}ul,ol,dl{margin:0;padding:0;list-style:none;list-style-image:none}li{list-style:none}dd{margin:0}img{border:0;-ms-interpolation-mode:bicubic;vertical-align:middle;max-width:100%}svg:not(:root){overflow:hidden}figure{margin:0}form{margin:0}fieldset{border:0;margin:0;padding:0}label{cursor:pointer}legend{border:0;*margin-left:-7px;padding:0;white-space:normal}button,input,select,textarea{font-size:100%;margin:0;vertical-align:baseline;*vertical-align:middle}button,input{line-height:normal}button,input[type="button"],input[type="reset"],input[type="submit"]{cursor:pointer;-webkit-appearance:button;*overflow:visible}button[disabled],input[disabled]{cursor:default}input[type="checkbox"],input[type="radio"]{box-sizing:border-box;padding:0;*width:13px;*height:13px}input[type="search"]{-webkit-appearance:textfield;-moz-box-sizing:content-box;-webkit-box-sizing:content-box;box-sizing:content-box}input[type="search"]::-webkit-search-decoration,input[type="search"]::-webkit-search-cancel-button{-webkit-appearance:none}button::-moz-focus-inner,input::-moz-focus-inner{border:0;padding:0}textarea{overflow:auto;vertical-align:top;resize:vertical}table{border-collapse:collapse;border-spacing:0}td{vertical-align:top}.chromeframe{margin:0.2em 0;background:#ccc;color:#000;padding:0.2em 0}.ir{display:block;border:0;text-indent:-999em;overflow:hidden;background-color:transparent;background-repeat:no-repeat;text-align:left;direction:ltr;*line-height:0}.ir br{display:none}.hidden{display:none !important;visibility:hidden}.visuallyhidden{border:0;clip:rect(0 0 0 0);height:1px;margin:-1px;overflow:hidden;padding:0;position:absolute;width:1px}.visuallyhidden.focusable:active,.visuallyhidden.focusable:focus{clip:auto;height:auto;margin:0;overflow:visible;position:static;width:auto}.invisible{visibility:hidden}.relative{position:relative}big,small{font-size:100%}@media print{html,body,section{background:none !important}*{box-shadow:none !important;text-shadow:none !important;filter:none !important;-ms-filter:none !important}a,a:visited{text-decoration:underline}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,.rst-content .toctree-wrapper p.caption,h3{orphans:3;widows:3}h2,.rst-content .toctree-wrapper p.caption,h3{page-break-after:avoid}}.fa:before,.wy-menu-vertical li span.toctree-expand:before,.wy-menu-vertical li.on a span.toctree-expand:before,.wy-menu-vertical li.current>a span.toctree-expand:before,.rst-content .admonition-title:before,.rst-content h1 .headerlink:before,.rst-content h2 .headerlink:before,.rst-content h3 .headerlink:before,.rst-content h4 .headerlink:before,.rst-content h5 .headerlink:before,.rst-content h6 .headerlink:before,.rst-content dl dt .headerlink:before,.rst-content p.caption .headerlink:before,.rst-content tt.download span:first-child:before,.rst-content code.download span:first-child:before,.icon:before,.wy-dropdown .caret:before,.wy-inline-validate.wy-inline-validate-success .wy-input-context:before,.wy-inline-validate.wy-inline-validate-danger .wy-input-context:before,.wy-inline-validate.wy-inline-validate-warning .wy-input-context:before,.wy-inline-validate.wy-inline-validate-info .wy-input-context:before,.wy-alert,.rst-content .note,.rst-content .attention,.rst-content .caution,.rst-content .danger,.rst-content .error,.rst-content .hint,.rst-content .important,.rst-content .tip,.rst-content .warning,.rst-content .seealso,.rst-content .admonition-todo,.btn,input[type="text"],input[type="password"],input[type="email"],input[type="url"],input[type="date"],input[type="month"],input[type="time"],input[type="datetime"],input[type="datetime-local"],input[type="week"],input[type="number"],input[type="search"],input[type="tel"],input[type="color"],select,textarea,.wy-menu-vertical li.on a,.wy-menu-vertical li.current>a,.wy-side-nav-search>a,.wy-side-nav-search .wy-dropdown>a,.wy-nav-top a{-webkit-font-smoothing:antialiased}.clearfix{*zoom:1}.clearfix:before,.clearfix:after{display:table;content:""}.clearfix:after{clear:both}/*! - * Font Awesome 4.7.0 by @davegandy - http://fontawesome.io - @fontawesome - * License - http://fontawesome.io/license (Font: SIL OFL 1.1, CSS: MIT License) - */@font-face{font-family:'FontAwesome';src:url("../fonts/fontawesome-webfont.eot?v=4.7.0");src:url("../fonts/fontawesome-webfont.eot?#iefix&v=4.7.0") format("embedded-opentype"),url("../fonts/fontawesome-webfont.woff2?v=4.7.0") format("woff2"),url("../fonts/fontawesome-webfont.woff?v=4.7.0") format("woff"),url("../fonts/fontawesome-webfont.ttf?v=4.7.0") format("truetype"),url("../fonts/fontawesome-webfont.svg?v=4.7.0#fontawesomeregular") format("svg");font-weight:normal;font-style:normal}.fa,.wy-menu-vertical li span.toctree-expand,.wy-menu-vertical li.on a span.toctree-expand,.wy-menu-vertical li.current>a span.toctree-expand,.rst-content .admonition-title,.rst-content h1 .headerlink,.rst-content h2 .headerlink,.rst-content h3 .headerlink,.rst-content h4 .headerlink,.rst-content h5 .headerlink,.rst-content h6 .headerlink,.rst-content dl dt .headerlink,.rst-content p.caption .headerlink,.rst-content tt.download span:first-child,.rst-content code.download span:first-child,.icon{display:inline-block;font:normal normal normal 14px/1 FontAwesome;font-size:inherit;text-rendering:auto;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale}.fa-lg{font-size:1.33333em;line-height:.75em;vertical-align:-15%}.fa-2x{font-size:2em}.fa-3x{font-size:3em}.fa-4x{font-size:4em}.fa-5x{font-size:5em}.fa-fw{width:1.28571em;text-align:center}.fa-ul{padding-left:0;margin-left:2.14286em;list-style-type:none}.fa-ul>li{position:relative}.fa-li{position:absolute;left:-2.14286em;width:2.14286em;top:.14286em;text-align:center}.fa-li.fa-lg{left:-1.85714em}.fa-border{padding:.2em .25em .15em;border:solid 0.08em #eee;border-radius:.1em}.fa-pull-left{float:left}.fa-pull-right{float:right}.fa.fa-pull-left,.wy-menu-vertical li span.fa-pull-left.toctree-expand,.wy-menu-vertical li.on a span.fa-pull-left.toctree-expand,.wy-menu-vertical li.current>a span.fa-pull-left.toctree-expand,.rst-content .fa-pull-left.admonition-title,.rst-content h1 .fa-pull-left.headerlink,.rst-content h2 .fa-pull-left.headerlink,.rst-content h3 .fa-pull-left.headerlink,.rst-content h4 .fa-pull-left.headerlink,.rst-content h5 .fa-pull-left.headerlink,.rst-content h6 .fa-pull-left.headerlink,.rst-content dl dt .fa-pull-left.headerlink,.rst-content p.caption .fa-pull-left.headerlink,.rst-content tt.download span.fa-pull-left:first-child,.rst-content code.download span.fa-pull-left:first-child,.fa-pull-left.icon{margin-right:.3em}.fa.fa-pull-right,.wy-menu-vertical li span.fa-pull-right.toctree-expand,.wy-menu-vertical li.on a span.fa-pull-right.toctree-expand,.wy-menu-vertical li.current>a span.fa-pull-right.toctree-expand,.rst-content .fa-pull-right.admonition-title,.rst-content h1 .fa-pull-right.headerlink,.rst-content h2 .fa-pull-right.headerlink,.rst-content h3 .fa-pull-right.headerlink,.rst-content h4 .fa-pull-right.headerlink,.rst-content h5 .fa-pull-right.headerlink,.rst-content h6 .fa-pull-right.headerlink,.rst-content dl dt .fa-pull-right.headerlink,.rst-content p.caption .fa-pull-right.headerlink,.rst-content tt.download span.fa-pull-right:first-child,.rst-content code.download span.fa-pull-right:first-child,.fa-pull-right.icon{margin-left:.3em}.pull-right{float:right}.pull-left{float:left}.fa.pull-left,.wy-menu-vertical li span.pull-left.toctree-expand,.wy-menu-vertical li.on a span.pull-left.toctree-expand,.wy-menu-vertical li.current>a span.pull-left.toctree-expand,.rst-content .pull-left.admonition-title,.rst-content h1 .pull-left.headerlink,.rst-content h2 .pull-left.headerlink,.rst-content h3 .pull-left.headerlink,.rst-content h4 .pull-left.headerlink,.rst-content h5 .pull-left.headerlink,.rst-content h6 .pull-left.headerlink,.rst-content dl dt .pull-left.headerlink,.rst-content p.caption .pull-left.headerlink,.rst-content tt.download span.pull-left:first-child,.rst-content code.download span.pull-left:first-child,.pull-left.icon{margin-right:.3em}.fa.pull-right,.wy-menu-vertical li span.pull-right.toctree-expand,.wy-menu-vertical li.on a span.pull-right.toctree-expand,.wy-menu-vertical li.current>a span.pull-right.toctree-expand,.rst-content .pull-right.admonition-title,.rst-content h1 .pull-right.headerlink,.rst-content h2 .pull-right.headerlink,.rst-content h3 .pull-right.headerlink,.rst-content h4 .pull-right.headerlink,.rst-content h5 .pull-right.headerlink,.rst-content h6 .pull-right.headerlink,.rst-content dl dt .pull-right.headerlink,.rst-content p.caption .pull-right.headerlink,.rst-content tt.download span.pull-right:first-child,.rst-content code.download span.pull-right:first-child,.pull-right.icon{margin-left:.3em}.fa-spin{-webkit-animation:fa-spin 2s infinite linear;animation:fa-spin 2s infinite linear}.fa-pulse{-webkit-animation:fa-spin 1s infinite steps(8);animation:fa-spin 1s infinite steps(8)}@-webkit-keyframes fa-spin{0%{-webkit-transform:rotate(0deg);transform:rotate(0deg)}100%{-webkit-transform:rotate(359deg);transform:rotate(359deg)}}@keyframes fa-spin{0%{-webkit-transform:rotate(0deg);transform:rotate(0deg)}100%{-webkit-transform:rotate(359deg);transform:rotate(359deg)}}.fa-rotate-90{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=1)";-webkit-transform:rotate(90deg);-ms-transform:rotate(90deg);transform:rotate(90deg)}.fa-rotate-180{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=2)";-webkit-transform:rotate(180deg);-ms-transform:rotate(180deg);transform:rotate(180deg)}.fa-rotate-270{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=3)";-webkit-transform:rotate(270deg);-ms-transform:rotate(270deg);transform:rotate(270deg)}.fa-flip-horizontal{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=0, mirror=1)";-webkit-transform:scale(-1, 1);-ms-transform:scale(-1, 1);transform:scale(-1, 1)}.fa-flip-vertical{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=2, mirror=1)";-webkit-transform:scale(1, -1);-ms-transform:scale(1, -1);transform:scale(1, -1)}:root .fa-rotate-90,:root .fa-rotate-180,:root .fa-rotate-270,:root .fa-flip-horizontal,:root .fa-flip-vertical{filter:none}.fa-stack{position:relative;display:inline-block;width:2em;height:2em;line-height:2em;vertical-align:middle}.fa-stack-1x,.fa-stack-2x{position:absolute;left:0;width:100%;text-align:center}.fa-stack-1x{line-height:inherit}.fa-stack-2x{font-size:2em}.fa-inverse{color:#fff}.fa-glass:before{content:""}.fa-music:before{content:""}.fa-search:before,.icon-search:before{content:""}.fa-envelope-o:before{content:""}.fa-heart:before{content:""}.fa-star:before{content:""}.fa-star-o:before{content:""}.fa-user:before{content:""}.fa-film:before{content:""}.fa-th-large:before{content:""}.fa-th:before{content:""}.fa-th-list:before{content:""}.fa-check:before{content:""}.fa-remove:before,.fa-close:before,.fa-times:before{content:""}.fa-search-plus:before{content:""}.fa-search-minus:before{content:""}.fa-power-off:before{content:""}.fa-signal:before{content:""}.fa-gear:before,.fa-cog:before{content:""}.fa-trash-o:before{content:""}.fa-home:before,.icon-home:before{content:""}.fa-file-o:before{content:""}.fa-clock-o:before{content:""}.fa-road:before{content:""}.fa-download:before,.rst-content tt.download span:first-child:before,.rst-content code.download span:first-child:before{content:""}.fa-arrow-circle-o-down:before{content:""}.fa-arrow-circle-o-up:before{content:""}.fa-inbox:before{content:""}.fa-play-circle-o:before{content:""}.fa-rotate-right:before,.fa-repeat:before{content:""}.fa-refresh:before{content:""}.fa-list-alt:before{content:""}.fa-lock:before{content:""}.fa-flag:before{content:""}.fa-headphones:before{content:""}.fa-volume-off:before{content:""}.fa-volume-down:before{content:""}.fa-volume-up:before{content:""}.fa-qrcode:before{content:""}.fa-barcode:before{content:""}.fa-tag:before{content:""}.fa-tags:before{content:""}.fa-book:before,.icon-book:before{content:""}.fa-bookmark:before{content:""}.fa-print:before{content:""}.fa-camera:before{content:""}.fa-font:before{content:""}.fa-bold:before{content:""}.fa-italic:before{content:""}.fa-text-height:before{content:""}.fa-text-width:before{content:""}.fa-align-left:before{content:""}.fa-align-center:before{content:""}.fa-align-right:before{content:""}.fa-align-justify:before{content:""}.fa-list:before{content:""}.fa-dedent:before,.fa-outdent:before{content:""}.fa-indent:before{content:""}.fa-video-camera:before{content:""}.fa-photo:before,.fa-image:before,.fa-picture-o:before{content:""}.fa-pencil:before{content:""}.fa-map-marker:before{content:""}.fa-adjust:before{content:""}.fa-tint:before{content:""}.fa-edit:before,.fa-pencil-square-o:before{content:""}.fa-share-square-o:before{content:""}.fa-check-square-o:before{content:""}.fa-arrows:before{content:""}.fa-step-backward:before{content:""}.fa-fast-backward:before{content:""}.fa-backward:before{content:""}.fa-play:before{content:""}.fa-pause:before{content:""}.fa-stop:before{content:""}.fa-forward:before{content:""}.fa-fast-forward:before{content:""}.fa-step-forward:before{content:""}.fa-eject:before{content:""}.fa-chevron-left:before{content:""}.fa-chevron-right:before{content:""}.fa-plus-circle:before{content:""}.fa-minus-circle:before{content:""}.fa-times-circle:before,.wy-inline-validate.wy-inline-validate-danger .wy-input-context:before{content:""}.fa-check-circle:before,.wy-inline-validate.wy-inline-validate-success .wy-input-context:before{content:""}.fa-question-circle:before{content:""}.fa-info-circle:before{content:""}.fa-crosshairs:before{content:""}.fa-times-circle-o:before{content:""}.fa-check-circle-o:before{content:""}.fa-ban:before{content:""}.fa-arrow-left:before{content:""}.fa-arrow-right:before{content:""}.fa-arrow-up:before{content:""}.fa-arrow-down:before{content:""}.fa-mail-forward:before,.fa-share:before{content:""}.fa-expand:before{content:""}.fa-compress:before{content:""}.fa-plus:before{content:""}.fa-minus:before{content:""}.fa-asterisk:before{content:""}.fa-exclamation-circle:before,.wy-inline-validate.wy-inline-validate-warning .wy-input-context:before,.wy-inline-validate.wy-inline-validate-info .wy-input-context:before,.rst-content .admonition-title:before{content:""}.fa-gift:before{content:""}.fa-leaf:before{content:""}.fa-fire:before,.icon-fire:before{content:""}.fa-eye:before{content:""}.fa-eye-slash:before{content:""}.fa-warning:before,.fa-exclamation-triangle:before{content:""}.fa-plane:before{content:""}.fa-calendar:before{content:""}.fa-random:before{content:""}.fa-comment:before{content:""}.fa-magnet:before{content:""}.fa-chevron-up:before{content:""}.fa-chevron-down:before{content:""}.fa-retweet:before{content:""}.fa-shopping-cart:before{content:""}.fa-folder:before{content:""}.fa-folder-open:before{content:""}.fa-arrows-v:before{content:""}.fa-arrows-h:before{content:""}.fa-bar-chart-o:before,.fa-bar-chart:before{content:""}.fa-twitter-square:before{content:""}.fa-facebook-square:before{content:""}.fa-camera-retro:before{content:""}.fa-key:before{content:""}.fa-gears:before,.fa-cogs:before{content:""}.fa-comments:before{content:""}.fa-thumbs-o-up:before{content:""}.fa-thumbs-o-down:before{content:""}.fa-star-half:before{content:""}.fa-heart-o:before{content:""}.fa-sign-out:before{content:""}.fa-linkedin-square:before{content:""}.fa-thumb-tack:before{content:""}.fa-external-link:before{content:""}.fa-sign-in:before{content:""}.fa-trophy:before{content:""}.fa-github-square:before{content:""}.fa-upload:before{content:""}.fa-lemon-o:before{content:""}.fa-phone:before{content:""}.fa-square-o:before{content:""}.fa-bookmark-o:before{content:""}.fa-phone-square:before{content:""}.fa-twitter:before{content:""}.fa-facebook-f:before,.fa-facebook:before{content:""}.fa-github:before,.icon-github:before{content:""}.fa-unlock:before{content:""}.fa-credit-card:before{content:""}.fa-feed:before,.fa-rss:before{content:""}.fa-hdd-o:before{content:""}.fa-bullhorn:before{content:""}.fa-bell:before{content:""}.fa-certificate:before{content:""}.fa-hand-o-right:before{content:""}.fa-hand-o-left:before{content:""}.fa-hand-o-up:before{content:""}.fa-hand-o-down:before{content:""}.fa-arrow-circle-left:before,.icon-circle-arrow-left:before{content:""}.fa-arrow-circle-right:before,.icon-circle-arrow-right:before{content:""}.fa-arrow-circle-up:before{content:""}.fa-arrow-circle-down:before{content:""}.fa-globe:before{content:""}.fa-wrench:before{content:""}.fa-tasks:before{content:""}.fa-filter:before{content:""}.fa-briefcase:before{content:""}.fa-arrows-alt:before{content:""}.fa-group:before,.fa-users:before{content:""}.fa-chain:before,.fa-link:before,.icon-link:before{content:""}.fa-cloud:before{content:""}.fa-flask:before{content:""}.fa-cut:before,.fa-scissors:before{content:""}.fa-copy:before,.fa-files-o:before{content:""}.fa-paperclip:before{content:""}.fa-save:before,.fa-floppy-o:before{content:""}.fa-square:before{content:""}.fa-navicon:before,.fa-reorder:before,.fa-bars:before{content:""}.fa-list-ul:before{content:""}.fa-list-ol:before{content:""}.fa-strikethrough:before{content:""}.fa-underline:before{content:""}.fa-table:before{content:""}.fa-magic:before{content:""}.fa-truck:before{content:""}.fa-pinterest:before{content:""}.fa-pinterest-square:before{content:""}.fa-google-plus-square:before{content:""}.fa-google-plus:before{content:""}.fa-money:before{content:""}.fa-caret-down:before,.wy-dropdown .caret:before,.icon-caret-down:before{content:""}.fa-caret-up:before{content:""}.fa-caret-left:before{content:""}.fa-caret-right:before{content:""}.fa-columns:before{content:""}.fa-unsorted:before,.fa-sort:before{content:""}.fa-sort-down:before,.fa-sort-desc:before{content:""}.fa-sort-up:before,.fa-sort-asc:before{content:""}.fa-envelope:before{content:""}.fa-linkedin:before{content:""}.fa-rotate-left:before,.fa-undo:before{content:""}.fa-legal:before,.fa-gavel:before{content:""}.fa-dashboard:before,.fa-tachometer:before{content:""}.fa-comment-o:before{content:""}.fa-comments-o:before{content:""}.fa-flash:before,.fa-bolt:before{content:""}.fa-sitemap:before{content:""}.fa-umbrella:before{content:""}.fa-paste:before,.fa-clipboard:before{content:""}.fa-lightbulb-o:before{content:""}.fa-exchange:before{content:""}.fa-cloud-download:before{content:""}.fa-cloud-upload:before{content:""}.fa-user-md:before{content:""}.fa-stethoscope:before{content:""}.fa-suitcase:before{content:""}.fa-bell-o:before{content:""}.fa-coffee:before{content:""}.fa-cutlery:before{content:""}.fa-file-text-o:before{content:""}.fa-building-o:before{content:""}.fa-hospital-o:before{content:""}.fa-ambulance:before{content:""}.fa-medkit:before{content:""}.fa-fighter-jet:before{content:""}.fa-beer:before{content:""}.fa-h-square:before{content:""}.fa-plus-square:before{content:""}.fa-angle-double-left:before{content:""}.fa-angle-double-right:before{content:""}.fa-angle-double-up:before{content:""}.fa-angle-double-down:before{content:""}.fa-angle-left:before{content:""}.fa-angle-right:before{content:""}.fa-angle-up:before{content:""}.fa-angle-down:before{content:""}.fa-desktop:before{content:""}.fa-laptop:before{content:""}.fa-tablet:before{content:""}.fa-mobile-phone:before,.fa-mobile:before{content:""}.fa-circle-o:before{content:""}.fa-quote-left:before{content:""}.fa-quote-right:before{content:""}.fa-spinner:before{content:""}.fa-circle:before{content:""}.fa-mail-reply:before,.fa-reply:before{content:""}.fa-github-alt:before{content:""}.fa-folder-o:before{content:""}.fa-folder-open-o:before{content:""}.fa-smile-o:before{content:""}.fa-frown-o:before{content:""}.fa-meh-o:before{content:""}.fa-gamepad:before{content:""}.fa-keyboard-o:before{content:""}.fa-flag-o:before{content:""}.fa-flag-checkered:before{content:""}.fa-terminal:before{content:""}.fa-code:before{content:""}.fa-mail-reply-all:before,.fa-reply-all:before{content:""}.fa-star-half-empty:before,.fa-star-half-full:before,.fa-star-half-o:before{content:""}.fa-location-arrow:before{content:""}.fa-crop:before{content:""}.fa-code-fork:before{content:""}.fa-unlink:before,.fa-chain-broken:before{content:""}.fa-question:before{content:""}.fa-info:before{content:""}.fa-exclamation:before{content:""}.fa-superscript:before{content:""}.fa-subscript:before{content:""}.fa-eraser:before{content:""}.fa-puzzle-piece:before{content:""}.fa-microphone:before{content:""}.fa-microphone-slash:before{content:""}.fa-shield:before{content:""}.fa-calendar-o:before{content:""}.fa-fire-extinguisher:before{content:""}.fa-rocket:before{content:""}.fa-maxcdn:before{content:""}.fa-chevron-circle-left:before{content:""}.fa-chevron-circle-right:before{content:""}.fa-chevron-circle-up:before{content:""}.fa-chevron-circle-down:before{content:""}.fa-html5:before{content:""}.fa-css3:before{content:""}.fa-anchor:before{content:""}.fa-unlock-alt:before{content:""}.fa-bullseye:before{content:""}.fa-ellipsis-h:before{content:""}.fa-ellipsis-v:before{content:""}.fa-rss-square:before{content:""}.fa-play-circle:before{content:""}.fa-ticket:before{content:""}.fa-minus-square:before{content:""}.fa-minus-square-o:before,.wy-menu-vertical li.on a span.toctree-expand:before,.wy-menu-vertical li.current>a span.toctree-expand:before{content:""}.fa-level-up:before{content:""}.fa-level-down:before{content:""}.fa-check-square:before{content:""}.fa-pencil-square:before{content:""}.fa-external-link-square:before{content:""}.fa-share-square:before{content:""}.fa-compass:before{content:""}.fa-toggle-down:before,.fa-caret-square-o-down:before{content:""}.fa-toggle-up:before,.fa-caret-square-o-up:before{content:""}.fa-toggle-right:before,.fa-caret-square-o-right:before{content:""}.fa-euro:before,.fa-eur:before{content:""}.fa-gbp:before{content:""}.fa-dollar:before,.fa-usd:before{content:""}.fa-rupee:before,.fa-inr:before{content:""}.fa-cny:before,.fa-rmb:before,.fa-yen:before,.fa-jpy:before{content:""}.fa-ruble:before,.fa-rouble:before,.fa-rub:before{content:""}.fa-won:before,.fa-krw:before{content:""}.fa-bitcoin:before,.fa-btc:before{content:""}.fa-file:before{content:""}.fa-file-text:before{content:""}.fa-sort-alpha-asc:before{content:""}.fa-sort-alpha-desc:before{content:""}.fa-sort-amount-asc:before{content:""}.fa-sort-amount-desc:before{content:""}.fa-sort-numeric-asc:before{content:""}.fa-sort-numeric-desc:before{content:""}.fa-thumbs-up:before{content:""}.fa-thumbs-down:before{content:""}.fa-youtube-square:before{content:""}.fa-youtube:before{content:""}.fa-xing:before{content:""}.fa-xing-square:before{content:""}.fa-youtube-play:before{content:""}.fa-dropbox:before{content:""}.fa-stack-overflow:before{content:""}.fa-instagram:before{content:""}.fa-flickr:before{content:""}.fa-adn:before{content:""}.fa-bitbucket:before,.icon-bitbucket:before{content:""}.fa-bitbucket-square:before{content:""}.fa-tumblr:before{content:""}.fa-tumblr-square:before{content:""}.fa-long-arrow-down:before{content:""}.fa-long-arrow-up:before{content:""}.fa-long-arrow-left:before{content:""}.fa-long-arrow-right:before{content:""}.fa-apple:before{content:""}.fa-windows:before{content:""}.fa-android:before{content:""}.fa-linux:before{content:""}.fa-dribbble:before{content:""}.fa-skype:before{content:""}.fa-foursquare:before{content:""}.fa-trello:before{content:""}.fa-female:before{content:""}.fa-male:before{content:""}.fa-gittip:before,.fa-gratipay:before{content:""}.fa-sun-o:before{content:""}.fa-moon-o:before{content:""}.fa-archive:before{content:""}.fa-bug:before{content:""}.fa-vk:before{content:""}.fa-weibo:before{content:""}.fa-renren:before{content:""}.fa-pagelines:before{content:""}.fa-stack-exchange:before{content:""}.fa-arrow-circle-o-right:before{content:""}.fa-arrow-circle-o-left:before{content:""}.fa-toggle-left:before,.fa-caret-square-o-left:before{content:""}.fa-dot-circle-o:before{content:""}.fa-wheelchair:before{content:""}.fa-vimeo-square:before{content:""}.fa-turkish-lira:before,.fa-try:before{content:""}.fa-plus-square-o:before,.wy-menu-vertical li span.toctree-expand:before{content:""}.fa-space-shuttle:before{content:""}.fa-slack:before{content:""}.fa-envelope-square:before{content:""}.fa-wordpress:before{content:""}.fa-openid:before{content:""}.fa-institution:before,.fa-bank:before,.fa-university:before{content:""}.fa-mortar-board:before,.fa-graduation-cap:before{content:""}.fa-yahoo:before{content:""}.fa-google:before{content:""}.fa-reddit:before{content:""}.fa-reddit-square:before{content:""}.fa-stumbleupon-circle:before{content:""}.fa-stumbleupon:before{content:""}.fa-delicious:before{content:""}.fa-digg:before{content:""}.fa-pied-piper-pp:before{content:""}.fa-pied-piper-alt:before{content:""}.fa-drupal:before{content:""}.fa-joomla:before{content:""}.fa-language:before{content:""}.fa-fax:before{content:""}.fa-building:before{content:""}.fa-child:before{content:""}.fa-paw:before{content:""}.fa-spoon:before{content:""}.fa-cube:before{content:""}.fa-cubes:before{content:""}.fa-behance:before{content:""}.fa-behance-square:before{content:""}.fa-steam:before{content:""}.fa-steam-square:before{content:""}.fa-recycle:before{content:""}.fa-automobile:before,.fa-car:before{content:""}.fa-cab:before,.fa-taxi:before{content:""}.fa-tree:before{content:""}.fa-spotify:before{content:""}.fa-deviantart:before{content:""}.fa-soundcloud:before{content:""}.fa-database:before{content:""}.fa-file-pdf-o:before{content:""}.fa-file-word-o:before{content:""}.fa-file-excel-o:before{content:""}.fa-file-powerpoint-o:before{content:""}.fa-file-photo-o:before,.fa-file-picture-o:before,.fa-file-image-o:before{content:""}.fa-file-zip-o:before,.fa-file-archive-o:before{content:""}.fa-file-sound-o:before,.fa-file-audio-o:before{content:""}.fa-file-movie-o:before,.fa-file-video-o:before{content:""}.fa-file-code-o:before{content:""}.fa-vine:before{content:""}.fa-codepen:before{content:""}.fa-jsfiddle:before{content:""}.fa-life-bouy:before,.fa-life-buoy:before,.fa-life-saver:before,.fa-support:before,.fa-life-ring:before{content:""}.fa-circle-o-notch:before{content:""}.fa-ra:before,.fa-resistance:before,.fa-rebel:before{content:""}.fa-ge:before,.fa-empire:before{content:""}.fa-git-square:before{content:""}.fa-git:before{content:""}.fa-y-combinator-square:before,.fa-yc-square:before,.fa-hacker-news:before{content:""}.fa-tencent-weibo:before{content:""}.fa-qq:before{content:""}.fa-wechat:before,.fa-weixin:before{content:""}.fa-send:before,.fa-paper-plane:before{content:""}.fa-send-o:before,.fa-paper-plane-o:before{content:""}.fa-history:before{content:""}.fa-circle-thin:before{content:""}.fa-header:before{content:""}.fa-paragraph:before{content:""}.fa-sliders:before{content:""}.fa-share-alt:before{content:""}.fa-share-alt-square:before{content:""}.fa-bomb:before{content:""}.fa-soccer-ball-o:before,.fa-futbol-o:before{content:""}.fa-tty:before{content:""}.fa-binoculars:before{content:""}.fa-plug:before{content:""}.fa-slideshare:before{content:""}.fa-twitch:before{content:""}.fa-yelp:before{content:""}.fa-newspaper-o:before{content:""}.fa-wifi:before{content:""}.fa-calculator:before{content:""}.fa-paypal:before{content:""}.fa-google-wallet:before{content:""}.fa-cc-visa:before{content:""}.fa-cc-mastercard:before{content:""}.fa-cc-discover:before{content:""}.fa-cc-amex:before{content:""}.fa-cc-paypal:before{content:""}.fa-cc-stripe:before{content:""}.fa-bell-slash:before{content:""}.fa-bell-slash-o:before{content:""}.fa-trash:before{content:""}.fa-copyright:before{content:""}.fa-at:before{content:""}.fa-eyedropper:before{content:""}.fa-paint-brush:before{content:""}.fa-birthday-cake:before{content:""}.fa-area-chart:before{content:""}.fa-pie-chart:before{content:""}.fa-line-chart:before{content:""}.fa-lastfm:before{content:""}.fa-lastfm-square:before{content:""}.fa-toggle-off:before{content:""}.fa-toggle-on:before{content:""}.fa-bicycle:before{content:""}.fa-bus:before{content:""}.fa-ioxhost:before{content:""}.fa-angellist:before{content:""}.fa-cc:before{content:""}.fa-shekel:before,.fa-sheqel:before,.fa-ils:before{content:""}.fa-meanpath:before{content:""}.fa-buysellads:before{content:""}.fa-connectdevelop:before{content:""}.fa-dashcube:before{content:""}.fa-forumbee:before{content:""}.fa-leanpub:before{content:""}.fa-sellsy:before{content:""}.fa-shirtsinbulk:before{content:""}.fa-simplybuilt:before{content:""}.fa-skyatlas:before{content:""}.fa-cart-plus:before{content:""}.fa-cart-arrow-down:before{content:""}.fa-diamond:before{content:""}.fa-ship:before{content:""}.fa-user-secret:before{content:""}.fa-motorcycle:before{content:""}.fa-street-view:before{content:""}.fa-heartbeat:before{content:""}.fa-venus:before{content:""}.fa-mars:before{content:""}.fa-mercury:before{content:""}.fa-intersex:before,.fa-transgender:before{content:""}.fa-transgender-alt:before{content:""}.fa-venus-double:before{content:""}.fa-mars-double:before{content:""}.fa-venus-mars:before{content:""}.fa-mars-stroke:before{content:""}.fa-mars-stroke-v:before{content:""}.fa-mars-stroke-h:before{content:""}.fa-neuter:before{content:""}.fa-genderless:before{content:""}.fa-facebook-official:before{content:""}.fa-pinterest-p:before{content:""}.fa-whatsapp:before{content:""}.fa-server:before{content:""}.fa-user-plus:before{content:""}.fa-user-times:before{content:""}.fa-hotel:before,.fa-bed:before{content:""}.fa-viacoin:before{content:""}.fa-train:before{content:""}.fa-subway:before{content:""}.fa-medium:before{content:""}.fa-yc:before,.fa-y-combinator:before{content:""}.fa-optin-monster:before{content:""}.fa-opencart:before{content:""}.fa-expeditedssl:before{content:""}.fa-battery-4:before,.fa-battery:before,.fa-battery-full:before{content:""}.fa-battery-3:before,.fa-battery-three-quarters:before{content:""}.fa-battery-2:before,.fa-battery-half:before{content:""}.fa-battery-1:before,.fa-battery-quarter:before{content:""}.fa-battery-0:before,.fa-battery-empty:before{content:""}.fa-mouse-pointer:before{content:""}.fa-i-cursor:before{content:""}.fa-object-group:before{content:""}.fa-object-ungroup:before{content:""}.fa-sticky-note:before{content:""}.fa-sticky-note-o:before{content:""}.fa-cc-jcb:before{content:""}.fa-cc-diners-club:before{content:""}.fa-clone:before{content:""}.fa-balance-scale:before{content:""}.fa-hourglass-o:before{content:""}.fa-hourglass-1:before,.fa-hourglass-start:before{content:""}.fa-hourglass-2:before,.fa-hourglass-half:before{content:""}.fa-hourglass-3:before,.fa-hourglass-end:before{content:""}.fa-hourglass:before{content:""}.fa-hand-grab-o:before,.fa-hand-rock-o:before{content:""}.fa-hand-stop-o:before,.fa-hand-paper-o:before{content:""}.fa-hand-scissors-o:before{content:""}.fa-hand-lizard-o:before{content:""}.fa-hand-spock-o:before{content:""}.fa-hand-pointer-o:before{content:""}.fa-hand-peace-o:before{content:""}.fa-trademark:before{content:""}.fa-registered:before{content:""}.fa-creative-commons:before{content:""}.fa-gg:before{content:""}.fa-gg-circle:before{content:""}.fa-tripadvisor:before{content:""}.fa-odnoklassniki:before{content:""}.fa-odnoklassniki-square:before{content:""}.fa-get-pocket:before{content:""}.fa-wikipedia-w:before{content:""}.fa-safari:before{content:""}.fa-chrome:before{content:""}.fa-firefox:before{content:""}.fa-opera:before{content:""}.fa-internet-explorer:before{content:""}.fa-tv:before,.fa-television:before{content:""}.fa-contao:before{content:""}.fa-500px:before{content:""}.fa-amazon:before{content:""}.fa-calendar-plus-o:before{content:""}.fa-calendar-minus-o:before{content:""}.fa-calendar-times-o:before{content:""}.fa-calendar-check-o:before{content:""}.fa-industry:before{content:""}.fa-map-pin:before{content:""}.fa-map-signs:before{content:""}.fa-map-o:before{content:""}.fa-map:before{content:""}.fa-commenting:before{content:""}.fa-commenting-o:before{content:""}.fa-houzz:before{content:""}.fa-vimeo:before{content:""}.fa-black-tie:before{content:""}.fa-fonticons:before{content:""}.fa-reddit-alien:before{content:""}.fa-edge:before{content:""}.fa-credit-card-alt:before{content:""}.fa-codiepie:before{content:""}.fa-modx:before{content:""}.fa-fort-awesome:before{content:""}.fa-usb:before{content:""}.fa-product-hunt:before{content:""}.fa-mixcloud:before{content:""}.fa-scribd:before{content:""}.fa-pause-circle:before{content:""}.fa-pause-circle-o:before{content:""}.fa-stop-circle:before{content:""}.fa-stop-circle-o:before{content:""}.fa-shopping-bag:before{content:""}.fa-shopping-basket:before{content:""}.fa-hashtag:before{content:""}.fa-bluetooth:before{content:""}.fa-bluetooth-b:before{content:""}.fa-percent:before{content:""}.fa-gitlab:before,.icon-gitlab:before{content:""}.fa-wpbeginner:before{content:""}.fa-wpforms:before{content:""}.fa-envira:before{content:""}.fa-universal-access:before{content:""}.fa-wheelchair-alt:before{content:""}.fa-question-circle-o:before{content:""}.fa-blind:before{content:""}.fa-audio-description:before{content:""}.fa-volume-control-phone:before{content:""}.fa-braille:before{content:""}.fa-assistive-listening-systems:before{content:""}.fa-asl-interpreting:before,.fa-american-sign-language-interpreting:before{content:""}.fa-deafness:before,.fa-hard-of-hearing:before,.fa-deaf:before{content:""}.fa-glide:before{content:""}.fa-glide-g:before{content:""}.fa-signing:before,.fa-sign-language:before{content:""}.fa-low-vision:before{content:""}.fa-viadeo:before{content:""}.fa-viadeo-square:before{content:""}.fa-snapchat:before{content:""}.fa-snapchat-ghost:before{content:""}.fa-snapchat-square:before{content:""}.fa-pied-piper:before{content:""}.fa-first-order:before{content:""}.fa-yoast:before{content:""}.fa-themeisle:before{content:""}.fa-google-plus-circle:before,.fa-google-plus-official:before{content:""}.fa-fa:before,.fa-font-awesome:before{content:""}.fa-handshake-o:before{content:""}.fa-envelope-open:before{content:""}.fa-envelope-open-o:before{content:""}.fa-linode:before{content:""}.fa-address-book:before{content:""}.fa-address-book-o:before{content:""}.fa-vcard:before,.fa-address-card:before{content:""}.fa-vcard-o:before,.fa-address-card-o:before{content:""}.fa-user-circle:before{content:""}.fa-user-circle-o:before{content:""}.fa-user-o:before{content:""}.fa-id-badge:before{content:""}.fa-drivers-license:before,.fa-id-card:before{content:""}.fa-drivers-license-o:before,.fa-id-card-o:before{content:""}.fa-quora:before{content:""}.fa-free-code-camp:before{content:""}.fa-telegram:before{content:""}.fa-thermometer-4:before,.fa-thermometer:before,.fa-thermometer-full:before{content:""}.fa-thermometer-3:before,.fa-thermometer-three-quarters:before{content:""}.fa-thermometer-2:before,.fa-thermometer-half:before{content:""}.fa-thermometer-1:before,.fa-thermometer-quarter:before{content:""}.fa-thermometer-0:before,.fa-thermometer-empty:before{content:""}.fa-shower:before{content:""}.fa-bathtub:before,.fa-s15:before,.fa-bath:before{content:""}.fa-podcast:before{content:""}.fa-window-maximize:before{content:""}.fa-window-minimize:before{content:""}.fa-window-restore:before{content:""}.fa-times-rectangle:before,.fa-window-close:before{content:""}.fa-times-rectangle-o:before,.fa-window-close-o:before{content:""}.fa-bandcamp:before{content:""}.fa-grav:before{content:""}.fa-etsy:before{content:""}.fa-imdb:before{content:""}.fa-ravelry:before{content:""}.fa-eercast:before{content:""}.fa-microchip:before{content:""}.fa-snowflake-o:before{content:""}.fa-superpowers:before{content:""}.fa-wpexplorer:before{content:""}.fa-meetup:before{content:""}.sr-only{position:absolute;width:1px;height:1px;padding:0;margin:-1px;overflow:hidden;clip:rect(0, 0, 0, 0);border:0}.sr-only-focusable:active,.sr-only-focusable:focus{position:static;width:auto;height:auto;margin:0;overflow:visible;clip:auto}.fa,.wy-menu-vertical li span.toctree-expand,.wy-menu-vertical li.on a span.toctree-expand,.wy-menu-vertical li.current>a span.toctree-expand,.rst-content .admonition-title,.rst-content h1 .headerlink,.rst-content h2 .headerlink,.rst-content h3 .headerlink,.rst-content h4 .headerlink,.rst-content h5 .headerlink,.rst-content h6 .headerlink,.rst-content dl dt .headerlink,.rst-content p.caption .headerlink,.rst-content tt.download span:first-child,.rst-content code.download span:first-child,.icon,.wy-dropdown .caret,.wy-inline-validate.wy-inline-validate-success .wy-input-context,.wy-inline-validate.wy-inline-validate-danger .wy-input-context,.wy-inline-validate.wy-inline-validate-warning .wy-input-context,.wy-inline-validate.wy-inline-validate-info .wy-input-context{font-family:inherit}.fa:before,.wy-menu-vertical li span.toctree-expand:before,.wy-menu-vertical li.on a span.toctree-expand:before,.wy-menu-vertical li.current>a span.toctree-expand:before,.rst-content .admonition-title:before,.rst-content h1 .headerlink:before,.rst-content h2 .headerlink:before,.rst-content h3 .headerlink:before,.rst-content h4 .headerlink:before,.rst-content h5 .headerlink:before,.rst-content h6 .headerlink:before,.rst-content dl dt .headerlink:before,.rst-content p.caption .headerlink:before,.rst-content tt.download span:first-child:before,.rst-content code.download span:first-child:before,.icon:before,.wy-dropdown .caret:before,.wy-inline-validate.wy-inline-validate-success .wy-input-context:before,.wy-inline-validate.wy-inline-validate-danger .wy-input-context:before,.wy-inline-validate.wy-inline-validate-warning .wy-input-context:before,.wy-inline-validate.wy-inline-validate-info .wy-input-context:before{font-family:"FontAwesome";display:inline-block;font-style:normal;font-weight:normal;line-height:1;text-decoration:inherit}a .fa,a .wy-menu-vertical li span.toctree-expand,.wy-menu-vertical li a span.toctree-expand,.wy-menu-vertical li.on a span.toctree-expand,.wy-menu-vertical li.current>a span.toctree-expand,a .rst-content .admonition-title,.rst-content a .admonition-title,a .rst-content h1 .headerlink,.rst-content h1 a .headerlink,a .rst-content h2 .headerlink,.rst-content h2 a .headerlink,a .rst-content h3 .headerlink,.rst-content h3 a .headerlink,a .rst-content h4 .headerlink,.rst-content h4 a .headerlink,a .rst-content h5 .headerlink,.rst-content h5 a .headerlink,a .rst-content h6 .headerlink,.rst-content h6 a .headerlink,a .rst-content dl dt .headerlink,.rst-content dl dt a .headerlink,a .rst-content p.caption .headerlink,.rst-content p.caption a .headerlink,a .rst-content tt.download span:first-child,.rst-content tt.download a span:first-child,a .rst-content code.download span:first-child,.rst-content code.download a span:first-child,a .icon{display:inline-block;text-decoration:inherit}.btn .fa,.btn .wy-menu-vertical li span.toctree-expand,.wy-menu-vertical li .btn span.toctree-expand,.btn .wy-menu-vertical li.on a span.toctree-expand,.wy-menu-vertical li.on a .btn span.toctree-expand,.btn .wy-menu-vertical li.current>a span.toctree-expand,.wy-menu-vertical li.current>a .btn span.toctree-expand,.btn .rst-content .admonition-title,.rst-content .btn .admonition-title,.btn .rst-content h1 .headerlink,.rst-content h1 .btn .headerlink,.btn .rst-content h2 .headerlink,.rst-content h2 .btn .headerlink,.btn .rst-content h3 .headerlink,.rst-content h3 .btn .headerlink,.btn .rst-content h4 .headerlink,.rst-content h4 .btn .headerlink,.btn .rst-content h5 .headerlink,.rst-content h5 .btn .headerlink,.btn .rst-content h6 .headerlink,.rst-content h6 .btn .headerlink,.btn .rst-content dl dt .headerlink,.rst-content dl dt .btn .headerlink,.btn .rst-content p.caption .headerlink,.rst-content p.caption .btn .headerlink,.btn .rst-content tt.download span:first-child,.rst-content tt.download .btn span:first-child,.btn .rst-content code.download span:first-child,.rst-content code.download .btn span:first-child,.btn .icon,.nav .fa,.nav .wy-menu-vertical li span.toctree-expand,.wy-menu-vertical li .nav span.toctree-expand,.nav .wy-menu-vertical li.on a span.toctree-expand,.wy-menu-vertical li.on a .nav span.toctree-expand,.nav .wy-menu-vertical li.current>a span.toctree-expand,.wy-menu-vertical li.current>a .nav span.toctree-expand,.nav .rst-content .admonition-title,.rst-content .nav .admonition-title,.nav .rst-content h1 .headerlink,.rst-content h1 .nav .headerlink,.nav .rst-content h2 .headerlink,.rst-content h2 .nav .headerlink,.nav .rst-content h3 .headerlink,.rst-content h3 .nav .headerlink,.nav .rst-content h4 .headerlink,.rst-content h4 .nav .headerlink,.nav .rst-content h5 .headerlink,.rst-content h5 .nav .headerlink,.nav .rst-content h6 .headerlink,.rst-content h6 .nav .headerlink,.nav .rst-content dl dt .headerlink,.rst-content dl dt .nav .headerlink,.nav .rst-content p.caption .headerlink,.rst-content p.caption .nav .headerlink,.nav .rst-content tt.download span:first-child,.rst-content tt.download .nav span:first-child,.nav .rst-content code.download span:first-child,.rst-content code.download .nav span:first-child,.nav .icon{display:inline}.btn .fa.fa-large,.btn .wy-menu-vertical li span.fa-large.toctree-expand,.wy-menu-vertical li .btn span.fa-large.toctree-expand,.btn .rst-content .fa-large.admonition-title,.rst-content .btn .fa-large.admonition-title,.btn .rst-content h1 .fa-large.headerlink,.rst-content h1 .btn .fa-large.headerlink,.btn .rst-content h2 .fa-large.headerlink,.rst-content h2 .btn .fa-large.headerlink,.btn .rst-content h3 .fa-large.headerlink,.rst-content h3 .btn .fa-large.headerlink,.btn .rst-content h4 .fa-large.headerlink,.rst-content h4 .btn .fa-large.headerlink,.btn .rst-content h5 .fa-large.headerlink,.rst-content h5 .btn .fa-large.headerlink,.btn .rst-content h6 .fa-large.headerlink,.rst-content h6 .btn .fa-large.headerlink,.btn .rst-content dl dt .fa-large.headerlink,.rst-content dl dt .btn .fa-large.headerlink,.btn .rst-content p.caption .fa-large.headerlink,.rst-content p.caption .btn .fa-large.headerlink,.btn .rst-content tt.download span.fa-large:first-child,.rst-content tt.download .btn span.fa-large:first-child,.btn .rst-content code.download span.fa-large:first-child,.rst-content code.download .btn span.fa-large:first-child,.btn .fa-large.icon,.nav .fa.fa-large,.nav .wy-menu-vertical li span.fa-large.toctree-expand,.wy-menu-vertical li .nav span.fa-large.toctree-expand,.nav .rst-content .fa-large.admonition-title,.rst-content .nav .fa-large.admonition-title,.nav .rst-content h1 .fa-large.headerlink,.rst-content h1 .nav .fa-large.headerlink,.nav .rst-content h2 .fa-large.headerlink,.rst-content h2 .nav .fa-large.headerlink,.nav .rst-content h3 .fa-large.headerlink,.rst-content h3 .nav .fa-large.headerlink,.nav .rst-content h4 .fa-large.headerlink,.rst-content h4 .nav .fa-large.headerlink,.nav .rst-content h5 .fa-large.headerlink,.rst-content h5 .nav .fa-large.headerlink,.nav .rst-content h6 .fa-large.headerlink,.rst-content h6 .nav .fa-large.headerlink,.nav .rst-content dl dt .fa-large.headerlink,.rst-content dl dt .nav .fa-large.headerlink,.nav .rst-content p.caption .fa-large.headerlink,.rst-content p.caption .nav .fa-large.headerlink,.nav .rst-content tt.download span.fa-large:first-child,.rst-content tt.download .nav span.fa-large:first-child,.nav .rst-content code.download span.fa-large:first-child,.rst-content code.download .nav span.fa-large:first-child,.nav .fa-large.icon{line-height:0.9em}.btn .fa.fa-spin,.btn .wy-menu-vertical li span.fa-spin.toctree-expand,.wy-menu-vertical li .btn span.fa-spin.toctree-expand,.btn .rst-content .fa-spin.admonition-title,.rst-content .btn .fa-spin.admonition-title,.btn .rst-content h1 .fa-spin.headerlink,.rst-content h1 .btn .fa-spin.headerlink,.btn .rst-content h2 .fa-spin.headerlink,.rst-content h2 .btn .fa-spin.headerlink,.btn .rst-content h3 .fa-spin.headerlink,.rst-content h3 .btn .fa-spin.headerlink,.btn .rst-content h4 .fa-spin.headerlink,.rst-content h4 .btn .fa-spin.headerlink,.btn .rst-content h5 .fa-spin.headerlink,.rst-content h5 .btn .fa-spin.headerlink,.btn .rst-content h6 .fa-spin.headerlink,.rst-content h6 .btn .fa-spin.headerlink,.btn .rst-content dl dt .fa-spin.headerlink,.rst-content dl dt .btn .fa-spin.headerlink,.btn .rst-content p.caption .fa-spin.headerlink,.rst-content p.caption .btn .fa-spin.headerlink,.btn .rst-content tt.download span.fa-spin:first-child,.rst-content tt.download .btn span.fa-spin:first-child,.btn .rst-content code.download span.fa-spin:first-child,.rst-content code.download .btn span.fa-spin:first-child,.btn .fa-spin.icon,.nav .fa.fa-spin,.nav .wy-menu-vertical li span.fa-spin.toctree-expand,.wy-menu-vertical li .nav span.fa-spin.toctree-expand,.nav .rst-content .fa-spin.admonition-title,.rst-content .nav .fa-spin.admonition-title,.nav .rst-content h1 .fa-spin.headerlink,.rst-content h1 .nav .fa-spin.headerlink,.nav .rst-content h2 .fa-spin.headerlink,.rst-content h2 .nav .fa-spin.headerlink,.nav .rst-content h3 .fa-spin.headerlink,.rst-content h3 .nav .fa-spin.headerlink,.nav .rst-content h4 .fa-spin.headerlink,.rst-content h4 .nav .fa-spin.headerlink,.nav .rst-content h5 .fa-spin.headerlink,.rst-content h5 .nav .fa-spin.headerlink,.nav .rst-content h6 .fa-spin.headerlink,.rst-content h6 .nav .fa-spin.headerlink,.nav .rst-content dl dt .fa-spin.headerlink,.rst-content dl dt .nav .fa-spin.headerlink,.nav .rst-content p.caption .fa-spin.headerlink,.rst-content p.caption .nav .fa-spin.headerlink,.nav .rst-content tt.download span.fa-spin:first-child,.rst-content tt.download .nav span.fa-spin:first-child,.nav .rst-content code.download span.fa-spin:first-child,.rst-content code.download .nav span.fa-spin:first-child,.nav .fa-spin.icon{display:inline-block}.btn.fa:before,.wy-menu-vertical li span.btn.toctree-expand:before,.rst-content .btn.admonition-title:before,.rst-content h1 .btn.headerlink:before,.rst-content h2 .btn.headerlink:before,.rst-content h3 .btn.headerlink:before,.rst-content h4 .btn.headerlink:before,.rst-content h5 .btn.headerlink:before,.rst-content h6 .btn.headerlink:before,.rst-content dl dt .btn.headerlink:before,.rst-content p.caption .btn.headerlink:before,.rst-content tt.download span.btn:first-child:before,.rst-content code.download span.btn:first-child:before,.btn.icon:before{opacity:0.5;-webkit-transition:opacity 0.05s ease-in;-moz-transition:opacity 0.05s ease-in;transition:opacity 0.05s ease-in}.btn.fa:hover:before,.wy-menu-vertical li span.btn.toctree-expand:hover:before,.rst-content .btn.admonition-title:hover:before,.rst-content h1 .btn.headerlink:hover:before,.rst-content h2 .btn.headerlink:hover:before,.rst-content h3 .btn.headerlink:hover:before,.rst-content h4 .btn.headerlink:hover:before,.rst-content h5 .btn.headerlink:hover:before,.rst-content h6 .btn.headerlink:hover:before,.rst-content dl dt .btn.headerlink:hover:before,.rst-content p.caption .btn.headerlink:hover:before,.rst-content tt.download span.btn:first-child:hover:before,.rst-content code.download span.btn:first-child:hover:before,.btn.icon:hover:before{opacity:1}.btn-mini .fa:before,.btn-mini .wy-menu-vertical li span.toctree-expand:before,.wy-menu-vertical li .btn-mini span.toctree-expand:before,.btn-mini .rst-content .admonition-title:before,.rst-content .btn-mini .admonition-title:before,.btn-mini .rst-content h1 .headerlink:before,.rst-content h1 .btn-mini .headerlink:before,.btn-mini .rst-content h2 .headerlink:before,.rst-content h2 .btn-mini .headerlink:before,.btn-mini .rst-content h3 .headerlink:before,.rst-content h3 .btn-mini .headerlink:before,.btn-mini .rst-content h4 .headerlink:before,.rst-content h4 .btn-mini .headerlink:before,.btn-mini .rst-content h5 .headerlink:before,.rst-content h5 .btn-mini .headerlink:before,.btn-mini .rst-content h6 .headerlink:before,.rst-content h6 .btn-mini .headerlink:before,.btn-mini .rst-content dl dt .headerlink:before,.rst-content dl dt .btn-mini .headerlink:before,.btn-mini .rst-content p.caption .headerlink:before,.rst-content p.caption .btn-mini .headerlink:before,.btn-mini .rst-content tt.download span:first-child:before,.rst-content tt.download .btn-mini span:first-child:before,.btn-mini .rst-content code.download span:first-child:before,.rst-content code.download .btn-mini span:first-child:before,.btn-mini .icon:before{font-size:14px;vertical-align:-15%}.wy-alert,.rst-content .note,.rst-content .attention,.rst-content .caution,.rst-content .danger,.rst-content .error,.rst-content .hint,.rst-content .important,.rst-content .tip,.rst-content .warning,.rst-content .seealso,.rst-content .admonition-todo{padding:12px;line-height:24px;margin-bottom:24px;background:#e7f2fa}.wy-alert-title,.rst-content .admonition-title{color:#fff;font-weight:bold;display:block;color:#fff;background:#6ab0de;margin:-12px;padding:6px 12px;margin-bottom:12px}.wy-alert.wy-alert-danger,.rst-content .wy-alert-danger.note,.rst-content .wy-alert-danger.attention,.rst-content .wy-alert-danger.caution,.rst-content .danger,.rst-content .error,.rst-content .wy-alert-danger.hint,.rst-content .wy-alert-danger.important,.rst-content .wy-alert-danger.tip,.rst-content .wy-alert-danger.warning,.rst-content .wy-alert-danger.seealso,.rst-content .wy-alert-danger.admonition-todo{background:#fdf3f2}.wy-alert.wy-alert-danger .wy-alert-title,.rst-content .wy-alert-danger.note .wy-alert-title,.rst-content .wy-alert-danger.attention .wy-alert-title,.rst-content .wy-alert-danger.caution .wy-alert-title,.rst-content .danger .wy-alert-title,.rst-content .error .wy-alert-title,.rst-content .wy-alert-danger.hint .wy-alert-title,.rst-content .wy-alert-danger.important .wy-alert-title,.rst-content .wy-alert-danger.tip .wy-alert-title,.rst-content .wy-alert-danger.warning .wy-alert-title,.rst-content .wy-alert-danger.seealso .wy-alert-title,.rst-content .wy-alert-danger.admonition-todo .wy-alert-title,.wy-alert.wy-alert-danger .rst-content .admonition-title,.rst-content .wy-alert.wy-alert-danger .admonition-title,.rst-content .wy-alert-danger.note .admonition-title,.rst-content .wy-alert-danger.attention .admonition-title,.rst-content .wy-alert-danger.caution .admonition-title,.rst-content .danger .admonition-title,.rst-content .error .admonition-title,.rst-content .wy-alert-danger.hint .admonition-title,.rst-content .wy-alert-danger.important .admonition-title,.rst-content .wy-alert-danger.tip .admonition-title,.rst-content .wy-alert-danger.warning .admonition-title,.rst-content .wy-alert-danger.seealso .admonition-title,.rst-content .wy-alert-danger.admonition-todo .admonition-title{background:#f29f97}.wy-alert.wy-alert-warning,.rst-content .wy-alert-warning.note,.rst-content .attention,.rst-content .caution,.rst-content .wy-alert-warning.danger,.rst-content .wy-alert-warning.error,.rst-content .wy-alert-warning.hint,.rst-content .wy-alert-warning.important,.rst-content .wy-alert-warning.tip,.rst-content .warning,.rst-content .wy-alert-warning.seealso,.rst-content .admonition-todo{background:#ffedcc}.wy-alert.wy-alert-warning .wy-alert-title,.rst-content .wy-alert-warning.note .wy-alert-title,.rst-content .attention .wy-alert-title,.rst-content .caution .wy-alert-title,.rst-content .wy-alert-warning.danger .wy-alert-title,.rst-content .wy-alert-warning.error .wy-alert-title,.rst-content .wy-alert-warning.hint .wy-alert-title,.rst-content .wy-alert-warning.important .wy-alert-title,.rst-content .wy-alert-warning.tip .wy-alert-title,.rst-content .warning .wy-alert-title,.rst-content .wy-alert-warning.seealso .wy-alert-title,.rst-content .admonition-todo .wy-alert-title,.wy-alert.wy-alert-warning .rst-content .admonition-title,.rst-content .wy-alert.wy-alert-warning .admonition-title,.rst-content .wy-alert-warning.note .admonition-title,.rst-content .attention .admonition-title,.rst-content .caution .admonition-title,.rst-content .wy-alert-warning.danger .admonition-title,.rst-content .wy-alert-warning.error .admonition-title,.rst-content .wy-alert-warning.hint .admonition-title,.rst-content .wy-alert-warning.important .admonition-title,.rst-content .wy-alert-warning.tip .admonition-title,.rst-content .warning .admonition-title,.rst-content .wy-alert-warning.seealso .admonition-title,.rst-content .admonition-todo .admonition-title{background:#f0b37e}.wy-alert.wy-alert-info,.rst-content .note,.rst-content .wy-alert-info.attention,.rst-content .wy-alert-info.caution,.rst-content .wy-alert-info.danger,.rst-content .wy-alert-info.error,.rst-content .wy-alert-info.hint,.rst-content .wy-alert-info.important,.rst-content .wy-alert-info.tip,.rst-content .wy-alert-info.warning,.rst-content .seealso,.rst-content .wy-alert-info.admonition-todo{background:#e7f2fa}.wy-alert.wy-alert-info .wy-alert-title,.rst-content .note .wy-alert-title,.rst-content .wy-alert-info.attention .wy-alert-title,.rst-content .wy-alert-info.caution .wy-alert-title,.rst-content .wy-alert-info.danger .wy-alert-title,.rst-content .wy-alert-info.error .wy-alert-title,.rst-content .wy-alert-info.hint .wy-alert-title,.rst-content .wy-alert-info.important .wy-alert-title,.rst-content .wy-alert-info.tip .wy-alert-title,.rst-content .wy-alert-info.warning .wy-alert-title,.rst-content .seealso .wy-alert-title,.rst-content .wy-alert-info.admonition-todo .wy-alert-title,.wy-alert.wy-alert-info .rst-content .admonition-title,.rst-content .wy-alert.wy-alert-info .admonition-title,.rst-content .note .admonition-title,.rst-content .wy-alert-info.attention .admonition-title,.rst-content .wy-alert-info.caution .admonition-title,.rst-content .wy-alert-info.danger .admonition-title,.rst-content .wy-alert-info.error .admonition-title,.rst-content .wy-alert-info.hint .admonition-title,.rst-content .wy-alert-info.important .admonition-title,.rst-content .wy-alert-info.tip .admonition-title,.rst-content .wy-alert-info.warning .admonition-title,.rst-content .seealso .admonition-title,.rst-content .wy-alert-info.admonition-todo .admonition-title{background:#6ab0de}.wy-alert.wy-alert-success,.rst-content .wy-alert-success.note,.rst-content .wy-alert-success.attention,.rst-content .wy-alert-success.caution,.rst-content .wy-alert-success.danger,.rst-content .wy-alert-success.error,.rst-content .hint,.rst-content .important,.rst-content .tip,.rst-content .wy-alert-success.warning,.rst-content .wy-alert-success.seealso,.rst-content .wy-alert-success.admonition-todo{background:#dbfaf4}.wy-alert.wy-alert-success .wy-alert-title,.rst-content .wy-alert-success.note .wy-alert-title,.rst-content .wy-alert-success.attention .wy-alert-title,.rst-content .wy-alert-success.caution .wy-alert-title,.rst-content .wy-alert-success.danger .wy-alert-title,.rst-content .wy-alert-success.error .wy-alert-title,.rst-content .hint .wy-alert-title,.rst-content .important .wy-alert-title,.rst-content .tip .wy-alert-title,.rst-content .wy-alert-success.warning .wy-alert-title,.rst-content .wy-alert-success.seealso .wy-alert-title,.rst-content .wy-alert-success.admonition-todo .wy-alert-title,.wy-alert.wy-alert-success .rst-content .admonition-title,.rst-content .wy-alert.wy-alert-success .admonition-title,.rst-content .wy-alert-success.note .admonition-title,.rst-content .wy-alert-success.attention .admonition-title,.rst-content .wy-alert-success.caution .admonition-title,.rst-content .wy-alert-success.danger .admonition-title,.rst-content .wy-alert-success.error .admonition-title,.rst-content .hint .admonition-title,.rst-content .important .admonition-title,.rst-content .tip .admonition-title,.rst-content .wy-alert-success.warning .admonition-title,.rst-content .wy-alert-success.seealso .admonition-title,.rst-content .wy-alert-success.admonition-todo .admonition-title{background:#1abc9c}.wy-alert.wy-alert-neutral,.rst-content .wy-alert-neutral.note,.rst-content .wy-alert-neutral.attention,.rst-content .wy-alert-neutral.caution,.rst-content .wy-alert-neutral.danger,.rst-content .wy-alert-neutral.error,.rst-content .wy-alert-neutral.hint,.rst-content .wy-alert-neutral.important,.rst-content .wy-alert-neutral.tip,.rst-content .wy-alert-neutral.warning,.rst-content .wy-alert-neutral.seealso,.rst-content .wy-alert-neutral.admonition-todo{background:#f3f6f6}.wy-alert.wy-alert-neutral .wy-alert-title,.rst-content .wy-alert-neutral.note .wy-alert-title,.rst-content .wy-alert-neutral.attention .wy-alert-title,.rst-content .wy-alert-neutral.caution .wy-alert-title,.rst-content .wy-alert-neutral.danger .wy-alert-title,.rst-content .wy-alert-neutral.error .wy-alert-title,.rst-content .wy-alert-neutral.hint .wy-alert-title,.rst-content .wy-alert-neutral.important .wy-alert-title,.rst-content .wy-alert-neutral.tip .wy-alert-title,.rst-content .wy-alert-neutral.warning .wy-alert-title,.rst-content .wy-alert-neutral.seealso .wy-alert-title,.rst-content .wy-alert-neutral.admonition-todo .wy-alert-title,.wy-alert.wy-alert-neutral .rst-content .admonition-title,.rst-content .wy-alert.wy-alert-neutral .admonition-title,.rst-content .wy-alert-neutral.note .admonition-title,.rst-content .wy-alert-neutral.attention .admonition-title,.rst-content .wy-alert-neutral.caution .admonition-title,.rst-content .wy-alert-neutral.danger .admonition-title,.rst-content .wy-alert-neutral.error .admonition-title,.rst-content .wy-alert-neutral.hint .admonition-title,.rst-content .wy-alert-neutral.important .admonition-title,.rst-content .wy-alert-neutral.tip .admonition-title,.rst-content .wy-alert-neutral.warning .admonition-title,.rst-content .wy-alert-neutral.seealso .admonition-title,.rst-content .wy-alert-neutral.admonition-todo .admonition-title{color:#404040;background:#e1e4e5}.wy-alert.wy-alert-neutral a,.rst-content .wy-alert-neutral.note a,.rst-content .wy-alert-neutral.attention a,.rst-content .wy-alert-neutral.caution a,.rst-content .wy-alert-neutral.danger a,.rst-content .wy-alert-neutral.error a,.rst-content .wy-alert-neutral.hint a,.rst-content .wy-alert-neutral.important a,.rst-content .wy-alert-neutral.tip a,.rst-content .wy-alert-neutral.warning a,.rst-content .wy-alert-neutral.seealso a,.rst-content .wy-alert-neutral.admonition-todo a{color:#2980B9}.wy-alert p:last-child,.rst-content .note p:last-child,.rst-content .attention p:last-child,.rst-content .caution p:last-child,.rst-content .danger p:last-child,.rst-content .error p:last-child,.rst-content .hint p:last-child,.rst-content .important p:last-child,.rst-content .tip p:last-child,.rst-content .warning p:last-child,.rst-content .seealso p:last-child,.rst-content .admonition-todo p:last-child{margin-bottom:0}.wy-tray-container{position:fixed;bottom:0px;left:0;z-index:600}.wy-tray-container li{display:block;width:300px;background:transparent;color:#fff;text-align:center;box-shadow:0 5px 5px 0 rgba(0,0,0,0.1);padding:0 24px;min-width:20%;opacity:0;height:0;line-height:56px;overflow:hidden;-webkit-transition:all 0.3s ease-in;-moz-transition:all 0.3s ease-in;transition:all 0.3s ease-in}.wy-tray-container li.wy-tray-item-success{background:#27AE60}.wy-tray-container li.wy-tray-item-info{background:#2980B9}.wy-tray-container li.wy-tray-item-warning{background:#E67E22}.wy-tray-container li.wy-tray-item-danger{background:#E74C3C}.wy-tray-container li.on{opacity:1;height:56px}@media screen and (max-width: 768px){.wy-tray-container{bottom:auto;top:0;width:100%}.wy-tray-container li{width:100%}}button{font-size:100%;margin:0;vertical-align:baseline;*vertical-align:middle;cursor:pointer;line-height:normal;-webkit-appearance:button;*overflow:visible}button::-moz-focus-inner,input::-moz-focus-inner{border:0;padding:0}button[disabled]{cursor:default}.btn{display:inline-block;border-radius:2px;line-height:normal;white-space:nowrap;text-align:center;cursor:pointer;font-size:100%;padding:6px 12px 8px 12px;color:#fff;border:1px solid rgba(0,0,0,0.1);background-color:#27AE60;text-decoration:none;font-weight:normal;font-family:"Lato","proxima-nova","Helvetica Neue",Arial,sans-serif;box-shadow:0px 1px 2px -1px rgba(255,255,255,0.5) inset,0px -2px 0px 0px rgba(0,0,0,0.1) inset;outline-none:false;vertical-align:middle;*display:inline;zoom:1;-webkit-user-drag:none;-webkit-user-select:none;-moz-user-select:none;-ms-user-select:none;user-select:none;-webkit-transition:all 0.1s linear;-moz-transition:all 0.1s linear;transition:all 0.1s linear}.btn-hover{background:#2e8ece;color:#fff}.btn:hover{background:#2cc36b;color:#fff}.btn:focus{background:#2cc36b;outline:0}.btn:active{box-shadow:0px -1px 0px 0px rgba(0,0,0,0.05) inset,0px 2px 0px 0px rgba(0,0,0,0.1) inset;padding:8px 12px 6px 12px}.btn:visited{color:#fff}.btn:disabled{background-image:none;filter:progid:DXImageTransform.Microsoft.gradient(enabled = false);filter:alpha(opacity=40);opacity:0.4;cursor:not-allowed;box-shadow:none}.btn-disabled{background-image:none;filter:progid:DXImageTransform.Microsoft.gradient(enabled = false);filter:alpha(opacity=40);opacity:0.4;cursor:not-allowed;box-shadow:none}.btn-disabled:hover,.btn-disabled:focus,.btn-disabled:active{background-image:none;filter:progid:DXImageTransform.Microsoft.gradient(enabled = false);filter:alpha(opacity=40);opacity:0.4;cursor:not-allowed;box-shadow:none}.btn::-moz-focus-inner{padding:0;border:0}.btn-small{font-size:80%}.btn-info{background-color:#2980B9 !important}.btn-info:hover{background-color:#2e8ece !important}.btn-neutral{background-color:#f3f6f6 !important;color:#404040 !important}.btn-neutral:hover{background-color:#e5ebeb !important;color:#404040}.btn-neutral:visited{color:#404040 !important}.btn-success{background-color:#27AE60 !important}.btn-success:hover{background-color:#295 !important}.btn-danger{background-color:#E74C3C !important}.btn-danger:hover{background-color:#ea6153 !important}.btn-warning{background-color:#E67E22 !important}.btn-warning:hover{background-color:#e98b39 !important}.btn-invert{background-color:#222}.btn-invert:hover{background-color:#2f2f2f !important}.btn-link{background-color:transparent !important;color:#2980B9;box-shadow:none;border-color:transparent !important}.btn-link:hover{background-color:transparent !important;color:#409ad5 !important;box-shadow:none}.btn-link:active{background-color:transparent !important;color:#409ad5 !important;box-shadow:none}.btn-link:visited{color:#9B59B6}.wy-btn-group .btn,.wy-control .btn{vertical-align:middle}.wy-btn-group{margin-bottom:24px;*zoom:1}.wy-btn-group:before,.wy-btn-group:after{display:table;content:""}.wy-btn-group:after{clear:both}.wy-dropdown{position:relative;display:inline-block}.wy-dropdown-active .wy-dropdown-menu{display:block}.wy-dropdown-menu{position:absolute;left:0;display:none;float:left;top:100%;min-width:100%;background:#fcfcfc;z-index:100;border:solid 1px #cfd7dd;box-shadow:0 2px 2px 0 rgba(0,0,0,0.1);padding:12px}.wy-dropdown-menu>dd>a{display:block;clear:both;color:#404040;white-space:nowrap;font-size:90%;padding:0 12px;cursor:pointer}.wy-dropdown-menu>dd>a:hover{background:#2980B9;color:#fff}.wy-dropdown-menu>dd.divider{border-top:solid 1px #cfd7dd;margin:6px 0}.wy-dropdown-menu>dd.search{padding-bottom:12px}.wy-dropdown-menu>dd.search input[type="search"]{width:100%}.wy-dropdown-menu>dd.call-to-action{background:#e3e3e3;text-transform:uppercase;font-weight:500;font-size:80%}.wy-dropdown-menu>dd.call-to-action:hover{background:#e3e3e3}.wy-dropdown-menu>dd.call-to-action .btn{color:#fff}.wy-dropdown.wy-dropdown-up .wy-dropdown-menu{bottom:100%;top:auto;left:auto;right:0}.wy-dropdown.wy-dropdown-bubble .wy-dropdown-menu{background:#fcfcfc;margin-top:2px}.wy-dropdown.wy-dropdown-bubble .wy-dropdown-menu a{padding:6px 12px}.wy-dropdown.wy-dropdown-bubble .wy-dropdown-menu a:hover{background:#2980B9;color:#fff}.wy-dropdown.wy-dropdown-left .wy-dropdown-menu{right:0;left:auto;text-align:right}.wy-dropdown-arrow:before{content:" ";border-bottom:5px solid #f5f5f5;border-left:5px solid transparent;border-right:5px solid transparent;position:absolute;display:block;top:-4px;left:50%;margin-left:-3px}.wy-dropdown-arrow.wy-dropdown-arrow-left:before{left:11px}.wy-form-stacked select{display:block}.wy-form-aligned input,.wy-form-aligned textarea,.wy-form-aligned select,.wy-form-aligned .wy-help-inline,.wy-form-aligned label{display:inline-block;*display:inline;*zoom:1;vertical-align:middle}.wy-form-aligned .wy-control-group>label{display:inline-block;vertical-align:middle;width:10em;margin:6px 12px 0 0;float:left}.wy-form-aligned .wy-control{float:left}.wy-form-aligned .wy-control label{display:block}.wy-form-aligned .wy-control select{margin-top:6px}fieldset{border:0;margin:0;padding:0}legend{display:block;width:100%;border:0;padding:0;white-space:normal;margin-bottom:24px;font-size:150%;*margin-left:-7px}label{display:block;margin:0 0 .3125em 0;color:#333;font-size:90%}input,select,textarea{font-size:100%;margin:0;vertical-align:baseline;*vertical-align:middle}.wy-control-group{margin-bottom:24px;*zoom:1;max-width:68em;margin-left:auto;margin-right:auto;*zoom:1}.wy-control-group:before,.wy-control-group:after{display:table;content:""}.wy-control-group:after{clear:both}.wy-control-group:before,.wy-control-group:after{display:table;content:""}.wy-control-group:after{clear:both}.wy-control-group.wy-control-group-required>label:after{content:" *";color:#E74C3C}.wy-control-group .wy-form-full,.wy-control-group .wy-form-halves,.wy-control-group .wy-form-thirds{padding-bottom:12px}.wy-control-group .wy-form-full select,.wy-control-group .wy-form-halves select,.wy-control-group .wy-form-thirds select{width:100%}.wy-control-group .wy-form-full input[type="text"],.wy-control-group .wy-form-full input[type="password"],.wy-control-group .wy-form-full input[type="email"],.wy-control-group .wy-form-full input[type="url"],.wy-control-group .wy-form-full input[type="date"],.wy-control-group .wy-form-full input[type="month"],.wy-control-group .wy-form-full input[type="time"],.wy-control-group .wy-form-full input[type="datetime"],.wy-control-group .wy-form-full input[type="datetime-local"],.wy-control-group .wy-form-full input[type="week"],.wy-control-group .wy-form-full input[type="number"],.wy-control-group .wy-form-full input[type="search"],.wy-control-group .wy-form-full input[type="tel"],.wy-control-group .wy-form-full input[type="color"],.wy-control-group .wy-form-halves input[type="text"],.wy-control-group .wy-form-halves input[type="password"],.wy-control-group .wy-form-halves input[type="email"],.wy-control-group .wy-form-halves input[type="url"],.wy-control-group .wy-form-halves input[type="date"],.wy-control-group .wy-form-halves input[type="month"],.wy-control-group .wy-form-halves input[type="time"],.wy-control-group .wy-form-halves input[type="datetime"],.wy-control-group .wy-form-halves input[type="datetime-local"],.wy-control-group .wy-form-halves input[type="week"],.wy-control-group .wy-form-halves input[type="number"],.wy-control-group .wy-form-halves input[type="search"],.wy-control-group .wy-form-halves input[type="tel"],.wy-control-group .wy-form-halves input[type="color"],.wy-control-group .wy-form-thirds input[type="text"],.wy-control-group .wy-form-thirds input[type="password"],.wy-control-group .wy-form-thirds input[type="email"],.wy-control-group .wy-form-thirds input[type="url"],.wy-control-group .wy-form-thirds input[type="date"],.wy-control-group .wy-form-thirds input[type="month"],.wy-control-group .wy-form-thirds input[type="time"],.wy-control-group .wy-form-thirds input[type="datetime"],.wy-control-group .wy-form-thirds input[type="datetime-local"],.wy-control-group .wy-form-thirds input[type="week"],.wy-control-group .wy-form-thirds input[type="number"],.wy-control-group .wy-form-thirds input[type="search"],.wy-control-group .wy-form-thirds input[type="tel"],.wy-control-group .wy-form-thirds input[type="color"]{width:100%}.wy-control-group .wy-form-full{float:left;display:block;margin-right:2.35765%;width:100%;margin-right:0}.wy-control-group .wy-form-full:last-child{margin-right:0}.wy-control-group .wy-form-halves{float:left;display:block;margin-right:2.35765%;width:48.82117%}.wy-control-group .wy-form-halves:last-child{margin-right:0}.wy-control-group .wy-form-halves:nth-of-type(2n){margin-right:0}.wy-control-group .wy-form-halves:nth-of-type(2n+1){clear:left}.wy-control-group .wy-form-thirds{float:left;display:block;margin-right:2.35765%;width:31.76157%}.wy-control-group .wy-form-thirds:last-child{margin-right:0}.wy-control-group .wy-form-thirds:nth-of-type(3n){margin-right:0}.wy-control-group .wy-form-thirds:nth-of-type(3n+1){clear:left}.wy-control-group.wy-control-group-no-input .wy-control{margin:6px 0 0 0;font-size:90%}.wy-control-no-input{display:inline-block;margin:6px 0 0 0;font-size:90%}.wy-control-group.fluid-input input[type="text"],.wy-control-group.fluid-input input[type="password"],.wy-control-group.fluid-input input[type="email"],.wy-control-group.fluid-input input[type="url"],.wy-control-group.fluid-input input[type="date"],.wy-control-group.fluid-input input[type="month"],.wy-control-group.fluid-input input[type="time"],.wy-control-group.fluid-input input[type="datetime"],.wy-control-group.fluid-input input[type="datetime-local"],.wy-control-group.fluid-input input[type="week"],.wy-control-group.fluid-input input[type="number"],.wy-control-group.fluid-input input[type="search"],.wy-control-group.fluid-input input[type="tel"],.wy-control-group.fluid-input input[type="color"]{width:100%}.wy-form-message-inline{display:inline-block;padding-left:0.3em;color:#666;vertical-align:middle;font-size:90%}.wy-form-message{display:block;color:#999;font-size:70%;margin-top:.3125em;font-style:italic}.wy-form-message p{font-size:inherit;font-style:italic;margin-bottom:6px}.wy-form-message p:last-child{margin-bottom:0}input{line-height:normal}input[type="button"],input[type="reset"],input[type="submit"]{-webkit-appearance:button;cursor:pointer;font-family:"Lato","proxima-nova","Helvetica Neue",Arial,sans-serif;*overflow:visible}input[type="text"],input[type="password"],input[type="email"],input[type="url"],input[type="date"],input[type="month"],input[type="time"],input[type="datetime"],input[type="datetime-local"],input[type="week"],input[type="number"],input[type="search"],input[type="tel"],input[type="color"]{-webkit-appearance:none;padding:6px;display:inline-block;border:1px solid #ccc;font-size:80%;font-family:"Lato","proxima-nova","Helvetica Neue",Arial,sans-serif;box-shadow:inset 0 1px 3px #ddd;border-radius:0;-webkit-transition:border 0.3s linear;-moz-transition:border 0.3s linear;transition:border 0.3s linear}input[type="datetime-local"]{padding:.34375em .625em}input[disabled]{cursor:default}input[type="checkbox"],input[type="radio"]{-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box;padding:0;margin-right:.3125em;*height:13px;*width:13px}input[type="search"]{-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box}input[type="search"]::-webkit-search-cancel-button,input[type="search"]::-webkit-search-decoration{-webkit-appearance:none}input[type="text"]:focus,input[type="password"]:focus,input[type="email"]:focus,input[type="url"]:focus,input[type="date"]:focus,input[type="month"]:focus,input[type="time"]:focus,input[type="datetime"]:focus,input[type="datetime-local"]:focus,input[type="week"]:focus,input[type="number"]:focus,input[type="search"]:focus,input[type="tel"]:focus,input[type="color"]:focus{outline:0;outline:thin dotted \9;border-color:#333}input.no-focus:focus{border-color:#ccc !important}input[type="file"]:focus,input[type="radio"]:focus,input[type="checkbox"]:focus{outline:thin dotted #333;outline:1px auto #129FEA}input[type="text"][disabled],input[type="password"][disabled],input[type="email"][disabled],input[type="url"][disabled],input[type="date"][disabled],input[type="month"][disabled],input[type="time"][disabled],input[type="datetime"][disabled],input[type="datetime-local"][disabled],input[type="week"][disabled],input[type="number"][disabled],input[type="search"][disabled],input[type="tel"][disabled],input[type="color"][disabled]{cursor:not-allowed;background-color:#fafafa}input:focus:invalid,textarea:focus:invalid,select:focus:invalid{color:#E74C3C;border:1px solid #E74C3C}input:focus:invalid:focus,textarea:focus:invalid:focus,select:focus:invalid:focus{border-color:#E74C3C}input[type="file"]:focus:invalid:focus,input[type="radio"]:focus:invalid:focus,input[type="checkbox"]:focus:invalid:focus{outline-color:#E74C3C}input.wy-input-large{padding:12px;font-size:100%}textarea{overflow:auto;vertical-align:top;width:100%;font-family:"Lato","proxima-nova","Helvetica Neue",Arial,sans-serif}select,textarea{padding:.5em .625em;display:inline-block;border:1px solid #ccc;font-size:80%;box-shadow:inset 0 1px 3px #ddd;-webkit-transition:border 0.3s linear;-moz-transition:border 0.3s linear;transition:border 0.3s linear}select{border:1px solid #ccc;background-color:#fff}select[multiple]{height:auto}select:focus,textarea:focus{outline:0}select[disabled],textarea[disabled],input[readonly],select[readonly],textarea[readonly]{cursor:not-allowed;background-color:#fafafa}input[type="radio"][disabled],input[type="checkbox"][disabled]{cursor:not-allowed}.wy-checkbox,.wy-radio{margin:6px 0;color:#404040;display:block}.wy-checkbox input,.wy-radio input{vertical-align:baseline}.wy-form-message-inline{display:inline-block;*display:inline;*zoom:1;vertical-align:middle}.wy-input-prefix,.wy-input-suffix{white-space:nowrap;padding:6px}.wy-input-prefix .wy-input-context,.wy-input-suffix .wy-input-context{line-height:27px;padding:0 8px;display:inline-block;font-size:80%;background-color:#f3f6f6;border:solid 1px #ccc;color:#999}.wy-input-suffix .wy-input-context{border-left:0}.wy-input-prefix .wy-input-context{border-right:0}.wy-switch{position:relative;display:block;height:24px;margin-top:12px;cursor:pointer}.wy-switch:before{position:absolute;content:"";display:block;left:0;top:0;width:36px;height:12px;border-radius:4px;background:#ccc;-webkit-transition:all 0.2s ease-in-out;-moz-transition:all 0.2s ease-in-out;transition:all 0.2s ease-in-out}.wy-switch:after{position:absolute;content:"";display:block;width:18px;height:18px;border-radius:4px;background:#999;left:-3px;top:-3px;-webkit-transition:all 0.2s ease-in-out;-moz-transition:all 0.2s ease-in-out;transition:all 0.2s ease-in-out}.wy-switch span{position:absolute;left:48px;display:block;font-size:12px;color:#ccc;line-height:1}.wy-switch.active:before{background:#1e8449}.wy-switch.active:after{left:24px;background:#27AE60}.wy-switch.disabled{cursor:not-allowed;opacity:0.8}.wy-control-group.wy-control-group-error .wy-form-message,.wy-control-group.wy-control-group-error>label{color:#E74C3C}.wy-control-group.wy-control-group-error input[type="text"],.wy-control-group.wy-control-group-error input[type="password"],.wy-control-group.wy-control-group-error input[type="email"],.wy-control-group.wy-control-group-error input[type="url"],.wy-control-group.wy-control-group-error input[type="date"],.wy-control-group.wy-control-group-error input[type="month"],.wy-control-group.wy-control-group-error input[type="time"],.wy-control-group.wy-control-group-error input[type="datetime"],.wy-control-group.wy-control-group-error input[type="datetime-local"],.wy-control-group.wy-control-group-error input[type="week"],.wy-control-group.wy-control-group-error input[type="number"],.wy-control-group.wy-control-group-error input[type="search"],.wy-control-group.wy-control-group-error input[type="tel"],.wy-control-group.wy-control-group-error input[type="color"]{border:solid 1px #E74C3C}.wy-control-group.wy-control-group-error textarea{border:solid 1px #E74C3C}.wy-inline-validate{white-space:nowrap}.wy-inline-validate .wy-input-context{padding:.5em .625em;display:inline-block;font-size:80%}.wy-inline-validate.wy-inline-validate-success .wy-input-context{color:#27AE60}.wy-inline-validate.wy-inline-validate-danger .wy-input-context{color:#E74C3C}.wy-inline-validate.wy-inline-validate-warning .wy-input-context{color:#E67E22}.wy-inline-validate.wy-inline-validate-info .wy-input-context{color:#2980B9}.rotate-90{-webkit-transform:rotate(90deg);-moz-transform:rotate(90deg);-ms-transform:rotate(90deg);-o-transform:rotate(90deg);transform:rotate(90deg)}.rotate-180{-webkit-transform:rotate(180deg);-moz-transform:rotate(180deg);-ms-transform:rotate(180deg);-o-transform:rotate(180deg);transform:rotate(180deg)}.rotate-270{-webkit-transform:rotate(270deg);-moz-transform:rotate(270deg);-ms-transform:rotate(270deg);-o-transform:rotate(270deg);transform:rotate(270deg)}.mirror{-webkit-transform:scaleX(-1);-moz-transform:scaleX(-1);-ms-transform:scaleX(-1);-o-transform:scaleX(-1);transform:scaleX(-1)}.mirror.rotate-90{-webkit-transform:scaleX(-1) rotate(90deg);-moz-transform:scaleX(-1) rotate(90deg);-ms-transform:scaleX(-1) rotate(90deg);-o-transform:scaleX(-1) rotate(90deg);transform:scaleX(-1) rotate(90deg)}.mirror.rotate-180{-webkit-transform:scaleX(-1) rotate(180deg);-moz-transform:scaleX(-1) rotate(180deg);-ms-transform:scaleX(-1) rotate(180deg);-o-transform:scaleX(-1) rotate(180deg);transform:scaleX(-1) rotate(180deg)}.mirror.rotate-270{-webkit-transform:scaleX(-1) rotate(270deg);-moz-transform:scaleX(-1) rotate(270deg);-ms-transform:scaleX(-1) rotate(270deg);-o-transform:scaleX(-1) rotate(270deg);transform:scaleX(-1) rotate(270deg)}@media only screen and (max-width: 480px){.wy-form button[type="submit"]{margin:0.7em 0 0}.wy-form input[type="text"],.wy-form input[type="password"],.wy-form input[type="email"],.wy-form input[type="url"],.wy-form input[type="date"],.wy-form input[type="month"],.wy-form input[type="time"],.wy-form input[type="datetime"],.wy-form input[type="datetime-local"],.wy-form input[type="week"],.wy-form input[type="number"],.wy-form input[type="search"],.wy-form input[type="tel"],.wy-form input[type="color"]{margin-bottom:0.3em;display:block}.wy-form label{margin-bottom:0.3em;display:block}.wy-form input[type="password"],.wy-form input[type="email"],.wy-form input[type="url"],.wy-form input[type="date"],.wy-form input[type="month"],.wy-form input[type="time"],.wy-form input[type="datetime"],.wy-form input[type="datetime-local"],.wy-form input[type="week"],.wy-form input[type="number"],.wy-form input[type="search"],.wy-form input[type="tel"],.wy-form input[type="color"]{margin-bottom:0}.wy-form-aligned .wy-control-group label{margin-bottom:0.3em;text-align:left;display:block;width:100%}.wy-form-aligned .wy-control{margin:1.5em 0 0 0}.wy-form .wy-help-inline,.wy-form-message-inline,.wy-form-message{display:block;font-size:80%;padding:6px 0}}@media screen and (max-width: 768px){.tablet-hide{display:none}}@media screen and (max-width: 480px){.mobile-hide{display:none}}.float-left{float:left}.float-right{float:right}.full-width{width:100%}.wy-table,.rst-content table.docutils,.rst-content table.field-list{border-collapse:collapse;border-spacing:0;empty-cells:show;margin-bottom:24px}.wy-table caption,.rst-content table.docutils caption,.rst-content table.field-list caption{color:#000;font:italic 85%/1 arial,sans-serif;padding:1em 0;text-align:center}.wy-table td,.rst-content table.docutils td,.rst-content table.field-list td,.wy-table th,.rst-content table.docutils th,.rst-content table.field-list th{font-size:90%;margin:0;overflow:visible;padding:8px 16px}.wy-table td:first-child,.rst-content table.docutils td:first-child,.rst-content table.field-list td:first-child,.wy-table th:first-child,.rst-content table.docutils th:first-child,.rst-content table.field-list th:first-child{border-left-width:0}.wy-table thead,.rst-content table.docutils thead,.rst-content table.field-list thead{color:#000;text-align:left;vertical-align:bottom;white-space:nowrap}.wy-table thead th,.rst-content table.docutils thead th,.rst-content table.field-list thead th{font-weight:bold;border-bottom:solid 2px #e1e4e5}.wy-table td,.rst-content table.docutils td,.rst-content table.field-list td{background-color:transparent;vertical-align:middle}.wy-table td p,.rst-content table.docutils td p,.rst-content table.field-list td p{line-height:18px}.wy-table td p:last-child,.rst-content table.docutils td p:last-child,.rst-content table.field-list td p:last-child{margin-bottom:0}.wy-table .wy-table-cell-min,.rst-content table.docutils .wy-table-cell-min,.rst-content table.field-list .wy-table-cell-min{width:1%;padding-right:0}.wy-table .wy-table-cell-min input[type=checkbox],.rst-content table.docutils .wy-table-cell-min input[type=checkbox],.rst-content table.field-list .wy-table-cell-min input[type=checkbox],.wy-table .wy-table-cell-min input[type=checkbox],.rst-content table.docutils .wy-table-cell-min input[type=checkbox],.rst-content table.field-list .wy-table-cell-min input[type=checkbox]{margin:0}.wy-table-secondary{color:gray;font-size:90%}.wy-table-tertiary{color:gray;font-size:80%}.wy-table-odd td,.wy-table-striped tr:nth-child(2n-1) td,.rst-content table.docutils:not(.field-list) tr:nth-child(2n-1) td{background-color:#f3f6f6}.wy-table-backed{background-color:#f3f6f6}.wy-table-bordered-all,.rst-content table.docutils{border:1px solid #e1e4e5}.wy-table-bordered-all td,.rst-content table.docutils td{border-bottom:1px solid #e1e4e5;border-left:1px solid #e1e4e5}.wy-table-bordered-all tbody>tr:last-child td,.rst-content table.docutils tbody>tr:last-child td{border-bottom-width:0}.wy-table-bordered{border:1px solid #e1e4e5}.wy-table-bordered-rows td{border-bottom:1px solid #e1e4e5}.wy-table-bordered-rows tbody>tr:last-child td{border-bottom-width:0}.wy-table-horizontal tbody>tr:last-child td{border-bottom-width:0}.wy-table-horizontal td,.wy-table-horizontal th{border-width:0 0 1px 0;border-bottom:1px solid #e1e4e5}.wy-table-horizontal tbody>tr:last-child td{border-bottom-width:0}.wy-table-responsive{margin-bottom:24px;max-width:100%;overflow:auto}.wy-table-responsive table{margin-bottom:0 !important}.wy-table-responsive table td,.wy-table-responsive table th{white-space:nowrap}a{color:#2980B9;text-decoration:none;cursor:pointer}a:hover{color:#3091d1}a:visited{color:#9B59B6}html{height:100%;overflow-x:hidden}body{font-family:"Lato","proxima-nova","Helvetica Neue",Arial,sans-serif;font-weight:normal;color:#404040;min-height:100%;overflow-x:hidden;background:#edf0f2}.wy-text-left{text-align:left}.wy-text-center{text-align:center}.wy-text-right{text-align:right}.wy-text-large{font-size:120%}.wy-text-normal{font-size:100%}.wy-text-small,small{font-size:80%}.wy-text-strike{text-decoration:line-through}.wy-text-warning{color:#E67E22 !important}a.wy-text-warning:hover{color:#eb9950 !important}.wy-text-info{color:#2980B9 !important}a.wy-text-info:hover{color:#409ad5 !important}.wy-text-success{color:#27AE60 !important}a.wy-text-success:hover{color:#36d278 !important}.wy-text-danger{color:#E74C3C !important}a.wy-text-danger:hover{color:#ed7669 !important}.wy-text-neutral{color:#404040 !important}a.wy-text-neutral:hover{color:#595959 !important}h1,h2,.rst-content .toctree-wrapper p.caption,h3,h4,h5,h6,legend{margin-top:0;font-weight:700;font-family:"Roboto Slab","ff-tisa-web-pro","Georgia",Arial,sans-serif}p{line-height:24px;margin:0;font-size:16px;margin-bottom:24px}h1{font-size:175%}h2,.rst-content .toctree-wrapper p.caption{font-size:150%}h3{font-size:125%}h4{font-size:115%}h5{font-size:110%}h6{font-size:100%}hr{display:block;height:1px;border:0;border-top:1px solid #e1e4e5;margin:24px 0;padding:0}code,.rst-content tt,.rst-content code{white-space:nowrap;max-width:100%;background:#fff;border:solid 1px #e1e4e5;font-size:75%;padding:0 5px;font-family:Consolas,"Andale Mono WT","Andale Mono","Lucida Console","Lucida Sans Typewriter","DejaVu Sans Mono","Bitstream Vera Sans Mono","Liberation Mono","Nimbus Mono L",Monaco,"Courier New",Courier,monospace;color:#E74C3C;overflow-x:auto}code.code-large,.rst-content tt.code-large{font-size:90%}.wy-plain-list-disc,.rst-content .section ul,.rst-content .toctree-wrapper ul,article ul{list-style:disc;line-height:24px;margin-bottom:24px}.wy-plain-list-disc li,.rst-content .section ul li,.rst-content .toctree-wrapper ul li,article ul li{list-style:disc;margin-left:24px}.wy-plain-list-disc li p:last-child,.rst-content .section ul li p:last-child,.rst-content .toctree-wrapper ul li p:last-child,article ul li p:last-child{margin-bottom:0}.wy-plain-list-disc li ul,.rst-content .section ul li ul,.rst-content .toctree-wrapper ul li ul,article ul li ul{margin-bottom:0}.wy-plain-list-disc li li,.rst-content .section ul li li,.rst-content .toctree-wrapper ul li li,article ul li li{list-style:circle}.wy-plain-list-disc li li li,.rst-content .section ul li li li,.rst-content .toctree-wrapper ul li li li,article ul li li li{list-style:square}.wy-plain-list-disc li ol li,.rst-content .section ul li ol li,.rst-content .toctree-wrapper ul li ol li,article ul li ol li{list-style:decimal}.wy-plain-list-decimal,.rst-content .section ol,.rst-content ol.arabic,article ol{list-style:decimal;line-height:24px;margin-bottom:24px}.wy-plain-list-decimal li,.rst-content .section ol li,.rst-content ol.arabic li,article ol li{list-style:decimal;margin-left:24px}.wy-plain-list-decimal li p:last-child,.rst-content .section ol li p:last-child,.rst-content ol.arabic li p:last-child,article ol li p:last-child{margin-bottom:0}.wy-plain-list-decimal li ul,.rst-content .section ol li ul,.rst-content ol.arabic li ul,article ol li ul{margin-bottom:0}.wy-plain-list-decimal li ul li,.rst-content .section ol li ul li,.rst-content ol.arabic li ul li,article ol li ul li{list-style:disc}.codeblock-example{border:1px solid #e1e4e5;border-bottom:none;padding:24px;padding-top:48px;font-weight:500;background:#fff;position:relative}.codeblock-example:after{content:"Example";position:absolute;top:0px;left:0px;background:#9B59B6;color:#fff;padding:6px 12px}.codeblock-example.prettyprint-example-only{border:1px solid #e1e4e5;margin-bottom:24px}.codeblock,pre.literal-block,.rst-content .literal-block,.rst-content pre.literal-block,div[class^='highlight']{border:1px solid #e1e4e5;padding:0px;overflow-x:auto;background:#fff;margin:1px 0 24px 0}.codeblock div[class^='highlight'],pre.literal-block div[class^='highlight'],.rst-content .literal-block div[class^='highlight'],div[class^='highlight'] div[class^='highlight']{border:none;background:none;margin:0}div[class^='highlight'] td.code{width:100%}.linenodiv pre{border-right:solid 1px #e6e9ea;margin:0;padding:12px 12px;font-family:Consolas,"Andale Mono WT","Andale Mono","Lucida Console","Lucida Sans Typewriter","DejaVu Sans Mono","Bitstream Vera Sans Mono","Liberation Mono","Nimbus Mono L",Monaco,"Courier New",Courier,monospace;font-size:12px;line-height:1.5;color:#d9d9d9}div[class^='highlight'] pre{white-space:pre;margin:0;padding:12px 12px;font-family:Consolas,"Andale Mono WT","Andale Mono","Lucida Console","Lucida Sans Typewriter","DejaVu Sans Mono","Bitstream Vera Sans Mono","Liberation Mono","Nimbus Mono L",Monaco,"Courier New",Courier,monospace;font-size:12px;line-height:1.5;display:block;overflow:auto;color:#404040}@media print{.codeblock,pre.literal-block,.rst-content .literal-block,.rst-content pre.literal-block,div[class^='highlight'],div[class^='highlight'] pre{white-space:pre-wrap}}.hll{background-color:#ffc;margin:0 -12px;padding:0 12px;display:block}.c{color:#998;font-style:italic}.err{color:#a61717;background-color:#e3d2d2}.k{font-weight:bold}.o{font-weight:bold}.cm{color:#998;font-style:italic}.cp{color:#999;font-weight:bold}.c1{color:#998;font-style:italic}.cs{color:#999;font-weight:bold;font-style:italic}.gd{color:#000;background-color:#fdd}.gd .x{color:#000;background-color:#faa}.ge{font-style:italic}.gr{color:#a00}.gh{color:#999}.gi{color:#000;background-color:#dfd}.gi .x{color:#000;background-color:#afa}.go{color:#888}.gp{color:#555}.gs{font-weight:bold}.gu{color:purple;font-weight:bold}.gt{color:#a00}.kc{font-weight:bold}.kd{font-weight:bold}.kn{font-weight:bold}.kp{font-weight:bold}.kr{font-weight:bold}.kt{color:#458;font-weight:bold}.m{color:#099}.s{color:#d14}.n{color:#333}.na{color:teal}.nb{color:#0086b3}.nc{color:#458;font-weight:bold}.no{color:teal}.ni{color:purple}.ne{color:#900;font-weight:bold}.nf{color:#900;font-weight:bold}.nn{color:#555}.nt{color:navy}.nv{color:teal}.ow{font-weight:bold}.w{color:#bbb}.mf{color:#099}.mh{color:#099}.mi{color:#099}.mo{color:#099}.sb{color:#d14}.sc{color:#d14}.sd{color:#d14}.s2{color:#d14}.se{color:#d14}.sh{color:#d14}.si{color:#d14}.sx{color:#d14}.sr{color:#009926}.s1{color:#d14}.ss{color:#990073}.bp{color:#999}.vc{color:teal}.vg{color:teal}.vi{color:teal}.il{color:#099}.gc{color:#999;background-color:#EAF2F5}.wy-breadcrumbs li{display:inline-block}.wy-breadcrumbs li.wy-breadcrumbs-aside{float:right}.wy-breadcrumbs li a{display:inline-block;padding:5px}.wy-breadcrumbs li a:first-child{padding-left:0}.wy-breadcrumbs li code,.wy-breadcrumbs li .rst-content tt,.rst-content .wy-breadcrumbs li tt{padding:5px;border:none;background:none}.wy-breadcrumbs li code.literal,.wy-breadcrumbs li .rst-content tt.literal,.rst-content .wy-breadcrumbs li tt.literal{color:#404040}.wy-breadcrumbs-extra{margin-bottom:0;color:#b3b3b3;font-size:80%;display:inline-block}@media screen and (max-width: 480px){.wy-breadcrumbs-extra{display:none}.wy-breadcrumbs li.wy-breadcrumbs-aside{display:none}}@media print{.wy-breadcrumbs li.wy-breadcrumbs-aside{display:none}}.wy-affix{position:fixed;top:1.618em}.wy-menu a:hover{text-decoration:none}.wy-menu-horiz{*zoom:1}.wy-menu-horiz:before,.wy-menu-horiz:after{display:table;content:""}.wy-menu-horiz:after{clear:both}.wy-menu-horiz ul,.wy-menu-horiz li{display:inline-block}.wy-menu-horiz li:hover{background:rgba(255,255,255,0.1)}.wy-menu-horiz li.divide-left{border-left:solid 1px #404040}.wy-menu-horiz li.divide-right{border-right:solid 1px #404040}.wy-menu-horiz a{height:32px;display:inline-block;line-height:32px;padding:0 16px}.wy-menu-vertical{width:300px}.wy-menu-vertical header,.wy-menu-vertical p.caption{height:32px;display:inline-block;line-height:32px;padding:0 1.618em;margin-bottom:0;display:block;font-weight:bold;text-transform:uppercase;font-size:80%;color:#6f6f6f;white-space:nowrap}.wy-menu-vertical ul{margin-bottom:0}.wy-menu-vertical li.divide-top{border-top:solid 1px #404040}.wy-menu-vertical li.divide-bottom{border-bottom:solid 1px #404040}.wy-menu-vertical li.current{background:#e3e3e3}.wy-menu-vertical li.current a{color:gray;border-right:solid 1px #c9c9c9;padding:.4045em 2.427em}.wy-menu-vertical li.current a:hover{background:#d6d6d6}.wy-menu-vertical li code,.wy-menu-vertical li .rst-content tt,.rst-content .wy-menu-vertical li tt{border:none;background:inherit;color:inherit;padding-left:0;padding-right:0}.wy-menu-vertical li span.toctree-expand{display:block;float:left;margin-left:-1.2em;font-size:.8em;line-height:1.6em;color:#4d4d4d}.wy-menu-vertical li.on a,.wy-menu-vertical li.current>a{color:#404040;padding:.4045em 1.618em;font-weight:bold;position:relative;background:#fcfcfc;border:none;border-bottom:solid 1px #c9c9c9;border-top:solid 1px #c9c9c9;padding-left:1.618em -4px}.wy-menu-vertical li.on a:hover,.wy-menu-vertical li.current>a:hover{background:#fcfcfc}.wy-menu-vertical li.on a:hover span.toctree-expand,.wy-menu-vertical li.current>a:hover span.toctree-expand{color:gray}.wy-menu-vertical li.on a span.toctree-expand,.wy-menu-vertical li.current>a span.toctree-expand{display:block;font-size:.8em;line-height:1.6em;color:#333}.wy-menu-vertical li.toctree-l1.current li.toctree-l2>ul,.wy-menu-vertical li.toctree-l2.current li.toctree-l3>ul{display:none}.wy-menu-vertical li.toctree-l1.current li.toctree-l2.current>ul,.wy-menu-vertical li.toctree-l2.current li.toctree-l3.current>ul{display:block}.wy-menu-vertical li.toctree-l2.current>a{background:#c9c9c9;padding:.4045em 2.427em}.wy-menu-vertical li.toctree-l2.current li.toctree-l3>a{display:block;background:#c9c9c9;padding:.4045em 4.045em}.wy-menu-vertical li.toctree-l2 a:hover span.toctree-expand{color:gray}.wy-menu-vertical li.toctree-l2 span.toctree-expand{color:#a3a3a3}.wy-menu-vertical li.toctree-l3{font-size:.9em}.wy-menu-vertical li.toctree-l3.current>a{background:#bdbdbd;padding:.4045em 4.045em}.wy-menu-vertical li.toctree-l3.current li.toctree-l4>a{display:block;background:#bdbdbd;padding:.4045em 5.663em;border-top:none;border-bottom:none}.wy-menu-vertical li.toctree-l3 a:hover span.toctree-expand{color:gray}.wy-menu-vertical li.toctree-l3 span.toctree-expand{color:#969696}.wy-menu-vertical li.toctree-l4{font-size:.9em}.wy-menu-vertical li.current ul{display:block}.wy-menu-vertical li ul{margin-bottom:0;display:none}.wy-menu-vertical .local-toc li ul{display:block}.wy-menu-vertical li ul li a{margin-bottom:0;color:#b3b3b3;font-weight:normal}.wy-menu-vertical a{display:inline-block;line-height:18px;padding:.4045em 1.618em;display:block;position:relative;font-size:90%;color:#b3b3b3}.wy-menu-vertical a:hover{background-color:#4e4a4a;cursor:pointer}.wy-menu-vertical a:hover span.toctree-expand{color:#b3b3b3}.wy-menu-vertical a:active{background-color:#2980B9;cursor:pointer;color:#fff}.wy-menu-vertical a:active span.toctree-expand{color:#fff}.wy-side-nav-search{display:block;width:300px;padding:.809em;margin-bottom:.809em;z-index:200;background-color:#2980B9;text-align:center;padding:.809em;display:block;color:#fcfcfc;margin-bottom:.809em}.wy-side-nav-search input[type=text]{width:100%;border-radius:50px;padding:6px 12px;border-color:#2472a4}.wy-side-nav-search img{display:block;margin:auto auto .809em auto;height:45px;width:45px;background-color:#2980B9;padding:5px;border-radius:100%}.wy-side-nav-search>a,.wy-side-nav-search .wy-dropdown>a{color:#fcfcfc;font-size:100%;font-weight:bold;display:inline-block;padding:4px 6px;margin-bottom:.809em}.wy-side-nav-search>a:hover,.wy-side-nav-search .wy-dropdown>a:hover{background:rgba(255,255,255,0.1)}.wy-side-nav-search>a img.logo,.wy-side-nav-search .wy-dropdown>a img.logo{display:block;margin:0 auto;height:auto;width:auto;border-radius:0;max-width:100%;background:transparent}.wy-side-nav-search>a.icon img.logo,.wy-side-nav-search .wy-dropdown>a.icon img.logo{margin-top:.85em}.wy-side-nav-search>div.version{margin-top:-.4045em;margin-bottom:.809em;font-weight:normal;color:rgba(255,255,255,0.3)}.wy-nav .wy-menu-vertical header{color:#2980B9}.wy-nav .wy-menu-vertical a{color:#b3b3b3}.wy-nav .wy-menu-vertical a:hover{background-color:#2980B9;color:#fff}[data-menu-wrap]{-webkit-transition:all .2s ease-in;-moz-transition:all .2s ease-in;transition:all .2s ease-in;position:absolute;opacity:1;width:100%;opacity:0}[data-menu-wrap].move-center{left:0;right:auto;opacity:1}[data-menu-wrap].move-left{right:auto;left:-100%;opacity:0}[data-menu-wrap].move-right{right:-100%;left:auto;opacity:0}.wy-body-for-nav{background:left repeat-y #fcfcfc;background-image:url(data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAIAAACQd1PeAAAAGXRFWHRTb2Z0d2FyZQBBZG9iZSBJbWFnZVJlYWR5ccllPAAAAyRpVFh0WE1MOmNvbS5hZG9iZS54bXAAAAAAADw/eHBhY2tldCBiZWdpbj0i77u/IiBpZD0iVzVNME1wQ2VoaUh6cmVTek5UY3prYzlkIj8+IDx4OnhtcG1ldGEgeG1sbnM6eD0iYWRvYmU6bnM6bWV0YS8iIHg6eG1wdGs9IkFkb2JlIFhNUCBDb3JlIDUuMy1jMDExIDY2LjE0NTY2MSwgMjAxMi8wMi8wNi0xNDo1NjoyNyAgICAgICAgIj4gPHJkZjpSREYgeG1sbnM6cmRmPSJodHRwOi8vd3d3LnczLm9yZy8xOTk5LzAyLzIyLXJkZi1zeW50YXgtbnMjIj4gPHJkZjpEZXNjcmlwdGlvbiByZGY6YWJvdXQ9IiIgeG1sbnM6eG1wPSJodHRwOi8vbnMuYWRvYmUuY29tL3hhcC8xLjAvIiB4bWxuczp4bXBNTT0iaHR0cDovL25zLmFkb2JlLmNvbS94YXAvMS4wL21tLyIgeG1sbnM6c3RSZWY9Imh0dHA6Ly9ucy5hZG9iZS5jb20veGFwLzEuMC9zVHlwZS9SZXNvdXJjZVJlZiMiIHhtcDpDcmVhdG9yVG9vbD0iQWRvYmUgUGhvdG9zaG9wIENTNiAoTWFjaW50b3NoKSIgeG1wTU06SW5zdGFuY2VJRD0ieG1wLmlpZDoxOERBMTRGRDBFMUUxMUUzODUwMkJCOThDMEVFNURFMCIgeG1wTU06RG9jdW1lbnRJRD0ieG1wLmRpZDoxOERBMTRGRTBFMUUxMUUzODUwMkJCOThDMEVFNURFMCI+IDx4bXBNTTpEZXJpdmVkRnJvbSBzdFJlZjppbnN0YW5jZUlEPSJ4bXAuaWlkOjE4REExNEZCMEUxRTExRTM4NTAyQkI5OEMwRUU1REUwIiBzdFJlZjpkb2N1bWVudElEPSJ4bXAuZGlkOjE4REExNEZDMEUxRTExRTM4NTAyQkI5OEMwRUU1REUwIi8+IDwvcmRmOkRlc2NyaXB0aW9uPiA8L3JkZjpSREY+IDwveDp4bXBtZXRhPiA8P3hwYWNrZXQgZW5kPSJyIj8+EwrlwAAAAA5JREFUeNpiMDU0BAgwAAE2AJgB9BnaAAAAAElFTkSuQmCC);background-size:300px 1px}.wy-grid-for-nav{position:absolute;width:100%;height:100%}.wy-nav-side{position:fixed;top:0;bottom:0;left:0;padding-bottom:2em;width:300px;overflow-x:hidden;overflow-y:hidden;min-height:100%;background:#343131;z-index:200}.wy-side-scroll{width:320px;position:relative;overflow-x:hidden;overflow-y:scroll;height:100%}.wy-nav-top{display:none;background:#2980B9;color:#fff;padding:.4045em .809em;position:relative;line-height:50px;text-align:center;font-size:100%;*zoom:1}.wy-nav-top:before,.wy-nav-top:after{display:table;content:""}.wy-nav-top:after{clear:both}.wy-nav-top a{color:#fff;font-weight:bold}.wy-nav-top img{margin-right:12px;height:45px;width:45px;background-color:#2980B9;padding:5px;border-radius:100%}.wy-nav-top i{font-size:30px;float:left;cursor:pointer;padding-top:inherit}.wy-nav-content-wrap{margin-left:300px;background:#fcfcfc;min-height:100%}.wy-nav-content{padding:1.618em 3.236em;height:100%;max-width:800px;margin:auto}.wy-body-mask{position:fixed;width:100%;height:100%;background:rgba(0,0,0,0.2);display:none;z-index:499}.wy-body-mask.on{display:block}footer{color:gray}footer p{margin-bottom:12px}footer span.commit code,footer span.commit .rst-content tt,.rst-content footer span.commit tt{padding:0px;font-family:Consolas,"Andale Mono WT","Andale Mono","Lucida Console","Lucida Sans Typewriter","DejaVu Sans Mono","Bitstream Vera Sans Mono","Liberation Mono","Nimbus Mono L",Monaco,"Courier New",Courier,monospace;font-size:1em;background:none;border:none;color:gray}.rst-footer-buttons{*zoom:1}.rst-footer-buttons:before,.rst-footer-buttons:after{width:100%}.rst-footer-buttons:before,.rst-footer-buttons:after{display:table;content:""}.rst-footer-buttons:after{clear:both}.rst-breadcrumbs-buttons{margin-top:12px;*zoom:1}.rst-breadcrumbs-buttons:before,.rst-breadcrumbs-buttons:after{display:table;content:""}.rst-breadcrumbs-buttons:after{clear:both}#search-results .search li{margin-bottom:24px;border-bottom:solid 1px #e1e4e5;padding-bottom:24px}#search-results .search li:first-child{border-top:solid 1px #e1e4e5;padding-top:24px}#search-results .search li a{font-size:120%;margin-bottom:12px;display:inline-block}#search-results .context{color:gray;font-size:90%}@media screen and (max-width: 768px){.wy-body-for-nav{background:#fcfcfc}.wy-nav-top{display:block}.wy-nav-side{left:-300px}.wy-nav-side.shift{width:85%;left:0}.wy-side-scroll{width:auto}.wy-side-nav-search{width:auto}.wy-menu.wy-menu-vertical{width:auto}.wy-nav-content-wrap{margin-left:0}.wy-nav-content-wrap .wy-nav-content{padding:1.618em}.wy-nav-content-wrap.shift{position:fixed;min-width:100%;left:85%;top:0;height:100%;overflow:hidden}}@media screen and (min-width: 1400px){.wy-nav-content-wrap{background:rgba(0,0,0,0.05)}.wy-nav-content{margin:0;background:#fcfcfc}}@media print{.rst-versions,footer,.wy-nav-side{display:none}.wy-nav-content-wrap{margin-left:0}}.rst-versions{position:fixed;bottom:0;left:0;width:300px;color:#fcfcfc;background:#1f1d1d;border-top:solid 10px #343131;font-family:"Lato","proxima-nova","Helvetica Neue",Arial,sans-serif;z-index:400}.rst-versions a{color:#2980B9;text-decoration:none}.rst-versions .rst-badge-small{display:none}.rst-versions .rst-current-version{padding:12px;background-color:#272525;display:block;text-align:right;font-size:90%;cursor:pointer;color:#27AE60;*zoom:1}.rst-versions .rst-current-version:before,.rst-versions .rst-current-version:after{display:table;content:""}.rst-versions .rst-current-version:after{clear:both}.rst-versions .rst-current-version .fa,.rst-versions .rst-current-version .wy-menu-vertical li span.toctree-expand,.wy-menu-vertical li .rst-versions .rst-current-version span.toctree-expand,.rst-versions .rst-current-version .rst-content .admonition-title,.rst-content .rst-versions .rst-current-version .admonition-title,.rst-versions .rst-current-version .rst-content h1 .headerlink,.rst-content h1 .rst-versions .rst-current-version .headerlink,.rst-versions .rst-current-version .rst-content h2 .headerlink,.rst-content h2 .rst-versions .rst-current-version .headerlink,.rst-versions .rst-current-version .rst-content h3 .headerlink,.rst-content h3 .rst-versions .rst-current-version .headerlink,.rst-versions .rst-current-version .rst-content h4 .headerlink,.rst-content h4 .rst-versions .rst-current-version .headerlink,.rst-versions .rst-current-version .rst-content h5 .headerlink,.rst-content h5 .rst-versions .rst-current-version .headerlink,.rst-versions .rst-current-version .rst-content h6 .headerlink,.rst-content h6 .rst-versions .rst-current-version .headerlink,.rst-versions .rst-current-version .rst-content dl dt .headerlink,.rst-content dl dt .rst-versions .rst-current-version .headerlink,.rst-versions .rst-current-version .rst-content p.caption .headerlink,.rst-content p.caption .rst-versions .rst-current-version .headerlink,.rst-versions .rst-current-version .rst-content tt.download span:first-child,.rst-content tt.download .rst-versions .rst-current-version span:first-child,.rst-versions .rst-current-version .rst-content code.download span:first-child,.rst-content code.download .rst-versions .rst-current-version span:first-child,.rst-versions .rst-current-version .icon{color:#fcfcfc}.rst-versions .rst-current-version .fa-book,.rst-versions .rst-current-version .icon-book{float:left}.rst-versions .rst-current-version .icon-book{float:left}.rst-versions .rst-current-version.rst-out-of-date{background-color:#E74C3C;color:#fff}.rst-versions .rst-current-version.rst-active-old-version{background-color:#F1C40F;color:#000}.rst-versions.shift-up .rst-other-versions{display:block}.rst-versions .rst-other-versions{font-size:90%;padding:12px;color:gray;display:none}.rst-versions .rst-other-versions hr{display:block;height:1px;border:0;margin:20px 0;padding:0;border-top:solid 1px #413d3d}.rst-versions .rst-other-versions dd{display:inline-block;margin:0}.rst-versions .rst-other-versions dd a{display:inline-block;padding:6px;color:#fcfcfc}.rst-versions.rst-badge{width:auto;bottom:20px;right:20px;left:auto;border:none;max-width:300px}.rst-versions.rst-badge .icon-book{float:none}.rst-versions.rst-badge .fa-book,.rst-versions.rst-badge .icon-book{float:none}.rst-versions.rst-badge.shift-up .rst-current-version{text-align:right}.rst-versions.rst-badge.shift-up .rst-current-version .fa-book,.rst-versions.rst-badge.shift-up .rst-current-version .icon-book{float:left}.rst-versions.rst-badge.shift-up .rst-current-version .icon-book{float:left}.rst-versions.rst-badge .rst-current-version{width:auto;height:30px;line-height:30px;padding:0 6px;display:block;text-align:center}@media screen and (max-width: 768px){.rst-versions{width:85%;display:none}.rst-versions.shift{display:block}}.rst-content img{max-width:100%;height:auto !important}.rst-content .highlight>pre,.rst-content .linenodiv>pre{line-height:normal}.rst-content div.figure{margin-bottom:24px}.rst-content div.figure p.caption{font-style:italic}.rst-content div.figure.align-center{text-align:center}.rst-content .section>img,.rst-content .section>a>img{margin-bottom:24px}.rst-content blockquote{margin-left:24px;line-height:24px;margin-bottom:24px}.rst-content .note .last,.rst-content .attention .last,.rst-content .caution .last,.rst-content .danger .last,.rst-content .error .last,.rst-content .hint .last,.rst-content .important .last,.rst-content .tip .last,.rst-content .warning .last,.rst-content .seealso .last,.rst-content .admonition-todo .last{margin-bottom:0}.rst-content .admonition-title:before{margin-right:4px}.rst-content .admonition table{border-color:rgba(0,0,0,0.1)}.rst-content .admonition table td,.rst-content .admonition table th{background:transparent !important;border-color:rgba(0,0,0,0.1) !important}.rst-content .section ol.loweralpha,.rst-content .section ol.loweralpha li{list-style:lower-alpha}.rst-content .section ol.upperalpha,.rst-content .section ol.upperalpha li{list-style:upper-alpha}.rst-content .section ol p,.rst-content .section ul p{margin-bottom:12px}.rst-content .line-block{margin-left:24px}.rst-content .topic-title{font-weight:bold;margin-bottom:12px}.rst-content .toc-backref{color:#404040}.rst-content .align-right{float:right;margin:0px 0px 24px 24px}.rst-content .align-left{float:left;margin:0px 24px 24px 0px}.rst-content .align-center{margin:auto;display:block}.rst-content h1 .headerlink,.rst-content h2 .headerlink,.rst-content .toctree-wrapper p.caption .headerlink,.rst-content h3 .headerlink,.rst-content h4 .headerlink,.rst-content h5 .headerlink,.rst-content h6 .headerlink,.rst-content dl dt .headerlink,.rst-content p.caption .headerlink{display:none;visibility:hidden;font-size:14px}.rst-content h1 .headerlink:after,.rst-content h2 .headerlink:after,.rst-content .toctree-wrapper p.caption .headerlink:after,.rst-content h3 .headerlink:after,.rst-content h4 .headerlink:after,.rst-content h5 .headerlink:after,.rst-content h6 .headerlink:after,.rst-content dl dt .headerlink:after,.rst-content p.caption .headerlink:after{visibility:visible;content:"";font-family:FontAwesome;display:inline-block}.rst-content h1:hover .headerlink,.rst-content h2:hover .headerlink,.rst-content .toctree-wrapper p.caption:hover .headerlink,.rst-content h3:hover .headerlink,.rst-content h4:hover .headerlink,.rst-content h5:hover .headerlink,.rst-content h6:hover .headerlink,.rst-content dl dt:hover .headerlink,.rst-content p.caption:hover .headerlink{display:inline-block}.rst-content .centered{text-align:center}.rst-content .sidebar{float:right;width:40%;display:block;margin:0 0 24px 24px;padding:24px;background:#f3f6f6;border:solid 1px #e1e4e5}.rst-content .sidebar p,.rst-content .sidebar ul,.rst-content .sidebar dl{font-size:90%}.rst-content .sidebar .last{margin-bottom:0}.rst-content .sidebar .sidebar-title{display:block;font-family:"Roboto Slab","ff-tisa-web-pro","Georgia",Arial,sans-serif;font-weight:bold;background:#e1e4e5;padding:6px 12px;margin:-24px;margin-bottom:24px;font-size:100%}.rst-content .highlighted{background:#F1C40F;display:inline-block;font-weight:bold;padding:0 6px}.rst-content .footnote-reference,.rst-content .citation-reference{vertical-align:super;font-size:90%}.rst-content table.docutils.citation,.rst-content table.docutils.footnote{background:none;border:none;color:gray}.rst-content table.docutils.citation td,.rst-content table.docutils.citation tr,.rst-content table.docutils.footnote td,.rst-content table.docutils.footnote tr{border:none;background-color:transparent !important;white-space:normal}.rst-content table.docutils.citation td.label,.rst-content table.docutils.footnote td.label{padding-left:0;padding-right:0;vertical-align:top}.rst-content table.docutils.citation tt,.rst-content table.docutils.citation code,.rst-content table.docutils.footnote tt,.rst-content table.docutils.footnote code{color:#555}.rst-content table.field-list{border:none}.rst-content table.field-list td{border:none}.rst-content table.field-list td>strong{display:inline-block}.rst-content table.field-list .field-name{padding-right:10px;text-align:left;white-space:nowrap}.rst-content table.field-list .field-body{text-align:left}.rst-content tt,.rst-content tt,.rst-content code{color:#000;padding:2px 5px}.rst-content tt big,.rst-content tt em,.rst-content tt big,.rst-content code big,.rst-content tt em,.rst-content code em{font-size:100% !important;line-height:normal}.rst-content tt.literal,.rst-content tt.literal,.rst-content code.literal{color:#E74C3C}.rst-content tt.xref,a .rst-content tt,.rst-content tt.xref,.rst-content code.xref,a .rst-content tt,a .rst-content code{font-weight:bold;color:#404040}.rst-content a tt,.rst-content a tt,.rst-content a code{color:#2980B9}.rst-content dl{margin-bottom:24px}.rst-content dl dt{font-weight:bold}.rst-content dl p,.rst-content dl table,.rst-content dl ul,.rst-content dl ol{margin-bottom:12px !important}.rst-content dl dd{margin:0 0 12px 24px}.rst-content dl:not(.docutils){margin-bottom:24px}.rst-content dl:not(.docutils) dt{display:table;margin:6px 0;font-size:90%;line-height:normal;background:#e7f2fa;color:#2980B9;border-top:solid 3px #6ab0de;padding:6px;position:relative}.rst-content dl:not(.docutils) dt:before{color:#6ab0de}.rst-content dl:not(.docutils) dt .headerlink{color:#404040;font-size:100% !important}.rst-content dl:not(.docutils) dl dt{margin-bottom:6px;border:none;border-left:solid 3px #ccc;background:#f0f0f0;color:#555}.rst-content dl:not(.docutils) dl dt .headerlink{color:#404040;font-size:100% !important}.rst-content dl:not(.docutils) dt:first-child{margin-top:0}.rst-content dl:not(.docutils) tt,.rst-content dl:not(.docutils) tt,.rst-content dl:not(.docutils) code{font-weight:bold}.rst-content dl:not(.docutils) tt.descname,.rst-content dl:not(.docutils) tt.descclassname,.rst-content dl:not(.docutils) tt.descname,.rst-content dl:not(.docutils) code.descname,.rst-content dl:not(.docutils) tt.descclassname,.rst-content dl:not(.docutils) code.descclassname{background-color:transparent;border:none;padding:0;font-size:100% !important}.rst-content dl:not(.docutils) tt.descname,.rst-content dl:not(.docutils) tt.descname,.rst-content dl:not(.docutils) code.descname{font-weight:bold}.rst-content dl:not(.docutils) .optional{display:inline-block;padding:0 4px;color:#000;font-weight:bold}.rst-content dl:not(.docutils) .property{display:inline-block;padding-right:8px}.rst-content .viewcode-link,.rst-content .viewcode-back{display:inline-block;color:#27AE60;font-size:80%;padding-left:24px}.rst-content .viewcode-back{display:block;float:right}.rst-content p.rubric{margin-bottom:12px;font-weight:bold}.rst-content tt.download,.rst-content code.download{background:inherit;padding:inherit;font-weight:normal;font-family:inherit;font-size:inherit;color:inherit;border:inherit;white-space:inherit}.rst-content tt.download span:first-child,.rst-content code.download span:first-child{-webkit-font-smoothing:subpixel-antialiased}.rst-content tt.download span:first-child:before,.rst-content code.download span:first-child:before{margin-right:4px}.rst-content .guilabel{border:1px solid #7fbbe3;background:#e7f2fa;font-size:80%;font-weight:700;border-radius:4px;padding:2.4px 6px;margin:auto 2px}.rst-content .versionmodified{font-style:italic}@media screen and (max-width: 480px){.rst-content .sidebar{width:100%}}span[id*='MathJax-Span']{color:#404040}.math{text-align:center}@font-face{font-family:"Inconsolata";font-style:normal;font-weight:400;src:local("Inconsolata"),local("Inconsolata-Regular"),url(../fonts/Inconsolata-Regular.ttf) format("truetype")}@font-face{font-family:"Inconsolata";font-style:normal;font-weight:700;src:local("Inconsolata Bold"),local("Inconsolata-Bold"),url(../fonts/Inconsolata-Bold.ttf) format("truetype")}@font-face{font-family:"Lato";font-style:normal;font-weight:400;src:local("Lato Regular"),local("Lato-Regular"),url(../fonts/Lato-Regular.ttf) format("truetype")}@font-face{font-family:"Lato";font-style:normal;font-weight:700;src:local("Lato Bold"),local("Lato-Bold"),url(../fonts/Lato-Bold.ttf) format("truetype")}@font-face{font-family:"Lato";font-style:italic;font-weight:400;src:local("Lato Italic"),local("Lato-Italic"),url(../fonts/Lato-Italic.ttf) format("truetype")}@font-face{font-family:"Lato";font-style:italic;font-weight:700;src:local("Lato Bold Italic"),local("Lato-BoldItalic"),url(../fonts/Lato-BoldItalic.ttf) format("truetype")}@font-face{font-family:"Roboto Slab";font-style:normal;font-weight:400;src:local("Roboto Slab Regular"),local("RobotoSlab-Regular"),url(../fonts/RobotoSlab-Regular.ttf) format("truetype")}@font-face{font-family:"Roboto Slab";font-style:normal;font-weight:700;src:local("Roboto Slab Bold"),local("RobotoSlab-Bold"),url(../fonts/RobotoSlab-Bold.ttf) format("truetype")} -/*# sourceMappingURL=theme.css.map */ diff --git a/docs/_static/css/theme.css.map b/docs/_static/css/theme.css.map deleted file mode 100644 index 8f756d1a1a4a..000000000000 --- a/docs/_static/css/theme.css.map +++ /dev/null @@ -1,7 +0,0 @@ -{ -"version": 3, -"mappings": "CACE,AAAE,ECQI,iBAAoB,EDPJ,SAAU,ECY1B,cAAiB,EDZD,SAAU,EC2B1B,SAAY,ED3BI,SAAU,EEFlC,uEAAiF,EAC/E,MAAO,EAAE,IAAK,EAEhB,iBAAoB,EAClB,MAAO,EAAE,WAAY,EACrB,OAAQ,EAAE,KAAM,EAChB,IAAK,EAAE,AAAC,EAEV,oBAAqB,EACnB,MAAO,EAAE,GAAI,EAEf,OAAQ,EACN,MAAO,EAAE,GAAI,EAEf,AAAC,EDLO,iBAAoB,ECMd,SAAU,EDDhB,cAAiB,ECCX,SAAU,EDchB,SAAY,ECdN,SAAU,EAExB,GAAI,EACF,QAAS,EAAE,GAAI,EACf,uBAAwB,EAAE,GAAI,EAC9B,mBAAoB,EAAE,GAAI,EAE5B,GAAI,EACF,KAAM,EAAE,AAAC,EAEX,eAAiB,EACf,MAAO,EAAE,AAAC,EAEZ,UAAW,EACT,YAAa,EAAE,SAAU,EAE3B,OAAS,EACP,UAAW,EAAE,GAAI,EAEnB,SAAU,EACR,KAAM,EAAE,AAAC,EAEX,EAAG,EACD,SAAU,EAAE,KAAM,EAGpB,EAAG,EACD,SAAU,EAAE,GAAI,EAChB,IAAK,EAAE,GAAI,EACX,cAAe,EAAE,GAAI,EAEvB,GAAI,EACF,SAAU,EAAE,GAAI,EAChB,IAAK,EAAE,GAAI,EACX,SAAU,EAAE,KAAM,EAClB,UAAW,EAAE,GAAI,EAEnB,kDAAoB,EAClB,UAAW,EAAE,cAAgB,EAC7B,WAAY,EAAE,sBAAwB,EACtC,QAAS,EAAE,EAAG,EAEhB,EAAG,EACD,UAAW,EAAE,EAAG,EAElB,AAAC,EACC,KAAM,EAAE,GAAI,EAEd,eAAiB,EACf,MAAO,EAAE,CAAE,EACX,MAAO,EAAE,GAAI,EAEf,IAAK,EACH,QAAS,EAAE,EAAG,EAEhB,MAAQ,EACN,QAAS,EAAE,EAAG,EACd,UAAW,EAAE,AAAC,EACd,OAAQ,EAAE,OAAQ,EAClB,aAAc,EAAE,OAAQ,EAE1B,EAAG,EACD,EAAG,EAAE,KAAM,EAEb,EAAG,EACD,KAAM,EAAE,MAAO,EAEjB,OAAU,EACR,KAAM,EAAE,AAAC,EACT,MAAO,EAAE,AAAC,EACV,SAAU,EAAE,GAAI,EAChB,eAAgB,EAAE,GAAI,EAExB,CAAE,EACA,SAAU,EAAE,GAAI,EAElB,CAAE,EACA,KAAM,EAAE,AAAC,EAEX,EAAG,EACD,KAAM,EAAE,AAAC,EACT,qBAAsB,EAAE,MAAO,EAC/B,aAAc,EAAE,KAAM,EACtB,QAAS,EAAE,GAAI,EAEjB,aAAc,EACZ,OAAQ,EAAE,KAAM,EAElB,KAAM,EACJ,KAAM,EAAE,AAAC,EAEX,GAAI,EACF,KAAM,EAAE,AAAC,EAEX,OAAQ,EACN,KAAM,EAAE,AAAC,EACT,KAAM,EAAE,AAAC,EACT,MAAO,EAAE,AAAC,EAEZ,IAAK,EACH,KAAM,EAAE,MAAO,EAEjB,KAAM,EACJ,KAAM,EAAE,AAAC,EACT,WAAY,EAAE,GAAI,EAClB,MAAO,EAAE,AAAC,EACV,UAAW,EAAE,KAAM,EAErB,2BAA+B,EAC7B,QAAS,EAAE,GAAI,EACf,KAAM,EAAE,AAAC,EACT,aAAc,EAAE,OAAQ,EACxB,cAAe,EAAE,KAAM,EAEzB,WAAa,EACX,UAAW,EAAE,KAAM,EAErB,mEAAuE,EACrE,KAAM,EAAE,MAAO,EACf,iBAAkB,EAAE,KAAM,EAC1B,QAAS,EAAE,MAAO,EAEpB,+BAAiC,EAC/B,KAAM,EAAE,MAAO,EAEjB,yCAA2C,EACzC,SAAU,EAAE,SAAU,EACtB,MAAO,EAAE,AAAC,EACV,KAAM,EAAE,GAAI,EACZ,MAAO,EAAE,GAAI,EAEf,mBAAoB,EAClB,iBAAkB,EAAE,QAAS,EAC7B,cAAe,EAAE,UAAW,EAC5B,iBAAkB,EAAE,UAAW,EAC/B,SAAU,EAAE,UAAW,EAEzB,iGAAmG,EACjG,iBAAkB,EAAE,GAAI,EAE1B,+CAAiD,EAC/C,KAAM,EAAE,AAAC,EACT,MAAO,EAAE,AAAC,EAEZ,OAAQ,EACN,OAAQ,EAAE,GAAI,EACd,aAAc,EAAE,EAAG,EACnB,KAAM,EAAE,OAAQ,EAElB,IAAK,EACH,cAAe,EAAE,OAAQ,EACzB,aAAc,EAAE,AAAC,EAEnB,CAAE,EACA,aAAc,EAAE,EAAG,EAErB,WAAY,EACV,KAAM,EAAE,MAAO,EACf,SAAU,EAAE,GAAI,EAChB,IAAK,EAAE,GAAK,EACZ,MAAO,EAAE,MAAO,EAElB,EAAG,EACD,MAAO,EAAE,IAAK,EACd,KAAM,EAAE,AAAC,EACT,UAAW,EAAE,KAAM,EACnB,OAAQ,EAAE,KAAM,EAChB,eAAgB,EAAE,UAAW,EAC7B,gBAAiB,EAAE,QAAS,EAC5B,SAAU,EAAE,GAAI,EAChB,QAAS,EAAE,EAAG,EACd,WAAY,EAAE,AAAC,EAEjB,KAAM,EACJ,MAAO,EAAE,GAAI,EAEf,MAAO,EACL,MAAO,EAAE,cAAe,EACxB,SAAU,EAAE,KAAM,EAEpB,cAAe,EACb,KAAM,EAAE,AAAC,EACT,GAAI,EAAE,YAAa,EACnB,KAAM,EAAE,EAAG,EACX,KAAM,EAAE,GAAI,EACZ,OAAQ,EAAE,KAAM,EAChB,MAAO,EAAE,AAAC,EACV,OAAQ,EAAE,OAAQ,EAClB,IAAK,EAAE,EAAG,EAEZ,+DAAiE,EAC/D,GAAI,EAAE,GAAI,EACV,KAAM,EAAE,GAAI,EACZ,KAAM,EAAE,AAAC,EACT,OAAQ,EAAE,MAAO,EACjB,OAAQ,EAAE,KAAM,EAChB,IAAK,EAAE,GAAI,EAEb,SAAU,EACR,SAAU,EAAE,KAAM,EAEpB,QAAS,EACP,OAAQ,EAAE,OAAQ,EAEpB,QAAU,EACR,QAAS,EAAE,GAAI,EAEjB,WAAY,EACV,gBAAmB,EACjB,SAAU,EAAE,cAAe,EAC7B,AAAC,EACC,SAAU,EAAE,cAAe,EAC3B,UAAW,EAAE,cAAe,EAC5B,KAAM,EAAE,cAAe,EACvB,SAAU,EAAE,cAAe,EAC7B,UAAY,EACV,cAAe,EAAE,QAAS,EAC5B,0DAA6D,EAC3D,MAAO,EAAE,CAAE,EACb,aAAe,EACb,gBAAiB,EAAE,IAAK,EAC1B,IAAK,EACH,MAAO,EAAE,iBAAkB,EAC7B,KAAO,EACL,gBAAiB,EAAE,IAAK,EAC1B,EAAG,EACD,QAAS,EAAE,cAAe,QAE1B,KAAM,EAAE,IAAK,EAEf,8CAAS,EACP,MAAO,EAAE,AAAC,EACV,KAAM,EAAE,AAAC,EACX,4CAAM,EACJ,eAAgB,EAAE,IAAK,GChM3B,ykDAAY,EACV,qBAAsB,EAAE,UAAW,EAqDrC,QAAS,EARP,IAAK,EAAE,AAAC,EACR,+BAAS,EAEP,MAAO,EAAE,IAAK,EACd,MAAO,EAAE,CAAE,EACb,cAAO,EACL,IAAK,EAAE,GAAI,EC7Gf;;;IAGG,DCAH,UAWC,CAVC,WAAW,CAAE,aAAa,CAC1B,GAAG,CAAE,+CAAgE,CACrE,GAAG,CAAE,wWAI8F,CAEnG,WAAW,CAAE,MAAM,CACnB,UAAU,CAAE,MAAM,CCVpB,kfAAmB,CACjB,OAAO,CAAE,YAAY,CACrB,IAAI,CAAE,uCAA8E,CACpF,SAAS,CAAE,OAAO,CAClB,cAAc,CAAE,IAAI,CACpB,sBAAsB,CAAE,WAAW,CACnC,uBAAuB,CAAE,SAAS,CCLpC,MAAsB,CACpB,SAAS,CAAE,SAAS,CACpB,WAAW,CAAE,KAAS,CACtB,cAAc,CAAE,IAAI,CAEtB,MAAsB,CAAE,SAAS,CAAE,GAAG,CACtC,MAAsB,CAAE,SAAS,CAAE,GAAG,CACtC,MAAsB,CAAE,SAAS,CAAE,GAAG,CACtC,MAAsB,CAAE,SAAS,CAAE,GAAG,CCVtC,MAAsB,CACpB,KAAK,CAAE,SAAW,CAClB,UAAU,CAAE,MAAM,CCDpB,MAAsB,CACpB,YAAY,CAAE,CAAC,CACf,WAAW,CCMU,SAAS,CDL9B,eAAe,CAAE,IAAI,CACrB,SAAK,CAAE,QAAQ,CAAE,QAAQ,CAE3B,MAAsB,CACpB,QAAQ,CAAE,QAAQ,CAClB,IAAI,CAAE,UAAa,CACnB,KAAK,CCDgB,SAAS,CDE9B,GAAG,CAAE,QAAU,CACf,UAAU,CAAE,MAAM,CAClB,YAAuB,CACrB,IAAI,CAAE,UAA0B,CEbpC,UAA0B,CACxB,OAAO,CAAE,gBAAgB,CACzB,MAAM,CAAE,iBAA4B,CACpC,aAAa,CAAE,IAAI,CAGrB,aAA6B,CAAE,KAAK,CAAE,IAAI,CAC1C,cAA8B,CAAE,KAAK,CAAE,KAAK,CAG1C,ksBAA8B,CAAE,YAAY,CAAE,IAAI,CAClD,ktBAA+B,CAAE,WAAW,CAAE,IAAI,CAIpD,WAAY,CAAE,KAAK,CAAE,KAAK,CAC1B,UAAW,CAAE,KAAK,CAAE,IAAI,CAGtB,kpBAAY,CAAE,YAAY,CAAE,IAAI,CAChC,kqBAAa,CAAE,WAAW,CAAE,IAAI,CCpBlC,QAAwB,CACtB,iBAAiB,CAAE,0BAA0B,CACrC,SAAS,CAAE,0BAA0B,CAG/C,SAAyB,CACvB,iBAAiB,CAAE,4BAA4B,CACvC,SAAS,CAAE,4BAA4B,CAGjD,0BASC,CARC,EAAG,CACD,iBAAiB,CAAE,YAAY,CACvB,SAAS,CAAE,YAAY,CAEjC,IAAK,CACH,iBAAiB,CAAE,cAAc,CACzB,SAAS,CAAE,cAAc,EAIrC,kBASC,CARC,EAAG,CACD,iBAAiB,CAAE,YAAY,CACvB,SAAS,CAAE,YAAY,CAEjC,IAAK,CACH,iBAAiB,CAAE,cAAc,CACzB,SAAS,CAAE,cAAc,EC5BrC,aAA8B,CCW5B,UAAU,CAAE,0DAAqE,CACjF,iBAAiB,CAAE,aAAgB,CAC/B,aAAa,CAAE,aAAgB,CAC3B,SAAS,CAAE,aAAgB,CDbrC,cAA8B,CCU5B,UAAU,CAAE,0DAAqE,CACjF,iBAAiB,CAAE,cAAgB,CAC/B,aAAa,CAAE,cAAgB,CAC3B,SAAS,CAAE,cAAgB,CDZrC,cAA8B,CCS5B,UAAU,CAAE,0DAAqE,CACjF,iBAAiB,CAAE,cAAgB,CAC/B,aAAa,CAAE,cAAgB,CAC3B,SAAS,CAAE,cAAgB,CDVrC,mBAAmC,CCcjC,UAAU,CAAE,oEAA+E,CAC3F,iBAAiB,CAAE,YAAoB,CACnC,aAAa,CAAE,YAAoB,CAC/B,SAAS,CAAE,YAAoB,CDhBzC,iBAAmC,CCajC,UAAU,CAAE,oEAA+E,CAC3F,iBAAiB,CAAE,YAAoB,CACnC,aAAa,CAAE,YAAoB,CAC/B,SAAS,CAAE,YAAoB,CDXzC,+GAIuC,CACrC,MAAM,CAAE,IAAI,CEfd,SAAyB,CACvB,QAAQ,CAAE,QAAQ,CAClB,OAAO,CAAE,YAAY,CACrB,KAAK,CAAE,GAAG,CACV,MAAM,CAAE,GAAG,CACX,WAAW,CAAE,GAAG,CAChB,cAAc,CAAE,MAAM,CAExB,yBAAyD,CACvD,QAAQ,CAAE,QAAQ,CAClB,IAAI,CAAE,CAAC,CACP,KAAK,CAAE,IAAI,CACX,UAAU,CAAE,MAAM,CAEpB,YAA4B,CAAE,WAAW,CAAE,OAAO,CAClD,YAA4B,CAAE,SAAS,CAAE,GAAG,CAC5C,WAA2B,CAAE,KAAK,CLTZ,IAAI,CMP1B,gBAAgC,CAAE,OAAO,CNwU1B,GAAO,CMvUtB,gBAAgC,CAAE,OAAO,CN2d1B,GAAO,CM1dtB,qCAAiC,CAAE,OAAO,CN0jB1B,GAAO,CMzjBvB,qBAAqC,CAAE,OAAO,CNsO1B,GAAO,CMrO3B,gBAAgC,CAAE,OAAO,CNuW1B,GAAO,CMtWtB,eAA+B,CAAE,OAAO,CNknB1B,GAAO,CMjnBrB,iBAAiC,CAAE,OAAO,CNsnB1B,GAAO,CMrnBvB,eAA+B,CAAE,OAAO,CNytB1B,GAAO,CMxtBrB,eAA+B,CAAE,OAAO,CNmR1B,GAAO,CMlRrB,mBAAmC,CAAE,OAAO,CNupB1B,GAAO,CMtpBzB,aAA6B,CAAE,OAAO,CNqpB1B,GAAO,CMppBnB,kBAAkC,CAAE,OAAO,CNspB1B,GAAO,CMrpBxB,gBAAgC,CAAE,OAAO,CNyI1B,GAAO,CMxItB,mDAEgC,CAAE,OAAO,CNqqB1B,GAAO,CMpqBtB,sBAAsC,CAAE,OAAO,CN8iB1B,GAAO,CM7iB5B,uBAAuC,CAAE,OAAO,CN4iB1B,GAAO,CM3iB7B,oBAAoC,CAAE,OAAO,CN4f1B,GAAO,CM3f1B,iBAAiC,CAAE,OAAO,CNikB1B,GAAO,CMhkBvB,8BAC8B,CAAE,OAAO,CNgK1B,GAAO,CM/JpB,kBAAkC,CAAE,OAAO,CN+qB1B,GAAO,CM9qBxB,iCAA+B,CAAE,OAAO,CNwV1B,GAAO,CMvVrB,iBAAiC,CAAE,OAAO,CNuP1B,GAAO,CMtPvB,kBAAkC,CAAE,OAAO,CNgJ1B,GAAO,CM/IxB,eAA+B,CAAE,OAAO,CNmhB1B,GAAO,CMlhBrB,uHAAmC,CAAE,OAAO,CNgM1B,GAAO,CM/LzB,8BAA8C,CAAE,OAAO,CNY1B,GAAO,CMXpC,4BAA4C,CAAE,OAAO,CNc1B,GAAO,CMblC,gBAAgC,CAAE,OAAO,CNqW1B,GAAO,CMpWtB,wBAAwC,CAAE,OAAO,CNwe1B,GAAO,CMve9B,yCACiC,CAAE,OAAO,CNsgB1B,GAAO,CMrgBvB,kBAAkC,CAAE,OAAO,CNggB1B,GAAO,CM/fxB,mBAAmC,CAAE,OAAO,CNwY1B,GAAO,CMvYzB,eAA+B,CAAE,OAAO,CN2Y1B,GAAO,CM1YrB,eAA+B,CAAE,OAAO,CN4P1B,GAAO,CM3PrB,qBAAqC,CAAE,OAAO,CNoU1B,GAAO,CMnU3B,qBAAqC,CAAE,OAAO,CNitB1B,GAAO,CMhtB3B,sBAAsC,CAAE,OAAO,CN+sB1B,GAAO,CM9sB5B,oBAAoC,CAAE,OAAO,CNgtB1B,GAAO,CM/sB1B,iBAAiC,CAAE,OAAO,CNye1B,GAAO,CMxevB,kBAAkC,CAAE,OAAO,CNwB1B,GAAO,CMvBxB,cAA8B,CAAE,OAAO,CNymB1B,GAAO,CMxmBpB,eAA+B,CAAE,OAAO,CNymB1B,GAAO,CMxmBrB,iCAA+B,CAAE,OAAO,CNyD1B,GAAO,CMxDrB,mBAAmC,CAAE,OAAO,CNyD1B,GAAO,CMxDzB,gBAAgC,CAAE,OAAO,CN+d1B,GAAO,CM9dtB,iBAAiC,CAAE,OAAO,CN2E1B,GAAO,CM1EvB,eAA+B,CAAE,OAAO,CN0P1B,GAAO,CMzPrB,eAA+B,CAAE,OAAO,CNiD1B,GAAO,CMhDrB,iBAAiC,CAAE,OAAO,CN0V1B,GAAO,CMzVvB,sBAAsC,CAAE,OAAO,CNwmB1B,GAAO,CMvmB5B,qBAAqC,CAAE,OAAO,CNwmB1B,GAAO,CMvmB3B,qBAAqC,CAAE,OAAO,CNpC1B,GAAO,CMqC3B,uBAAuC,CAAE,OAAO,CNvC1B,GAAO,CMwC7B,sBAAsC,CAAE,OAAO,CNrC1B,GAAO,CMsC5B,wBAAwC,CAAE,OAAO,CNxC1B,GAAO,CMyC9B,eAA+B,CAAE,OAAO,CN+W1B,GAAO,CM9WrB,oCACkC,CAAE,OAAO,CN2a1B,GAAO,CM1axB,iBAAiC,CAAE,OAAO,CNsU1B,GAAO,CMrUvB,uBAAuC,CAAE,OAAO,CNkrB1B,GAAO,CMjrB7B,sDAEoC,CAAE,OAAO,CN0b1B,GAAO,CMzb1B,iBAAiC,CAAE,OAAO,CNkb1B,GAAO,CMjbvB,qBAAqC,CAAE,OAAO,CNwX1B,GAAO,CMvX3B,iBAAiC,CAAE,OAAO,CNtD1B,GAAO,CMuDvB,eAA+B,CAAE,OAAO,CNmnB1B,GAAO,CMlnBrB,0CAC0C,CAAE,OAAO,CN+a1B,GAAO,CM9ahC,yBAAyC,CAAE,OAAO,CN8f1B,GAAO,CM7f/B,yBAAyC,CAAE,OAAO,CN+E1B,GAAO,CM9E/B,iBAAiC,CAAE,OAAO,CNzB1B,GAAO,CM0BvB,wBAAwC,CAAE,OAAO,CNmjB1B,GAAO,CMljB9B,wBAAwC,CAAE,OAAO,CNqL1B,GAAO,CMpL9B,mBAAmC,CAAE,OAAO,CNlB1B,GAAO,CMmBzB,eAA+B,CAAE,OAAO,CNsb1B,GAAO,CMrbrB,gBAAgC,CAAE,OAAO,CNga1B,GAAO,CM/ZtB,eAA+B,CAAE,OAAO,CNmjB1B,GAAO,CMljBrB,kBAAkC,CAAE,OAAO,CN+N1B,GAAO,CM9NxB,uBAAuC,CAAE,OAAO,CNgL1B,GAAO,CM/K7B,uBAAuC,CAAE,OAAO,CN4iB1B,GAAO,CM3iB7B,gBAAgC,CAAE,OAAO,CN+I1B,GAAO,CM9ItB,uBAAuC,CAAE,OAAO,CNyE1B,GAAO,CMxE7B,wBAAwC,CAAE,OAAO,CNyE1B,GAAO,CMxE9B,sBAAsC,CAAE,OAAO,CNkb1B,GAAO,CMjb5B,uBAAuC,CAAE,OAAO,CNuX1B,GAAO,CMtX7B,8FAAuC,CAAE,OAAO,CN2lB1B,GAAO,CM1lB7B,+FAAuC,CAAE,OAAO,CN2D1B,GAAO,CM1D7B,0BAA0C,CAAE,OAAO,CNyb1B,GAAO,CMxbhC,sBAAsC,CAAE,OAAO,CN0S1B,GAAO,CMzS5B,qBAAqC,CAAE,OAAO,CN0G1B,GAAO,CMzG3B,yBAAyC,CAAE,OAAO,CNulB1B,GAAO,CMtlB/B,yBAAyC,CAAE,OAAO,CNuD1B,GAAO,CMtD/B,cAA8B,CAAE,OAAO,CNnC1B,GAAO,CMoCpB,qBAAqC,CAAE,OAAO,CNnD1B,GAAO,CMoD3B,sBAAsC,CAAE,OAAO,CNnD1B,GAAO,CMoD5B,mBAAmC,CAAE,OAAO,CNnD1B,GAAO,CMoDzB,qBAAqC,CAAE,OAAO,CNvD1B,GAAO,CMwD3B,wCACgC,CAAE,OAAO,CN4d1B,GAAO,CM3dtB,iBAAiC,CAAE,OAAO,CN8I1B,GAAO,CM7IvB,mBAAmC,CAAE,OAAO,CNsF1B,GAAO,CMrFzB,eAA+B,CAAE,OAAO,CN+Z1B,GAAO,CM9ZrB,gBAAgC,CAAE,OAAO,CNoW1B,GAAO,CMnWtB,mBAAmC,CAAE,OAAO,CNpD1B,GAAO,CMqDzB,gNAA6C,CAAE,OAAO,CNuI1B,GAAO,CMtInC,eAA+B,CAAE,OAAO,CNkN1B,GAAO,CMjNrB,eAA+B,CAAE,OAAO,CN0S1B,GAAO,CMzSrB,iCAA+B,CAAE,OAAO,CN6K1B,GAAO,CM5KrB,cAA8B,CAAE,OAAO,CNyI1B,GAAO,CMxIpB,oBAAoC,CAAE,OAAO,CNyI1B,GAAO,CMxI1B,kDAC+C,CAAE,OAAO,CNiI1B,GAAO,CMhIrC,gBAAgC,CAAE,OAAO,CN+Y1B,GAAO,CM9YtB,mBAAmC,CAAE,OAAO,CNA1B,GAAO,CMCzB,iBAAiC,CAAE,OAAO,CNoa1B,GAAO,CMnavB,kBAAkC,CAAE,OAAO,CNgE1B,GAAO,CM/DxB,iBAAiC,CAAE,OAAO,CN6T1B,GAAO,CM5TvB,qBAAqC,CAAE,OAAO,CNuC1B,GAAO,CMtC3B,uBAAuC,CAAE,OAAO,CNmC1B,GAAO,CMlC7B,kBAAkC,CAAE,OAAO,CN+a1B,GAAO,CM9axB,wBAAwC,CAAE,OAAO,CNkd1B,GAAO,CMjd9B,iBAAiC,CAAE,OAAO,CN0K1B,GAAO,CMzKvB,sBAAsC,CAAE,OAAO,CN2K1B,GAAO,CM1K5B,mBAAmC,CAAE,OAAO,CN3E1B,GAAO,CM4EzB,mBAAmC,CAAE,OAAO,CN7E1B,GAAO,CM8EzB,2CACoC,CAAE,OAAO,CNlE1B,GAAO,CMmE1B,yBAAyC,CAAE,OAAO,CN+kB1B,GAAO,CM9kB/B,0BAA0C,CAAE,OAAO,CN4H1B,GAAO,CM3HhC,uBAAuC,CAAE,OAAO,CNT1B,GAAO,CMU7B,cAA8B,CAAE,OAAO,CN2Q1B,GAAO,CM1QpB,gCAC+B,CAAE,OAAO,CN6C1B,GAAO,CM5CrB,mBAAmC,CAAE,OAAO,CNkD1B,GAAO,CMjDzB,sBAAsC,CAAE,OAAO,CNsiB1B,GAAO,CMriB5B,wBAAwC,CAAE,OAAO,CNoiB1B,GAAO,CMniB9B,oBAAoC,CAAE,OAAO,CN2e1B,GAAO,CM1e1B,kBAAkC,CAAE,OAAO,CN8N1B,GAAO,CM7NxB,mBAAmC,CAAE,OAAO,CNoc1B,GAAO,CMnczB,0BAA0C,CAAE,OAAO,CNuR1B,GAAO,CMtRhC,qBAAqC,CAAE,OAAO,CN6hB1B,GAAO,CM5hB3B,wBAAwC,CAAE,OAAO,CNsG1B,GAAO,CMrG9B,kBAAkC,CAAE,OAAO,CN8b1B,GAAO,CM7bxB,iBAAiC,CAAE,OAAO,CNqjB1B,GAAO,CMpjBvB,wBAAwC,CAAE,OAAO,CNgL1B,GAAO,CM/K9B,iBAAiC,CAAE,OAAO,CNukB1B,GAAO,CMtkBvB,kBAAkC,CAAE,OAAO,CNqQ1B,GAAO,CMpQxB,gBAAgC,CAAE,OAAO,CNiW1B,GAAO,CMhWtB,mBAAmC,CAAE,OAAO,CN2d1B,GAAO,CM1dzB,qBAAqC,CAAE,OAAO,CNjD1B,GAAO,CMkD3B,uBAAuC,CAAE,OAAO,CN+V1B,GAAO,CM9V7B,kBAAkC,CAAE,OAAO,CNsjB1B,GAAO,CMrjBxB,yCACmC,CAAE,OAAO,CNgG1B,GAAO,CM/FzB,qCAAiC,CAAE,OAAO,CNoK1B,GAAO,CMnKvB,iBAAiC,CAAE,OAAO,CN0jB1B,GAAO,CMzjBvB,sBAAsC,CAAE,OAAO,CNoC1B,GAAO,CMnC5B,8BAC8B,CAAE,OAAO,CN+Y1B,GAAO,CM9YpB,gBAAgC,CAAE,OAAO,CNoM1B,GAAO,CMnMtB,mBAAmC,CAAE,OAAO,CNrD1B,GAAO,CMsDzB,eAA+B,CAAE,OAAO,CNhF1B,GAAO,CMiFrB,sBAAsC,CAAE,OAAO,CNrB1B,GAAO,CMsB5B,uBAAuC,CAAE,OAAO,CNoL1B,GAAO,CMnL7B,sBAAsC,CAAE,OAAO,CNkL1B,GAAO,CMjL5B,oBAAoC,CAAE,OAAO,CNmL1B,GAAO,CMlL1B,sBAAsC,CAAE,OAAO,CN+K1B,GAAO,CM9K5B,2DAA4C,CAAE,OAAO,CNrI1B,GAAO,CMsIlC,6DAA6C,CAAE,OAAO,CNjI1B,GAAO,CMkInC,0BAA0C,CAAE,OAAO,CNjI1B,GAAO,CMkIhC,4BAA4C,CAAE,OAAO,CNzI1B,GAAO,CM0IlC,gBAAgC,CAAE,OAAO,CN2J1B,GAAO,CM1JtB,iBAAiC,CAAE,OAAO,CN6lB1B,GAAO,CM5lBvB,gBAAgC,CAAE,OAAO,CNqe1B,GAAO,CMpetB,iBAAiC,CAAE,OAAO,CNyG1B,GAAO,CMxGvB,oBAAoC,CAAE,OAAO,CNzE1B,GAAO,CM0E1B,qBAAqC,CAAE,OAAO,CNlI1B,GAAO,CMmI3B,iCACgC,CAAE,OAAO,CNijB1B,GAAO,CMhjBtB,kDAC+B,CAAE,OAAO,CN4O1B,GAAO,CM3OrB,gBAAgC,CAAE,OAAO,CNd1B,GAAO,CMetB,gBAAgC,CAAE,OAAO,CN0G1B,GAAO,CMzGtB,kCACmC,CAAE,OAAO,CN6X1B,GAAO,CM5XzB,kCACkC,CAAE,OAAO,CN2F1B,GAAO,CM1FxB,oBAAoC,CAAE,OAAO,CN6S1B,GAAO,CM5S1B,mCACmC,CAAE,OAAO,CNqG1B,GAAO,CMpGzB,iBAAiC,CAAE,OAAO,CNgb1B,GAAO,CM/avB,qDAE+B,CAAE,OAAO,CNlI1B,GAAO,CMmIrB,kBAAkC,CAAE,OAAO,CNsO1B,GAAO,CMrOxB,kBAAkC,CAAE,OAAO,CNoO1B,GAAO,CMnOxB,wBAAwC,CAAE,OAAO,CN+b1B,GAAO,CM9b9B,oBAAoC,CAAE,OAAO,CN2gB1B,GAAO,CM1gB1B,gBAAgC,CAAE,OAAO,CNuc1B,GAAO,CMtctB,gBAAgC,CAAE,OAAO,CNyO1B,GAAO,CMxOtB,gBAAgC,CAAE,OAAO,CN6f1B,GAAO,CM5ftB,oBAAoC,CAAE,OAAO,CNmT1B,GAAO,CMlT1B,2BAA2C,CAAE,OAAO,CNoT1B,GAAO,CMnTjC,6BAA6C,CAAE,OAAO,CNgI1B,GAAO,CM/HnC,sBAAsC,CAAE,OAAO,CN4H1B,GAAO,CM3H5B,gBAAgC,CAAE,OAAO,CNqQ1B,GAAO,CMpQtB,wEAAqC,CAAE,OAAO,CNpF1B,GAAO,CMqF3B,mBAAmC,CAAE,OAAO,CN9E1B,GAAO,CM+EzB,qBAAqC,CAAE,OAAO,CNrF1B,GAAO,CMsF3B,sBAAsC,CAAE,OAAO,CNrF1B,GAAO,CMsF5B,kBAAkC,CAAE,OAAO,CNhC1B,GAAO,CMiCxB,mCAC+B,CAAE,OAAO,CN0Y1B,GAAO,CMzYrB,yCACoC,CAAE,OAAO,CN8Y1B,GAAO,CM7Y1B,sCACmC,CAAE,OAAO,CN2Y1B,GAAO,CM1YzB,mBAAmC,CAAE,OAAO,CNU1B,GAAO,CMTzB,mBAAmC,CAAE,OAAO,CNuM1B,GAAO,CMtMzB,sCAC+B,CAAE,OAAO,CNqf1B,GAAO,CMpfrB,iCACgC,CAAE,OAAO,CNoF1B,GAAO,CMnFtB,0CACqC,CAAE,OAAO,CN+a1B,GAAO,CM9a3B,oBAAoC,CAAE,OAAO,CN7C1B,GAAO,CM8C1B,qBAAqC,CAAE,OAAO,CN1C1B,GAAO,CM2C3B,gCAC+B,CAAE,OAAO,CNpI1B,GAAO,CMqIrB,kBAAkC,CAAE,OAAO,CN6W1B,GAAO,CM5WxB,mBAAmC,CAAE,OAAO,CNye1B,GAAO,CMxezB,qCACoC,CAAE,OAAO,CNrE1B,GAAO,CMsE1B,sBAAsC,CAAE,OAAO,CNqL1B,GAAO,CMpL5B,mBAAmC,CAAE,OAAO,CNG1B,GAAO,CMFzB,yBAAyC,CAAE,OAAO,CNnE1B,GAAO,CMoE/B,uBAAuC,CAAE,OAAO,CNnE1B,GAAO,CMoE7B,kBAAkC,CAAE,OAAO,CNif1B,GAAO,CMhfxB,sBAAsC,CAAE,OAAO,CN8Y1B,GAAO,CM7Y5B,mBAAmC,CAAE,OAAO,CNyZ1B,GAAO,CMxZzB,iBAAiC,CAAE,OAAO,CN9J1B,GAAO,CM+JvB,iBAAiC,CAAE,OAAO,CNlE1B,GAAO,CMmEvB,kBAAkC,CAAE,OAAO,CN1C1B,GAAO,CM2CxB,sBAAsC,CAAE,OAAO,CN8B1B,GAAO,CM7B5B,qBAAqC,CAAE,OAAO,CN1I1B,GAAO,CM2I3B,qBAAqC,CAAE,OAAO,CNsH1B,GAAO,CMrH3B,oBAAoC,CAAE,OAAO,CNrO1B,GAAO,CMsO1B,iBAAiC,CAAE,OAAO,CN4M1B,GAAO,CM3MvB,sBAAsC,CAAE,OAAO,CNU1B,GAAO,CMT5B,eAA+B,CAAE,OAAO,CN3K1B,GAAO,CM4KrB,mBAAmC,CAAE,OAAO,CNuF1B,GAAO,CMtFzB,sBAAsC,CAAE,OAAO,CN2Q1B,GAAO,CM1Q5B,4BAA4C,CAAE,OAAO,CNrO1B,GAAO,CMsOlC,6BAA6C,CAAE,OAAO,CNrO1B,GAAO,CMsOnC,0BAA0C,CAAE,OAAO,CNrO1B,GAAO,CMsOhC,4BAA4C,CAAE,OAAO,CNzO1B,GAAO,CM0OlC,qBAAqC,CAAE,OAAO,CNrO1B,GAAO,CMsO3B,sBAAsC,CAAE,OAAO,CNrO1B,GAAO,CMsO5B,mBAAmC,CAAE,OAAO,CNrO1B,GAAO,CMsOzB,qBAAqC,CAAE,OAAO,CNzO1B,GAAO,CM0O3B,kBAAkC,CAAE,OAAO,CNpD1B,GAAO,CMqDxB,iBAAiC,CAAE,OAAO,CN4I1B,GAAO,CM3IvB,iBAAiC,CAAE,OAAO,CNwY1B,GAAO,CMvYvB,yCACiC,CAAE,OAAO,CNuM1B,GAAO,CMtMvB,mBAAmC,CAAE,OAAO,CNzG1B,GAAO,CM0GzB,qBAAqC,CAAE,OAAO,CNyQ1B,GAAO,CMxQ3B,sBAAsC,CAAE,OAAO,CNyQ1B,GAAO,CMxQ5B,kBAAkC,CAAE,OAAO,CN+V1B,GAAO,CM9VxB,iBAAiC,CAAE,OAAO,CN9G1B,GAAO,CM+GvB,sCACgC,CAAE,OAAO,CNoR1B,GAAO,CMnRtB,qBAAqC,CAAE,OAAO,CN+C1B,GAAO,CM9C3B,mBAAmC,CAAE,OAAO,CNmB1B,GAAO,CMlBzB,wBAAwC,CAAE,OAAO,CNoB1B,GAAO,CMnB9B,kBAAkC,CAAE,OAAO,CNqU1B,GAAO,CMpUxB,kBAAkC,CAAE,OAAO,CN2B1B,GAAO,CM1BxB,gBAAgC,CAAE,OAAO,CNgL1B,GAAO,CM/KtB,kBAAkC,CAAE,OAAO,CN2B1B,GAAO,CM1BxB,qBAAqC,CAAE,OAAO,CNuH1B,GAAO,CMtH3B,iBAAiC,CAAE,OAAO,CNM1B,GAAO,CMLvB,yBAAyC,CAAE,OAAO,CNI1B,GAAO,CMH/B,mBAAmC,CAAE,OAAO,CN6X1B,GAAO,CM5XzB,eAA+B,CAAE,OAAO,CNhH1B,GAAO,CMiHrB,8CACoC,CAAE,OAAO,CNuQ1B,GAAO,CMtQ1B,2EAEsC,CAAE,OAAO,CNsV1B,GAAO,CMrV5B,yBAAyC,CAAE,OAAO,CNwI1B,GAAO,CMvI/B,eAA+B,CAAE,OAAO,CNhG1B,GAAO,CMiGrB,oBAAoC,CAAE,OAAO,CNvH1B,GAAO,CMwH1B,yCACuC,CAAE,OAAO,CNtJ1B,GAAO,CMuJ7B,mBAAmC,CAAE,OAAO,CNyO1B,GAAO,CMxOzB,eAA+B,CAAE,OAAO,CN0F1B,GAAO,CMzFrB,sBAAsC,CAAE,OAAO,CN1D1B,GAAO,CM2D5B,sBAAsC,CAAE,OAAO,CNkW1B,GAAO,CMjW5B,oBAAoC,CAAE,OAAO,CN4V1B,GAAO,CM3V1B,iBAAiC,CAAE,OAAO,CNlE1B,GAAO,CMmEvB,uBAAuC,CAAE,OAAO,CNgO1B,GAAO,CM/N7B,qBAAqC,CAAE,OAAO,CN2J1B,GAAO,CM1J3B,2BAA2C,CAAE,OAAO,CN2J1B,GAAO,CM1JjC,iBAAiC,CAAE,OAAO,CNsR1B,GAAO,CMrRvB,qBAAqC,CAAE,OAAO,CN5L1B,GAAO,CM6L3B,4BAA4C,CAAE,OAAO,CNxB1B,GAAO,CMyBlC,iBAAiC,CAAE,OAAO,CNuP1B,GAAO,CMtPvB,iBAAiC,CAAE,OAAO,CN6I1B,GAAO,CM5IvB,8BAA8C,CAAE,OAAO,CN9J1B,GAAO,CM+JpC,+BAA+C,CAAE,OAAO,CN9J1B,GAAO,CM+JrC,4BAA4C,CAAE,OAAO,CN9J1B,GAAO,CM+JlC,8BAA8C,CAAE,OAAO,CNlK1B,GAAO,CMmKpC,gBAAgC,CAAE,OAAO,CN8D1B,GAAO,CM7DtB,eAA+B,CAAE,OAAO,CNrH1B,GAAO,CMsHrB,iBAAiC,CAAE,OAAO,CNvS1B,GAAO,CMwSvB,qBAAqC,CAAE,OAAO,CN2Z1B,GAAO,CM1Z3B,mBAAmC,CAAE,OAAO,CNhN1B,GAAO,CMiNzB,qBAAqC,CAAE,OAAO,CN7F1B,GAAO,CM8F3B,qBAAqC,CAAE,OAAO,CN7F1B,GAAO,CM8F3B,qBAAqC,CAAE,OAAO,CN+O1B,GAAO,CM9O3B,sBAAsC,CAAE,OAAO,CNiM1B,GAAO,CMhM5B,iBAAiC,CAAE,OAAO,CN6W1B,GAAO,CM5WvB,uBAAuC,CAAE,OAAO,CN0I1B,GAAO,CMzI7B,wIAAyC,CAAE,OAAO,CN0I1B,GAAO,CMzI/B,mBAAmC,CAAE,OAAO,CNqF1B,GAAO,CMpFzB,qBAAqC,CAAE,OAAO,CNmF1B,GAAO,CMlF3B,uBAAuC,CAAE,OAAO,CNnL1B,GAAO,CMoL7B,wBAAwC,CAAE,OAAO,CN0K1B,GAAO,CMzK9B,+BAA+C,CAAE,OAAO,CNpF1B,GAAO,CMqFrC,uBAAuC,CAAE,OAAO,CNwP1B,GAAO,CMvP7B,kBAAkC,CAAE,OAAO,CNjJ1B,GAAO,CMkJxB,qDAC8C,CAAE,OAAO,CN/M1B,GAAO,CMgNpC,iDAC4C,CAAE,OAAO,CN9M1B,GAAO,CM+MlC,uDAC+C,CAAE,OAAO,CNjN1B,GAAO,CMkNrC,8BAC8B,CAAE,OAAO,CNvG1B,GAAO,CMwGpB,cAA8B,CAAE,OAAO,CNhC1B,GAAO,CMiCpB,gCAC8B,CAAE,OAAO,CNqY1B,GAAO,CMpYpB,+BAC8B,CAAE,OAAO,CN4C1B,GAAO,CM3CpB,2DAG8B,CAAE,OAAO,CNgD1B,GAAO,CM/CpB,iDAE8B,CAAE,OAAO,CNiN1B,GAAO,CMhNpB,6BAC8B,CAAE,OAAO,CN+C1B,GAAO,CM9CpB,iCAC8B,CAAE,OAAO,CN3P1B,GAAO,CM4PpB,eAA+B,CAAE,OAAO,CNhG1B,GAAO,CMiGrB,oBAAoC,CAAE,OAAO,CNpF1B,GAAO,CMqF1B,yBAAyC,CAAE,OAAO,CN0P1B,GAAO,CMzP/B,0BAA0C,CAAE,OAAO,CN0P1B,GAAO,CMzPhC,0BAA0C,CAAE,OAAO,CN0P1B,GAAO,CMzPhC,2BAA2C,CAAE,OAAO,CN0P1B,GAAO,CMzPjC,2BAA2C,CAAE,OAAO,CN6P1B,GAAO,CM5PjC,4BAA4C,CAAE,OAAO,CN6P1B,GAAO,CM5PlC,oBAAoC,CAAE,OAAO,CNkU1B,GAAO,CMjU1B,sBAAsC,CAAE,OAAO,CN8T1B,GAAO,CM7T5B,yBAAyC,CAAE,OAAO,CNya1B,GAAO,CMxa/B,kBAAkC,CAAE,OAAO,CNsa1B,GAAO,CMraxB,eAA+B,CAAE,OAAO,CN2Z1B,GAAO,CM1ZrB,sBAAsC,CAAE,OAAO,CN2Z1B,GAAO,CM1Z5B,uBAAuC,CAAE,OAAO,CNoa1B,GAAO,CMna7B,kBAAkC,CAAE,OAAO,CNxJ1B,GAAO,CMyJxB,yBAAyC,CAAE,OAAO,CN8P1B,GAAO,CM7P/B,oBAAoC,CAAE,OAAO,CNgB1B,GAAO,CMf1B,iBAAiC,CAAE,OAAO,CNpF1B,GAAO,CMqFvB,cAA8B,CAAE,OAAO,CN3W1B,GAAO,CM4WpB,2CAAoC,CAAE,OAAO,CN/R1B,GAAO,CMgS1B,2BAA2C,CAAE,OAAO,CN/R1B,GAAO,CMgSjC,iBAAiC,CAAE,OAAO,CN+U1B,GAAO,CM9UvB,wBAAwC,CAAE,OAAO,CN+U1B,GAAO,CM9U9B,0BAA0C,CAAE,OAAO,CNgD1B,GAAO,CM/ChC,wBAAwC,CAAE,OAAO,CNkD1B,GAAO,CMjD9B,0BAA0C,CAAE,OAAO,CN+C1B,GAAO,CM9ChC,2BAA2C,CAAE,OAAO,CN+C1B,GAAO,CM9CjC,gBAAgC,CAAE,OAAO,CNjW1B,GAAO,CMkWtB,kBAAkC,CAAE,OAAO,CNmY1B,GAAO,CMlYxB,kBAAkC,CAAE,OAAO,CN7W1B,GAAO,CM8WxB,gBAAgC,CAAE,OAAO,CNkC1B,GAAO,CMjCtB,mBAAmC,CAAE,OAAO,CN5K1B,GAAO,CM6KzB,gBAAgC,CAAE,OAAO,CNgN1B,GAAO,CM/MtB,qBAAqC,CAAE,OAAO,CNxF1B,GAAO,CMyF3B,iBAAiC,CAAE,OAAO,CN4T1B,GAAO,CM3TvB,iBAAiC,CAAE,OAAO,CNtI1B,GAAO,CMuIvB,eAA+B,CAAE,OAAO,CN6C1B,GAAO,CM5CrB,qCACmC,CAAE,OAAO,CN5D1B,GAAO,CM6DzB,gBAAgC,CAAE,OAAO,CN8P1B,GAAO,CM7PtB,iBAAiC,CAAE,OAAO,CNuE1B,GAAO,CMtEvB,kBAAkC,CAAE,OAAO,CN9W1B,GAAO,CM+WxB,cAA8B,CAAE,OAAO,CNtS1B,GAAO,CMuSpB,aAA6B,CAAE,OAAO,CNiW1B,GAAO,CMhWnB,gBAAgC,CAAE,OAAO,CNuW1B,GAAO,CMtWtB,iBAAiC,CAAE,OAAO,CN+I1B,GAAO,CM9IvB,oBAAoC,CAAE,OAAO,CNkF1B,GAAO,CMjF1B,yBAAyC,CAAE,OAAO,CN6N1B,GAAO,CM5N/B,+BAA+C,CAAE,OAAO,CN/W1B,GAAO,CMgXrC,8BAA8C,CAAE,OAAO,CNjX1B,GAAO,CMkXpC,qDAC8C,CAAE,OAAO,CNzR1B,GAAO,CM0RpC,uBAAuC,CAAE,OAAO,CNnM1B,GAAO,CMoM7B,qBAAqC,CAAE,OAAO,CNiW1B,GAAO,CMhW3B,uBAAuC,CAAE,OAAO,CNoV1B,GAAO,CMnV7B,sCAC8B,CAAE,OAAO,CN0S1B,GAAO,CMzSpB,wEAAwC,CAAE,OAAO,CN0G1B,GAAO,CMzG9B,wBAAwC,CAAE,OAAO,CN4M1B,GAAO,CM3M9B,gBAAgC,CAAE,OAAO,CNsL1B,GAAO,CMrLtB,0BAA0C,CAAE,OAAO,CNzL1B,GAAO,CM0LhC,oBAAoC,CAAE,OAAO,CNoW1B,GAAO,CMnW1B,iBAAiC,CAAE,OAAO,CN8D1B,GAAO,CM7DvB,4DAEqC,CAAE,OAAO,CN8S1B,GAAO,CM7S3B,iDACyC,CAAE,OAAO,CN1F1B,GAAO,CM2F/B,gBAAgC,CAAE,OAAO,CNsW1B,GAAO,CMrWtB,iBAAiC,CAAE,OAAO,CNlG1B,GAAO,CMmGvB,iBAAiC,CAAE,OAAO,CNgH1B,GAAO,CM/GvB,wBAAwC,CAAE,OAAO,CNiH1B,GAAO,CMhH9B,6BAA6C,CAAE,OAAO,CNyN1B,GAAO,CMxNnC,sBAAsC,CAAE,OAAO,CNuN1B,GAAO,CMtN5B,oBAAoC,CAAE,OAAO,CN/N1B,GAAO,CMgO1B,eAA+B,CAAE,OAAO,CN5N1B,GAAO,CM6NrB,wBAAwC,CAAE,OAAO,CN2E1B,GAAO,CM1E9B,yBAAyC,CAAE,OAAO,CNyE1B,GAAO,CMxE/B,iBAAiC,CAAE,OAAO,CNvN1B,GAAO,CMwNvB,iBAAiC,CAAE,OAAO,CNzC1B,GAAO,CM0CvB,mBAAmC,CAAE,OAAO,CNpC1B,GAAO,CMqCzB,cAA8B,CAAE,OAAO,CNtL1B,GAAO,CMuLpB,mBAAmC,CAAE,OAAO,CN7U1B,GAAO,CM8UzB,gBAAgC,CAAE,OAAO,CN1R1B,GAAO,CM2RtB,cAA8B,CAAE,OAAO,CNsD1B,GAAO,CMrDpB,gBAAgC,CAAE,OAAO,CNmL1B,GAAO,CMlLtB,eAA+B,CAAE,OAAO,CNrP1B,GAAO,CMsPrB,gBAAgC,CAAE,OAAO,CNrP1B,GAAO,CMsPtB,kBAAkC,CAAE,OAAO,CN7W1B,GAAO,CM8WxB,yBAAyC,CAAE,OAAO,CN7W1B,GAAO,CM8W/B,gBAAgC,CAAE,OAAO,CN0L1B,GAAO,CMzLtB,uBAAuC,CAAE,OAAO,CN0L1B,GAAO,CMzL7B,kBAAkC,CAAE,OAAO,CNyF1B,GAAO,CMxFxB,oCAC8B,CAAE,OAAO,CNzU1B,GAAO,CM0UpB,8BAC+B,CAAE,OAAO,CN+M1B,GAAO,CM9MrB,eAA+B,CAAE,OAAO,CN4P1B,GAAO,CM3PrB,kBAAkC,CAAE,OAAO,CNuK1B,GAAO,CMtKxB,qBAAqC,CAAE,OAAO,CNtP1B,GAAO,CMuP3B,qBAAqC,CAAE,OAAO,CNiK1B,GAAO,CMhK3B,mBAAmC,CAAE,OAAO,CN9P1B,GAAO,CM+PzB,qBAAqC,CAAE,OAAO,CN/L1B,GAAO,CMgM3B,sBAAsC,CAAE,OAAO,CNxL1B,GAAO,CMyL5B,uBAAuC,CAAE,OAAO,CNrM1B,GAAO,CMsM7B,4BAA4C,CAAE,OAAO,CN/L1B,GAAO,CMgMlC,yEAEuC,CAAE,OAAO,CNxM1B,GAAO,CMyM7B,+CACyC,CAAE,OAAO,CN9M1B,GAAO,CM+M/B,+CACuC,CAAE,OAAO,CN/M1B,GAAO,CMgN7B,+CACuC,CAAE,OAAO,CNpM1B,GAAO,CMqM7B,sBAAsC,CAAE,OAAO,CNjN1B,GAAO,CMkN5B,eAA+B,CAAE,OAAO,CNuR1B,GAAO,CMtRrB,kBAAkC,CAAE,OAAO,CN5S1B,GAAO,CM6SxB,mBAAmC,CAAE,OAAO,CN9E1B,GAAO,CM+EzB,uGAIoC,CAAE,OAAO,CNnE1B,GAAO,CMoE1B,yBAAyC,CAAE,OAAO,CN/T1B,GAAO,CMgU/B,oDAEgC,CAAE,OAAO,CNqD1B,GAAO,CMpDtB,+BACiC,CAAE,OAAO,CNnQ1B,GAAO,CMoQvB,qBAAqC,CAAE,OAAO,CNzK1B,GAAO,CM0K3B,cAA8B,CAAE,OAAO,CN3K1B,GAAO,CM4KpB,0EAEsC,CAAE,OAAO,CNxJ1B,GAAO,CMyJ5B,wBAAwC,CAAE,OAAO,CN2K1B,GAAO,CM1K9B,aAA6B,CAAE,OAAO,CNiC1B,GAAO,CMhCnB,mCACiC,CAAE,OAAO,CN0Q1B,GAAO,CMzQvB,sCACsC,CAAE,OAAO,CNV1B,GAAO,CMW5B,0CACwC,CAAE,OAAO,CNX1B,GAAO,CMY9B,kBAAkC,CAAE,OAAO,CN1I1B,GAAO,CM2IxB,sBAAsC,CAAE,OAAO,CNlV1B,GAAO,CMmV5B,iBAAiC,CAAE,OAAO,CNjJ1B,GAAO,CMkJvB,oBAAoC,CAAE,OAAO,CNb1B,GAAO,CMc1B,kBAAkC,CAAE,OAAO,CN+F1B,GAAO,CM9FxB,oBAAoC,CAAE,OAAO,CNuE1B,GAAO,CMtE1B,2BAA2C,CAAE,OAAO,CNuE1B,GAAO,CMtEjC,eAA+B,CAAE,OAAO,CNzZ1B,GAAO,CM0ZrB,4CACmC,CAAE,OAAO,CN5M1B,GAAO,CM6MzB,cAA8B,CAAE,OAAO,CN0M1B,GAAO,CMzMpB,qBAAqC,CAAE,OAAO,CNxa1B,GAAO,CMya3B,eAA+B,CAAE,OAAO,CNI1B,GAAO,CMHrB,qBAAqC,CAAE,OAAO,CNuF1B,GAAO,CMtF3B,iBAAiC,CAAE,OAAO,CN2M1B,GAAO,CM1MvB,eAA+B,CAAE,OAAO,CN+Q1B,GAAO,CM9QrB,sBAAsC,CAAE,OAAO,CNzC1B,GAAO,CM0C5B,eAA+B,CAAE,OAAO,CNwP1B,GAAO,CMvPrB,qBAAqC,CAAE,OAAO,CNrZ1B,GAAO,CMsZ3B,iBAAiC,CAAE,OAAO,CNvB1B,GAAO,CMwBvB,wBAAwC,CAAE,OAAO,CN3L1B,GAAO,CM4L9B,kBAAkC,CAAE,OAAO,CN5X1B,GAAO,CM6XxB,wBAAwC,CAAE,OAAO,CNhY1B,GAAO,CMiY9B,sBAAsC,CAAE,OAAO,CNnY1B,GAAO,CMoY5B,kBAAkC,CAAE,OAAO,CNtY1B,GAAO,CMuYxB,oBAAoC,CAAE,OAAO,CNlY1B,GAAO,CMmY1B,oBAAoC,CAAE,OAAO,CNlY1B,GAAO,CMmY1B,qBAAqC,CAAE,OAAO,CN3b1B,GAAO,CM4b3B,uBAAuC,CAAE,OAAO,CN3b1B,GAAO,CM4b7B,gBAAgC,CAAE,OAAO,CN+K1B,GAAO,CM9KtB,oBAAoC,CAAE,OAAO,CNnV1B,GAAO,CMoV1B,aAA6B,CAAE,OAAO,CN9d1B,GAAO,CM+dnB,qBAAqC,CAAE,OAAO,CN5R1B,GAAO,CM6R3B,sBAAsC,CAAE,OAAO,CN/C1B,GAAO,CMgD5B,wBAAwC,CAAE,OAAO,CN9b1B,GAAO,CM+b9B,qBAAqC,CAAE,OAAO,CNtf1B,GAAO,CMuf3B,oBAAoC,CAAE,OAAO,CN/B1B,GAAO,CMgC1B,qBAAqC,CAAE,OAAO,CNzH1B,GAAO,CM0H3B,iBAAiC,CAAE,OAAO,CNvI1B,GAAO,CMwIvB,wBAAwC,CAAE,OAAO,CNvI1B,GAAO,CMwI9B,qBAAqC,CAAE,OAAO,CN4J1B,GAAO,CM3J3B,oBAAoC,CAAE,OAAO,CN4J1B,GAAO,CM3J1B,kBAAkC,CAAE,OAAO,CNxc1B,GAAO,CMycxB,cAA8B,CAAE,OAAO,CNjb1B,GAAO,CMkbpB,kBAAkC,CAAE,OAAO,CNvJ1B,GAAO,CMwJxB,oBAAoC,CAAE,OAAO,CN3gB1B,GAAO,CM4gB1B,aAA6B,CAAE,OAAO,CN7Z1B,GAAO,CM8ZnB,kDAE8B,CAAE,OAAO,CNzK1B,GAAO,CM0KpB,mBAAmC,CAAE,OAAO,CNpG1B,GAAO,CMqGzB,qBAAqC,CAAE,OAAO,CNxb1B,GAAO,CMyb3B,yBAAyC,CAAE,OAAO,CN5W1B,GAAO,CM6W/B,mBAAmC,CAAE,OAAO,CN9V1B,GAAO,CM+VzB,mBAAmC,CAAE,OAAO,CN9P1B,GAAO,CM+PzB,kBAAkC,CAAE,OAAO,CNrJ1B,GAAO,CMsJxB,iBAAiC,CAAE,OAAO,CNe1B,GAAO,CMdvB,uBAAuC,CAAE,OAAO,CN2B1B,GAAO,CM1B7B,sBAAsC,CAAE,OAAO,CNoC1B,GAAO,CMnC5B,mBAAmC,CAAE,OAAO,CNqC1B,GAAO,CMpCzB,oBAAoC,CAAE,OAAO,CN5a1B,GAAO,CM6a1B,0BAA0C,CAAE,OAAO,CN9a1B,GAAO,CM+ahC,kBAAkC,CAAE,OAAO,CN/V1B,GAAO,CMgWxB,eAA+B,CAAE,OAAO,CNoB1B,GAAO,CMnBrB,sBAAsC,CAAE,OAAO,CN8K1B,GAAO,CM7K5B,qBAAqC,CAAE,OAAO,CN/F1B,GAAO,CMgG3B,sBAAsC,CAAE,OAAO,CN6E1B,GAAO,CM5E5B,oBAAoC,CAAE,OAAO,CN9M1B,GAAO,CM+M1B,gBAAgC,CAAE,OAAO,CN+K1B,GAAO,CM9KtB,eAA+B,CAAE,OAAO,CN7H1B,GAAO,CM8HrB,kBAAkC,CAAE,OAAO,CNnH1B,GAAO,CMoHxB,0CACsC,CAAE,OAAO,CNkI1B,GAAO,CMjI5B,0BAA0C,CAAE,OAAO,CNkI1B,GAAO,CMjIhC,uBAAuC,CAAE,OAAO,CN0K1B,GAAO,CMzK7B,sBAAsC,CAAE,OAAO,CNlI1B,GAAO,CMmI5B,qBAAqC,CAAE,OAAO,CNyK1B,GAAO,CMxK3B,sBAAsC,CAAE,OAAO,CNnI1B,GAAO,CMoI5B,wBAAwC,CAAE,OAAO,CNlI1B,GAAO,CMmI9B,wBAAwC,CAAE,OAAO,CNpI1B,GAAO,CMqI9B,iBAAiC,CAAE,OAAO,CN1G1B,GAAO,CM2GvB,qBAAqC,CAAE,OAAO,CN7Q1B,GAAO,CM8Q3B,4BAA4C,CAAE,OAAO,CN1U1B,GAAO,CM2UlC,sBAAsC,CAAE,OAAO,CNzE1B,GAAO,CM0E5B,mBAAmC,CAAE,OAAO,CNkL1B,GAAO,CMjLzB,iBAAiC,CAAE,OAAO,CNX1B,GAAO,CMYvB,oBAAoC,CAAE,OAAO,CNuJ1B,GAAO,CMtJ1B,qBAAqC,CAAE,OAAO,CNwJ1B,GAAO,CMvJ3B,+BAC8B,CAAE,OAAO,CN/f1B,GAAO,CMggBpB,kBAAkC,CAAE,OAAO,CN4J1B,GAAO,CM3JxB,gBAAgC,CAAE,OAAO,CN8G1B,GAAO,CM7GtB,iBAAiC,CAAE,OAAO,CNwD1B,GAAO,CMvDvB,iBAAiC,CAAE,OAAO,CN9I1B,GAAO,CM+IvB,qCACuC,CAAE,OAAO,CN0L1B,GAAO,CMzL7B,wBAAwC,CAAE,OAAO,CNjH1B,GAAO,CMkH9B,mBAAmC,CAAE,OAAO,CNrH1B,GAAO,CMsHzB,uBAAuC,CAAE,OAAO,CNnW1B,GAAO,CMoW7B,+DAEuC,CAAE,OAAO,CN/gB1B,GAAO,CMghB7B,sDACiD,CAAE,OAAO,CN9gB1B,GAAO,CM+gBvC,4CACuC,CAAE,OAAO,CNlhB1B,GAAO,CMmhB7B,+CAC0C,CAAE,OAAO,CNnhB1B,GAAO,CMohBhC,6CACwC,CAAE,OAAO,CNxhB1B,GAAO,CMyhB9B,wBAAwC,CAAE,OAAO,CN3I1B,GAAO,CM4I9B,mBAAmC,CAAE,OAAO,CN3O1B,GAAO,CM4OzB,uBAAuC,CAAE,OAAO,CNxI1B,GAAO,CMyI7B,yBAAyC,CAAE,OAAO,CNxI1B,GAAO,CMyI/B,sBAAsC,CAAE,OAAO,CNwB1B,GAAO,CMvB5B,wBAAwC,CAAE,OAAO,CNwB1B,GAAO,CMvB9B,iBAAiC,CAAE,OAAO,CN/d1B,GAAO,CMgevB,yBAAyC,CAAE,OAAO,CNle1B,GAAO,CMme/B,gBAAgC,CAAE,OAAO,CNpc1B,GAAO,CMqctB,wBAAwC,CAAE,OAAO,CNljB1B,GAAO,CMmjB9B,sBAAsC,CAAE,OAAO,CNxP1B,GAAO,CMyP5B,iDAC0C,CAAE,OAAO,CNzP1B,GAAO,CM0PhC,gDACyC,CAAE,OAAO,CN7P1B,GAAO,CM8P/B,+CACwC,CAAE,OAAO,CNhQ1B,GAAO,CMiQ9B,oBAAoC,CAAE,OAAO,CNrQ1B,GAAO,CMsQ1B,6CACsC,CAAE,OAAO,CNxR1B,GAAO,CMyR5B,8CACuC,CAAE,OAAO,CN7R1B,GAAO,CM8R7B,0BAA0C,CAAE,OAAO,CN1R1B,GAAO,CM2RhC,wBAAwC,CAAE,OAAO,CNpS1B,GAAO,CMqS9B,uBAAuC,CAAE,OAAO,CN3R1B,GAAO,CM4R7B,yBAAyC,CAAE,OAAO,CN/R1B,GAAO,CMgS/B,uBAAuC,CAAE,OAAO,CNjS1B,GAAO,CMkS7B,oBAAoC,CAAE,OAAO,CN+D1B,GAAO,CM9D1B,qBAAqC,CAAE,OAAO,CN/F1B,GAAO,CMgG3B,2BAA2C,CAAE,OAAO,CN/b1B,GAAO,CMgcjC,aAA6B,CAAE,OAAO,CNtU1B,GAAO,CMuUnB,oBAAoC,CAAE,OAAO,CNtU1B,GAAO,CMuU1B,sBAAsC,CAAE,OAAO,CNkE1B,GAAO,CMjE5B,wBAAwC,CAAE,OAAO,CNrK1B,GAAO,CMsK9B,+BAA+C,CAAE,OAAO,CNrK1B,GAAO,CMsKrC,qBAAqC,CAAE,OAAO,CN5U1B,GAAO,CM6U3B,sBAAsC,CAAE,OAAO,CNwH1B,GAAO,CMvH5B,iBAAiC,CAAE,OAAO,CNnF1B,GAAO,CMoFvB,iBAAiC,CAAE,OAAO,CNze1B,GAAO,CM0evB,kBAAkC,CAAE,OAAO,CN9W1B,GAAO,CM+WxB,gBAAgC,CAAE,OAAO,CNxK1B,GAAO,CMyKtB,4BAA4C,CAAE,OAAO,CNpQ1B,GAAO,CMqQlC,mCACqC,CAAE,OAAO,CNS1B,GAAO,CMR3B,iBAAiC,CAAE,OAAO,CNjd1B,GAAO,CMkdvB,gBAAgC,CAAE,OAAO,CNzoB1B,GAAO,CM0oBtB,iBAAiC,CAAE,OAAO,CN/nB1B,GAAO,CMgoBvB,0BAA0C,CAAE,OAAO,CN3hB1B,GAAO,CM4hBhC,2BAA2C,CAAE,OAAO,CN9hB1B,GAAO,CM+hBjC,2BAA2C,CAAE,OAAO,CN5hB1B,GAAO,CM6hBjC,2BAA2C,CAAE,OAAO,CNjiB1B,GAAO,CMkiBjC,mBAAmC,CAAE,OAAO,CNpR1B,GAAO,CMqRzB,kBAAkC,CAAE,OAAO,CN5N1B,GAAO,CM6NxB,oBAAoC,CAAE,OAAO,CN5N1B,GAAO,CM6N1B,gBAAgC,CAAE,OAAO,CN/N1B,GAAO,CMgOtB,cAA8B,CAAE,OAAO,CNlO1B,GAAO,CMmOpB,qBAAqC,CAAE,OAAO,CNpe1B,GAAO,CMqe3B,uBAAuC,CAAE,OAAO,CNpe1B,GAAO,CMqe7B,gBAAgC,CAAE,OAAO,CNtS1B,GAAO,CMuStB,gBAAgC,CAAE,OAAO,CNiF1B,GAAO,CMhFtB,oBAAoC,CAAE,OAAO,CNlkB1B,GAAO,CMmkB1B,oBAAoC,CAAE,OAAO,CNrX1B,GAAO,CMsX1B,uBAAuC,CAAE,OAAO,CNpI1B,GAAO,CMqI7B,eAA+B,CAAE,OAAO,CNpc1B,GAAO,CMqcrB,0BAA0C,CAAE,OAAO,CNhe1B,GAAO,CMiehC,mBAAmC,CAAE,OAAO,CNpf1B,GAAO,CMqfzB,eAA+B,CAAE,OAAO,CNlN1B,GAAO,CMmNrB,uBAAuC,CAAE,OAAO,CN1X1B,GAAO,CM2X7B,cAA8B,CAAE,OAAO,CNoD1B,GAAO,CMnDpB,uBAAuC,CAAE,OAAO,CN3J1B,GAAO,CM4J7B,mBAAmC,CAAE,OAAO,CNzN1B,GAAO,CM0NzB,iBAAiC,CAAE,OAAO,CNlH1B,GAAO,CMmHvB,uBAAuC,CAAE,OAAO,CN7L1B,GAAO,CM8L7B,yBAAyC,CAAE,OAAO,CN7L1B,GAAO,CM8L/B,sBAAsC,CAAE,OAAO,CN3C1B,GAAO,CM4C5B,wBAAwC,CAAE,OAAO,CN3C1B,GAAO,CM4C9B,uBAAuC,CAAE,OAAO,CNrG1B,GAAO,CMsG7B,0BAA0C,CAAE,OAAO,CNrG1B,GAAO,CMsGhC,kBAAkC,CAAE,OAAO,CN7U1B,GAAO,CM8UxB,oBAAoC,CAAE,OAAO,CNnlB1B,GAAO,CMolB1B,sBAAsC,CAAE,OAAO,CNnlB1B,GAAO,CMolB5B,kBAAkC,CAAE,OAAO,CN/L1B,GAAO,CMgMxB,qCAAiC,CAAE,OAAO,CNlX1B,GAAO,CMmXvB,qBAAqC,CAAE,OAAO,CNkF1B,GAAO,CMjF3B,kBAAkC,CAAE,OAAO,CNmF1B,GAAO,CMlFxB,iBAAiC,CAAE,OAAO,CN9c1B,GAAO,CM+cvB,2BAA2C,CAAE,OAAO,CN2B1B,GAAO,CM1BjC,yBAAyC,CAAE,OAAO,CNmE1B,GAAO,CMlE/B,4BAA4C,CAAE,OAAO,CNxK1B,GAAO,CMyKlC,gBAAgC,CAAE,OAAO,CN9lB1B,GAAO,CM+lBtB,4BAA4C,CAAE,OAAO,CNtoB1B,GAAO,CMuoBlC,+BAA+C,CAAE,OAAO,CNqD1B,GAAO,CMpDrC,kBAAkC,CAAE,OAAO,CNxlB1B,GAAO,CMylBxB,sCAAsD,CAAE,OAAO,CN5oB1B,GAAO,CM6oB5C,0EAC8D,CAAE,OAAO,CN9qB1B,GAAO,CM+qBpD,8DAE+B,CAAE,OAAO,CNvf1B,GAAO,CMwfrB,gBAAgC,CAAE,OAAO,CNhY1B,GAAO,CMiYtB,kBAAkC,CAAE,OAAO,CNhY1B,GAAO,CMiYxB,2CACwC,CAAE,OAAO,CN1H1B,GAAO,CM2H9B,qBAAqC,CAAE,OAAO,CNzR1B,GAAO,CM0R3B,iBAAiC,CAAE,OAAO,CNiC1B,GAAO,CMhCvB,wBAAwC,CAAE,OAAO,CNiC1B,GAAO,CMhC9B,mBAAmC,CAAE,OAAO,CNlH1B,GAAO,CMmHzB,yBAAyC,CAAE,OAAO,CNlH1B,GAAO,CMmH/B,0BAA0C,CAAE,OAAO,CNlH1B,GAAO,CMmHhC,qBAAqC,CAAE,OAAO,CNrN1B,GAAO,CMsN3B,sBAAsC,CAAE,OAAO,CNpb1B,GAAO,CMqb5B,gBAAgC,CAAE,OAAO,CNmE1B,GAAO,CMlEtB,oBAAoC,CAAE,OAAO,CNpD1B,GAAO,CMqD1B,6DAC+C,CAAE,OAAO,CNzY1B,GAAO,CM0YrC,qCACuC,CAAE,OAAO,CN7a1B,GAAO,CM8a7B,sBAAsC,CAAE,OAAO,CNtX1B,GAAO,CMuX5B,wBAAwC,CAAE,OAAO,CNlf1B,GAAO,CMmf9B,0BAA0C,CAAE,OAAO,CNlf1B,GAAO,CMmfhC,iBAAiC,CAAE,OAAO,CNtT1B,GAAO,CMuTvB,uBAAuC,CAAE,OAAO,CNptB1B,GAAO,CMqtB7B,yBAAyC,CAAE,OAAO,CNptB1B,GAAO,CMqtB/B,wCACuC,CAAE,OAAO,CNrtB1B,GAAO,CMstB7B,4CACyC,CAAE,OAAO,CNttB1B,GAAO,CMutB/B,sBAAsC,CAAE,OAAO,CNJ1B,GAAO,CMK5B,wBAAwC,CAAE,OAAO,CNJ1B,GAAO,CMK9B,iBAAiC,CAAE,OAAO,CNH1B,GAAO,CMIvB,mBAAmC,CAAE,OAAO,CN3W1B,GAAO,CM4WzB,6CACkC,CAAE,OAAO,CN5W1B,GAAO,CM6WxB,iDACoC,CAAE,OAAO,CN7W1B,GAAO,CM8W1B,gBAAgC,CAAE,OAAO,CNtN1B,GAAO,CMuNtB,yBAAyC,CAAE,OAAO,CN3b1B,GAAO,CM4b/B,mBAAmC,CAAE,OAAO,CNtF1B,GAAO,CMuFzB,2EAE2C,CAAE,OAAO,CNxE1B,GAAO,CMyEjC,8DACqD,CAAE,OAAO,CNvE1B,GAAO,CMwE3C,oDAC2C,CAAE,OAAO,CN3E1B,GAAO,CM4EjC,uDAC8C,CAAE,OAAO,CN5E1B,GAAO,CM6EpC,qDAC4C,CAAE,OAAO,CNjF1B,GAAO,CMkFlC,iBAAiC,CAAE,OAAO,CN3K1B,GAAO,CM4KvB,iDAE+B,CAAE,OAAO,CNzrB1B,GAAO,CM0rBrB,kBAAkC,CAAE,OAAO,CNlP1B,GAAO,CMmPxB,0BAA0C,CAAE,OAAO,CNK1B,GAAO,CMJhC,0BAA0C,CAAE,OAAO,CNK1B,GAAO,CMJhC,yBAAyC,CAAE,OAAO,CNK1B,GAAO,CMJ/B,kDACuC,CAAE,OAAO,CND1B,GAAO,CME7B,sDACyC,CAAE,OAAO,CNF1B,GAAO,CMG/B,mBAAmC,CAAE,OAAO,CNxsB1B,GAAO,CMysBzB,eAA+B,CAAE,OAAO,CNpb1B,GAAO,CMqbrB,eAA+B,CAAE,OAAO,CN1hB1B,GAAO,CM2hBrB,eAA+B,CAAE,OAAO,CNxY1B,GAAO,CMyYrB,kBAAkC,CAAE,OAAO,CN/O1B,GAAO,CMgPxB,kBAAkC,CAAE,OAAO,CNziB1B,GAAO,CM0iBxB,oBAAoC,CAAE,OAAO,CNjU1B,GAAO,CMkU1B,sBAAsC,CAAE,OAAO,CN7K1B,GAAO,CM8K5B,sBAAsC,CAAE,OAAO,CNhI1B,GAAO,CMiI5B,qBAAqC,CAAE,OAAO,CNJ1B,GAAO,CMK3B,iBAAiC,CAAE,OAAO,CNxU1B,GAAO,COzcvB,QAAS,CH8BP,QAAQ,CAAE,QAAQ,CAClB,KAAK,CAAE,GAAG,CACV,MAAM,CAAE,GAAG,CACX,OAAO,CAAE,CAAC,CACV,MAAM,CAAE,IAAI,CACZ,QAAQ,CAAE,MAAM,CAChB,IAAI,CAAE,gBAAa,CACnB,MAAM,CAAE,CAAC,CAUT,kDACQ,CACN,QAAQ,CAAE,MAAM,CAChB,KAAK,CAAE,IAAI,CACX,MAAM,CAAE,IAAI,CACZ,MAAM,CAAE,CAAC,CACT,QAAQ,CAAE,OAAO,CACjB,IAAI,CAAE,IAAI,CIvDd,swBAAK,CACH,WAAW,CAAE,OAAO,CACpB,y5BAAQ,CACN,WAAW,CC+BuB,aAAa,CD9B/C,OAAO,CAAE,YAAY,CACrB,UAAU,CAAE,MAAM,CAClB,WAAW,CAAE,MAAM,CACnB,WAAW,CAAE,CAAC,CACd,eAAe,CAAE,OAAO,CAM5B,86BAAkB,CAChB,OAAO,CAAE,YAAY,CACrB,eAAe,CAAE,OAAO,CAGxB,muEAAgB,CACd,OAAO,CAAE,MAAM,CACf,2wEAAuB,CACrB,WAAW,CAAE,KAAI,CACnB,utEAAsB,CACpB,OAAO,CAAE,YAAY,CAE3B,2iBAA2B,CACzB,OAAO,CAAE,GAAE,CjBpBL,kBAAoB,CAAE,qBAAM,CAK5B,eAAiB,CAAE,qBAAM,CAezB,UAAY,CAAE,qBAAM,CiBE5B,+nBAAiC,CAC/B,OAAO,CAAE,CAAC,CAGV,mtCAAuB,CACrB,SAAS,CAAE,IAAI,CACf,cAAc,CAAE,IAAI,CEpBxB,0PAAS,CACP,OAAO,CAAE,IAAqB,CAC9B,WAAW,CDayB,IAAI,CCZxC,aAAa,CDYuB,IAAI,CCXxC,UAAU,CAAE,OAAmB,CAEjC,8CAAe,CACb,KAAK,CCe+B,IAAM,CDd1C,WAAW,CAAE,IAAI,CACjB,OAAO,CAAE,KAAK,CACd,KAAK,CCY+B,IAAM,CDX1C,UAAU,CAAE,OAAkB,CAC9B,MAAM,CAAE,KAAsB,CAC9B,OAAO,CAAE,QAA2C,CACpD,aAAa,CAAE,IAAqB,CAEtC,0ZAAyB,CACvB,UAAU,CAAE,OAAkB,CAC9B,mxCAAe,CACb,UAAU,CAAE,OAAiB,CACjC,kYAA0B,CACxB,UAAU,CAAE,OAAmB,CAC/B,ouCAAe,CACb,UAAU,CAAE,OAAoB,CAEpC,sYAAuB,CACrB,UAAU,CAAE,OAAmB,CAC/B,yuCAAe,CACb,UAAU,CAAE,OAAkB,CAElC,mZAA0B,CACxB,UAAU,CAAE,OAAuB,CACnC,swCAAe,CACb,UAAU,CAAE,OAAqB,CAErC,scAA0B,CACxB,UAAU,CCF0B,OAAmB,CDGvD,42CAAe,CACb,KAAK,CCpB6B,OAAW,CDqB7C,UAAU,CCHwB,OAAmB,CDIvD,8dAAC,CACC,KAAK,CCb6B,OAAK,CDe3C,sZAAsB,CACpB,aAAa,CAAE,CAAC,CAsBlB,kBAAkB,CAChB,QAAQ,CAAE,KAAK,CACf,MAAM,CAAE,GAAG,CACX,IAAI,CAAE,CAAC,CACP,OAAO,CDG6B,GAAG,CCFvC,qBAAE,CACA,OAAO,CAAE,KAAK,CACd,KAAK,CDT6B,KAAK,CCUvC,UAAU,CAAE,WAAW,CACvB,KAAK,CCrD6B,IAAM,CDsDxC,UAAU,CAAE,MAAM,CAClB,UAAU,CAAE,2BAA0B,CACtC,OAAO,CAAE,MAAmB,CAC5B,SAAS,CAAE,GAAG,CACd,OAAO,CAAE,CAAC,CACV,MAAM,CAAE,CAAC,CACT,WAAW,CAAE,IAAI,CACjB,QAAQ,CAAE,MAAM,CnB3FZ,kBAAoB,CAAE,gBAAM,CAK5B,eAAiB,CAAE,gBAAM,CAezB,UAAY,CAAE,gBAAM,CmByExB,0CAAsB,CACpB,UAAU,CC5FsB,OAAM,CD6FxC,uCAAmB,CACjB,UAAU,CC5DsB,OAAK,CD6DvC,0CAAsB,CACpB,UAAU,CDnFsB,OAAO,CCoFzC,yCAAqB,CACnB,UAAU,CDtEsB,OAAI,CCuEtC,wBAAI,CACF,OAAO,CAAE,CAAC,CACV,MAAM,CAAE,IAAI,CEhFd,oCAAsB,CFmFxB,kBAAkB,CAChB,MAAM,CAAE,IAAI,CACZ,GAAG,CAAE,CAAC,CACN,KAAK,CAAE,IAAI,CACX,qBAAE,CACA,KAAK,CAAE,IAAI,EG3FjB,MAAM,CACJ,SAAS,CAAE,IAAI,CACf,MAAM,CAAE,CAAC,CACT,cAAc,CAAE,QAAQ,CACxB,eAAe,CAAE,MAAM,CACvB,MAAM,CAAE,OAAO,CACf,WAAW,CAAE,MAAM,CACnB,kBAAkB,CAAE,MAAM,CAC1B,SAAS,CAAE,OAAO,CACpB,gDAAiD,CAC/C,MAAM,CAAE,CAAC,CACT,OAAO,CAAE,CAAC,CACZ,gBAAgB,CACd,MAAM,CAAE,OAAO,CAEjB,IAAI,CAEF,OAAO,CAAE,YAAY,CACrB,aAAa,CAAE,GAAG,CAClB,WAAW,CAAE,MAAM,CACnB,WAAW,CAAE,MAAM,CACnB,UAAU,CAAE,MAAM,CAClB,MAAM,CAAE,OAAO,CACf,SAAS,CAAE,IAAI,CACf,OAAO,CAAE,iBAA6F,CACtG,KAAK,CFf+B,IAAM,CEgB1C,MAAM,CAAE,yBAAyB,CACjC,gBAAgB,CF7CoB,OAAM,CE8C1C,eAAe,CAAE,IAAI,CACrB,WAAW,CAAE,MAAM,CACnB,WAAW,CFDyB,uDAA2D,CEE/F,UAAU,CAAE,mFAAqF,CACjG,YAAY,CAAE,KAAK,CACnB,cAAc,CAAE,MAAM,CACtB,QAAQ,CAAE,MAAM,CAChB,IAAI,CAAE,CAAC,CACP,iBAAiB,CAAE,IAAI,CtBxDjB,mBAAoB,CsByDb,IAAI,CtBpDX,gBAAiB,CsBoDV,IAAI,CtB/CX,eAAgB,CsB+CT,IAAI,CtBrCX,WAAY,CsBqCL,IAAI,CtBzDX,kBAAoB,CAAE,eAAM,CAK5B,eAAiB,CAAE,eAAM,CAezB,UAAY,CAAE,eAAM,CsByC5B,UAAU,CACR,UAAU,CAAE,OAAwB,CACpC,KAAK,CFjC+B,IAAM,CEoC1C,UAAO,CACL,UAAU,CAAE,OAAqC,CACjD,KAAK,CFtC6B,IAAM,CEuC1C,UAAO,CACL,UAAU,CAAE,OAAqC,CACjD,OAAO,CAAE,CAAC,CACZ,WAAQ,CACN,UAAU,CAAE,6EAA+E,CAC3F,OAAO,CAAE,iBAA6F,CACxG,YAAS,CACP,KAAK,CF9C6B,IAAM,CE+C1C,aAAU,CACR,gBAAgB,CAAE,IAAI,CACtB,MAAM,CAAE,2DAA2D,CACnE,MAAM,CAAE,iBAAmB,CAC3B,OAAO,CAAE,GAAG,CACZ,MAAM,CAAE,WAAW,CACnB,UAAU,CAAE,IAAI,CAEpB,aAAa,CACX,gBAAgB,CAAE,IAAI,CACtB,MAAM,CAAE,2DAA2D,CACnE,MAAM,CAAE,iBAAmB,CAC3B,OAAO,CAAE,GAAG,CACZ,MAAM,CAAE,WAAW,CACnB,UAAU,CAAE,IAAI,CAChB,4DAA0B,CACxB,gBAAgB,CAAE,IAAI,CACtB,MAAM,CAAE,2DAA2D,CACnE,MAAM,CAAE,iBAAmB,CAC3B,OAAO,CAAE,GAAI,CACb,MAAM,CAAE,WAAW,CACnB,UAAU,CAAE,IAAI,CAGpB,sBAAsB,CACpB,OAAO,CAAE,CAAC,CACV,MAAM,CAAE,CAAC,CAEX,UAAU,CACR,SAAS,CAAE,GAAG,CAEhB,SAAS,CACP,gBAAgB,CAAE,kBAAgB,CAClC,eAAO,CACL,gBAAgB,CAAE,kBAA6B,CAEnD,YAAY,CACV,gBAAgB,CAAE,kBAA2C,CAC7D,KAAK,CAAE,kBAAsB,CAC7B,kBAAO,CACL,gBAAgB,CAAE,kBAAuD,CACzE,KAAK,CF5F6B,OAAW,CE6F/C,oBAAS,CACP,KAAK,CAAE,kBAAsB,CAEjC,YAAY,CACV,gBAAgB,CAAE,kBAAiB,CACnC,kBAAO,CACL,gBAAgB,CAAE,eAA6B,CAEnD,WAAW,CACT,gBAAgB,CAAE,kBAAe,CACjC,iBAAO,CACL,gBAAgB,CAAE,kBAA4B,CAElD,YAAY,CACV,gBAAgB,CAAE,kBAAkB,CACpC,kBAAO,CACL,gBAAgB,CAAE,kBAA+B,CACrD,WAAW,CACT,gBAAgB,CJvIoB,IAAI,CIwIxC,iBAAO,CACL,gBAAgB,CAAE,kBAAoC,CAE1D,SAAS,CACP,gBAAgB,CAAE,sBAAsB,CACxC,KAAK,CF3G+B,OAAK,CE4GzC,UAAU,CAAE,IAAI,CAChB,YAAY,CAAE,sBAAsB,CACpC,eAAO,CACL,gBAAgB,CAAE,sBAAsB,CACxC,KAAK,CAAE,kBAAoC,CAC3C,UAAU,CAAE,IAAI,CAClB,gBAAQ,CACN,gBAAgB,CAAE,sBAAsB,CACxC,KAAK,CAAE,kBAAoC,CAC3C,UAAU,CAAE,IAAI,CAClB,iBAAS,CACP,KAAK,CFtH6B,OAAO,CEwH7C,mCAAoC,CAClC,cAAc,CAAE,MAAM,CAExB,aAAa,CACX,aAAa,CJ1IuB,IAAI,ChBuExC,KAAK,CAAE,CAAC,CACR,wCAAS,CAEP,OAAO,CAAE,KAAK,CACd,OAAO,CAAE,EAAE,CACb,mBAAO,CACL,KAAK,CAAE,IAAI,CqB3Ff,YAAY,CACV,QAAQ,CAAE,QAAQ,CAClB,OAAO,CAAE,YAAY,CAIvB,qCAAqC,CACnC,OAAO,CAAE,KAAK,CAChB,iBAAiB,CACf,QAAQ,CAAE,QAAQ,CAClB,IAAI,CAAE,CAAC,CACP,OAAO,CAAE,IAAI,CACb,KAAK,CAAE,IAAI,CACX,GAAG,CAAE,IAAI,CACT,SAAS,CAAE,IAAI,CACf,UAAU,CHW0B,OAAyB,CGV7D,OAAO,CLmD6B,GAAG,CKlDvC,MAAM,CAAE,iBAAgC,CACxC,UAAU,CAAE,2BAA0B,CACtC,OAAO,CAAE,IAAqB,CAC9B,sBAAQ,CACN,OAAO,CAAE,KAAK,CACd,KAAK,CAAE,IAAI,CACX,KAAK,CHN6B,OAAW,CGO7C,WAAW,CAAE,MAAM,CACnB,SAAS,CAAE,GAAG,CACd,OAAO,CAAE,MAAuB,CAChC,MAAM,CAAE,OAAO,CACf,4BAAO,CACL,UAAU,CHFsB,OAAK,CGGrC,KAAK,CHT2B,IAAM,CGU1C,4BAAY,CACV,UAAU,CAAE,iBAAgC,CAC5C,MAAM,CAAE,KAAuB,CACjC,2BAAW,CACT,cAAc,CAAE,IAAqB,CACrC,gDAAoB,CAClB,KAAK,CAAE,IAAI,CACf,mCAAmB,CACjB,UAAU,CAAE,OAA4B,CACxC,cAAc,CAAE,SAAS,CACzB,WAAW,CAAE,GAAG,CAChB,SAAS,CAAE,GAAG,CACd,yCAAO,CACL,UAAU,CAAE,OAA4B,CAC1C,wCAAI,CACF,KAAK,CHzB2B,IAAM,CG2B5C,6CAA6C,CAC3C,MAAM,CAAE,IAAI,CACZ,GAAG,CAAE,IAAI,CACT,IAAI,CAAE,IAAI,CACV,KAAK,CAAE,CAAC,CAGR,iDAAiB,CACf,UAAU,CH9BwB,OAAyB,CG+B3D,UAAU,CAAE,GAAG,CACjB,mDAAmB,CACjB,OAAO,CAAE,QAA2C,CACpD,yDAAO,CACL,UAAU,CHlCsB,OAAK,CGmCrC,KAAK,CHzC2B,IAAM,CG2C5C,+CAA+C,CAC7C,KAAK,CAAE,CAAC,CACR,IAAI,CAAE,IAAI,CACV,UAAU,CAAE,KAAK,CAGjB,yBAAQ,CACN,OAAO,CAAE,GAAG,CACZ,aAAa,CAAE,iBAA0B,CACzC,WAAW,CAAE,qBAAqB,CAClC,YAAY,CAAE,qBAAqB,CACnC,QAAQ,CAAE,QAAQ,CAClB,OAAO,CAAE,KAAK,CACd,GAAG,CAAE,IAAI,CACT,IAAI,CAAE,GAAG,CACT,WAAW,CAAE,IAAI,CACnB,gDAA+B,CAC7B,IAAI,CAAE,IAAI,CCtEZ,uBAAM,CACJ,OAAO,CAAE,KAAK,CAEhB,gIAA+C,CAC7C,OAAO,CAAE,YAAY,CACrB,QAAQ,CAAE,MAAM,CAChB,KAAK,CAAE,CAAC,CACR,cAAc,CAAE,MAAM,CAItB,wCAAO,CACL,OAAO,CAAE,YAAY,CACrB,cAAc,CAAE,MAAM,CACtB,KAAK,CAAE,IAAI,CACX,MAAM,CAAE,YAA+C,CACvD,KAAK,CAAE,IAAI,CACf,4BAAW,CACT,KAAK,CAAE,IAAI,CACX,kCAAK,CACH,OAAO,CAAE,KAAK,CAChB,mCAAM,CACJ,UAAU,CAAE,GAAqB,CAEvC,QAAQ,CACN,MAAM,CAAE,CAAC,CACT,MAAM,CAAE,CAAC,CACT,OAAO,CAAE,CAAC,CACZ,MAAM,CACJ,OAAO,CAAE,KAAK,CACd,KAAK,CAAE,IAAI,CACX,MAAM,CAAE,CAAC,CACT,OAAO,CAAE,CAAC,CACV,WAAW,CAAE,MAAM,CACnB,aAAa,CN/BuB,IAAI,CMgCxC,SAAS,CAAE,IAAI,CACf,YAAY,CAAE,IAAI,CACpB,KAAK,CACH,OAAO,CAAE,KAAK,CACd,MAAM,CAAE,aAAa,CACrB,KAAK,CNR+B,IAAU,CMS9C,SAAS,CAAE,GAAG,CAEhB,qBAAuB,CACrB,SAAS,CAAE,IAAI,CACf,MAAM,CAAE,CAAC,CACT,cAAc,CAAE,QAAQ,CACxB,eAAe,CAAE,MAAM,CAGzB,iBAAiB,CACf,aAAa,CNhDuB,IAAI,ChBuExC,KAAK,CAAE,CAAC,CuBrGR,SAAS,CCCC,IAAQ,CDChB,WAAI,CAAE,IAAI,CACV,YAAK,CAAE,IAAI,CvBkGb,KAAK,CAAE,CAAC,CACR,gDAAS,CAEP,OAAO,CAAE,KAAK,CACd,OAAO,CAAE,EAAE,CACb,uBAAO,CACL,KAAK,CAAE,IAAI,CALb,gDAAS,CAEP,OAAO,CAAE,KAAK,CACd,OAAO,CAAE,EAAE,CACb,uBAAO,CACL,KAAK,CAAE,IAAI,CsBzBf,uDAAyD,CACvD,OAAO,CAAE,IAAI,CACb,KAAK,CN/C+B,OAAI,CMoDxC,mGAA+C,CAC7C,cAAc,CAAE,IAAqB,CACrC,wHAAM,CACJ,KAAK,CAAE,IAAI,CAEX,0tEAAqP,CACnP,KAAK,CAAE,IAAI,CACnB,+BAA+B,CGlF3B,KAAK,CAAE,IAAsB,CAG3B,OAAO,CAAE,KAAK,CAed,YAAoB,CAAE,QAA+B,CACrD,KAAK,CAAE,IAAuC,CCnB5C,YAAoB,CAAE,CAAC,CDqBzB,0CAAa,CACX,YAAoB,CAAE,CAAC,CHgE/B,iCAAiC,CGtF7B,KAAK,CAAE,IAAsB,CAG3B,OAAO,CAAE,KAAK,CAed,YAAoB,CAAE,QAA+B,CACrD,KAAK,CAAE,SAAuC,CAE9C,4CAAa,CACX,YAAoB,CAAE,CAAC,CCA7B,iDAAwB,CACtB,YAAoB,CAAE,CAAC,CAEvB,mDAA0B,CACxB,KAAK,CALY,IAAkC,CJqEzD,iCAAiC,CG1F7B,KAAK,CAAE,IAAsB,CAG3B,OAAO,CAAE,KAAK,CAed,YAAoB,CAAE,QAA+B,CACrD,KAAK,CAAE,SAAuC,CAE9C,4CAAa,CACX,YAAoB,CAAE,CAAC,CCA7B,iDAAwB,CACtB,YAAoB,CAAE,CAAC,CAEvB,mDAA0B,CACxB,KAAK,CALY,IAAkC,CJ0EzD,uDAAuD,CACrD,MAAM,CAAE,SAA2B,CACnC,SAAS,CAAE,GAAG,CAEhB,oBAAoB,CAClB,OAAO,CAAE,YAAY,CACrB,MAAM,CAAE,SAA2B,CACnC,SAAS,CAAE,GAAG,CAOZ,osBAAqP,CACnP,KAAK,CAAE,IAAI,CAIjB,uBAAuB,CACrB,OAAO,CAAE,YAAY,CACrB,YAAY,CAAE,KAAK,CACnB,KAAK,CAAE,IAAI,CACX,cAAc,CAAE,MAAM,CACtB,SAAS,CAAE,GAAG,CAEhB,gBAAgB,CACd,OAAO,CAAE,KAAK,CACd,KAAK,CN7H+B,IAAI,CM8HxC,SAAS,CAAE,GAAG,CACd,UAAU,CAAE,OAAO,CACnB,UAAU,CAAE,MAAM,CAClB,kBAAC,CACC,SAAS,CAAE,OAAO,CAClB,UAAU,CAAE,MAAM,CAClB,aAAa,CAAE,GAAqB,CACtC,6BAAY,CACV,aAAa,CAAE,CAAC,CA4DpB,KAAK,CACH,WAAW,CAAE,MAAM,CAGnB,6DAAmD,CACjD,kBAAkB,CAAE,MAAM,CAC1B,MAAM,CAAE,OAAO,CACf,WAAW,CJ7JuB,uDAA2D,CI8J7F,SAAS,CAAE,OAAO,CACpB,gSAAqP,CACnP,kBAAkB,CAAE,IAAI,CACxB,OAAO,CAAE,GAAqB,CAC9B,OAAO,CAAE,YAAY,CACrB,MAAM,CAAE,cAA6B,CACrC,SAAS,CAAE,GAAG,CACd,WAAW,CJrKuB,uDAA2D,CIsK7F,UAAU,CAAE,oBAAmC,CAC/C,aAAa,CAAE,CAAC,CxBxNZ,kBAAoB,CAAE,kBAAM,CAK5B,eAAiB,CAAE,kBAAM,CAezB,UAAY,CAAE,kBAAM,CwBuM1B,4BAAwB,CACtB,OAAO,CAAE,eAAkB,CAC7B,eAAW,CACT,MAAM,CAAE,OAAO,CACjB,0CAAmC,CxB/N7B,kBAAoB,CwBgOZ,UAAU,CxB3NlB,eAAiB,CwB2NT,UAAU,CxB5MlB,UAAY,CwB4MJ,UAAU,CACtB,OAAO,CAAE,CAAC,CACV,YAAY,CAAE,OAAO,CACrB,OAAO,CAAE,IAAI,CACb,MAAM,CAAE,IAAI,CACd,oBAAgB,CxBrOV,kBAAoB,CwBsOZ,UAAU,CxBjOlB,eAAiB,CwBiOT,UAAU,CxBlNlB,UAAY,CwBkNJ,UAAU,CACtB,kGAA6D,CAC3D,kBAAkB,CAAE,IAAI,CAC5B,oXAAyU,CACvU,OAAO,CAAE,CAAC,CACV,OAAO,CAAE,cAAc,CACvB,YAAY,CNxLsB,IAAU,CMyL9C,oBAAgB,CACd,YAAY,CAAE,eAA8B,CAC9C,+EAAqE,CACnE,OAAO,CAAE,gBAAsB,CAC/B,OAAO,CAAE,gBAAgB,CAC3B,4aAAiY,CAC/X,MAAM,CAAE,WAAW,CACnB,gBAAgB,CAAE,OAAmC,CAEzD,+DAAiE,CAC/D,KAAK,CNzN+B,OAAI,CM0NxC,MAAM,CAAE,iBAAc,CACxB,iFAAmF,CACjF,YAAY,CN5NwB,OAAI,CM8NxC,yHAA+G,CAC7G,aAAa,CN/NqB,OAAI,CMiO1C,oBAAoB,CAClB,OAAO,CAAE,IAAqB,CAC9B,SAAS,CAAE,IAAI,CAKjB,QAAQ,CACN,QAAQ,CAAE,IAAI,CACd,cAAc,CAAE,GAAG,CACnB,KAAK,CAAE,IAAI,CACX,WAAW,CJzNyB,uDAA2D,CI0NjG,eAAgB,CACd,OAAO,CAAE,WAAgB,CACzB,OAAO,CAAE,YAAY,CACrB,MAAM,CAAE,cAA6B,CACrC,SAAS,CAAE,GAAG,CACd,UAAU,CAAE,oBAAmC,CxBhRzC,kBAAoB,CAAE,kBAAM,CAK5B,eAAiB,CAAE,kBAAM,CAezB,UAAY,CAAE,kBAAM,CwB+P5B,MAAM,CACJ,MAAM,CAAE,cAA6B,CACrC,gBAAgB,CJvPoB,IAAM,CIwP1C,gBAAW,CACT,MAAM,CAAE,IAAI,CAChB,2BAA4B,CAC1B,OAAO,CAAE,CAAC,CACZ,uFAA2F,CACzF,MAAM,CAAE,WAAW,CACnB,gBAAgB,CAAE,OAAmC,CAKrD,8DAAuD,CACrD,MAAM,CAAE,WAAW,CACvB,sBAAuB,CACrB,MAAM,CAAE,KAAuB,CAE/B,KAAK,CJ5Q+B,OAAW,CI6Q/C,OAAO,CAAE,KAAK,CACd,kCAAK,CACH,cAAc,CAAE,QAAQ,CAI5B,uBAAuB,CACrB,OAAO,CAAE,YAAY,CACrB,QAAQ,CAAE,MAAM,CAChB,KAAK,CAAE,CAAC,CACR,cAAc,CAAE,MAAM,CAuBxB,iCAAkC,CAChC,WAAW,CAAE,MAAM,CACnB,OAAO,CAAE,GAAqB,CAC9B,qEAAiB,CACf,WAAW,CAAE,IAAI,CACjB,OAAO,CAAE,KAAK,CACd,OAAO,CAAE,YAAY,CACrB,SAAS,CAAE,GAAG,CACd,gBAAgB,CJtSkB,OAAmB,CIuSrD,MAAM,CAAE,cAA6B,CACrC,KAAK,CN7U6B,IAAI,CM+U1C,kCAAkC,CAChC,WAAW,CAAE,CAAC,CAChB,kCAAkC,CAChC,YAAY,CAAE,CAAC,CAcjB,UAAU,CACR,QAAQ,CAAE,QAAQ,CAClB,OAAO,CAAE,KAAK,CACd,MAAM,CNjV8B,IAAI,CMkVxC,UAAU,CAAE,IAAqB,CACjC,MAAM,CAAE,OAAO,CACf,iBAAQ,CACN,QAAQ,CAAE,QAAQ,CAClB,OAAO,CAAE,EAAE,CACX,OAAO,CAAE,KAAK,CACd,IAAI,CAAE,CAAC,CACP,GAAG,CAAE,CAAC,CACN,KAAK,CAAE,IAAuB,CAC9B,MAAM,CAAE,IAAqB,CAC7B,aAAa,CAAE,GAAG,CAClB,UAAU,CN9WwB,IAAI,ClBNlC,kBAAoB,CAAE,oBAAM,CAK5B,eAAiB,CAAE,oBAAM,CAezB,UAAY,CAAE,oBAAM,CwBkW1B,gBAAO,CACL,QAAQ,CAAE,QAAQ,CAClB,OAAO,CAAE,EAAE,CACX,OAAO,CAAE,KAAK,CACd,KAAK,CAAE,IAAI,CACX,MAAM,CAAE,IAAI,CACZ,aAAa,CAAE,GAAG,CAClB,UAAU,CNxXwB,IAAI,CMyXtC,IAAI,CAAE,IAAI,CACV,GAAG,CAAE,IAAI,CxB/XL,kBAAoB,CAAE,oBAAM,CAK5B,eAAiB,CAAE,oBAAM,CAezB,UAAY,CAAE,oBAAM,CwB6W1B,eAAI,CACF,QAAQ,CAAE,QAAQ,CAClB,IAAI,CAAE,IAAqB,CAC3B,OAAO,CAAE,KAAK,CACd,SAAS,CAAE,IAAI,CACf,KAAK,CNhY6B,IAAI,CMiYtC,WAAW,CAAE,CAAC,CAEhB,wBAAQ,CACN,UAAU,CAAE,OAAmB,CACjC,uBAAO,CACL,IAAI,CNrX8B,IAAI,CMsXtC,UAAU,CJ3YwB,OAAM,CI6Y5C,mBAAmB,CACjB,MAAM,CAAE,WAAW,CACnB,OAAO,CAAE,GAAE,CAgDX,wGAAyB,CACvB,KAAK,CNpa6B,OAAI,CMsatC,81BAAqP,CACnP,MAAM,CAAE,iBAAc,CAC1B,iDAAQ,CACN,MAAM,CAAE,iBAAc,CAE1B,mBAAmB,CACjB,WAAW,CAAE,MAAM,CACnB,qCAAiB,CACf,OAAO,CAAE,WAAgB,CACzB,OAAO,CAAE,YAAY,CACrB,SAAS,CAAE,GAAG,CAClB,gEAAgE,CAC9D,KAAK,CJ9c+B,OAAM,CIid5C,+DAA+D,CAC7D,KAAK,CNtb+B,OAAI,CMyb1C,gEAAgE,CAC9D,KAAK,CNzc+B,OAAO,CM4c7C,6DAA6D,CAC3D,KAAK,CJxb+B,OAAK,CI8b3C,UAAU,CxBleF,iBAAoB,CAAE,aAAM,CAK5B,cAAiB,CAAE,aAAM,CAKzB,aAAgB,CAAE,aAAM,CAKxB,YAAe,CAAE,aAAM,CAKvB,SAAY,CAAE,aAAM,CwBgd5B,WAAW,CxBpeH,iBAAoB,CAAE,cAAM,CAK5B,cAAiB,CAAE,cAAM,CAKzB,aAAgB,CAAE,cAAM,CAKxB,YAAe,CAAE,cAAM,CAKvB,SAAY,CAAE,cAAM,CwBkd5B,WAAW,CxBteH,iBAAoB,CAAE,cAAM,CAK5B,cAAiB,CAAE,cAAM,CAKzB,aAAgB,CAAE,cAAM,CAKxB,YAAe,CAAE,cAAM,CAKvB,SAAY,CAAE,cAAM,CwBod5B,OAAO,CxBxeC,iBAAoB,CAAE,UAAM,CAK5B,cAAiB,CAAE,UAAM,CAKzB,aAAgB,CAAE,UAAM,CAKxB,YAAe,CAAE,UAAM,CAKvB,SAAY,CAAE,UAAM,CwBsd1B,iBAAW,CxB1eL,iBAAoB,CwB2eL,wBAAwB,CxBtevC,cAAiB,CwBseF,wBAAwB,CxBjevC,aAAgB,CwBieD,wBAAwB,CxB5dvC,YAAe,CwB4dA,wBAAwB,CxBvdvC,SAAY,CwBudG,wBAAwB,CAC7C,kBAAY,CxB5eN,iBAAoB,CwB6eL,yBAAyB,CxBxexC,cAAiB,CwBweF,yBAAyB,CxBnexC,aAAgB,CwBmeD,yBAAyB,CxB9dxC,YAAe,CwB8dA,yBAAyB,CxBzdxC,SAAY,CwBydG,yBAAyB,CAC9C,kBAAY,CxB9eN,iBAAoB,CwB+eL,yBAAyB,CxB1exC,cAAiB,CwB0eF,yBAAyB,CxBrexC,aAAgB,CwBqeD,yBAAyB,CxBhexC,YAAe,CwBgeA,yBAAyB,CxB3dxC,SAAY,CwB2dG,yBAAyB,CAEhD,yCAAyC,CAErC,8BAAqB,CACnB,MAAM,CAAE,SAAS,CAEjB,8ZAAqP,CACnP,aAAa,CAAE,KAAK,CACpB,OAAO,CAAE,KAAK,CAClB,cAAK,CACH,aAAa,CAAE,KAAK,CACpB,OAAO,CAAE,KAAK,CAEhB,kYAAqO,CACnO,aAAa,CAAE,CAAC,CAElB,wCAAuB,CACrB,aAAa,CAAE,KAAK,CACpB,UAAU,CAAE,IAAI,CAChB,OAAO,CAAE,KAAK,CACd,KAAK,CAAE,IAAI,CACb,4BAAW,CACT,MAAM,CAAE,WAAW,CACvB,iEAAmE,CACjE,OAAO,CAAE,KAAK,CACd,SAAS,CAAE,GAAG,CACd,OAAO,CAAE,KAAuB,EHnfhC,oCAAsB,CQhC1B,YAAY,CAER,OAAO,CAAE,IAAI,ER8Bb,oCAAsB,CQ5B1B,YAAY,CAER,OAAO,CAAE,IAAI,EAEjB,WAAW,CACT,KAAK,CAAE,IAAI,CAEb,YAAY,CACV,KAAK,CAAE,KAAK,CAEd,WAAW,CACT,KAAK,CAAE,IAAI,CC4Cb,mEAAS,CACP,eAAe,CAAE,QAAQ,CACzB,cAAc,CAAE,CAAC,CACjB,WAAW,CAAE,IAAI,CACjB,aAAa,CZ/BuB,IAAI,CYgCxC,2FAAO,CACL,KAAK,CAAE,IAAI,CACX,IAAI,CAAE,6BAA8B,CACpC,OAAO,CAAE,KAAK,CACd,UAAU,CAAE,MAAM,CACpB,yJAAM,CACJ,SAAS,CZjByB,GAAG,CYkBrC,MAAM,CAAE,CAAC,CACT,QAAQ,CAAE,OAAO,CACjB,OAAO,CZnB2B,QAAmC,CYoBvE,iOAA8B,CAC5B,iBAAiB,CAAE,CAAC,CACtB,qFAAK,CACH,KAAK,CAAE,IAAI,CACX,UAAU,CAAE,IAAI,CAChB,cAAc,CAAE,MAAM,CACtB,WAAW,CAAE,MAAM,CACnB,8FAAE,CACA,WAAW,CZnDqB,IAAI,CYoDpC,aAAa,CAAE,iBAA6B,CAChD,4EAAE,CACA,gBAAgB,CAAE,WAAW,CAC7B,cAAc,CAAE,MAAM,CAE1B,kFAAc,CACZ,WAAW,CAAE,IAAuB,CACpC,mHAAY,CACV,aAAa,CAAE,CAAC,CACpB,4HAA4B,CAC1B,KAAK,CAAE,EAAE,CACT,aAAa,CAAE,CAAC,CAChB,uXAA0C,CACxC,MAAM,CAAE,CAAC,CAEb,mBAAmB,CACjB,KAAK,CV9D+B,IAAY,CU+DhD,SAAS,CAAE,GAAG,CAChB,kBAAkB,CAChB,KAAK,CVjE+B,IAAY,CUkEhD,SAAS,CAAE,GAAG,CAIhB,2HAAyD,CACvD,gBAAgB,CVzDoB,OAAmB,CU2DzD,gBAAgB,CACd,gBAAgB,CV5DoB,OAAmB,CUiEzD,kDAAsB,CACpB,MAAM,CAAE,iBAA6B,CACrC,wDAAE,CACA,aAAa,CAAE,iBAA6B,CAC5C,WAAW,CAAE,iBAA6B,CAC5C,gGAAwB,CACtB,mBAAmB,CAAE,CAAC,CAE1B,kBAAkB,CAChB,MAAM,CAAE,iBAA6B,CAGrC,0BAAE,CACA,aAAa,CAAE,iBAA6B,CAC9C,8CAAwB,CACtB,mBAAmB,CAAE,CAAC,CAGxB,2CAAwB,CACtB,mBAAmB,CAAE,CAAC,CACxB,+CAAM,CACJ,YAAY,CAAE,SAAS,CACvB,aAAa,CAAE,iBAA6B,CAC9C,2CAAwB,CACtB,mBAAmB,CAAE,CAAC,CAG1B,oBAAoB,CAClB,aAAa,CZhHuB,IAAI,CYiHxC,SAAS,CAAE,IAAI,CACf,QAAQ,CAAE,IAAI,CACd,0BAAK,CACH,aAAa,CAAE,YAAY,CAC3B,2DAAM,CACJ,WAAW,CAAE,MAAM,CCzIzB,CAAC,CACC,KAAK,CX+B+B,OAAK,CW9BzC,eAAe,CAAE,IAAI,CACrB,MAAM,CAAE,OAAO,CACf,OAAO,CACL,KAAK,CbgD6B,OAAwB,Ca/C5D,SAAS,CACP,KAAK,CX0B6B,OAAO,CWA7C,IAAI,CACF,MAAM,CAAE,IAAI,CACZ,UAAU,CAAE,MAAM,CAEpB,IAAI,CACF,WAAW,CXOyB,uDAA2D,CWN/F,WAAW,CAAE,MAAM,CACnB,KAAK,CXlB+B,OAAW,CWmB/C,UAAU,CAAE,IAAI,CAChB,UAAU,CAAE,MAAM,CAClB,UAAU,CbnD0B,OAAO,CaqD7C,aAAa,CACX,UAAU,CAAE,IAAI,CAElB,eAAe,CACb,UAAU,CAAE,MAAM,CAEpB,cAAc,CACZ,UAAU,CAAE,KAAK,CAEnB,cAAc,CACZ,SAAS,CAAE,IAAI,CAEjB,eAAe,CACb,SAAS,CAAE,IAAI,CAEjB,oBAAqB,CACnB,SAAS,CAAE,GAAG,CAEhB,eAAe,CACb,eAAe,CAAE,YAAY,CAE/B,gBAAgB,CACd,KAAK,CAAE,kBAAkB,CAC3B,uBAAuB,CACrB,KAAK,CAAE,kBAAgC,CACzC,aAAa,CACX,KAAK,CAAE,kBAAgB,CACzB,oBAAoB,CAClB,KAAK,CAAE,kBAA8B,CACvC,gBAAgB,CACd,KAAK,CAAE,kBAAiB,CAC1B,uBAAuB,CACrB,KAAK,CAAE,kBAA+B,CACxC,eAAe,CACb,KAAK,CAAE,kBAAe,CACxB,sBAAsB,CACpB,KAAK,CAAE,kBAA6B,CACtC,gBAAgB,CACd,KAAK,CAAE,kBAAsB,CAC/B,uBAAuB,CACrB,KAAK,CAAE,kBAAoC,CAkB7C,gEAAyB,CACvB,UAAU,CAAE,CAAC,CACb,WAAW,CAAE,GAAG,CAChB,WAAW,CX5DyB,0DAA8D,CW8DpG,CAAC,CACC,WAAW,Cb1FyB,IAAI,Ca2FxC,MAAM,CAAE,CAAC,CACT,SAAS,Cb/F2B,IAAI,CagGxC,aAAa,Cb7FuB,IAAI,Ca+F1C,EAAE,CACA,SAAS,CAAE,IAAI,CAEjB,0CAAE,CACA,SAAS,CAAE,IAAI,CAEjB,EAAE,CACA,SAAS,CAAE,IAAI,CAEjB,EAAE,CACA,SAAS,CAAE,IAAI,CAEjB,EAAE,CACA,SAAS,CAAE,IAAI,CAEjB,EAAE,CACA,SAAS,CAAE,IAAI,CAEjB,EAAE,CACA,OAAO,CAAE,KAAK,CACd,MAAM,CAAE,GAAG,CACX,MAAM,CAAE,CAAC,CACT,UAAU,CAAE,iBAA6B,CACzC,MAAM,CAAE,MAAmB,CAC3B,OAAO,CAAE,CAAC,CAEZ,sCAAI,CACF,WAAW,CAAE,MAAM,CACnB,SAAS,CAAE,IAAI,CACf,UAAU,CXrH0B,IAAM,CWsH1C,MAAM,CAAE,iBAAiC,CACzC,SAAS,CAAE,GAAG,CACd,OAAO,CAAE,KAAK,CACd,WAAW,CXnGyB,wMAAoN,CWoGxP,KAAK,Cb1H+B,OAAI,Ca2HxC,UAAU,CAAE,IAAI,CAChB,0CAAY,CACV,SAAS,CAAE,GAAG,CAmClB,wFAAmB,CACjB,UAAU,CAAE,IAAI,CAChB,WAAW,CbzKyB,IAAI,Ca0KxC,aAAa,Cb1KuB,IAAI,Ca2KxC,oGAAE,CACA,UAAU,CAAE,IAAI,CAChB,WAAW,Cb7KuB,IAAI,Ca8KtC,wJAAY,CACV,aAAa,CAAE,CAAC,CAClB,gHAAE,CACA,aAAa,CAAE,CAAC,CAClB,gHAAE,CACA,UAAU,CAAE,MAAM,CAClB,4HAAE,CACA,UAAU,CAAE,MAAM,CACtB,4HAAK,CACH,UAAU,CAAE,OAAO,CAEzB,iFAAsB,CACpB,UAAU,CAAE,OAAO,CACnB,WAAW,Cb3LyB,IAAI,Ca4LxC,aAAa,Cb5LuB,IAAI,Ca6LxC,6FAAE,CACA,UAAU,CAAE,OAAO,CACnB,WAAW,Cb/LuB,IAAI,CagMtC,iJAAY,CACV,aAAa,CAAE,CAAC,CAClB,yGAAE,CACA,aAAa,CAAE,CAAC,CAChB,qHAAE,CACA,UAAU,CAAE,IAAI,CCrOxB,kBAAkB,CAChB,MAAM,CAAE,iBAA6B,CACrC,aAAa,CAAE,IAAI,CACnB,OAAO,Cd6B6B,IAAI,Cc5BxC,WAAW,CAAE,IAAqB,CAClC,WAAW,CAAE,GAAG,CAChB,UAAU,CZiC0B,IAAM,CYhC1C,QAAQ,CAAE,QAAQ,CAClB,wBAAO,CACL,OAAO,CAAE,SAAS,CAClB,QAAQ,CAAE,QAAQ,CAClB,GAAG,CAAE,GAAG,CACR,IAAI,CAAE,GAAG,CACT,UAAU,CZiCwB,OAAO,CYhCzC,KAAK,CAAE,IAAoB,CAC3B,OAAO,CAAE,QAA2C,CACtD,2CAA0B,CACxB,MAAM,CAAE,iBAA6B,CACrC,aAAa,CdcqB,IAAI,CcZ1C,+GAAmC,CACjC,MAAM,CAAE,iBAA6B,CACrC,OAAO,CAAE,GAAG,CACZ,UAAU,CAAE,IAAI,CAChB,UAAU,CZe0B,IAAM,CYb1C,MAAM,CAAE,YAAyB,CACjC,gLAAuB,CACrB,MAAM,CAAE,IAAI,CACZ,UAAU,CAAE,IAAI,CAChB,MAAM,CAAE,CAAC,CAEb,+BAA+B,CAC7B,KAAK,CAAE,IAAI,CACb,cAAc,CACZ,YAAY,CAAE,iBAA0C,CACxD,MAAM,CAAE,CAAC,CACT,OAAO,CAAE,SAA2C,CACpD,WAAW,CZuByB,wMAAoN,CYtBxP,SAAS,CAAE,IAAI,CACf,WAAW,CAAE,GAAG,CAChB,KAAK,CdI+B,OAAwB,CcH9D,2BAA2B,CACzB,WAAW,CAAE,GAAG,CAChB,MAAM,CAAE,CAAC,CACT,OAAO,CAAE,SAA2C,CACpD,WAAW,CZeyB,wMAAoN,CYdxP,SAAS,CAAE,IAAI,CACf,WAAW,CAAE,GAAG,CAChB,OAAO,CAAE,KAAK,CACd,QAAQ,CAAE,IAAI,CACd,KAAK,CZhB+B,OAAW,CYoBjD,YAAY,CACV,2IAAgE,CAC9D,WAAW,CAAE,QAAQ,ECzDzB,IAAI,CACF,gBAAgB,CAAE,IAAO,CACzB,MAAM,CAAE,OAAO,CACf,OAAO,CAAE,MAAM,CACf,OAAO,CAAE,KAAK,CAChB,EAAE,CACA,KAAK,CAAE,IAAO,CACd,UAAU,CAAE,MAAM,CACpB,IAAI,CACF,KAAK,CAAE,OAAO,CACd,gBAAgB,CAAE,OAAO,CAC3B,EAAE,CACA,WAAW,CAAE,IAAI,CACnB,EAAE,CACA,WAAW,CAAE,IAAI,CACnB,GAAG,CACD,KAAK,CAAE,IAAO,CACd,UAAU,CAAE,MAAM,CACpB,GAAG,CACD,KAAK,CAAE,IAAO,CACd,WAAW,CAAE,IAAI,CACnB,GAAG,CACD,KAAK,CAAE,IAAO,CACd,UAAU,CAAE,MAAM,CACpB,GAAG,CACD,KAAK,CAAE,IAAO,CACd,WAAW,CAAE,IAAI,CACjB,UAAU,CAAE,MAAM,CACpB,GAAG,CACD,KAAK,CAAE,IAAO,CACd,gBAAgB,CAAE,IAAO,CAC3B,MAAM,CACJ,KAAK,CAAE,IAAO,CACd,gBAAgB,CAAE,IAAO,CAC3B,GAAG,CACD,UAAU,CAAE,MAAM,CACpB,GAAG,CACD,KAAK,CAAE,IAAO,CAChB,GAAG,CACD,KAAK,CAAE,IAAO,CAChB,GAAG,CACD,KAAK,CAAE,IAAO,CACd,gBAAgB,CAAE,IAAO,CAC3B,MAAM,CACJ,KAAK,CAAE,IAAO,CACd,gBAAgB,CAAE,IAAO,CAC3B,GAAG,CACD,KAAK,CAAE,IAAO,CAChB,GAAG,CACD,KAAK,CAAE,IAAO,CAChB,GAAG,CACD,WAAW,CAAE,IAAI,CACnB,GAAG,CACD,KAAK,CAAE,MAAO,CACd,WAAW,CAAE,IAAI,CACnB,GAAG,CACD,KAAK,CAAE,IAAO,CAChB,GAAG,CACD,WAAW,CAAE,IAAI,CACnB,GAAG,CACD,WAAW,CAAE,IAAI,CACnB,GAAG,CACD,WAAW,CAAE,IAAI,CACnB,GAAG,CACD,WAAW,CAAE,IAAI,CACnB,GAAG,CACD,WAAW,CAAE,IAAI,CACnB,GAAG,CACD,KAAK,CAAE,IAAO,CACd,WAAW,CAAE,IAAI,CACnB,EAAE,CACA,KAAK,CAAE,IAAO,CAChB,EAAE,CACA,KAAK,CAAE,IAAO,CAChB,EAAE,CACA,KAAK,CAAE,IAAO,CAChB,GAAG,CACD,KAAK,CAAE,IAAI,CACb,GAAG,CACD,KAAK,CAAE,OAAO,CAChB,GAAG,CACD,KAAK,CAAE,IAAO,CACd,WAAW,CAAE,IAAI,CACnB,GAAG,CACD,KAAK,CAAE,IAAI,CACb,GAAG,CACD,KAAK,CAAE,MAAM,CACf,GAAG,CACD,KAAK,CAAE,IAAO,CACd,WAAW,CAAE,IAAI,CACnB,GAAG,CACD,KAAK,CAAE,IAAO,CACd,WAAW,CAAE,IAAI,CACnB,GAAG,CACD,KAAK,CAAE,IAAO,CAChB,GAAG,CACD,KAAK,CAAE,IAAI,CACb,GAAG,CACD,KAAK,CAAE,IAAI,CACb,GAAG,CACD,WAAW,CAAE,IAAI,CACnB,EAAE,CACA,KAAK,CAAE,IAAO,CAChB,GAAG,CACD,KAAK,CAAE,IAAO,CAChB,GAAG,CACD,KAAK,CAAE,IAAO,CAChB,GAAG,CACD,KAAK,CAAE,IAAO,CAChB,GAAG,CACD,KAAK,CAAE,IAAO,CAChB,GAAG,CACD,KAAK,CAAE,IAAO,CAChB,GAAG,CACD,KAAK,CAAE,IAAO,CAChB,GAAG,CACD,KAAK,CAAE,IAAO,CAChB,GAAG,CACD,KAAK,CAAE,IAAO,CAChB,GAAG,CACD,KAAK,CAAE,IAAO,CAChB,GAAG,CACD,KAAK,CAAE,IAAO,CAChB,GAAG,CACD,KAAK,CAAE,IAAO,CAChB,GAAG,CACD,KAAK,CAAE,IAAO,CAChB,GAAG,CACD,KAAK,CAAE,OAAO,CAChB,GAAG,CACD,KAAK,CAAE,IAAO,CAChB,GAAG,CACD,KAAK,CAAE,OAAO,CAChB,GAAG,CACD,KAAK,CAAE,IAAO,CAChB,GAAG,CACD,KAAK,CAAE,IAAI,CACb,GAAG,CACD,KAAK,CAAE,IAAI,CACb,GAAG,CACD,KAAK,CAAE,IAAI,CACb,GAAG,CACD,KAAK,CAAE,IAAO,CAChB,GAAG,CACD,KAAK,CAAE,IAAI,CACX,gBAAgB,CAAE,OAAO,CCjJ3B,kBAAkB,CAChB,OAAO,CAAE,YAAY,CACrB,uCAAsB,CACpB,KAAK,CAAE,KAAK,CACd,oBAAC,CACC,OAAO,CAAE,YAAY,CACrB,OAAO,CAAE,GAAG,CACZ,gCAAa,CACX,YAAY,CAAE,CAAC,CACnB,6FAAI,CACF,OAAO,CAAE,GAAG,CACZ,MAAM,CAAE,IAAI,CACZ,UAAU,CAAE,IAAI,CAChB,qHAAS,CACP,KAAK,CdqB2B,OAAW,CcpBjD,qBAAqB,CACnB,aAAa,CAAE,CAAC,CAChB,KAAK,CdqB+B,OAAW,CcpB/C,SAAS,CAAE,GAAG,CACd,OAAO,CAAE,YAAY,CbanB,oCAAsB,CaTxB,qBAAqB,CACnB,OAAO,CAAE,IAAI,CACf,uCAAuC,CACrC,OAAO,CAAE,IAAI,EAEjB,YAAY,CACV,uCAAuC,CACrC,OAAO,CAAE,IAAI,EC9BjB,SAAS,CACP,QAAQ,CAAE,KAAK,CACf,GAAG,CCAO,OAAO,CDGjB,gBAAO,CACL,eAAe,CAAE,IAAI,CAEzB,cAAc,CjC+FZ,KAAK,CAAE,CAAC,CACR,0CAAS,CAEP,OAAO,CAAE,KAAK,CACd,OAAO,CAAE,EAAE,CACb,oBAAO,CACL,KAAK,CAAE,IAAI,CiCnGb,mCAAM,CACJ,OAAO,CAAE,YAAY,CACvB,uBAAQ,CACN,UAAU,CAAE,qBAAoB,CAEhC,6BAAa,CACX,WAAW,CAAE,iBAAyB,CACxC,8BAAc,CACZ,YAAY,CAAE,iBAAyB,CAC3C,gBAAC,CACC,MAAM,CAAE,IAAmB,CAC3B,OAAO,CAAE,YAAY,CACrB,WAAW,CAAE,IAAmB,CAChC,OAAO,CAAE,MAAiB,CAE9B,iBAAiB,CACf,KAAK,CjBuD+B,KAAK,CiBtDzC,oDAAiB,CACf,MAAM,CAAE,IAAmB,CAC3B,OAAO,CAAE,YAAY,CACrB,WAAW,CAAE,IAAmB,CAChC,OAAO,CAAE,SAAS,CAClB,aAAa,CAAE,CAAC,CAChB,OAAO,CAAE,KAAK,CACd,WAAW,CAAE,IAAI,CACjB,cAAc,CAAE,SAAS,CACzB,SAAS,CAAE,GAAG,CACd,KAAK,CfR6B,OAAwB,CeS1D,WAAW,CAAE,MAAM,CAErB,oBAAE,CACA,aAAa,CAAE,CAAC,CAEhB,+BAAY,CACV,UAAU,CAAE,iBAAyB,CACvC,kCAAe,CACb,aAAa,CAAE,iBAAyB,CAC1C,4BAAS,CACP,UAAU,CAAE,OAA4C,CACxD,8BAAC,CACC,KAAK,CfbyB,IAAY,Cec1C,YAAY,CAAE,iBAAsD,CACpE,OAAO,CAAE,eAAyB,CAClC,oCAAO,CACL,UAAU,CAAE,OAA4C,CAC9D,mGAAI,CACF,MAAM,CAAE,IAAI,CACZ,UAAU,CAAE,OAAO,CACnB,KAAK,CAAE,OAAO,CACd,YAAY,CAAE,CAAC,CACf,aAAa,CAAE,CAAC,CAElB,wCAAmB,CACjB,OAAO,CAAE,KAAK,CACd,KAAK,CAAE,IAAI,CACX,WAAW,CAAE,MAAM,CAGnB,SAAS,CAAE,IAAI,CACf,WAAW,CAAE,KAAK,CAClB,KAAK,CAAE,OAA8B,CAGzC,wDAAuB,CACrB,KAAK,CfvC6B,OAAW,CewC7C,OAAO,CAAE,eAAmB,CAC5B,WAAW,CAAE,IAAI,CACjB,QAAQ,CAAE,QAAQ,CAClB,UAAU,CflCwB,OAAyB,CemC3D,MAAM,CAAE,IAAI,CACZ,aAAa,CAAE,iBAAsD,CACrE,UAAU,CAAE,iBAAsD,CAClE,YAAY,CAAE,YAAY,CAE1B,oEAAO,CACL,UAAU,CfzCsB,OAAyB,Ce0CzD,4GAAmB,CACjB,KAAK,CflDyB,IAAY,CemD9C,gGAAmB,CAGjB,OAAO,CAAE,KAAK,CACd,SAAS,CAAE,IAAI,CACf,WAAW,CAAE,KAAK,CAClB,KAAK,CAAE,IAA8B,CAIvC,iHAAI,CACF,OAAO,CAAE,IAAI,CACf,iIAAc,CACZ,OAAO,CAAE,KAAK,CAGd,yCAAG,CACD,UAAU,CAAE,OAA4C,CACxD,OAAO,CAAE,eAAyB,CACpC,uDAAiB,CACf,OAAO,CAAE,KAAK,CACd,UAAU,CAAE,OAA4C,CACxD,OAAO,CAAE,eAAyB,CACtC,2DAA2B,CACzB,KAAK,Cf3E2B,IAAY,Ce4E9C,mDAAmB,CACjB,KAAK,CAAE,OAA4C,CACvD,+BAAa,CACX,SAAS,CAAE,IAAI,CAEb,yCAAG,CACD,UAAU,CAAE,OAA4C,CACxD,OAAO,CAAE,eAAyB,CACpC,uDAAiB,CACf,OAAO,CAAE,KAAK,CACd,UAAU,CAAE,OAA4C,CACxD,OAAO,CAAE,eAAyB,CAClC,UAAU,CAAE,IAAI,CAChB,aAAa,CAAE,IAAI,CACvB,2DAA2B,CACzB,KAAK,Cf3F2B,IAAY,Ce4F9C,mDAAmB,CACjB,KAAK,CAAE,OAA4C,CACvD,+BAAa,CACX,SAAS,CAAE,IAAI,CAEjB,+BAAa,CACX,OAAO,CAAE,KAAK,CAChB,uBAAK,CACH,aAAa,CAAE,CAAC,CAChB,OAAO,CAAE,IAAI,CAEb,kCAAK,CACH,OAAO,CAAE,KAAK,CAClB,4BAAU,CACR,aAAa,CAAE,CAAC,CAChB,KAAK,Cf1G6B,OAAW,Ce2G7C,WAAW,CAAE,MAAM,CACrB,mBAAC,CACC,OAAO,CAAE,YAAY,CACrB,WAAW,CAAE,IAAI,CACjB,OAAO,CAAE,eAAmB,CAC5B,OAAO,CAAE,KAAK,CACd,QAAQ,CAAE,QAAQ,CAClB,SAAS,CAAE,GAAG,CACd,KAAK,CfnH6B,OAAW,CeoH7C,yBAAO,CACL,gBAAgB,CAAE,OAAoC,CACtD,MAAM,CAAE,OAAO,CACf,6CAAmB,CACjB,KAAK,CfxHyB,OAAW,CeyH7C,0BAAQ,CACN,gBAAgB,CfnHgB,OAAK,CeoHrC,MAAM,CAAE,OAAO,CACf,KAAK,Cf3H2B,IAAM,Ce4HtC,8CAAmB,CACjB,KAAK,Cf7HyB,IAAM,Ce+H5C,mBAAmB,CACjB,OAAO,CAAE,KAAK,CACd,KAAK,CjBvF+B,KAAK,CiBwFzC,OAAO,CAAE,MAAW,CACpB,aAAa,CAAE,MAAW,CAC1B,OAAO,CjBrF6B,GAAG,CiBsFvC,gBAAgB,Cf/HoB,OAAK,CegIzC,UAAU,CAAE,MAAM,CAClB,OAAO,CAAE,MAAW,CACpB,OAAO,CAAE,KAAK,CACd,KAAK,CfpI+B,OAAyB,CeqI7D,aAAa,CAAE,MAAW,CAC1B,oCAAgB,CACd,KAAK,CAAE,IAAI,CACX,aAAa,CAAE,IAAI,CACnB,OAAO,CAAE,QAAQ,CACjB,YAAY,CAAE,OAAuB,CACvC,uBAAG,CACD,OAAO,CAAE,KAAK,CACd,MAAM,CAAE,qBAA0B,CAClC,MAAM,CAAE,IAAI,CACZ,KAAK,CAAE,IAAI,CACX,gBAAgB,Cf/IkB,OAAK,CegJvC,OAAO,CAAE,GAAG,CACZ,aAAa,CAAE,IAAI,CACrB,wDAAqB,CACnB,KAAK,CfpJ6B,OAAyB,CeqJ3D,SAAS,CAAE,IAAI,CACf,WAAW,CAAE,IAAI,CACjB,OAAO,CAAE,YAAY,CACrB,OAAO,CAAE,OAA2C,CACpD,aAAa,CAAE,MAAW,CAE1B,oEAAO,CACL,UAAU,CAAE,qBAAoB,CAClC,0EAAQ,CACN,OAAO,CAAE,KAAK,CACd,MAAM,CAAE,MAAM,CACd,MAAM,CAAE,IAAI,CACZ,KAAK,CAAE,IAAI,CACX,aAAa,CAAE,CAAC,CAChB,SAAS,CAAE,IAAI,CACf,UAAU,CAAE,WAAa,CAEzB,oFAAQ,CACN,UAAU,CAAE,KAAM,CACxB,+BAAa,CACX,UAAU,CAAE,QAAkB,CAC9B,aAAa,CAAE,MAAW,CAC1B,WAAW,CAAE,MAAM,CACnB,KAAK,CAAE,qBAAoB,CAI7B,gCAAM,CACJ,KAAK,CfhL6B,OAAK,CeiLzC,2BAAC,CACC,KAAK,CfzL6B,OAAW,Ce0L7C,iCAAO,CACL,gBAAgB,CfpLgB,OAAK,CeqLrC,KAAK,Cf3L2B,IAAM,Ce6L5C,gBAAgB,CnC3NR,kBAAoB,CAAE,eAAM,CAK5B,eAAiB,CAAE,eAAM,CAezB,UAAY,CAAE,eAAM,CmCyM1B,QAAQ,CAAE,QAAQ,CAClB,OAAO,CAAE,CAAC,CACV,KAAK,CAAE,IAAI,CACX,OAAO,CAAE,CAAC,CACV,4BAAa,CACX,IAAI,CAAE,CAAC,CACP,KAAK,CAAE,IAAI,CACX,OAAO,CAAE,CAAC,CACZ,0BAAW,CACT,KAAK,CAAE,IAAI,CACX,IAAI,CAAE,KAAK,CACX,OAAO,CAAE,CAAC,CACZ,2BAAY,CACV,KAAK,CAAE,KAAK,CACZ,IAAI,CAAE,IAAI,CACV,OAAO,CAAE,CAAC,CAGd,gBAAgB,CACd,UAAU,CAAE,qBAAuC,CACnD,gBAAgB,CAAE,2uCAA2uC,CAC7vC,eAAe,CAAE,SAAsB,CAEzC,gBAAgB,CACd,QAAQ,CAAE,QAAQ,CAClB,KAAK,CAAE,IAAI,CACX,MAAM,CAAE,IAAI,CAEd,YAAY,CACV,QAAQ,CAAE,KAAK,CACf,GAAG,CAAE,CAAC,CACN,MAAM,CAAE,CAAC,CACT,IAAI,CAAE,CAAC,CACP,cAAc,CAAE,GAAG,CACnB,KAAK,CjBvL+B,KAAK,CiBwLzC,UAAU,CAAE,MAAM,CAClB,UAAU,CAAE,MAAM,CAClB,UAAU,CAAE,IAAI,CAChB,UAAU,CflO0B,OAAsB,CemO1D,OAAO,CjBvL6B,GAAG,CiByLzC,eAAe,CACb,KAAK,CAAE,KAAyB,CAChC,QAAQ,CAAE,QAAQ,CAClB,UAAU,CAAE,MAAM,CAClB,UAAU,CAAE,MAAM,CAClB,MAAM,CAAE,IAAI,CAEd,WAAW,CACT,OAAO,CAAE,IAAI,CACb,UAAU,Cf3O0B,OAAK,Ce4OzC,KAAK,CflP+B,IAAM,CemP1C,OAAO,CAAE,cAAuB,CAChC,QAAQ,CAAE,QAAQ,CAClB,WAAW,CAAE,IAAI,CACjB,UAAU,CAAE,MAAM,CAClB,SAAS,CAAE,IAAI,CjCvLf,KAAK,CAAE,CAAC,CACR,oCAAS,CAEP,OAAO,CAAE,KAAK,CACd,OAAO,CAAE,EAAE,CACb,iBAAO,CACL,KAAK,CAAE,IAAI,CiCmLb,aAAC,CACC,KAAK,Cf1P6B,IAAM,Ce2PxC,WAAW,CAAE,IAAI,CAEnB,eAAG,CACD,YAAY,CAAE,IAAqB,CACnC,MAAM,CAAE,IAAI,CACZ,KAAK,CAAE,IAAI,CACX,gBAAgB,Cf3PkB,OAAK,Ce4PvC,OAAO,CAAE,GAAG,CACZ,aAAa,CAAE,IAAI,CACrB,aAAC,CACC,SAAS,CAAE,IAAI,CACf,KAAK,CAAE,IAAI,CACX,MAAM,CAAE,OAAO,CACf,WAAW,CAAE,OAAO,CAExB,oBAAoB,CAClB,WAAW,CjBjOyB,KAAK,CiBkOzC,UAAU,CfvQ0B,OAAyB,CewQ7D,UAAU,CAAE,IAAI,CAElB,eAAe,CACb,OAAO,CAAE,eAAmB,CAC5B,MAAM,CAAE,IAAI,CACZ,SAAS,CAAE,KAAK,CAChB,MAAM,CAAE,IAAI,CAEd,aAAa,CACX,QAAQ,CAAE,KAAK,CACf,KAAK,CAAE,IAAI,CACX,MAAM,CAAE,IAAI,CACZ,UAAU,CAAE,eAAc,CAC1B,OAAO,CAAE,IAAI,CACb,OAAO,CAAE,GAAkB,CAC3B,gBAAI,CACF,OAAO,CAAE,KAAK,CAClB,MAAM,CACJ,KAAK,CfjS+B,IAAY,CekShD,QAAC,CACC,aAAa,CAAE,IAAqB,CACtC,6FAAgB,CACd,OAAO,CAAE,GAAG,CACZ,WAAW,Cf9QuB,wMAAoN,Ce+QtP,SAAS,CAAE,GAAG,CACd,UAAU,CAAE,IAAI,CAChB,MAAM,CAAE,IAAI,CACZ,KAAK,Cf1S6B,IAAY,Ce4SlD,mBAAmB,CjC1OjB,KAAK,CAAE,CAAC,CiC2OR,oDAAiB,CACf,KAAK,CAAE,IAAI,CjC3Ob,oDAAS,CAEP,OAAO,CAAE,KAAK,CACd,OAAO,CAAE,EAAE,CACb,yBAAO,CACL,KAAK,CAAE,IAAI,CiCyOf,wBAAwB,CACtB,UAAU,CAAE,IAAI,CjChPhB,KAAK,CAAE,CAAC,CACR,8DAAS,CAEP,OAAO,CAAE,KAAK,CACd,OAAO,CAAE,EAAE,CACb,8BAAO,CACL,KAAK,CAAE,IAAI,CiC8Ob,0BAAU,CACR,aAAa,CjB5TqB,IAAI,CiB6TtC,aAAa,CAAE,iBAA6B,CAC5C,cAAc,CjB9ToB,IAAI,CiB+TxC,sCAAsB,CACpB,UAAU,CAAE,iBAA6B,CACzC,WAAW,CjBjUuB,IAAI,CiBkUxC,4BAAY,CACV,SAAS,CAAE,IAAI,CACf,aAAa,CAAE,IAAqB,CACpC,OAAO,CAAE,YAAY,CACvB,wBAAQ,CACN,KAAK,CflU6B,IAAY,CemU9C,SAAS,CAAE,GAAG,CdxUd,oCAAsB,Cc4UxB,gBAAgB,CACd,UAAU,CfjUwB,OAAyB,CekU7D,WAAW,CACT,OAAO,CAAE,KAAK,CAChB,YAAY,CAER,IAAI,CAAE,MAAmB,CAG3B,kBAAO,CACL,KAAK,CAAE,GAAG,CACV,IAAI,CAAE,CAAC,CACX,eAAe,CACb,KAAK,CAAE,IAAI,CACb,mBAAmB,CACjB,KAAK,CAAE,IAAI,CACb,yBAAyB,CACvB,KAAK,CAAE,IAAI,CACb,oBAAoB,CAClB,WAAW,CAAE,CAAC,CACd,oCAAe,CACb,OAAO,CC/XD,OAAO,CDgYf,0BAAO,CACL,QAAQ,CAAE,KAAK,CACf,SAAS,CAAE,IAAI,CACf,IAAI,CAAE,GAAG,CACT,GAAG,CAAE,CAAC,CACN,MAAM,CAAE,IAAI,CACZ,QAAQ,CAAE,MAAM,EdxWlB,qCAAsB,Cc2WxB,oBAAoB,CAClB,UAAU,CAAE,gBAAe,CAC7B,eAAe,CACb,MAAM,CAAE,CAAC,CACT,UAAU,CfnWwB,OAAyB,EeqW/D,YAAY,CACV,iCAAmC,CACjC,OAAO,CAAE,IAAI,CACf,oBAAoB,CAClB,WAAW,CAAE,CAAC,EErZlB,aAAa,CACX,QAAQ,CAAE,KAAK,CACf,MAAM,CAAE,CAAC,CACT,IAAI,CAAE,CAAC,CACP,KAAK,CnB6E+B,KAAK,CmB5EzC,KAAK,CjBuC+B,OAAyB,CiBtC7D,UAAU,CAAE,OAAkC,CAC9C,UAAU,CAAE,kBAAiC,CAC7C,WAAW,CjBkDyB,uDAA2D,CiBjD/F,OAAO,CnB+E6B,GAAG,CmB9EvC,eAAC,CACC,KAAK,CjBkC6B,OAAK,CiBjCvC,eAAe,CAAE,IAAI,CACvB,8BAAgB,CACd,OAAO,CAAE,IAAI,CACf,kCAAoB,CAClB,OAAO,CAAE,IAAqB,CAC9B,gBAAgB,CAAE,OAAkC,CACpD,OAAO,CAAE,KAAK,CACd,UAAU,CAAE,KAAK,CACjB,SAAS,CAAE,GAAG,CACd,MAAM,CAAE,OAAO,CACf,KAAK,CjBX6B,OAAM,ClB4F1C,KAAK,CAAE,CAAC,CACR,kFAAS,CAEP,OAAO,CAAE,KAAK,CACd,OAAO,CAAE,EAAE,CACb,wCAAO,CACL,KAAK,CAAE,IAAI,CmCrFX,uqDAAG,CACD,KAAK,CjBmB2B,OAAyB,CiBlB3D,yFAAQ,CACN,KAAK,CAAE,IAAI,CACb,6CAAU,CACR,KAAK,CAAE,IAAI,CACb,kDAAiB,CACf,gBAAgB,CnBQgB,OAAI,CmBPpC,KAAK,CjBO2B,IAAM,CiBNxC,yDAAwB,CACtB,gBAAgB,CjBsBgB,OAAO,CiBrBvC,KAAK,CnBzB2B,IAAI,CmB0BxC,0CAA8B,CAC5B,OAAO,CAAE,KAAK,CAChB,iCAAmB,CACjB,SAAS,CAAE,GAAG,CACd,OAAO,CAAE,IAAqB,CAC9B,KAAK,CjBJ6B,IAAY,CiBK9C,OAAO,CAAE,IAAI,CACb,oCAAE,CACA,OAAO,CAAE,KAAK,CACd,MAAM,CAAE,GAAG,CACX,MAAM,CAAE,CAAC,CACT,MAAM,CAAE,MAAM,CACd,OAAO,CAAE,CAAC,CACV,UAAU,CAAE,iBAA6C,CAC3D,oCAAE,CACA,OAAO,CAAE,YAAY,CACrB,MAAM,CAAE,CAAC,CACT,sCAAC,CACC,OAAO,CAAE,YAAY,CACrB,OAAO,CAAE,GAAqB,CAC9B,KAAK,CjBZyB,OAAyB,CiBa7D,uBAAW,CACT,KAAK,CAAE,IAAI,CACX,MAAM,CAAE,IAAI,CACZ,KAAK,CAAE,IAAI,CACX,IAAI,CAAE,IAAI,CACV,MAAM,CAAE,IAAI,CACZ,SAAS,CnBkByB,KAAK,CmBjBvC,kCAAU,CACR,KAAK,CAAE,IAAI,CACb,mEAAQ,CACN,KAAK,CAAE,IAAI,CACb,qDAA+B,CAC7B,UAAU,CAAE,KAAK,CACjB,+HAAQ,CACN,KAAK,CAAE,IAAI,CACb,gEAAU,CACR,KAAK,CAAE,IAAI,CACf,4CAAoB,CAClB,KAAK,CAAE,IAAI,CACX,MAAM,CAAE,IAAI,CACZ,WAAW,CAAE,IAAI,CACjB,OAAO,CAAE,KAAuB,CAChC,OAAO,CAAE,KAAK,CACd,UAAU,CAAE,MAAM,ChBhDpB,oCAAsB,CgBmDxB,aAAa,CACX,KAAK,CAAE,GAAG,CACV,OAAO,CAAE,IAAI,CACb,mBAAO,CACL,OAAO,CAAE,KAAK,ECtElB,gBAAG,CACD,SAAS,CAAE,IAAI,CACf,MAAM,CAAE,eAAe,CAEzB,uDAAkC,CAChC,WAAW,CAAE,MAAM,CAErB,uBAAU,CACR,aAAa,CpBOqB,IAAI,CoBNtC,iCAAS,CACP,UAAU,CAAE,MAAM,CAEtB,oCAAuB,CACrB,UAAU,CAAE,MAAM,CAGpB,qDAAoC,CAClC,aAAa,CpBFqB,IAAI,CoBaxC,uBAAU,CACR,WAAW,CpBduB,IAAI,CoBetC,WAAW,CpBfuB,IAAI,CoBgBtC,aAAa,CpBhBqB,IAAI,CoBsBtC,kTAAK,CACH,aAAa,CAAE,CAAC,CAKlB,qCAAQ,CACN,YAAY,CAAE,GAAG,CAUrB,8BAAiB,CACf,YAAY,CAAE,eAAc,CAC5B,mEAAM,CACJ,UAAU,CAAE,sBAAsB,CAClC,YAAY,CAAE,0BAAyB,CAG3C,0EAAiD,CAC/C,UAAU,CAAE,WAAW,CACzB,0EAAiD,CAC/C,UAAU,CAAE,WAAW,CAGzB,qDAA4B,CAC1B,aAAa,CAAE,IAAqB,CACtC,wBAAW,CACT,WAAW,CpBvDuB,IAAI,CoB0DxC,yBAAY,CACV,WAAW,CAAE,IAAI,CACjB,aAAa,CAAE,IAAqB,CACtC,yBAAY,CACV,KAAK,ClB3D6B,OAAW,CkB4D/C,yBAAY,CACV,KAAK,CAAE,KAAK,CACZ,MAAM,CAAE,iBAA2C,CACrD,wBAAW,CACT,KAAK,CAAE,IAAI,CACX,MAAM,CAAE,iBAA2C,CACrD,0BAAa,CACX,MAAM,CAAE,IAAI,CACZ,OAAO,CAAE,KAAK,CAMd,6RAAW,CACT,OAAO,CAAE,IAAI,CACb,UAAU,CAAE,MAAM,CAClB,SAAS,CAAE,IAAI,CAEf,mVAAO,CACL,UAAU,CAAE,OAAO,CACnB,OAAO,CAAE,GAAO,CAChB,WAAW,CAAE,WAAW,CACxB,OAAO,CAAE,YAAY,CACzB,mVAAmB,CACjB,OAAO,CAAE,YAAY,CAEzB,sBAAS,CACP,UAAU,CAAE,MAAM,CAGpB,qBAAQ,CACN,KAAK,CAAE,KAAK,CACZ,KAAK,CAAE,GAAG,CACV,OAAO,CAAE,KAAK,CACd,MAAM,CAAE,aAAuC,CAC/C,OAAO,CpBnG2B,IAAI,CoBoGtC,UAAU,ClBjFwB,OAAmB,CkBkFrD,MAAM,CAAE,iBAA+B,CAEvC,yEAAS,CACP,SAAS,CAAE,GAAG,CAChB,2BAAK,CACH,aAAa,CAAE,CAAC,CAClB,oCAAc,CACZ,OAAO,CAAE,KAAK,CACd,WAAW,ClBlFqB,0DAA8D,CkBmF9F,WAAW,CAAE,IAAI,CACjB,UAAU,ClB1FsB,OAAmB,CkB2FnD,OAAO,CAAE,QAA2C,CACpD,MAAM,CAAE,KAAkB,CAC1B,aAAa,CpBlHmB,IAAI,CoBmHpC,SAAS,CAAE,IAAI,CAEnB,yBAAY,CACV,UAAU,ClB9FwB,OAAO,CkB+FzC,OAAO,CAAE,YAAY,CACrB,WAAW,CAAE,IAAI,CACjB,OAAO,CAAE,KAAuB,CAGlC,iEAAwC,CACtC,cAAc,CAAE,KAAK,CACrB,SAAS,CAAE,GAAG,CAIhB,yEAAgD,CAC9C,UAAU,CAAE,IAAI,CAChB,MAAM,CAAE,IAAI,CACZ,KAAK,ClBhI6B,IAAY,CkBiI9C,+JAAM,CACJ,MAAM,CAAE,IAAI,CACZ,gBAAgB,CAAE,sBAAsB,CACxC,WAAW,CAAE,MAAM,CACrB,2FAAQ,CACN,YAAY,CAAE,CAAC,CACf,aAAa,CAAE,CAAC,CAChB,cAAc,CAAE,GAAG,CACrB,mKAAI,CACF,KAAK,ClBnJ2B,IAAK,CkB0JzC,6BAAgB,CAEd,MAAM,CAAE,IAAI,CACZ,gCAAE,CACA,MAAM,CAAE,IAAI,CACd,uCAAW,CACT,OAAO,CAAE,YAAY,CACvB,yCAAW,CACT,aAAa,CAAE,IAAI,CACnB,UAAU,CAAE,IAAI,CAChB,WAAW,CAAE,MAAM,CACrB,yCAAW,CACT,UAAU,CAAE,IAAI,CAGpB,iDAAQ,CAEN,KAAK,CpB7L6B,IAAI,CoB8LtC,OAAO,CAAE,OAAO,CAChB,wHAAO,CACL,SAAS,CAAE,eAAe,CAC1B,WAAW,CAAE,MAAM,CAErB,yEAAS,CACP,KAAK,CpBvK2B,OAAI,CoBwKtC,wHAAW,CACT,WAAW,CAAE,IAAI,CACjB,KAAK,ClB9K2B,OAAW,CkBgL/C,uDAAY,CACV,KAAK,ClBvK6B,OAAK,CkBwKzC,eAAE,CACA,aAAa,CpBtLqB,IAAI,CoBuLtC,kBAAE,CACA,WAAW,CAAE,IAAI,CAEnB,6EAAgB,CACd,aAAa,CAAE,eAAgC,CAEjD,kBAAE,CACA,MAAM,CAAE,aAA4C,CAMxD,8BAAiB,CACf,aAAa,CpBrMqB,IAAI,CoBuMtC,iCAAE,CACA,OAAO,CAAE,KAAK,CACd,MAAM,CAAE,KAAuB,CAC/B,SAAS,CAAE,GAAG,CACd,WAAW,CAAE,MAAM,CACnB,UAAU,CAAE,OAA0B,CACtC,KAAK,ClBhM2B,OAAK,CkBiMrC,UAAU,CAAE,iBAAoC,CAChD,OAAO,CAAE,GAAqB,CAC9B,QAAQ,CAAE,QAAQ,CAClB,wCAAQ,CACN,KAAK,CAAE,OAA0B,CACnC,6CAAW,CACT,KAAK,ClBjNyB,OAAW,CkBkNzC,SAAS,CAAE,eAAe,CAE9B,oCAAK,CACH,aAAa,CAAE,GAAqB,CACpC,MAAM,CAAE,IAAI,CACZ,WAAW,CAAE,cAAuB,CACpC,UAAU,CAAE,OAAa,CACzB,KAAK,ClBhO2B,IAAK,CkBiOrC,gDAAW,CACT,KAAK,ClB3NyB,OAAW,CkB4NzC,SAAS,CAAE,eAAe,CAC9B,6CAAc,CACZ,UAAU,CAAE,CAAC,CAEf,uGAAQ,CACN,WAAW,CAAE,IAAI,CACjB,oRAA2B,CACzB,gBAAgB,CAAE,WAAW,CAC7B,MAAM,CAAE,IAAI,CACZ,OAAO,CAAE,CAAC,CACV,SAAS,CAAE,eAAe,CAC5B,kIAAU,CACR,WAAW,CAAE,IAAI,CAErB,wCAAS,CACP,OAAO,CAAE,YAAY,CACrB,OAAO,CAAE,KAAK,CACd,KAAK,CpBtQ2B,IAAI,CoBuQpC,WAAW,CAAE,IAAI,CACnB,wCAAS,CACP,OAAO,CAAE,YAAY,CACrB,aAAa,CAAE,GAAG,CAEtB,uDAA8B,CAC5B,OAAO,CAAE,YAAY,CACrB,KAAK,ClB7Q6B,OAAM,CkB8QxC,SAAS,CAAE,GAAG,CACd,YAAY,CpB1PsB,IAAI,CoB2PxC,2BAAc,CACZ,OAAO,CAAE,KAAK,CACd,KAAK,CAAE,KAAK,CACd,qBAAQ,CACN,aAAa,CAAE,IAAI,CACnB,WAAW,CAAE,IAAI,CAEnB,mDAAa,CACX,UAAU,CAAE,OAAO,CACnB,OAAO,CAAE,OAAO,CAChB,WAAW,CAAE,MAAM,CACnB,WAAW,CAAE,OAAO,CACpB,SAAS,CAAE,OAAO,CAClB,KAAK,CAAE,OAAO,CACd,MAAM,CAAE,OAAO,CACf,WAAW,CAAE,OAAO,CACpB,qFAAgB,CACd,sBAAsB,CAAE,oBAAoB,CAG5C,mGAAQ,CACN,YAAY,CAAE,GAAG,CACvB,sBAAS,CACP,MAAM,CAAE,iBAAuC,CAC/C,UAAU,CAAE,OAA6B,CACzC,SAAS,CAAE,GAAG,CACd,WAAW,CAAE,GAAG,CAChB,aAAa,CAAE,GAAqB,CACpC,OAAO,CAAE,SAA4C,CACrD,MAAM,CAAE,QAA2B,CACrC,6BAAgB,CACd,UAAU,CAAE,MAAM,CjB1RlB,oCAAsB,CiBgStB,qBAAQ,CACN,KAAK,CAAE,IAAI,ECjUjB,wBAAwB,CACtB,KAAK,CnBkC+B,OAAW,CmBhCjD,KAAK,CACH,UAAU,CAAE,MAAM,YCHlB,WAAW,CAAE,aAAa,CAC1B,UAAU,CAAE,MAAM,CAClB,WAAW,CAAE,GAAG,CAChB,GAAG,CAAE,0GAA4G,YAGjH,WAAW,CAAE,aAAa,CAC1B,UAAU,CAAE,MAAM,CAClB,WAAW,CAAE,GAAG,CAChB,GAAG,CAAE,yGAA2G,YAGhH,WAAW,CAAE,MAAM,CACnB,UAAU,CAAE,MAAM,CAClB,WAAW,CAAE,GAAG,CAChB,GAAG,CAAE,6FAA+F,YAGpG,WAAW,CAAE,MAAM,CACnB,UAAU,CAAE,MAAM,CAClB,WAAW,CAAE,GAAG,CAChB,GAAG,CAAE,oFAAsF,YAG3F,WAAW,CAAE,MAAM,CACnB,UAAU,CAAE,MAAM,CAClB,WAAW,CAAE,GAAG,CAChB,GAAG,CAAE,0FAA4F,YAGjG,WAAW,CAAE,MAAM,CACnB,UAAU,CAAE,MAAM,CAClB,WAAW,CAAE,GAAG,CAChB,GAAG,CAAE,uGAAyG,YAG9G,WAAW,CAAE,aAAa,CAC1B,UAAU,CAAE,MAAM,CAClB,WAAW,CAAE,GAAG,CAChB,GAAG,CAAE,gHAAkH,YAGvH,WAAW,CAAE,aAAa,CAC1B,UAAU,CAAE,MAAM,CAClB,WAAW,CAAE,GAAG,CAChB,GAAG,CAAE,uGAAyG", -"sources": ["../../../bower_components/neat/app/assets/stylesheets/grid/_grid.scss","../../../bower_components/bourbon/dist/addons/_prefixer.scss","../../../bower_components/wyrm/sass/wyrm_core/_reset.sass","../../../bower_components/wyrm/sass/wyrm_core/_mixin.sass","../../../bower_components/font-awesome/scss/font-awesome.scss","../../../bower_components/font-awesome/scss/_path.scss","../../../bower_components/font-awesome/scss/_core.scss","../../../bower_components/font-awesome/scss/_larger.scss","../../../bower_components/font-awesome/scss/_fixed-width.scss","../../../bower_components/font-awesome/scss/_list.scss","../../../bower_components/font-awesome/scss/_variables.scss","../../../bower_components/font-awesome/scss/_bordered-pulled.scss","../../../bower_components/font-awesome/scss/_animated.scss","../../../bower_components/font-awesome/scss/_rotated-flipped.scss","../../../bower_components/font-awesome/scss/_mixins.scss","../../../bower_components/font-awesome/scss/_stacked.scss","../../../bower_components/font-awesome/scss/_icons.scss","../../../bower_components/font-awesome/scss/_screen-reader.scss","../../../bower_components/wyrm/sass/wyrm_core/_font_icon_defaults.sass","../../../bower_components/wyrm/sass/wyrm_core/_wy_variables.sass","../../../bower_components/wyrm/sass/wyrm_core/_alert.sass","../../../sass/_theme_variables.sass","../../../bower_components/neat/app/assets/stylesheets/grid/_media.scss","../../../bower_components/wyrm/sass/wyrm_core/_button.sass","../../../bower_components/wyrm/sass/wyrm_core/_dropdown.sass","../../../bower_components/wyrm/sass/wyrm_core/_form.sass","../../../bower_components/neat/app/assets/stylesheets/grid/_outer-container.scss","../../../bower_components/neat/app/assets/stylesheets/settings/_grid.scss","../../../bower_components/neat/app/assets/stylesheets/grid/_span-columns.scss","../../../bower_components/wyrm/sass/wyrm_core/_neat_extra.sass","../../../bower_components/wyrm/sass/wyrm_core/_generic.sass","../../../bower_components/wyrm/sass/wyrm_core/_table.sass","../../../bower_components/wyrm/sass/wyrm_core/_type.sass","../../../bower_components/wyrm/sass/wyrm_addons/pygments/_pygments.sass","../../../bower_components/wyrm/sass/wyrm_addons/pygments/_pygments_light.sass","../../../sass/_theme_breadcrumbs.sass","../../../sass/_theme_layout.sass","../../../bower_components/neat/app/assets/stylesheets/grid/_private.scss","../../../sass/_theme_badge.sass","../../../sass/_theme_rst.sass","../../../sass/_theme_mathjax.sass","../../../sass/_theme_font_local.sass"], -"names": [], -"file": "theme.css" -} diff --git a/docs/_static/doctools.js b/docs/_static/doctools.js deleted file mode 100644 index 816349563588..000000000000 --- a/docs/_static/doctools.js +++ /dev/null @@ -1,287 +0,0 @@ -/* - * doctools.js - * ~~~~~~~~~~~ - * - * Sphinx JavaScript utilities for all documentation. - * - * :copyright: Copyright 2007-2016 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * - */ - -/** - * select a different prefix for underscore - */ -$u = _.noConflict(); - -/** - * make the code below compatible with browsers without - * an installed firebug like debugger -if (!window.console || !console.firebug) { - var names = ["log", "debug", "info", "warn", "error", "assert", "dir", - "dirxml", "group", "groupEnd", "time", "timeEnd", "count", "trace", - "profile", "profileEnd"]; - window.console = {}; - for (var i = 0; i < names.length; ++i) - window.console[names[i]] = function() {}; -} - */ - -/** - * small helper function to urldecode strings - */ -jQuery.urldecode = function(x) { - return decodeURIComponent(x).replace(/\+/g, ' '); -}; - -/** - * small helper function to urlencode strings - */ -jQuery.urlencode = encodeURIComponent; - -/** - * This function returns the parsed url parameters of the - * current request. Multiple values per key are supported, - * it will always return arrays of strings for the value parts. - */ -jQuery.getQueryParameters = function(s) { - if (typeof s == 'undefined') - s = document.location.search; - var parts = s.substr(s.indexOf('?') + 1).split('&'); - var result = {}; - for (var i = 0; i < parts.length; i++) { - var tmp = parts[i].split('=', 2); - var key = jQuery.urldecode(tmp[0]); - var value = jQuery.urldecode(tmp[1]); - if (key in result) - result[key].push(value); - else - result[key] = [value]; - } - return result; -}; - -/** - * highlight a given string on a jquery object by wrapping it in - * span elements with the given class name. - */ -jQuery.fn.highlightText = function(text, className) { - function highlight(node) { - if (node.nodeType == 3) { - var val = node.nodeValue; - var pos = val.toLowerCase().indexOf(text); - if (pos >= 0 && !jQuery(node.parentNode).hasClass(className)) { - var span = document.createElement("span"); - span.className = className; - span.appendChild(document.createTextNode(val.substr(pos, text.length))); - node.parentNode.insertBefore(span, node.parentNode.insertBefore( - document.createTextNode(val.substr(pos + text.length)), - node.nextSibling)); - node.nodeValue = val.substr(0, pos); - } - } - else if (!jQuery(node).is("button, select, textarea")) { - jQuery.each(node.childNodes, function() { - highlight(this); - }); - } - } - return this.each(function() { - highlight(this); - }); -}; - -/* - * backward compatibility for jQuery.browser - * This will be supported until firefox bug is fixed. - */ -if (!jQuery.browser) { - jQuery.uaMatch = function(ua) { - ua = ua.toLowerCase(); - - var match = /(chrome)[ \/]([\w.]+)/.exec(ua) || - /(webkit)[ \/]([\w.]+)/.exec(ua) || - /(opera)(?:.*version|)[ \/]([\w.]+)/.exec(ua) || - /(msie) ([\w.]+)/.exec(ua) || - ua.indexOf("compatible") < 0 && /(mozilla)(?:.*? rv:([\w.]+)|)/.exec(ua) || - []; - - return { - browser: match[ 1 ] || "", - version: match[ 2 ] || "0" - }; - }; - jQuery.browser = {}; - jQuery.browser[jQuery.uaMatch(navigator.userAgent).browser] = true; -} - -/** - * Small JavaScript module for the documentation. - */ -var Documentation = { - - init : function() { - this.fixFirefoxAnchorBug(); - this.highlightSearchWords(); - this.initIndexTable(); - - }, - - /** - * i18n support - */ - TRANSLATIONS : {}, - PLURAL_EXPR : function(n) { return n == 1 ? 0 : 1; }, - LOCALE : 'unknown', - - // gettext and ngettext don't access this so that the functions - // can safely bound to a different name (_ = Documentation.gettext) - gettext : function(string) { - var translated = Documentation.TRANSLATIONS[string]; - if (typeof translated == 'undefined') - return string; - return (typeof translated == 'string') ? translated : translated[0]; - }, - - ngettext : function(singular, plural, n) { - var translated = Documentation.TRANSLATIONS[singular]; - if (typeof translated == 'undefined') - return (n == 1) ? singular : plural; - return translated[Documentation.PLURALEXPR(n)]; - }, - - addTranslations : function(catalog) { - for (var key in catalog.messages) - this.TRANSLATIONS[key] = catalog.messages[key]; - this.PLURAL_EXPR = new Function('n', 'return +(' + catalog.plural_expr + ')'); - this.LOCALE = catalog.locale; - }, - - /** - * add context elements like header anchor links - */ - addContextElements : function() { - $('div[id] > :header:first').each(function() { - $('\u00B6'). - attr('href', '#' + this.id). - attr('title', _('Permalink to this headline')). - appendTo(this); - }); - $('dt[id]').each(function() { - $('\u00B6'). - attr('href', '#' + this.id). - attr('title', _('Permalink to this definition')). - appendTo(this); - }); - }, - - /** - * workaround a firefox stupidity - * see: https://bugzilla.mozilla.org/show_bug.cgi?id=645075 - */ - fixFirefoxAnchorBug : function() { - if (document.location.hash) - window.setTimeout(function() { - document.location.href += ''; - }, 10); - }, - - /** - * highlight the search words provided in the url in the text - */ - highlightSearchWords : function() { - var params = $.getQueryParameters(); - var terms = (params.highlight) ? params.highlight[0].split(/\s+/) : []; - if (terms.length) { - var body = $('div.body'); - if (!body.length) { - body = $('body'); - } - window.setTimeout(function() { - $.each(terms, function() { - body.highlightText(this.toLowerCase(), 'highlighted'); - }); - }, 10); - $('') - .appendTo($('#searchbox')); - } - }, - - /** - * init the domain index toggle buttons - */ - initIndexTable : function() { - var togglers = $('img.toggler').click(function() { - var src = $(this).attr('src'); - var idnum = $(this).attr('id').substr(7); - $('tr.cg-' + idnum).toggle(); - if (src.substr(-9) == 'minus.png') - $(this).attr('src', src.substr(0, src.length-9) + 'plus.png'); - else - $(this).attr('src', src.substr(0, src.length-8) + 'minus.png'); - }).css('display', ''); - if (DOCUMENTATION_OPTIONS.COLLAPSE_INDEX) { - togglers.click(); - } - }, - - /** - * helper function to hide the search marks again - */ - hideSearchWords : function() { - $('#searchbox .highlight-link').fadeOut(300); - $('span.highlighted').removeClass('highlighted'); - }, - - /** - * make the url absolute - */ - makeURL : function(relativeURL) { - return DOCUMENTATION_OPTIONS.URL_ROOT + '/' + relativeURL; - }, - - /** - * get the current relative url - */ - getCurrentURL : function() { - var path = document.location.pathname; - var parts = path.split(/\//); - $.each(DOCUMENTATION_OPTIONS.URL_ROOT.split(/\//), function() { - if (this == '..') - parts.pop(); - }); - var url = parts.join('/'); - return path.substring(url.lastIndexOf('/') + 1, path.length - 1); - }, - - initOnKeyListeners: function() { - $(document).keyup(function(event) { - var activeElementType = document.activeElement.tagName; - // don't navigate when in search box or textarea - if (activeElementType !== 'TEXTAREA' && activeElementType !== 'INPUT' && activeElementType !== 'SELECT') { - switch (event.keyCode) { - case 37: // left - var prevHref = $('link[rel="prev"]').prop('href'); - if (prevHref) { - window.location.href = prevHref; - return false; - } - case 39: // right - var nextHref = $('link[rel="next"]').prop('href'); - if (nextHref) { - window.location.href = nextHref; - return false; - } - } - } - }); - } -}; - -// quick alias for translations -_ = Documentation.gettext; - -$(document).ready(function() { - Documentation.init(); -}); \ No newline at end of file diff --git a/docs/_static/down-pressed.png b/docs/_static/down-pressed.png deleted file mode 100644 index 5756c8cad885..000000000000 Binary files a/docs/_static/down-pressed.png and /dev/null differ diff --git a/docs/_static/down.png b/docs/_static/down.png deleted file mode 100644 index 1b3bdad2ceff..000000000000 Binary files a/docs/_static/down.png and /dev/null differ diff --git a/docs/_static/file.png b/docs/_static/file.png deleted file mode 100644 index a858a410e4fa..000000000000 Binary files a/docs/_static/file.png and /dev/null differ diff --git a/docs/_static/fonts/FontAwesome.otf b/docs/_static/fonts/FontAwesome.otf deleted file mode 100644 index d4de13e832d5..000000000000 Binary files a/docs/_static/fonts/FontAwesome.otf and /dev/null differ diff --git a/docs/_static/fonts/Inconsolata-Bold.ttf b/docs/_static/fonts/Inconsolata-Bold.ttf deleted file mode 100644 index 809c1f5828f8..000000000000 Binary files a/docs/_static/fonts/Inconsolata-Bold.ttf and /dev/null differ diff --git a/docs/_static/fonts/Inconsolata-Regular.ttf b/docs/_static/fonts/Inconsolata-Regular.ttf deleted file mode 100644 index fc981ce7ad6c..000000000000 Binary files a/docs/_static/fonts/Inconsolata-Regular.ttf and /dev/null differ diff --git a/docs/_static/fonts/Lato-Bold.ttf b/docs/_static/fonts/Lato-Bold.ttf deleted file mode 100644 index 1d23c7066e09..000000000000 Binary files a/docs/_static/fonts/Lato-Bold.ttf and /dev/null differ diff --git a/docs/_static/fonts/Lato-Regular.ttf b/docs/_static/fonts/Lato-Regular.ttf deleted file mode 100644 index 0f3d0f837d24..000000000000 Binary files a/docs/_static/fonts/Lato-Regular.ttf and /dev/null differ diff --git a/docs/_static/fonts/RobotoSlab-Bold.ttf b/docs/_static/fonts/RobotoSlab-Bold.ttf deleted file mode 100644 index df5d1df27304..000000000000 Binary files a/docs/_static/fonts/RobotoSlab-Bold.ttf and /dev/null differ diff --git a/docs/_static/fonts/RobotoSlab-Regular.ttf b/docs/_static/fonts/RobotoSlab-Regular.ttf deleted file mode 100644 index eb52a7907362..000000000000 Binary files a/docs/_static/fonts/RobotoSlab-Regular.ttf and /dev/null differ diff --git a/docs/_static/fonts/fontawesome-webfont.eot b/docs/_static/fonts/fontawesome-webfont.eot deleted file mode 100644 index c7b00d2ba889..000000000000 Binary files a/docs/_static/fonts/fontawesome-webfont.eot and /dev/null differ diff --git a/docs/_static/fonts/fontawesome-webfont.svg b/docs/_static/fonts/fontawesome-webfont.svg deleted file mode 100644 index 8b66187fe067..000000000000 --- a/docs/_static/fonts/fontawesome-webfont.svg +++ /dev/null @@ -1,685 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/_static/fonts/fontawesome-webfont.ttf b/docs/_static/fonts/fontawesome-webfont.ttf deleted file mode 100644 index f221e50a2ef6..000000000000 Binary files a/docs/_static/fonts/fontawesome-webfont.ttf and /dev/null differ diff --git a/docs/_static/fonts/fontawesome-webfont.woff b/docs/_static/fonts/fontawesome-webfont.woff deleted file mode 100644 index 6e7483cf61b4..000000000000 Binary files a/docs/_static/fonts/fontawesome-webfont.woff and /dev/null differ diff --git a/docs/_static/fonts/fontawesome-webfont.woff2 b/docs/_static/fonts/fontawesome-webfont.woff2 deleted file mode 100644 index 7eb74fd127ee..000000000000 Binary files a/docs/_static/fonts/fontawesome-webfont.woff2 and /dev/null differ diff --git a/docs/_static/img/dynamic_graph.gif b/docs/_static/img/dynamic_graph.gif deleted file mode 100644 index b4f17374e034..000000000000 Binary files a/docs/_static/img/dynamic_graph.gif and /dev/null differ diff --git a/docs/_static/img/pytorch-logo-dark.png b/docs/_static/img/pytorch-logo-dark.png deleted file mode 100644 index 0288a564e227..000000000000 Binary files a/docs/_static/img/pytorch-logo-dark.png and /dev/null differ diff --git a/docs/_static/img/pytorch-logo-dark.svg b/docs/_static/img/pytorch-logo-dark.svg deleted file mode 100644 index 717a3ce942f8..000000000000 --- a/docs/_static/img/pytorch-logo-dark.svg +++ /dev/null @@ -1,24 +0,0 @@ - - - - - - - - - - - - - diff --git a/docs/_static/img/tensor_illustration.png b/docs/_static/img/tensor_illustration.png deleted file mode 100644 index b0039c7f3f3e..000000000000 Binary files a/docs/_static/img/tensor_illustration.png and /dev/null differ diff --git a/docs/_static/jquery-3.1.0.js b/docs/_static/jquery-3.1.0.js deleted file mode 100644 index f2fc2747874e..000000000000 --- a/docs/_static/jquery-3.1.0.js +++ /dev/null @@ -1,10074 +0,0 @@ -/*eslint-disable no-unused-vars*/ -/*! - * jQuery JavaScript Library v3.1.0 - * https://jquery.com/ - * - * Includes Sizzle.js - * https://sizzlejs.com/ - * - * Copyright jQuery Foundation and other contributors - * Released under the MIT license - * https://jquery.org/license - * - * Date: 2016-07-07T21:44Z - */ -( function( global, factory ) { - - "use strict"; - - if ( typeof module === "object" && typeof module.exports === "object" ) { - - // For CommonJS and CommonJS-like environments where a proper `window` - // is present, execute the factory and get jQuery. - // For environments that do not have a `window` with a `document` - // (such as Node.js), expose a factory as module.exports. - // This accentuates the need for the creation of a real `window`. - // e.g. var jQuery = require("jquery")(window); - // See ticket #14549 for more info. - module.exports = global.document ? - factory( global, true ) : - function( w ) { - if ( !w.document ) { - throw new Error( "jQuery requires a window with a document" ); - } - return factory( w ); - }; - } else { - factory( global ); - } - -// Pass this if window is not defined yet -} )( typeof window !== "undefined" ? window : this, function( window, noGlobal ) { - -// Edge <= 12 - 13+, Firefox <=18 - 45+, IE 10 - 11, Safari 5.1 - 9+, iOS 6 - 9.1 -// throw exceptions when non-strict code (e.g., ASP.NET 4.5) accesses strict mode -// arguments.callee.caller (trac-13335). But as of jQuery 3.0 (2016), strict mode should be common -// enough that all such attempts are guarded in a try block. -"use strict"; - -var arr = []; - -var document = window.document; - -var getProto = Object.getPrototypeOf; - -var slice = arr.slice; - -var concat = arr.concat; - -var push = arr.push; - -var indexOf = arr.indexOf; - -var class2type = {}; - -var toString = class2type.toString; - -var hasOwn = class2type.hasOwnProperty; - -var fnToString = hasOwn.toString; - -var ObjectFunctionString = fnToString.call( Object ); - -var support = {}; - - - - function DOMEval( code, doc ) { - doc = doc || document; - - var script = doc.createElement( "script" ); - - script.text = code; - doc.head.appendChild( script ).parentNode.removeChild( script ); - } -/* global Symbol */ -// Defining this global in .eslintrc would create a danger of using the global -// unguarded in another place, it seems safer to define global only for this module - - - -var - version = "3.1.0", - - // Define a local copy of jQuery - jQuery = function( selector, context ) { - - // The jQuery object is actually just the init constructor 'enhanced' - // Need init if jQuery is called (just allow error to be thrown if not included) - return new jQuery.fn.init( selector, context ); - }, - - // Support: Android <=4.0 only - // Make sure we trim BOM and NBSP - rtrim = /^[\s\uFEFF\xA0]+|[\s\uFEFF\xA0]+$/g, - - // Matches dashed string for camelizing - rmsPrefix = /^-ms-/, - rdashAlpha = /-([a-z])/g, - - // Used by jQuery.camelCase as callback to replace() - fcamelCase = function( all, letter ) { - return letter.toUpperCase(); - }; - -jQuery.fn = jQuery.prototype = { - - // The current version of jQuery being used - jquery: version, - - constructor: jQuery, - - // The default length of a jQuery object is 0 - length: 0, - - toArray: function() { - return slice.call( this ); - }, - - // Get the Nth element in the matched element set OR - // Get the whole matched element set as a clean array - get: function( num ) { - return num != null ? - - // Return just the one element from the set - ( num < 0 ? this[ num + this.length ] : this[ num ] ) : - - // Return all the elements in a clean array - slice.call( this ); - }, - - // Take an array of elements and push it onto the stack - // (returning the new matched element set) - pushStack: function( elems ) { - - // Build a new jQuery matched element set - var ret = jQuery.merge( this.constructor(), elems ); - - // Add the old object onto the stack (as a reference) - ret.prevObject = this; - - // Return the newly-formed element set - return ret; - }, - - // Execute a callback for every element in the matched set. - each: function( callback ) { - return jQuery.each( this, callback ); - }, - - map: function( callback ) { - return this.pushStack( jQuery.map( this, function( elem, i ) { - return callback.call( elem, i, elem ); - } ) ); - }, - - slice: function() { - return this.pushStack( slice.apply( this, arguments ) ); - }, - - first: function() { - return this.eq( 0 ); - }, - - last: function() { - return this.eq( -1 ); - }, - - eq: function( i ) { - var len = this.length, - j = +i + ( i < 0 ? len : 0 ); - return this.pushStack( j >= 0 && j < len ? [ this[ j ] ] : [] ); - }, - - end: function() { - return this.prevObject || this.constructor(); - }, - - // For internal use only. - // Behaves like an Array's method, not like a jQuery method. - push: push, - sort: arr.sort, - splice: arr.splice -}; - -jQuery.extend = jQuery.fn.extend = function() { - var options, name, src, copy, copyIsArray, clone, - target = arguments[ 0 ] || {}, - i = 1, - length = arguments.length, - deep = false; - - // Handle a deep copy situation - if ( typeof target === "boolean" ) { - deep = target; - - // Skip the boolean and the target - target = arguments[ i ] || {}; - i++; - } - - // Handle case when target is a string or something (possible in deep copy) - if ( typeof target !== "object" && !jQuery.isFunction( target ) ) { - target = {}; - } - - // Extend jQuery itself if only one argument is passed - if ( i === length ) { - target = this; - i--; - } - - for ( ; i < length; i++ ) { - - // Only deal with non-null/undefined values - if ( ( options = arguments[ i ] ) != null ) { - - // Extend the base object - for ( name in options ) { - src = target[ name ]; - copy = options[ name ]; - - // Prevent never-ending loop - if ( target === copy ) { - continue; - } - - // Recurse if we're merging plain objects or arrays - if ( deep && copy && ( jQuery.isPlainObject( copy ) || - ( copyIsArray = jQuery.isArray( copy ) ) ) ) { - - if ( copyIsArray ) { - copyIsArray = false; - clone = src && jQuery.isArray( src ) ? src : []; - - } else { - clone = src && jQuery.isPlainObject( src ) ? src : {}; - } - - // Never move original objects, clone them - target[ name ] = jQuery.extend( deep, clone, copy ); - - // Don't bring in undefined values - } else if ( copy !== undefined ) { - target[ name ] = copy; - } - } - } - } - - // Return the modified object - return target; -}; - -jQuery.extend( { - - // Unique for each copy of jQuery on the page - expando: "jQuery" + ( version + Math.random() ).replace( /\D/g, "" ), - - // Assume jQuery is ready without the ready module - isReady: true, - - error: function( msg ) { - throw new Error( msg ); - }, - - noop: function() {}, - - isFunction: function( obj ) { - return jQuery.type( obj ) === "function"; - }, - - isArray: Array.isArray, - - isWindow: function( obj ) { - return obj != null && obj === obj.window; - }, - - isNumeric: function( obj ) { - - // As of jQuery 3.0, isNumeric is limited to - // strings and numbers (primitives or objects) - // that can be coerced to finite numbers (gh-2662) - var type = jQuery.type( obj ); - return ( type === "number" || type === "string" ) && - - // parseFloat NaNs numeric-cast false positives ("") - // ...but misinterprets leading-number strings, particularly hex literals ("0x...") - // subtraction forces infinities to NaN - !isNaN( obj - parseFloat( obj ) ); - }, - - isPlainObject: function( obj ) { - var proto, Ctor; - - // Detect obvious negatives - // Use toString instead of jQuery.type to catch host objects - if ( !obj || toString.call( obj ) !== "[object Object]" ) { - return false; - } - - proto = getProto( obj ); - - // Objects with no prototype (e.g., `Object.create( null )`) are plain - if ( !proto ) { - return true; - } - - // Objects with prototype are plain iff they were constructed by a global Object function - Ctor = hasOwn.call( proto, "constructor" ) && proto.constructor; - return typeof Ctor === "function" && fnToString.call( Ctor ) === ObjectFunctionString; - }, - - isEmptyObject: function( obj ) { - - /* eslint-disable no-unused-vars */ - // See https://github.com/eslint/eslint/issues/6125 - var name; - - for ( name in obj ) { - return false; - } - return true; - }, - - type: function( obj ) { - if ( obj == null ) { - return obj + ""; - } - - // Support: Android <=2.3 only (functionish RegExp) - return typeof obj === "object" || typeof obj === "function" ? - class2type[ toString.call( obj ) ] || "object" : - typeof obj; - }, - - // Evaluates a script in a global context - globalEval: function( code ) { - DOMEval( code ); - }, - - // Convert dashed to camelCase; used by the css and data modules - // Support: IE <=9 - 11, Edge 12 - 13 - // Microsoft forgot to hump their vendor prefix (#9572) - camelCase: function( string ) { - return string.replace( rmsPrefix, "ms-" ).replace( rdashAlpha, fcamelCase ); - }, - - nodeName: function( elem, name ) { - return elem.nodeName && elem.nodeName.toLowerCase() === name.toLowerCase(); - }, - - each: function( obj, callback ) { - var length, i = 0; - - if ( isArrayLike( obj ) ) { - length = obj.length; - for ( ; i < length; i++ ) { - if ( callback.call( obj[ i ], i, obj[ i ] ) === false ) { - break; - } - } - } else { - for ( i in obj ) { - if ( callback.call( obj[ i ], i, obj[ i ] ) === false ) { - break; - } - } - } - - return obj; - }, - - // Support: Android <=4.0 only - trim: function( text ) { - return text == null ? - "" : - ( text + "" ).replace( rtrim, "" ); - }, - - // results is for internal usage only - makeArray: function( arr, results ) { - var ret = results || []; - - if ( arr != null ) { - if ( isArrayLike( Object( arr ) ) ) { - jQuery.merge( ret, - typeof arr === "string" ? - [ arr ] : arr - ); - } else { - push.call( ret, arr ); - } - } - - return ret; - }, - - inArray: function( elem, arr, i ) { - return arr == null ? -1 : indexOf.call( arr, elem, i ); - }, - - // Support: Android <=4.0 only, PhantomJS 1 only - // push.apply(_, arraylike) throws on ancient WebKit - merge: function( first, second ) { - var len = +second.length, - j = 0, - i = first.length; - - for ( ; j < len; j++ ) { - first[ i++ ] = second[ j ]; - } - - first.length = i; - - return first; - }, - - grep: function( elems, callback, invert ) { - var callbackInverse, - matches = [], - i = 0, - length = elems.length, - callbackExpect = !invert; - - // Go through the array, only saving the items - // that pass the validator function - for ( ; i < length; i++ ) { - callbackInverse = !callback( elems[ i ], i ); - if ( callbackInverse !== callbackExpect ) { - matches.push( elems[ i ] ); - } - } - - return matches; - }, - - // arg is for internal usage only - map: function( elems, callback, arg ) { - var length, value, - i = 0, - ret = []; - - // Go through the array, translating each of the items to their new values - if ( isArrayLike( elems ) ) { - length = elems.length; - for ( ; i < length; i++ ) { - value = callback( elems[ i ], i, arg ); - - if ( value != null ) { - ret.push( value ); - } - } - - // Go through every key on the object, - } else { - for ( i in elems ) { - value = callback( elems[ i ], i, arg ); - - if ( value != null ) { - ret.push( value ); - } - } - } - - // Flatten any nested arrays - return concat.apply( [], ret ); - }, - - // A global GUID counter for objects - guid: 1, - - // Bind a function to a context, optionally partially applying any - // arguments. - proxy: function( fn, context ) { - var tmp, args, proxy; - - if ( typeof context === "string" ) { - tmp = fn[ context ]; - context = fn; - fn = tmp; - } - - // Quick check to determine if target is callable, in the spec - // this throws a TypeError, but we will just return undefined. - if ( !jQuery.isFunction( fn ) ) { - return undefined; - } - - // Simulated bind - args = slice.call( arguments, 2 ); - proxy = function() { - return fn.apply( context || this, args.concat( slice.call( arguments ) ) ); - }; - - // Set the guid of unique handler to the same of original handler, so it can be removed - proxy.guid = fn.guid = fn.guid || jQuery.guid++; - - return proxy; - }, - - now: Date.now, - - // jQuery.support is not used in Core but other projects attach their - // properties to it so it needs to exist. - support: support -} ); - -if ( typeof Symbol === "function" ) { - jQuery.fn[ Symbol.iterator ] = arr[ Symbol.iterator ]; -} - -// Populate the class2type map -jQuery.each( "Boolean Number String Function Array Date RegExp Object Error Symbol".split( " " ), -function( i, name ) { - class2type[ "[object " + name + "]" ] = name.toLowerCase(); -} ); - -function isArrayLike( obj ) { - - // Support: real iOS 8.2 only (not reproducible in simulator) - // `in` check used to prevent JIT error (gh-2145) - // hasOwn isn't used here due to false negatives - // regarding Nodelist length in IE - var length = !!obj && "length" in obj && obj.length, - type = jQuery.type( obj ); - - if ( type === "function" || jQuery.isWindow( obj ) ) { - return false; - } - - return type === "array" || length === 0 || - typeof length === "number" && length > 0 && ( length - 1 ) in obj; -} -var Sizzle = -/*! - * Sizzle CSS Selector Engine v2.3.0 - * https://sizzlejs.com/ - * - * Copyright jQuery Foundation and other contributors - * Released under the MIT license - * http://jquery.org/license - * - * Date: 2016-01-04 - */ -(function( window ) { - -var i, - support, - Expr, - getText, - isXML, - tokenize, - compile, - select, - outermostContext, - sortInput, - hasDuplicate, - - // Local document vars - setDocument, - document, - docElem, - documentIsHTML, - rbuggyQSA, - rbuggyMatches, - matches, - contains, - - // Instance-specific data - expando = "sizzle" + 1 * new Date(), - preferredDoc = window.document, - dirruns = 0, - done = 0, - classCache = createCache(), - tokenCache = createCache(), - compilerCache = createCache(), - sortOrder = function( a, b ) { - if ( a === b ) { - hasDuplicate = true; - } - return 0; - }, - - // Instance methods - hasOwn = ({}).hasOwnProperty, - arr = [], - pop = arr.pop, - push_native = arr.push, - push = arr.push, - slice = arr.slice, - // Use a stripped-down indexOf as it's faster than native - // https://jsperf.com/thor-indexof-vs-for/5 - indexOf = function( list, elem ) { - var i = 0, - len = list.length; - for ( ; i < len; i++ ) { - if ( list[i] === elem ) { - return i; - } - } - return -1; - }, - - booleans = "checked|selected|async|autofocus|autoplay|controls|defer|disabled|hidden|ismap|loop|multiple|open|readonly|required|scoped", - - // Regular expressions - - // http://www.w3.org/TR/css3-selectors/#whitespace - whitespace = "[\\x20\\t\\r\\n\\f]", - - // http://www.w3.org/TR/CSS21/syndata.html#value-def-identifier - identifier = "(?:\\\\.|[\\w-]|[^\0-\\xa0])+", - - // Attribute selectors: http://www.w3.org/TR/selectors/#attribute-selectors - attributes = "\\[" + whitespace + "*(" + identifier + ")(?:" + whitespace + - // Operator (capture 2) - "*([*^$|!~]?=)" + whitespace + - // "Attribute values must be CSS identifiers [capture 5] or strings [capture 3 or capture 4]" - "*(?:'((?:\\\\.|[^\\\\'])*)'|\"((?:\\\\.|[^\\\\\"])*)\"|(" + identifier + "))|)" + whitespace + - "*\\]", - - pseudos = ":(" + identifier + ")(?:\\((" + - // To reduce the number of selectors needing tokenize in the preFilter, prefer arguments: - // 1. quoted (capture 3; capture 4 or capture 5) - "('((?:\\\\.|[^\\\\'])*)'|\"((?:\\\\.|[^\\\\\"])*)\")|" + - // 2. simple (capture 6) - "((?:\\\\.|[^\\\\()[\\]]|" + attributes + ")*)|" + - // 3. anything else (capture 2) - ".*" + - ")\\)|)", - - // Leading and non-escaped trailing whitespace, capturing some non-whitespace characters preceding the latter - rwhitespace = new RegExp( whitespace + "+", "g" ), - rtrim = new RegExp( "^" + whitespace + "+|((?:^|[^\\\\])(?:\\\\.)*)" + whitespace + "+$", "g" ), - - rcomma = new RegExp( "^" + whitespace + "*," + whitespace + "*" ), - rcombinators = new RegExp( "^" + whitespace + "*([>+~]|" + whitespace + ")" + whitespace + "*" ), - - rattributeQuotes = new RegExp( "=" + whitespace + "*([^\\]'\"]*?)" + whitespace + "*\\]", "g" ), - - rpseudo = new RegExp( pseudos ), - ridentifier = new RegExp( "^" + identifier + "$" ), - - matchExpr = { - "ID": new RegExp( "^#(" + identifier + ")" ), - "CLASS": new RegExp( "^\\.(" + identifier + ")" ), - "TAG": new RegExp( "^(" + identifier + "|[*])" ), - "ATTR": new RegExp( "^" + attributes ), - "PSEUDO": new RegExp( "^" + pseudos ), - "CHILD": new RegExp( "^:(only|first|last|nth|nth-last)-(child|of-type)(?:\\(" + whitespace + - "*(even|odd|(([+-]|)(\\d*)n|)" + whitespace + "*(?:([+-]|)" + whitespace + - "*(\\d+)|))" + whitespace + "*\\)|)", "i" ), - "bool": new RegExp( "^(?:" + booleans + ")$", "i" ), - // For use in libraries implementing .is() - // We use this for POS matching in `select` - "needsContext": new RegExp( "^" + whitespace + "*[>+~]|:(even|odd|eq|gt|lt|nth|first|last)(?:\\(" + - whitespace + "*((?:-\\d)?\\d*)" + whitespace + "*\\)|)(?=[^-]|$)", "i" ) - }, - - rinputs = /^(?:input|select|textarea|button)$/i, - rheader = /^h\d$/i, - - rnative = /^[^{]+\{\s*\[native \w/, - - // Easily-parseable/retrievable ID or TAG or CLASS selectors - rquickExpr = /^(?:#([\w-]+)|(\w+)|\.([\w-]+))$/, - - rsibling = /[+~]/, - - // CSS escapes - // http://www.w3.org/TR/CSS21/syndata.html#escaped-characters - runescape = new RegExp( "\\\\([\\da-f]{1,6}" + whitespace + "?|(" + whitespace + ")|.)", "ig" ), - funescape = function( _, escaped, escapedWhitespace ) { - var high = "0x" + escaped - 0x10000; - // NaN means non-codepoint - // Support: Firefox<24 - // Workaround erroneous numeric interpretation of +"0x" - return high !== high || escapedWhitespace ? - escaped : - high < 0 ? - // BMP codepoint - String.fromCharCode( high + 0x10000 ) : - // Supplemental Plane codepoint (surrogate pair) - String.fromCharCode( high >> 10 | 0xD800, high & 0x3FF | 0xDC00 ); - }, - - // CSS string/identifier serialization - // https://drafts.csswg.org/cssom/#common-serializing-idioms - rcssescape = /([\0-\x1f\x7f]|^-?\d)|^-$|[^\x80-\uFFFF\w-]/g, - fcssescape = function( ch, asCodePoint ) { - if ( asCodePoint ) { - - // U+0000 NULL becomes U+FFFD REPLACEMENT CHARACTER - if ( ch === "\0" ) { - return "\uFFFD"; - } - - // Control characters and (dependent upon position) numbers get escaped as code points - return ch.slice( 0, -1 ) + "\\" + ch.charCodeAt( ch.length - 1 ).toString( 16 ) + " "; - } - - // Other potentially-special ASCII characters get backslash-escaped - return "\\" + ch; - }, - - // Used for iframes - // See setDocument() - // Removing the function wrapper causes a "Permission Denied" - // error in IE - unloadHandler = function() { - setDocument(); - }, - - disabledAncestor = addCombinator( - function( elem ) { - return elem.disabled === true; - }, - { dir: "parentNode", next: "legend" } - ); - -// Optimize for push.apply( _, NodeList ) -try { - push.apply( - (arr = slice.call( preferredDoc.childNodes )), - preferredDoc.childNodes - ); - // Support: Android<4.0 - // Detect silently failing push.apply - arr[ preferredDoc.childNodes.length ].nodeType; -} catch ( e ) { - push = { apply: arr.length ? - - // Leverage slice if possible - function( target, els ) { - push_native.apply( target, slice.call(els) ); - } : - - // Support: IE<9 - // Otherwise append directly - function( target, els ) { - var j = target.length, - i = 0; - // Can't trust NodeList.length - while ( (target[j++] = els[i++]) ) {} - target.length = j - 1; - } - }; -} - -function Sizzle( selector, context, results, seed ) { - var m, i, elem, nid, match, groups, newSelector, - newContext = context && context.ownerDocument, - - // nodeType defaults to 9, since context defaults to document - nodeType = context ? context.nodeType : 9; - - results = results || []; - - // Return early from calls with invalid selector or context - if ( typeof selector !== "string" || !selector || - nodeType !== 1 && nodeType !== 9 && nodeType !== 11 ) { - - return results; - } - - // Try to shortcut find operations (as opposed to filters) in HTML documents - if ( !seed ) { - - if ( ( context ? context.ownerDocument || context : preferredDoc ) !== document ) { - setDocument( context ); - } - context = context || document; - - if ( documentIsHTML ) { - - // If the selector is sufficiently simple, try using a "get*By*" DOM method - // (excepting DocumentFragment context, where the methods don't exist) - if ( nodeType !== 11 && (match = rquickExpr.exec( selector )) ) { - - // ID selector - if ( (m = match[1]) ) { - - // Document context - if ( nodeType === 9 ) { - if ( (elem = context.getElementById( m )) ) { - - // Support: IE, Opera, Webkit - // TODO: identify versions - // getElementById can match elements by name instead of ID - if ( elem.id === m ) { - results.push( elem ); - return results; - } - } else { - return results; - } - - // Element context - } else { - - // Support: IE, Opera, Webkit - // TODO: identify versions - // getElementById can match elements by name instead of ID - if ( newContext && (elem = newContext.getElementById( m )) && - contains( context, elem ) && - elem.id === m ) { - - results.push( elem ); - return results; - } - } - - // Type selector - } else if ( match[2] ) { - push.apply( results, context.getElementsByTagName( selector ) ); - return results; - - // Class selector - } else if ( (m = match[3]) && support.getElementsByClassName && - context.getElementsByClassName ) { - - push.apply( results, context.getElementsByClassName( m ) ); - return results; - } - } - - // Take advantage of querySelectorAll - if ( support.qsa && - !compilerCache[ selector + " " ] && - (!rbuggyQSA || !rbuggyQSA.test( selector )) ) { - - if ( nodeType !== 1 ) { - newContext = context; - newSelector = selector; - - // qSA looks outside Element context, which is not what we want - // Thanks to Andrew Dupont for this workaround technique - // Support: IE <=8 - // Exclude object elements - } else if ( context.nodeName.toLowerCase() !== "object" ) { - - // Capture the context ID, setting it first if necessary - if ( (nid = context.getAttribute( "id" )) ) { - nid = nid.replace( rcssescape, fcssescape ); - } else { - context.setAttribute( "id", (nid = expando) ); - } - - // Prefix every selector in the list - groups = tokenize( selector ); - i = groups.length; - while ( i-- ) { - groups[i] = "#" + nid + " " + toSelector( groups[i] ); - } - newSelector = groups.join( "," ); - - // Expand context for sibling selectors - newContext = rsibling.test( selector ) && testContext( context.parentNode ) || - context; - } - - if ( newSelector ) { - try { - push.apply( results, - newContext.querySelectorAll( newSelector ) - ); - return results; - } catch ( qsaError ) { - } finally { - if ( nid === expando ) { - context.removeAttribute( "id" ); - } - } - } - } - } - } - - // All others - return select( selector.replace( rtrim, "$1" ), context, results, seed ); -} - -/** - * Create key-value caches of limited size - * @returns {function(string, object)} Returns the Object data after storing it on itself with - * property name the (space-suffixed) string and (if the cache is larger than Expr.cacheLength) - * deleting the oldest entry - */ -function createCache() { - var keys = []; - - function cache( key, value ) { - // Use (key + " ") to avoid collision with native prototype properties (see Issue #157) - if ( keys.push( key + " " ) > Expr.cacheLength ) { - // Only keep the most recent entries - delete cache[ keys.shift() ]; - } - return (cache[ key + " " ] = value); - } - return cache; -} - -/** - * Mark a function for special use by Sizzle - * @param {Function} fn The function to mark - */ -function markFunction( fn ) { - fn[ expando ] = true; - return fn; -} - -/** - * Support testing using an element - * @param {Function} fn Passed the created element and returns a boolean result - */ -function assert( fn ) { - var el = document.createElement("fieldset"); - - try { - return !!fn( el ); - } catch (e) { - return false; - } finally { - // Remove from its parent by default - if ( el.parentNode ) { - el.parentNode.removeChild( el ); - } - // release memory in IE - el = null; - } -} - -/** - * Adds the same handler for all of the specified attrs - * @param {String} attrs Pipe-separated list of attributes - * @param {Function} handler The method that will be applied - */ -function addHandle( attrs, handler ) { - var arr = attrs.split("|"), - i = arr.length; - - while ( i-- ) { - Expr.attrHandle[ arr[i] ] = handler; - } -} - -/** - * Checks document order of two siblings - * @param {Element} a - * @param {Element} b - * @returns {Number} Returns less than 0 if a precedes b, greater than 0 if a follows b - */ -function siblingCheck( a, b ) { - var cur = b && a, - diff = cur && a.nodeType === 1 && b.nodeType === 1 && - a.sourceIndex - b.sourceIndex; - - // Use IE sourceIndex if available on both nodes - if ( diff ) { - return diff; - } - - // Check if b follows a - if ( cur ) { - while ( (cur = cur.nextSibling) ) { - if ( cur === b ) { - return -1; - } - } - } - - return a ? 1 : -1; -} - -/** - * Returns a function to use in pseudos for input types - * @param {String} type - */ -function createInputPseudo( type ) { - return function( elem ) { - var name = elem.nodeName.toLowerCase(); - return name === "input" && elem.type === type; - }; -} - -/** - * Returns a function to use in pseudos for buttons - * @param {String} type - */ -function createButtonPseudo( type ) { - return function( elem ) { - var name = elem.nodeName.toLowerCase(); - return (name === "input" || name === "button") && elem.type === type; - }; -} - -/** - * Returns a function to use in pseudos for :enabled/:disabled - * @param {Boolean} disabled true for :disabled; false for :enabled - */ -function createDisabledPseudo( disabled ) { - // Known :disabled false positives: - // IE: *[disabled]:not(button, input, select, textarea, optgroup, option, menuitem, fieldset) - // not IE: fieldset[disabled] > legend:nth-of-type(n+2) :can-disable - return function( elem ) { - - // Check form elements and option elements for explicit disabling - return "label" in elem && elem.disabled === disabled || - "form" in elem && elem.disabled === disabled || - - // Check non-disabled form elements for fieldset[disabled] ancestors - "form" in elem && elem.disabled === false && ( - // Support: IE6-11+ - // Ancestry is covered for us - elem.isDisabled === disabled || - - // Otherwise, assume any non-