initial commit

gdevos010 · gdevos010 · commit 341cf4a57b83 · 2025-02-08T13:57:10.000-08:00
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,174 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+# Ruff stuff:
+.ruff_cache/
+
+# PyPI configuration file
+.pypirc
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2025 Greg DeVosNouri
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -1 +1,56 @@
-# Scalable-Softmax
+# ScalableSoftmax
+
+An unofficial PyTorch implementation of Scalable-Softmax (Ssmax) from the paper "Scalable-Softmax Is Superior for Attention" (Nakanishi, 2025).
+
+## Overview
+
+ScalableSoftmax is a drop-in replacement for standard Softmax that helps prevent attention fading in transformers by incorporating input size scaling. This helps maintain focused attention distributions even with large input sizes.
+
+## Installation
+
+```bash
+pip install scalable-softmax
+```
+
+## Usage
+
+```python
+import torch
+from scalable_softmax import ScalableSoftmax
+
+# Initialize with default parameters
+ssmax = ScalableSoftmax()
+
+# Or customize parameters
+ssmax = ScalableSoftmax(
+    s=0.43,  # scaling parameter
+    learn_scaling=True,  # make scaling parameter learnable
+    bias=False  # whether to use bias term
+)
+
+# Apply to input tensor
+x = torch.randn(batch_size, sequence_length)
+output = ssmax(x)
+```
+
+## Features
+
+- Drop-in replacement for standard softmax
+- Learnable scaling parameter
+- Optional bias term
+- Maintains focused attention with large inputs
+
+## Citation
+
+```bibtex
+@article{nakanishi2025scalable,
+  title={Scalable-Softmax Is Superior for Attention},
+  author={Nakanishi, Ken M.},
+  journal={arXiv preprint arXiv:2501.19399},
+  year={2025}
+}
+```
+
+## License
+
+MIT License
diff --git a/main.py b/main.py
@@ -0,0 +1,18 @@
+import torch
+from scalable_softmax import ScalableSoftmax
+
+# Initialize with default parameters
+smax = ScalableSoftmax()
+
+# Or customize parameters
+smax = ScalableSoftmax(
+    s=0.43,  # scaling parameter
+    learn_scaling=True,  # make scaling parameter learnable
+    bias=False  # whether to use bias term
+)
+
+# Apply to input tensor
+batch_size = 32
+sequence_length = 128
+x = torch.randn(batch_size, sequence_length)
+output = smax(x)
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,33 @@
+[project]
+name = "scalable_softmax"
+version = "0.1.0"
+description = "PyTorch implementation of Scalable-Softmax for attention mechanisms"
+authors = [
+    { name = "Greg DeVosNouri", email = "gdevos010@gamil.com" }
+]
+readme = "README.md"
+requires-python = ">= 3.9"
+license = { file = "LICENSE" }
+keywords = ["pytorch", "deep-learning", "attention", "transformer"]
+
+classifiers=[
+    'Development Status :: 4 - Beta',
+    'Intended Audience :: Developers',
+    'Topic :: Scientific/Engineering :: Artificial Intelligence',
+    'License :: OSI Approved :: MIT License',
+    'Programming Language :: Python :: 3.9',
+]
+
+dependencies = [
+    "torch>=1.8",
+]
+
+[project.urls]
+Homepage = "https://github.com/gdevos010/Scalable-Softmax"
+Repository = "https://github.com/gdevos010/Scalable-Softmax"
+
+[project.optional-dependencies]
+dev = [
+    "ruff"
+]
+
diff --git a/src/__init__.py b/src/__init__.py
diff --git a/src/scalable_softmax.py b/src/scalable_softmax.py
@@ -0,0 +1,70 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+
+class ScalableSoftmax(nn.Module):
+    """Scalable-Softmax (SSMax) implementation from the paper 
+    'Scalable-Softmax Is Superior for Attention'.
+    
+    This is a drop-in replacement for standard Softmax that helps prevent attention
+    fading in transformers by incorporating input size scaling. The scaling helps maintain
+    focused attention distributions even with large input sizes.
+    
+    Args:
+        s (float, optional): Scaling parameter that controls attention focusing strength.
+            Lower values (e.g. 0.1) produce sharper attention, higher values (e.g. 1.0)
+            produce softer attention. Default: 0.43 as used in paper.
+        learn_scaling (bool, optional): If True, make scaling parameter learnable.
+            Default: True
+        bias (bool, optional): If True, adds a learnable bias term. The paper found
+            that while bias helps training, it can hurt length generalization.
+            Default: False
+    
+    Shape:
+        - Input: (*, N) where * is any number of dimensions and N is the sequence length
+        - Output: Same shape as input
+    """
+    def __init__(self, s: float = 0.43, learn_scaling: bool = True, bias: bool = False):
+        super().__init__()
+        
+        # Initialize scaling parameter
+        if learn_scaling:
+            self.s = nn.Parameter(torch.tensor(s, dtype=torch.float))
+        else:
+            self.register_buffer('s', torch.tensor(s, dtype=torch.float))
+            
+        # Optional bias parameter 
+        if bias:
+            self.b = nn.Parameter(torch.zeros(1))
+        else:
+            self.b = None
+            
+    def forward(self, x: torch.Tensor, dim: int = -1) -> torch.Tensor:
+        """Forward pass applying SSMax along specified dimension.
+        
+        Args:
+            x (torch.Tensor): Input tensor
+            dim (int): Dimension along which to apply SSMax. Default: -1
+            
+        Returns:
+            torch.Tensor: Output tensor with same shape as input
+        """
+        # Get size of dimension we're applying SSMax to
+        n = x.size(dim)
+        
+        # Apply scaling factor based on input size
+        if self.b is not None:
+            # Version with bias term
+            x_scaled = (self.s * math.log(n) + self.b) * x
+        else:
+            # Standard version from paper
+            x_scaled = self.s * math.log(n) * x
+            
+        # Apply standard softmax
+        return F.softmax(x_scaled, dim=dim)
+
+    def extra_repr(self) -> str:
+        """String representation of module."""
+        return f's={self.s.item():.3f}, bias={self.b is not None}'
+