Tutorial 6: Supporting flexible mask shape

phlippe · phlippe · commit ac42535de4cd · 2023-06-01T14:35:35.000+02:00
diff --git a/docs/tutorial_notebooks/JAX/tutorial6/Transformers_and_MHAttention.ipynb b/docs/tutorial_notebooks/JAX/tutorial6/Transformers_and_MHAttention.ipynb
@@ -344,6 +344,26 @@
     "With this in mind, we can implement the Multi-Head Attention module below."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Helper function to support different mask shapes.\n",
+    "# Output shape supports (batch_size, number of heads, seq length, seq length)\n",
+    "# If 2D: broadcasted over batch size and number of heads\n",
+    "# If 3D: broadcasted over number of heads\n",
+    "# If 4D: leave as is\n",
+    "def expand_mask(mask):\n",
+    "    assert mask.ndim > 2, \"Mask must be at least 2-dimensional with seq_length x seq_length\"\n",
+    "    if mask.ndim == 3:\n",
+    "        mask = mask.unsqueeze(1)\n",
+    "    while mask.ndim < 4:\n",
+    "        mask = mask.unsqueeze(0)\n",
+    "    return mask"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 5,
@@ -367,6 +387,8 @@
     "\n",
     "    def __call__(self, x, mask=None):\n",
     "        batch_size, seq_length, embed_dim = x.shape\n",
+    "        if mask is not None:\n",
+    "            mask = expand_mask(mask)\n",
     "        qkv = self.qkv_proj(x)\n",
     "        \n",
     "        # Separate Q, K, V from linear output\n",
@@ -526,7 +548,7 @@
     "encblock = EncoderBlock(input_dim=128, num_heads=4, dim_feedforward=512, dropout_prob=0.1)\n",
     "# Initialize parameters of encoder block with random key and inputs\n",
     "main_rng, init_rng, dropout_init_rng = random.split(main_rng, 3)\n",
-    "params = encblock.init({'params': init_rng, 'dropout': dropout_init_rng}, x, True)['params']\n",
+    "params = encblock.init({'params': init_rng, 'dropout': dropout_init_rng}, x, train=True)['params']\n",
     "# Apply encoder block with parameters on the inputs\n",
     "# Since dropout is stochastic, we need to pass a rng to the forward\n",
     "main_rng, dropout_apply_rng = random.split(main_rng)\n",
@@ -20341,7 +20363,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -20355,7 +20377,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.9"
+   "version": "3.10.4"
   }
  },
  "nbformat": 4,
diff --git a/docs/tutorial_notebooks/tutorial6/Transformers_and_MHAttention.ipynb b/docs/tutorial_notebooks/tutorial6/Transformers_and_MHAttention.ipynb
@@ -326,6 +326,26 @@
     "How are we applying a Multi-Head Attention layer in a neural network, where we don't have an arbitrary query, key, and value vector as input? Looking at the computation graph above, a simple but effective implementation is to set the current feature map in a NN, $X\\in\\mathbb{R}^{B\\times T\\times d_{\\text{model}}}$, as $Q$, $K$ and $V$ ($B$ being the batch size, $T$ the sequence length, $d_{\\text{model}}$ the hidden dimensionality of $X$). The consecutive weight matrices $W^{Q}$, $W^{K}$, and $W^{V}$ can transform $X$ to the corresponding feature vectors that represent the queries, keys, and values of the input. Using this approach, we can implement the Multi-Head Attention module below."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Helper function to support different mask shapes.\n",
+    "# Output shape supports (batch_size, number of heads, seq length, seq length)\n",
+    "# If 2D: broadcasted over batch size and number of heads\n",
+    "# If 3D: broadcasted over number of heads\n",
+    "# If 4D: leave as is\n",
+    "def expand_mask(mask):\n",
+    "    assert mask.ndim > 2, \"Mask must be at least 2-dimensional with seq_length x seq_length\"\n",
+    "    if mask.ndim == 3:\n",
+    "        mask = mask.unsqueeze(1)\n",
+    "    while mask.ndim < 4:\n",
+    "        mask = mask.unsqueeze(0)\n",
+    "    return mask"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 5,
@@ -358,6 +378,8 @@
     "\n",
     "    def forward(self, x, mask=None, return_attention=False):\n",
     "        batch_size, seq_length, _ = x.size()\n",
+    "        if mask is not None:\n",
+    "            mask = expand_mask(mask)\n",
     "        qkv = self.qkv_proj(x)\n",
     "        \n",
     "        # Separate Q, K, V from linear output\n",
@@ -20264,7 +20286,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -20278,7 +20300,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.9"
+   "version": "3.10.4"
   }
  },
  "nbformat": 4,