Merge branch 'main' into issues

Svetlana Karslioglu · web-flow · commit 0650b471dbb2 · 2023-06-09T11:17:31.000-07:00
diff --git a/_static/img/seq-seq-images/attention-decoder-network.png b/_static/img/seq-seq-images/attention-decoder-network.png
diff --git a/beginner_source/transformer_tutorial.py b/beginner_source/transformer_tutorial.py
@@ -2,7 +2,7 @@
 Language Modeling with ``nn.Transformer`` and torchtext
 ===============================================================
 
-This is a tutorial on training a sequence-to-sequence model that uses the
+This is a tutorial on training a model to predict the next word in a sequence using the
 `nn.Transformer <https://pytorch.org/docs/stable/generated/torch.nn.Transformer.html>`__ module.
 
 The PyTorch 1.2 release includes a standard transformer module based on the
@@ -29,7 +29,9 @@
 
 ######################################################################
 # In this tutorial, we train a ``nn.TransformerEncoder`` model on a
-# language modeling task. The language modeling task is to assign a
+# language modeling task. Please note that this tutorial does not cover
+# the training of `nn.TransformerDecoder <https://pytorch.org/docs/stable/generated/torch.nn.TransformerDecoder.html#torch.nn.TransformerDecoder>`__, as depicted in
+# the right half of the diagram above. The language modeling task is to assign a
 # probability for the likelihood of a given word (or a sequence of words)
 # to follow a sequence of words. A sequence of tokens are passed to the embedding
 # layer first, followed by a positional encoding layer to account for the order
@@ -130,6 +132,7 @@ def forward(self, x: Tensor) -> Tensor:
 #  .. code-block:: bash
 #
 #      %%bash
+#      pip install portalocker
 #      pip install torchdata
 #
 # The vocab object is built based on the train dataset and is used to numericalize
diff --git a/intermediate_source/FSDP_adavnced_tutorial.rst b/intermediate_source/FSDP_adavnced_tutorial.rst
@@ -75,7 +75,7 @@ highlight different available features in FSDP that are helpful for training
 large scale model above 3B parameters. Also, we cover specific features for
 Transformer based models. The code for this tutorial is available in  `Pytorch
 Examples
-<https://github.com/HamidShojanazeri/examples/tree/FSDP_example/FSDP/>`__.
+<https://github.com/HamidShojanazeri/examples/tree/FSDP_example/distributed/FSDP/>`__.
 
 
 *Setup*
@@ -97,13 +97,13 @@ Please create a `data` folder, download the WikiHow dataset from `wikihowAll.csv
 `wikihowSep.cs <https://ucsb.app.box.com/s/7yq601ijl1lzvlfu4rjdbbxforzd2oag>`__,
 and place them in the `data` folder.  We will use the wikihow dataset from
 `summarization_dataset
-<https://github.com/HamidShojanazeri/examples/blob/FSDP_example/FSDP/summarization_dataset.py>`__.
+<https://github.com/HamidShojanazeri/examples/blob/FSDP_example/distributed/FSDP/summarization_dataset.py>`__.
 
 Next, we add the following code snippets to a Python script “T5_training.py”.
 
 .. note::
    The full source code for this tutorial is available in `PyTorch examples
-   <https://github.com/HamidShojanazeri/examples/tree/FSDP_example/FSDP>`__.
+   <https://github.com/HamidShojanazeri/examples/tree/FSDP_example/distributed/FSDP>`__.
 
 1.3  Import necessary packages:
 
diff --git a/intermediate_source/seq2seq_translation_tutorial.py b/intermediate_source/seq2seq_translation_tutorial.py
@@ -440,25 +440,27 @@ def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGT
         self.max_length = max_length
 
         self.embedding = nn.Embedding(self.output_size, self.hidden_size)
-        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
-        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
+        self.fc_hidden = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
+        self.fc_encoder = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
+        self.alignment_vector = nn.Parameter(torch.Tensor(1, hidden_size))
+        torch.nn.init.xavier_uniform_(self.alignment_vector)
         self.dropout = nn.Dropout(self.dropout_p)
-        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
+        self.gru = nn.GRU(self.hidden_size * 2, self.hidden_size)
         self.out = nn.Linear(self.hidden_size, self.output_size)
 
     def forward(self, input, hidden, encoder_outputs):
-        embedded = self.embedding(input).view(1, 1, -1)
+        embedded = self.embedding(input).view(1, -1)
         embedded = self.dropout(embedded)
 
-        attn_weights = F.softmax(
-            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
-        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
-                                 encoder_outputs.unsqueeze(0))
-
-        output = torch.cat((embedded[0], attn_applied[0]), 1)
-        output = self.attn_combine(output).unsqueeze(0)
+        transformed_hidden = self.fc_hidden(hidden[0])
+        expanded_hidden_state = transformed_hidden.expand(self.max_length, -1)
+        alignment_scores = torch.tanh(expanded_hidden_state +
+                                      self.fc_encoder(encoder_outputs))
+        alignment_scores = self.alignment_vector.mm(alignment_scores.T)
+        attn_weights = F.softmax(alignment_scores, dim=1)
+        context_vector = attn_weights.mm(encoder_outputs)
 
-        output = F.relu(output)
+        output = torch.cat((embedded, context_vector), 1).unsqueeze(0)
         output, hidden = self.gru(output, hidden)
 
         output = F.log_softmax(self.out(output[0]), dim=1)
@@ -761,15 +763,15 @@ def evaluateRandomly(encoder, decoder, n=10):
 #
 
 hidden_size = 256
-encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
-attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)
+encoder = EncoderRNN(input_lang.n_words, hidden_size).to(device)
+attn_decoder = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)
 
-trainIters(encoder1, attn_decoder1, 75000, print_every=5000)
+trainIters(encoder, attn_decoder, 75000, print_every=5000)
 
 ######################################################################
 #
 
-evaluateRandomly(encoder1, attn_decoder1)
+evaluateRandomly(encoder, attn_decoder)
 
 
 ######################################################################
@@ -787,7 +789,7 @@ def evaluateRandomly(encoder, decoder, n=10):
 #
 
 output_words, attentions = evaluate(
-    encoder1, attn_decoder1, "je suis trop froid .")
+    encoder, attn_decoder, "je suis trop froid .")
 plt.matshow(attentions.numpy())
 
 
@@ -817,7 +819,7 @@ def showAttention(input_sentence, output_words, attentions):
 
 def evaluateAndShowAttention(input_sentence):
     output_words, attentions = evaluate(
-        encoder1, attn_decoder1, input_sentence)
+        encoder, attn_decoder, input_sentence)
     print('input =', input_sentence)
     print('output =', ' '.join(output_words))
     showAttention(input_sentence, output_words, attentions)