diff --git a/.gitignore b/.gitignore
index 515ff88..854eb76 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,5 @@
 .idea
 *.key
-*.pdf
 .nsml*
 *.pt
 X*
@@ -9,5 +8,7 @@ ppts
 .ipynb_checkpoints
 client_secret.json
 __pycache__/
+.py*
 tmp
 template.pdf
+*.ipynb
diff --git a/01_basics.py b/01_basics.py
index 7fa70d6..a96bbc2 100644
--- a/01_basics.py
+++ b/01_basics.py
@@ -15,22 +15,29 @@ def loss(x, y):
     y_pred = forward(x)
     return (y_pred - y) * (y_pred - y)
 
-
+# List of weights/Mean square Error (Mse) for each input
 w_list = []
 mse_list = []
 
 for w in np.arange(0.0, 4.1, 0.1):
+    # Print the weights and initialize the lost
     print("w=", w)
     l_sum = 0
+
     for x_val, y_val in zip(x_data, y_data):
+        # For each input and output, calculate y_hat
+        # Compute the total loss and add to the total error
         y_pred_val = forward(x_val)
         l = loss(x_val, y_val)
         l_sum += l
         print("\t", x_val, y_val, y_pred_val, l)
-    print("MSE=", l_sum / 3)
+    # Now compute the Mean squared error (mse) of each
+    # Aggregate the weight/mse from this run
+    print("MSE=", l_sum / len(x_data))
     w_list.append(w)
-    mse_list.append(l_sum / 3)
+    mse_list.append(l_sum / len(x_data))
 
+# Plot it all
 plt.plot(w_list, mse_list)
 plt.ylabel('Loss')
 plt.xlabel('w')
diff --git a/02_manual_gradient.py b/02_manual_gradient.py
index 81d1924..a84e789 100644
--- a/02_manual_gradient.py
+++ b/02_manual_gradient.py
@@ -1,11 +1,11 @@
+# Training Data
 x_data = [1.0, 2.0, 3.0]
 y_data = [2.0, 4.0, 6.0]
 
 w = 1.0  # a random guess: random value
 
-# our model forward pass
-
 
+# our model forward pass
 def forward(x):
     return x * w
 
@@ -20,18 +20,21 @@ def loss(x, y):
 def gradient(x, y):  # d_loss/d_w
     return 2 * x * (x * w - y)
 
+
 # Before training
-print("predict (before training)",  4, forward(4))
+print("Prediction (before training)",  4, forward(4))
 
 # Training loop
 for epoch in range(10):
     for x_val, y_val in zip(x_data, y_data):
+        # Compute derivative w.r.t to the learned weights
+        # Update the weights
+        # Compute the loss and print progress
         grad = gradient(x_val, y_val)
         w = w - 0.01 * grad
         print("\tgrad: ", x_val, y_val, round(grad, 2))
         l = loss(x_val, y_val)
-
     print("progress:", epoch, "w=", round(w, 2), "loss=", round(l, 2))
 
 # After training
-print("predict (after training)",  "4 hours", forward(4))
+print("Predicted score (after training)",  "4 hours of studying: ", forward(4))
diff --git a/03_auto_gradient.py b/03_auto_gradient.py
index fc44021..9689703 100644
--- a/03_auto_gradient.py
+++ b/03_auto_gradient.py
@@ -1,39 +1,34 @@
 import torch
-from torch.autograd import Variable
+import pdb
 
 x_data = [1.0, 2.0, 3.0]
 y_data = [2.0, 4.0, 6.0]
-
-w = Variable(torch.Tensor([1.0]),  requires_grad=True)  # Any random value
+w = torch.tensor([1.0], requires_grad=True)
 
 # our model forward pass
-
-
 def forward(x):
     return x * w
 
 # Loss function
-
-
-def loss(x, y):
-    y_pred = forward(x)
-    return (y_pred - y) * (y_pred - y)
+def loss(y_pred, y_val):
+    return (y_pred - y_val) ** 2
 
 # Before training
-print("predict (before training)",  4, forward(4).data[0])
+print("Prediction (before training)",  4, forward(4).item())
 
 # Training loop
 for epoch in range(10):
     for x_val, y_val in zip(x_data, y_data):
-        l = loss(x_val, y_val)
-        l.backward()
-        print("\tgrad: ", x_val, y_val, w.grad.data[0])
-        w.data = w.data - 0.01 * w.grad.data
+        y_pred = forward(x_val) # 1) Forward pass
+        l = loss(y_pred, y_val) # 2) Compute loss
+        l.backward() # 3) Back propagation to update weights
+        print("\tgrad: ", x_val, y_val, w.grad.item())
+        w.data = w.data - 0.01 * w.grad.item()
 
         # Manually zero the gradients after updating weights
         w.grad.data.zero_()
 
-    print("progress:", epoch, l.data[0])
+    print(f"Epoch: {epoch} | Loss: {l.item()}")
 
 # After training
-print("predict (after training)",  4, forward(4).data[0])
+print("Prediction (after training)",  4, forward(4).item())
diff --git a/05_linear_regression.py b/05_linear_regression.py
index 0afd430..84238a7 100644
--- a/05_linear_regression.py
+++ b/05_linear_regression.py
@@ -1,13 +1,12 @@
-
+from torch import nn
 import torch
-from torch.autograd import Variable
-
-x_data = Variable(torch.Tensor([[1.0], [2.0], [3.0]]))
-y_data = Variable(torch.Tensor([[2.0], [4.0], [6.0]]))
+from torch import tensor
 
+x_data = tensor([[1.0], [2.0], [3.0]])
+y_data = tensor([[2.0], [4.0], [6.0]])
 
-class Model(torch.nn.Module):
 
+class Model(nn.Module):
     def __init__(self):
         """
         In the constructor we instantiate two nn.Linear module
@@ -24,24 +23,24 @@ def forward(self, x):
         y_pred = self.linear(x)
         return y_pred
 
+
 # our model
 model = Model()
 
-
 # Construct our loss function and an Optimizer. The call to model.parameters()
 # in the SGD constructor will contain the learnable parameters of the two
 # nn.Linear modules which are members of the model.
-criterion = torch.nn.MSELoss(size_average=False)
+criterion = torch.nn.MSELoss(reduction='sum')
 optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
 
 # Training loop
 for epoch in range(500):
-        # Forward pass: Compute predicted y by passing x to the model
+    # 1) Forward pass: Compute predicted y by passing x to the model
     y_pred = model(x_data)
 
-    # Compute and print loss
+    # 2) Compute and print loss
     loss = criterion(y_pred, y_data)
-    print(epoch, loss.data[0])
+    print(f'Epoch: {epoch} | Loss: {loss.item()} ')
 
     # Zero gradients, perform a backward pass, and update the weights.
     optimizer.zero_grad()
@@ -50,6 +49,6 @@ def forward(self, x):
 
 
 # After training
-hour_var = Variable(torch.Tensor([[4.0]]))
+hour_var = tensor([[4.0]])
 y_pred = model(hour_var)
-print("predict (after training)",  4, model(hour_var).data[0][0])
+print("Prediction (after training)",  4, model(hour_var).data[0][0].item())
diff --git a/06_logistic_regression.py b/06_logistic_regression.py
index 4d16c83..db903c6 100644
--- a/06_logistic_regression.py
+++ b/06_logistic_regression.py
@@ -1,47 +1,48 @@
-
-import torch
-from torch.autograd import Variable
+from torch import tensor
+from torch import nn
+from torch import sigmoid
 import torch.nn.functional as F
+import torch.optim as optim
 
-x_data = Variable(torch.Tensor([[1.0], [2.0], [3.0], [4.0]]))
-y_data = Variable(torch.Tensor([[0.], [0.], [1.], [1.]]))
-
+# Training data and ground truth
+x_data = tensor([[1.0], [2.0], [3.0], [4.0]])
+y_data = tensor([[0.], [0.], [1.], [1.]])
 
-class Model(torch.nn.Module):
 
+class Model(nn.Module):
     def __init__(self):
         """
         In the constructor we instantiate nn.Linear module
         """
         super(Model, self).__init__()
-        self.linear = torch.nn.Linear(1, 1)  # One in and one out
+        self.linear = nn.Linear(1, 1)  # One in and one out
 
     def forward(self, x):
         """
         In the forward function we accept a Variable of input data and we must return
         a Variable of output data.
         """
-        y_pred = F.sigmoid(self.linear(x))
+        y_pred = sigmoid(self.linear(x))
         return y_pred
 
+
 # our model
 model = Model()
 
-
 # Construct our loss function and an Optimizer. The call to model.parameters()
 # in the SGD constructor will contain the learnable parameters of the two
 # nn.Linear modules which are members of the model.
-criterion = torch.nn.BCELoss(size_average=True)
-optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
+criterion = nn.BCELoss(reduction='mean')
+optimizer = optim.SGD(model.parameters(), lr=0.01)
 
 # Training loop
 for epoch in range(1000):
-        # Forward pass: Compute predicted y by passing x to the model
+    # Forward pass: Compute predicted y by passing x to the model
     y_pred = model(x_data)
 
     # Compute and print loss
     loss = criterion(y_pred, y_data)
-    print(epoch, loss.data[0])
+    print(f'Epoch {epoch + 1}/1000 | Loss: {loss.item():.4f}')
 
     # Zero gradients, perform a backward pass, and update the weights.
     optimizer.zero_grad()
@@ -49,7 +50,8 @@ def forward(self, x):
     optimizer.step()
 
 # After training
-hour_var = Variable(torch.Tensor([[1.0]]))
-print("predict 1 hour ", 1.0, model(hour_var).data[0][0] > 0.5)
-hour_var = Variable(torch.Tensor([[7.0]]))
-print("predict 7 hours", 7.0, model(hour_var).data[0][0] > 0.5)
+print(f'\nLet\'s predict the hours need to score above 50%\n{"=" * 50}')
+hour_var = model(tensor([[1.0]]))
+print(f'Prediction after 1 hour of training: {hour_var.item():.4f} | Above 50%: {hour_var.item() > 0.5}')
+hour_var = model(tensor([[7.0]]))
+print(f'Prediction after 7 hours of training: {hour_var.item():.4f} | Above 50%: { hour_var.item() > 0.5}')
diff --git a/07_diabets_logistic.py b/07_diabets_logistic.py
index 14535cf..0a510ac 100644
--- a/07_diabets_logistic.py
+++ b/07_diabets_logistic.py
@@ -1,28 +1,23 @@
-
-import torch
-from torch.autograd import Variable
+from torch import nn, optim, from_numpy
 import numpy as np
 
 xy = np.loadtxt('./data/diabetes.csv.gz', delimiter=',', dtype=np.float32)
-x_data = Variable(torch.from_numpy(xy[:, 0:-1]))
-y_data = Variable(torch.from_numpy(xy[:, [-1]]))
-
-print(x_data.data.shape)
-print(y_data.data.shape)
+x_data = from_numpy(xy[:, 0:-1])
+y_data = from_numpy(xy[:, [-1]])
+print(f'X\'s shape: {x_data.shape} | Y\'s shape: {y_data.shape}')
 
 
-class Model(torch.nn.Module):
-
+class Model(nn.Module):
     def __init__(self):
         """
         In the constructor we instantiate two nn.Linear module
         """
         super(Model, self).__init__()
-        self.l1 = torch.nn.Linear(8, 6)
-        self.l2 = torch.nn.Linear(6, 4)
-        self.l3 = torch.nn.Linear(4, 1)
+        self.l1 = nn.Linear(8, 6)
+        self.l2 = nn.Linear(6, 4)
+        self.l3 = nn.Linear(4, 1)
 
-        self.sigmoid = torch.nn.Sigmoid()
+        self.sigmoid = nn.Sigmoid()
 
     def forward(self, x):
         """
@@ -35,6 +30,7 @@ def forward(self, x):
         y_pred = self.sigmoid(self.l3(out2))
         return y_pred
 
+
 # our model
 model = Model()
 
@@ -42,17 +38,17 @@ def forward(self, x):
 # Construct our loss function and an Optimizer. The call to model.parameters()
 # in the SGD constructor will contain the learnable parameters of the two
 # nn.Linear modules which are members of the model.
-criterion = torch.nn.BCELoss(size_average=True)
-optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
+criterion = nn.BCELoss(reduction='mean')
+optimizer = optim.SGD(model.parameters(), lr=0.1)
 
 # Training loop
 for epoch in range(100):
-        # Forward pass: Compute predicted y by passing x to the model
+    # Forward pass: Compute predicted y by passing x to the model
     y_pred = model(x_data)
 
     # Compute and print loss
     loss = criterion(y_pred, y_data)
-    print(epoch, loss.data[0])
+    print(f'Epoch: {epoch + 1}/100 | Loss: {loss.item():.4f}')
 
     # Zero gradients, perform a backward pass, and update the weights.
     optimizer.zero_grad()
diff --git a/08_1_dataset_loader.py b/08_1_dataset_loader.py
index aba0797..2921894 100644
--- a/08_1_dataset_loader.py
+++ b/08_1_dataset_loader.py
@@ -1,11 +1,9 @@
 # References
 # https://github.com/yunjey/pytorch-tutorial/blob/master/tutorials/01-basics/pytorch_basics/main.py
 # http://pytorch.org/tutorials/beginner/data_loading_tutorial.html#dataset-class
-import torch
-import numpy as np
-from torch.autograd import Variable
 from torch.utils.data import Dataset, DataLoader
-
+from torch import from_numpy, tensor
+import numpy as np
 
 class DiabetesDataset(Dataset):
     """ Diabetes dataset."""
@@ -15,8 +13,8 @@ def __init__(self):
         xy = np.loadtxt('./data/diabetes.csv.gz',
                         delimiter=',', dtype=np.float32)
         self.len = xy.shape[0]
-        self.x_data = torch.from_numpy(xy[:, 0:-1])
-        self.y_data = torch.from_numpy(xy[:, [-1]])
+        self.x_data = from_numpy(xy[:, 0:-1])
+        self.y_data = from_numpy(xy[:, [-1]])
 
     def __getitem__(self, index):
         return self.x_data[index], self.y_data[index]
@@ -37,7 +35,7 @@ def __len__(self):
         inputs, labels = data
 
         # wrap them in Variable
-        inputs, labels = Variable(inputs), Variable(labels)
+        inputs, labels = tensor(inputs), tensor(labels)
 
         # Run your training process
-        print(epoch, i, "inputs", inputs.data, "labels", labels.data)
+        print(f'Epoch: {i} | Inputs {inputs.data} | Labels {labels.data}')
diff --git a/08_2_dataset_loade_logistic.py b/08_2_dataset_loade_logistic.py
index 43ba9f4..ec43dbf 100644
--- a/08_2_dataset_loade_logistic.py
+++ b/08_2_dataset_loade_logistic.py
@@ -1,22 +1,20 @@
 # References
 # https://github.com/yunjey/pytorch-tutorial/blob/master/tutorials/01-basics/pytorch_basics/main.py
 # http://pytorch.org/tutorials/beginner/data_loading_tutorial.html#dataset-class
-import torch
-import numpy as np
-from torch.autograd import Variable
 from torch.utils.data import Dataset, DataLoader
+from torch import nn, from_numpy, optim
+import numpy as np
 
 
 class DiabetesDataset(Dataset):
     """ Diabetes dataset."""
-
     # Initialize your data, download, etc.
     def __init__(self):
         xy = np.loadtxt('./data/diabetes.csv.gz',
                         delimiter=',', dtype=np.float32)
         self.len = xy.shape[0]
-        self.x_data = torch.from_numpy(xy[:, 0:-1])
-        self.y_data = torch.from_numpy(xy[:, [-1]])
+        self.x_data = from_numpy(xy[:, 0:-1])
+        self.y_data = from_numpy(xy[:, [-1]])
 
     def __getitem__(self, index):
         return self.x_data[index], self.y_data[index]
@@ -32,18 +30,18 @@ def __len__(self):
                           num_workers=2)
 
 
-class Model(torch.nn.Module):
+class Model(nn.Module):
 
     def __init__(self):
         """
         In the constructor we instantiate two nn.Linear module
         """
         super(Model, self).__init__()
-        self.l1 = torch.nn.Linear(8, 6)
-        self.l2 = torch.nn.Linear(6, 4)
-        self.l3 = torch.nn.Linear(4, 1)
+        self.l1 = nn.Linear(8, 6)
+        self.l2 = nn.Linear(6, 4)
+        self.l3 = nn.Linear(4, 1)
 
-        self.sigmoid = torch.nn.Sigmoid()
+        self.sigmoid = nn.Sigmoid()
 
     def forward(self, x):
         """
@@ -56,15 +54,15 @@ def forward(self, x):
         y_pred = self.sigmoid(self.l3(out2))
         return y_pred
 
+
 # our model
 model = Model()
 
-
 # Construct our loss function and an Optimizer. The call to model.parameters()
 # in the SGD constructor will contain the learnable parameters of the two
 # nn.Linear modules which are members of the model.
-criterion = torch.nn.BCELoss(size_average=True)
-optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
+criterion = nn.BCELoss(reduction='sum')
+optimizer = optim.SGD(model.parameters(), lr=0.1)
 
 # Training loop
 for epoch in range(2):
@@ -72,15 +70,12 @@ def forward(self, x):
         # get the inputs
         inputs, labels = data
 
-        # wrap them in Variable
-        inputs, labels = Variable(inputs), Variable(labels)
-
         # Forward pass: Compute predicted y by passing x to the model
         y_pred = model(inputs)
 
         # Compute and print loss
         loss = criterion(y_pred, labels)
-        print(epoch, i, loss.data[0])
+        print(f'Epoch {epoch + 1} | Batch: {i+1} | Loss: {loss.item():.4f}')
 
         # Zero gradients, perform a backward pass, and update the weights.
         optimizer.zero_grad()
diff --git a/09_01_softmax_loss.py b/09_01_softmax_loss.py
index aed1267..ffea7a9 100644
--- a/09_01_softmax_loss.py
+++ b/09_01_softmax_loss.py
@@ -1,23 +1,16 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.optim as optim
-from torchvision import datasets, transforms
-from torch.autograd import Variable
-
+from torch import nn, tensor, max
+import numpy as np
 
 # Cross entropy example
-import numpy as np
 # One hot
 # 0: 1 0 0
 # 1: 0 1 0
 # 2: 0 0 1
 Y = np.array([1, 0, 0])
-
 Y_pred1 = np.array([0.7, 0.2, 0.1])
 Y_pred2 = np.array([0.1, 0.3, 0.6])
-print("loss1 = ", np.sum(-Y * np.log(Y_pred1)))
-print("loss2 = ", np.sum(-Y * np.log(Y_pred2)))
+print(f'Loss1: {np.sum(-Y * np.log(Y_pred1)):.4f}')
+print(f'Loss2: {np.sum(-Y * np.log(Y_pred2)):.4f}')
 
 # Softmax + CrossEntropy (logSoftmax + NLLLoss)
 loss = nn.CrossEntropyLoss()
@@ -25,38 +18,35 @@
 # target is of size nBatch
 # each element in target has to have 0 <= value < nClasses (0-2)
 # Input is class, not one-hot
-Y = Variable(torch.LongTensor([0]), requires_grad=False)
+Y = tensor([0], requires_grad=False)
 
 # input is of size nBatch x nClasses = 1 x 4
 # Y_pred are logits (not softmax)
-Y_pred1 = Variable(torch.Tensor([[2.0, 1.0, 0.1]]))
-Y_pred2 = Variable(torch.Tensor([[0.5, 2.0, 0.3]]))
+Y_pred1 = tensor([[2.0, 1.0, 0.1]])
+Y_pred2 = tensor([[0.5, 2.0, 0.3]])
 
 l1 = loss(Y_pred1, Y)
 l2 = loss(Y_pred2, Y)
 
-print("PyTorch Loss1 = ", l1.data, "\nPyTorch Loss2=", l2.data)
-
-print("Y_pred1=", torch.max(Y_pred1.data, 1)[1])
-print("Y_pred2=", torch.max(Y_pred2.data, 1)[1])
+print(f'PyTorch Loss1: {l1.item():.4f} \nPyTorch Loss2: {l2.item():.4f}')
+print(f'Y_pred1: {max(Y_pred1.data, 1)[1].item()}')
+print(f'Y_pred2: {max(Y_pred2.data, 1)[1].item()}')
 
 # target is of size nBatch
 # each element in target has to have 0 <= value < nClasses (0-2)
 # Input is class, not one-hot
-Y = Variable(torch.LongTensor([2, 0, 1]), requires_grad=False)
+Y = tensor([2, 0, 1], requires_grad=False)
 
 # input is of size nBatch x nClasses = 2 x 4
 # Y_pred are logits (not softmax)
-Y_pred1 = Variable(torch.Tensor([[0.1, 0.2, 0.9],
-                                 [1.1, 0.1, 0.2],
-                                 [0.2, 2.1, 0.1]]))
-
+Y_pred1 = tensor([[0.1, 0.2, 0.9],
+                  [1.1, 0.1, 0.2],
+                  [0.2, 2.1, 0.1]])
 
-Y_pred2 = Variable(torch.Tensor([[0.8, 0.2, 0.3],
-                                 [0.2, 0.3, 0.5],
-                                 [0.2, 0.2, 0.5]]))
+Y_pred2 = tensor([[0.8, 0.2, 0.3],
+                  [0.2, 0.3, 0.5],
+                  [0.2, 0.2, 0.5]])
 
 l1 = loss(Y_pred1, Y)
 l2 = loss(Y_pred2, Y)
-
-print("Batch Loss1 = ", l1.data, "\nBatch Loss2=", l2.data)
+print(f'Batch Loss1:  {l1.item():.4f} \nBatch Loss2: {l2.data:.4f}')
diff --git a/09_2_softmax_mnist.py b/09_2_softmax_mnist.py
index 48d2483..9e035a3 100644
--- a/09_2_softmax_mnist.py
+++ b/09_2_softmax_mnist.py
@@ -1,14 +1,15 @@
 # https://github.com/pytorch/examples/blob/master/mnist/main.py
 from __future__ import print_function
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.optim as optim
+from torch import nn, optim, cuda
+from torch.utils import data
 from torchvision import datasets, transforms
-from torch.autograd import Variable
+import torch.nn.functional as F
+import time
 
 # Training settings
 batch_size = 64
+device = 'cuda' if cuda.is_available() else 'cpu'
+print(f'Training MNIST Model on {device}\n{"=" * 44}')
 
 # MNIST Dataset
 train_dataset = datasets.MNIST(root='./mnist_data/',
@@ -21,11 +22,11 @@
                               transform=transforms.ToTensor())
 
 # Data Loader (Input Pipeline)
-train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
+train_loader = data.DataLoader(dataset=train_dataset,
                                            batch_size=batch_size,
                                            shuffle=True)
 
-test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
+test_loader = data.DataLoader(dataset=test_dataset,
                                           batch_size=batch_size,
                                           shuffle=False)
 
@@ -50,7 +51,7 @@ def forward(self, x):
 
 
 model = Net()
-
+model.to(device)
 criterion = nn.CrossEntropyLoss()
 optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
 
@@ -58,16 +59,16 @@ def forward(self, x):
 def train(epoch):
     model.train()
     for batch_idx, (data, target) in enumerate(train_loader):
-        data, target = Variable(data), Variable(target)
+        data, target = data.to(device), target.to(device)
         optimizer.zero_grad()
         output = model(data)
         loss = criterion(output, target)
         loss.backward()
         optimizer.step()
         if batch_idx % 10 == 0:
-            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
+            print('Train Epoch: {} | Batch Status: {}/{} ({:.0f}%) | Loss: {:.6f}'.format(
                 epoch, batch_idx * len(data), len(train_loader.dataset),
-                100. * batch_idx / len(train_loader), loss.data[0]))
+                100. * batch_idx / len(train_loader), loss.item()))
 
 
 def test():
@@ -75,20 +76,30 @@ def test():
     test_loss = 0
     correct = 0
     for data, target in test_loader:
-        data, target = Variable(data, volatile=True), Variable(target)
+        data, target = data.to(device), target.to(device)
         output = model(data)
         # sum up batch loss
-        test_loss += criterion(output, target).data[0]
+        test_loss += criterion(output, target).item()
         # get the index of the max
         pred = output.data.max(1, keepdim=True)[1]
         correct += pred.eq(target.data.view_as(pred)).cpu().sum()
 
     test_loss /= len(test_loader.dataset)
-    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
-        test_loss, correct, len(test_loader.dataset),
-        100. * correct / len(test_loader.dataset)))
+    print(f'===========================\nTest set: Average loss: {test_loss:.4f}, Accuracy: {correct}/{len(test_loader.dataset)} '
+          f'({100. * correct / len(test_loader.dataset):.0f}%)')
+
+
+if __name__ == '__main__':
+    since = time.time()
+    for epoch in range(1, 10):
+        epoch_start = time.time()
+        train(epoch)
+        m, s = divmod(time.time() - epoch_start, 60)
+        print(f'Training time: {m:.0f}m {s:.0f}s')
+        test()
+        m, s = divmod(time.time() - epoch_start, 60)
+        print(f'Testing time: {m:.0f}m {s:.0f}s')
 
+    m, s = divmod(time.time() - since, 60)
+    print(f'Total Time: {m:.0f}m {s:.0f}s\nModel was trained on {device}!')
 
-for epoch in range(1, 10):
-    train(epoch)
-    test()
diff --git a/10_1_cnn_mnist.py b/10_1_cnn_mnist.py
index 547c477..3f851f9 100644
--- a/10_1_cnn_mnist.py
+++ b/10_1_cnn_mnist.py
@@ -66,7 +66,7 @@ def train(epoch):
         if batch_idx % 10 == 0:
             print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                 epoch, batch_idx * len(data), len(train_loader.dataset),
-                100. * batch_idx / len(train_loader), loss.data[0]))
+                100. * batch_idx / len(train_loader), loss.item()))
 
 
 def test():
@@ -77,7 +77,7 @@ def test():
         data, target = Variable(data, volatile=True), Variable(target)
         output = model(data)
         # sum up batch loss
-        test_loss += F.nll_loss(output, target, size_average=False).data[0]
+        test_loss += F.nll_loss(output, target, size_average=False).data
         # get the index of the max log-probability
         pred = output.data.max(1, keepdim=True)[1]
         correct += pred.eq(target.data.view_as(pred)).cpu().sum()
diff --git a/12_1_rnn_basics.py b/12_1_rnn_basics.py
index 2ec10a2..d26ceb0 100644
--- a/12_1_rnn_basics.py
+++ b/12_1_rnn_basics.py
@@ -11,9 +11,8 @@
 # One cell RNN input_dim (4) -> output_dim (2). sequence: 5
 cell = nn.RNN(input_size=4, hidden_size=2, batch_first=True)
 
-# (num_layers * num_directions, batch, hidden_size)
-# (batch, num_layers * num_directions, hidden_size) for batch_first=True
-hidden = (Variable(torch.randn(1, 1, 2)))
+# (num_layers * num_directions, batch, hidden_size) whether batch_first=True or False
+hidden = Variable(torch.randn(1, 1, 2))
 
 # Propagate input through RNN
 # Input: (batch, seq_len, input_size) when batch_first=True
@@ -32,6 +31,9 @@
 print("sequence input size", inputs.size(), "out size", out.size())
 
 
+# hidden : (num_layers * num_directions, batch, hidden_size) whether batch_first=True or False
+hidden = Variable(torch.randn(1, 3, 2))
+
 # One cell RNN input_dim (4) -> output_dim (2). sequence: 5, batch 3
 # 3 batches 'hello', 'eolll', 'lleel'
 # rank = (3, 5, 4)
@@ -50,7 +52,7 @@
 cell = nn.RNN(input_size=4, hidden_size=2)
 
 # The given dimensions dim0 and dim1 are swapped.
-inputs = inputs.transpose(3, dim1=1, dim2=2)
+inputs = inputs.transpose(dim0=0, dim1=1)
 # Propagate input through RNN
 # Input: (seq_len, batch_size, input_size) when batch_first=False (default)
 # S x B x I
diff --git a/12_2_hello_rnn.py b/12_2_hello_rnn.py
index 848a43a..d196472 100644
--- a/12_2_hello_rnn.py
+++ b/12_2_hello_rnn.py
@@ -44,14 +44,14 @@ def forward(self, hidden, x):
 
         # Propagate input through RNN
         # Input: (batch, seq_len, input_size)
-        # hidden: (batch, num_layers * num_directions, hidden_size)
+        # hidden: (num_layers * num_directions, batch, hidden_size)
         out, hidden = self.rnn(x, hidden)
         return hidden, out.view(-1, num_classes)
 
     def init_hidden(self):
         # Initialize hidden and cell states
-        # (batch, num_layers * num_directions, hidden_size) for batch_first=True
-        return Variable(torch.zeros(batch_size, num_layers, hidden_size))
+        # (num_layers * num_directions, batch, hidden_size)
+        return Variable(torch.zeros(num_layers, batch_size, hidden_size))
 
 
 # Instantiate RNN model
@@ -75,9 +75,9 @@ def init_hidden(self):
         hidden, output = model(hidden, input)
         val, idx = output.max(1)
         sys.stdout.write(idx2char[idx.data[0]])
-        loss += criterion(output, label)
+        loss += criterion(output, torch.LongTensor([label]))
 
-    print(", epoch: %d, loss: %1.3f" % (epoch + 1, loss.data[0]))
+    print(", epoch: %d, loss: %1.3f" % (epoch + 1, loss))
 
     loss.backward()
     optimizer.step()
diff --git a/12_3_hello_rnn_seq.py b/12_3_hello_rnn_seq.py
index 29f90d5..5a0c2f2 100644
--- a/12_3_hello_rnn_seq.py
+++ b/12_3_hello_rnn_seq.py
@@ -46,16 +46,16 @@ def __init__(self, num_classes, input_size, hidden_size, num_layers):
 
     def forward(self, x):
         # Initialize hidden and cell states
-        # (batch, num_layers * num_directions, hidden_size) for batch_first=True
+        # (num_layers * num_directions, batch, hidden_size) for batch_first=True
         h_0 = Variable(torch.zeros(
-            x.size(0), self.num_layers, self.hidden_size))
+            self.num_layers, x.size(0), self.hidden_size))
 
         # Reshape input
         x.view(x.size(0), self.sequence_length, self.input_size)
 
         # Propagate input through RNN
         # Input: (batch, seq_len, input_size)
-        # h_0: (batch, num_layers * num_directions, hidden_size)
+        # h_0: (num_layers * num_directions, batch, hidden_size)
 
         out, _ = self.rnn(x, h_0)
         return out.view(-1, num_classes)
diff --git a/12_4_hello_rnn_emb.py b/12_4_hello_rnn_emb.py
index 71fa85c..aa783a1 100644
--- a/12_4_hello_rnn_emb.py
+++ b/12_4_hello_rnn_emb.py
@@ -27,8 +27,10 @@
 
 class Model(nn.Module):
 
-    def __init__(self):
+    def __init__(self, num_layers, hidden_size):
         super(Model, self).__init__()
+        self.num_layers = num_layers
+        self.hidden_size = hidden_size
         self.embedding = nn.Embedding(input_size, embedding_size)
         self.rnn = nn.RNN(input_size=embedding_size,
                           hidden_size=5, batch_first=True)
@@ -36,22 +38,22 @@ def __init__(self):
 
     def forward(self, x):
         # Initialize hidden and cell states
-        # (batch, num_layers * num_directions, hidden_size) for batch_first=True
+        # (num_layers * num_directions, batch, hidden_size)
         h_0 = Variable(torch.zeros(
-            x.size(0), num_layers, hidden_size))
+            self.num_layers, x.size(0), self.hidden_size))
 
         emb = self.embedding(x)
         emb = emb.view(batch_size, sequence_length, -1)
 
         # Propagate embedding through RNN
         # Input: (batch, seq_len, embedding_size)
-        # h_0: (batch, num_layers * num_directions, hidden_size)
+        # h_0: (num_layers * num_directions, batch, hidden_size)
         out, _ = self.rnn(emb, h_0)
         return self.fc(out.view(-1, num_classes))
 
 
 # Instantiate RNN model
-model = Model()
+model = Model(num_layers, hidden_size)
 print(model)
 
 # Set loss and optimizer function
@@ -69,7 +71,7 @@ def forward(self, x):
     _, idx = outputs.max(1)
     idx = idx.data.numpy()
     result_str = [idx2char[c] for c in idx.squeeze()]
-    print("epoch: %d, loss: %1.3f" % (epoch + 1, loss.data[0]))
+    print("epoch: %d, loss: %1.3f" % (epoch + 1, loss.item()))
     print("Predicted string: ", ''.join(result_str))
 
 print("Learning finished!")
diff --git a/requirements.txt b/requirements.txt
index 0d2815e..db9a191 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
 #nonsml: digitalgenius/ubuntu-pytorch
 #varunagrawal/pytorch
-httplib2==0.10.3
+httplib2==0.18.0
 matplotlib==2.0.0
 numpy==1.13.3
 torch