diff --git a/README.md b/README.md
index 0f11bd35..a866454f 100644
--- a/README.md
+++ b/README.md
@@ -4,6 +4,8 @@
 
 This is the PyTorch implementation of the [RotatE](https://openreview.net/forum?id=HkgEQnRqYQ) model for knowledge graph embedding (KGE). We provide a toolkit that gives state-of-the-art performance of several popular KGE models. The toolkit is quite efficient, which is able to train a large KGE model within a few hours on a single GPU.
 
+A faster multi-GPU implementation of RotatE and other KGE models is available in [GraphVite](https://github.com/DeepGraphLearning/graphvite).
+
 **Implemented features**
 
 Models:
diff --git a/best_config.sh b/best_config.sh
index f509de3a..5dad806b 100755
--- a/best_config.sh
+++ b/best_config.sh
@@ -41,11 +41,11 @@ bash run.sh train ComplEx countries_S3 0 0 512 64 1000 1.0 1.0 0.000002 40000 8
 #
 # Best Configuration for DistMult
 # 
-bash run.sh train DistMult FB15k 0 0 1024 256 1000 500.0 1.0 0.001 150000 16 -de -dr -r 0.000002
-bash run.sh train DistMult FB15k-237 0 0 1024 256 1000 200.0 1.0 0.001 100000 16 -de -dr -r 0.00001
-bash run.sh train DistMult wn18 0 0 512 1024 500 200.0 1.0 0.001 80000 8 -de -dr -r 0.00001
-bash run.sh train DistMult wn18rr 0 0 512 1024 500 200.0 1.0 0.002 80000 8 -de -dr -r 0.000005
-bash run.sh train DistMult countries_S1 0 0 512 64 1000 1.0 1.0 0.000002 40000 8 -de -dr -r 0.0005 --countries
-bash run.sh train DistMult countries_S2 0 0 512 64 1000 1.0 1.0 0.000002 40000 8 -de -dr -r 0.0005 --countries
-bash run.sh train DistMult countries_S3 0 0 512 64 1000 1.0 1.0 0.000002 40000 8 -de -dr -r 0.0005 --countries
-#
\ No newline at end of file
+bash run.sh train DistMult FB15k 0 0 1024 256 2000 500.0 1.0 0.001 150000 16 -r 0.000002
+bash run.sh train DistMult FB15k-237 0 0 1024 256 2000 200.0 1.0 0.001 100000 16 -r 0.00001
+bash run.sh train DistMult wn18 0 0 512 1024 1000 200.0 1.0 0.001 80000 8 -r 0.00001
+bash run.sh train DistMult wn18rr 0 0 512 1024 1000 200.0 1.0 0.002 80000 8 -r 0.000005
+bash run.sh train DistMult countries_S1 0 0 512 64 2000 1.0 1.0 0.000002 40000 8 -r 0.0005 --countries
+bash run.sh train DistMult countries_S2 0 0 512 64 2000 1.0 1.0 0.000002 40000 8 -r 0.0005 --countries
+bash run.sh train DistMult countries_S3 0 0 512 64 2000 1.0 1.0 0.000002 40000 8 -r 0.0005 --countries
+#
diff --git a/codes/dataloader.py b/codes/dataloader.py
index 70d43a25..ed3f3492 100644
--- a/codes/dataloader.py
+++ b/codes/dataloader.py
@@ -59,8 +59,8 @@ def __getitem__(self, idx):
         
         negative_sample = np.concatenate(negative_sample_list)[:self.negative_sample_size]
 
-        negative_sample = torch.from_numpy(negative_sample)
-        
+        negative_sample = torch.LongTensor(negative_sample)
+
         positive_sample = torch.LongTensor(positive_sample)
             
         return positive_sample, negative_sample, subsampling_weight, self.mode
@@ -181,4 +181,4 @@ def one_shot_iterator(dataloader):
         '''
         while True:
             for data in dataloader:
-                yield data
\ No newline at end of file
+                yield data
diff --git a/codes/model.py b/codes/model.py
index 30762313..2459e71a 100644
--- a/codes/model.py
+++ b/codes/model.py
@@ -75,7 +75,7 @@ def forward(self, sample, mode='single'):
         In the 'head-batch' or 'tail-batch' mode, sample consists two part.
         The first part is usually the positive sample.
         And the second part is the entities in the negative samples.
-        Becuase negative samples and positive samples usually share two elements 
+        Because negative samples and positive samples usually share two elements 
         in their triple ((head, relation) or (relation, tail)).
         '''
 
@@ -267,7 +267,7 @@ def train_step(model, optimizer, train_iterator, args):
         negative_score = model((positive_sample, negative_sample), mode=mode)
 
         if args.negative_adversarial_sampling:
-            #In self-negative sampling, we do not apply back-propagation on the sampling weight
+            #In self-adversarial sampling, we do not apply back-propagation on the sampling weight
             negative_score = (F.softmax(negative_score * args.adversarial_temperature, dim = 1).detach() 
                               * F.logsigmoid(-negative_score)).sum(dim = 1)
         else:
@@ -278,8 +278,8 @@ def train_step(model, optimizer, train_iterator, args):
         positive_score = F.logsigmoid(positive_score).squeeze(dim = 1)
 
         if args.uni_weight:
-            positive_sample_loss = positive_score.mean()
-            negative_sample_loss = negative_score.mean()
+            positive_sample_loss = - positive_score.mean()
+            negative_sample_loss = - negative_score.mean()
         else:
             positive_sample_loss = - (subsampling_weight * positive_score).sum()/subsampling_weight.sum()
             negative_sample_loss = - (subsampling_weight * negative_score).sum()/subsampling_weight.sum()
diff --git a/codes/run.py b/codes/run.py
index 9cc7d2e9..457c6fdf 100644
--- a/codes/run.py
+++ b/codes/run.py
@@ -284,7 +284,6 @@ def main(args):
     
     logging.info('Start Training...')
     logging.info('init_step = %d' % init_step)
-    logging.info('learning_rate = %d' % current_learning_rate)
     logging.info('batch_size = %d' % args.batch_size)
     logging.info('negative_adversarial_sampling = %d' % args.negative_adversarial_sampling)
     logging.info('hidden_dim = %d' % args.hidden_dim)
@@ -296,6 +295,8 @@ def main(args):
     # Set valid dataloader as it would be evaluated during training
     
     if args.do_train:
+        logging.info('learning_rate = %d' % current_learning_rate)
+
         training_logs = []
         
         #Training Loop
@@ -357,4 +358,4 @@ def main(args):
         log_metrics('Test', step, metrics)
         
 if __name__ == '__main__':
-    main(parse_args())
\ No newline at end of file
+    main(parse_args())