Support for Python 3, PyTorch 1.3+, CPU, ship weights

lantiga · lantiga · commit a73d9c3b0086 · 2020-04-01T23:27:34.000+02:00
diff --git a/README.md b/README.md
@@ -1,5 +1,18 @@
 # ImageCaptioning.pytorch
 
+This is a fork of [Rotian Luo's ImageCaptioning repo](https://github.com/ruotianluo/ImageCaptioning.pytorch), adapted for the Deep Learning with PyTorch book (Manning).
+
+Notable changes:
+
+* Python 3.6+
+* PyTorch 1.3+
+* CPU and GPU support
+* a set of weights is provided in the repo to facilitate getting up to speed
+
+Following are the original notes.
+
+# ImageCaptioning.pytorch
+
 This is an image captioning codebase in PyTorch. If you are familiar with neuraltalk2, here are the differences compared to neuraltalk2.
 - Instead of using random split, we use [karpathy's train-val-test split](http://cs.stanford.edu/people/karpathy/deepimagesent/caption_datasets.zip).
 - Instead of including the convnet in the model, we use preprocessed features. (finetuneable cnn version is in the branch **with_finetune**)
diff --git a/data/imagenet_weights/resnet101a.pth b/data/imagenet_weights/resnet101a.pth
diff --git a/data/imagenet_weights/resnet101b.pth b/data/imagenet_weights/resnet101b.pth
diff --git a/dataloaderraw.py b/dataloaderraw.py
@@ -22,6 +22,8 @@
 from misc.resnet_utils import myResnet
 import misc.resnet
 
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+
 class DataLoaderRaw():
     
     def __init__(self, opt):
@@ -37,7 +39,7 @@ def __init__(self, opt):
         self.my_resnet = getattr(misc.resnet, self.cnn_model)()
         self.my_resnet.load_state_dict(torch.load('./data/imagenet_weights/'+self.cnn_model+'.pth'))
         self.my_resnet = myResnet(self.my_resnet)
-        self.my_resnet.cuda()
+        self.my_resnet.to(device=device)
         self.my_resnet.eval()
 
 
@@ -108,7 +110,7 @@ def get_batch(self, split, batch_size=None):
                 img = np.concatenate((img, img, img), axis=2)
 
             img = img.astype('float32')/255.0
-            img = torch.from_numpy(img.transpose([2, 0, 1])).cuda()
+            img = torch.from_numpy(img.transpose([2, 0, 1])).to(device=device)
             with torch.no_grad():
                 img = Variable(preprocess(img))
                 tmp_fc, tmp_att = self.my_resnet(img)
diff --git a/eval.py b/eval.py
@@ -18,6 +18,7 @@
 import argparse
 import misc.utils as utils
 import torch
+import shutil
 
 # Input arguments and options
 parser = argparse.ArgumentParser()
@@ -73,9 +74,19 @@
 
 opt = parser.parse_args()
 
+cnn_model_weights = './data/imagenet_weights/' + opt.cnn_model + '.pth'
+if not os.path.isfile(cnn_model_weights):
+    chunk1 = './data/imagenet_weights/' + opt.cnn_model + 'a.pth'
+    chunk2 = './data/imagenet_weights/' + opt.cnn_model + 'b.pth'
+    with open(cnn_model_weights,'wb') as destination:
+        with open(chunk1,'rb') as source:
+            shutil.copyfileobj(source, destination)
+        with open(chunk2,'rb') as source:
+            shutil.copyfileobj(source, destination)
+
 # Load infos
 with open(opt.infos_path, 'rb') as f:
-    infos = cPickle.load(f)
+    infos = cPickle.load(f, encoding='latin1')
 
 # override and collect parameters
 if len(opt.input_fc_dir) == 0:
@@ -100,8 +111,7 @@
 
 # Setup the model
 model = models.setup(opt)
-model.load_state_dict(torch.load(opt.model))
-model.cuda()
+model.load_state_dict(torch.load(opt.model, map_location=torch.device(device)))
 model.eval()
 crit = utils.LanguageModelCriterion()
 
diff --git a/eval_utils.py b/eval_utils.py
@@ -16,6 +16,8 @@
 import sys
 import misc.utils as utils
 
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+
 def language_eval(dataset, preds, model_id, split):
     import sys
     if 'coco' in dataset:
@@ -86,7 +88,7 @@ def eval_split(model, crit, loader, eval_kwargs={}):
             # forward the model to get loss
             tmp = [data['fc_feats'], data['att_feats'], data['labels'], data['masks']]
             with torch.no_grad():
-                tmp = [Variable(torch.from_numpy(_)).cuda() for _ in tmp]
+                tmp = [Variable(torch.from_numpy(_)).to(device=device) for _ in tmp]
                 fc_feats, att_feats, labels, masks = tmp
 
                 loss = crit(model(fc_feats, att_feats, labels), labels[:,1:], masks[:,1:]).item()
@@ -98,7 +100,7 @@ def eval_split(model, crit, loader, eval_kwargs={}):
         tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img], 
             data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]]
         with torch.no_grad():
-            tmp = [Variable(torch.from_numpy(_)).cuda() for _ in tmp]
+            tmp = [Variable(torch.from_numpy(_)).to(device=device) for _ in tmp]
             fc_feats, att_feats = tmp
             # forward the model to also get generated samples for each image
             seq, _ = model.sample(fc_feats, att_feats, eval_kwargs)
diff --git a/models/CaptionModel.py b/models/CaptionModel.py
@@ -15,6 +15,7 @@
 from torch.autograd import *
 import misc.utils as utils
 
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
 
 class CaptionModel(nn.Module):
     def __init__(self):
@@ -121,7 +122,7 @@ def beam_step(logprobsf, beam_size, t, beam_seq, beam_seq_logprobs, beam_logprob
 
             # encode as vectors
             it = beam_seq[t]
-            logprobs, state = self.get_logprobs_state(Variable(it.cuda()), *(args + (state,)))
+            logprobs, state = self.get_logprobs_state(Variable(it.to(device=device)), *(args + (state,)))
 
         done_beams = sorted(done_beams, key=lambda x: -x['p'])[:beam_size]
         return done_beams
diff --git a/models/FCModel.py b/models/FCModel.py
@@ -10,6 +10,8 @@
 
 from .CaptionModel import CaptionModel
 
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+
 class LSTMCore(nn.Module):
     def __init__(self, opt):
         super(LSTMCore, self).__init__()
@@ -176,7 +178,7 @@ def sample(self, fc_feats, att_feats, opt={}):
                     else:
                         # scale logprobs by temperature
                         prob_prev = torch.exp(torch.div(logprobs.data, temperature)).cpu()
-                    it = torch.multinomial(prob_prev, 1).cuda()
+                    it = torch.multinomial(prob_prev, 1).to(device=device)
                     sampleLogprobs = logprobs.gather(1, Variable(it, requires_grad=False)) # gather the logprobs at sampled positions
                     it = it.view(-1).long() # and flatten indices for downstream processing