Add quantization/8bit model loading support for sampling_report.py (LAION-AI#2857)

toiletpapercode · web-flow · commit a97900249d1c · 2023-04-24T10:09:34.000+02:00
Add ---quantize to the script call to take effect

Tested using:

1. --quantize --model-name
OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5
2. --quantize --model-name t5-small --model-type t5conditional
3. --quantize --model-name OpenAssistant/stablelm-7b-sft-v7-epoch-3

Unable to test on llama models without access to the base weights
(and/or only 35GB of VRAM?)

Enjoy,
TP
diff --git a/model/model_eval/manual/sampling_report.py b/model/model_eval/manual/sampling_report.py
@@ -226,6 +226,7 @@ def parse_args():
     parser.add_argument("--num-samples", type=int, default=2, help="number of sampling runs per configuration")
     parser.add_argument("--config", type=str, default="config/default.json", help="configuration file path")
     parser.add_argument("--half", action="store_true", default=False, help="use float16")
+    parser.add_argument("--int8", action="store_true", default=False, help="use int8 quantization")
     parser.add_argument("--skip-special-tokens", action="store_true", default=False)
     parser.add_argument("--model-type", type=str, default="CausalLM", help="CausalLM, T5Conditional, LLaMA")
     parser.add_argument("--max-input-len", type=int, help="max token counts for input")
@@ -247,6 +248,10 @@ def main():
     print("Using pytorch version {}".format(torch.__version__))
 
     args = parse_args()
+    if args.int8 and not torch.cuda.is_available():
+        print("Warning: --int8 argument passed but cuda is not available. Ignoring --int8.")
+        args.int8 = False
+
     print("Args:", args)
 
     torch.set_num_threads(args.num_threads)
@@ -265,17 +270,23 @@ def main():
     model_name = args.model_name
     print(f"Loading model: {model_name}")
 
+    model_args = {}
+    if args.int8:
+        # these will break model.to(device) later in the script so a conditional check is needed
+        model_args["load_in_8bit"] = args.int8
+        model_args["device_map"] = "auto"
+
     if args.model_type.lower() == "causallm" or args.model_type.lower() == "llama":
         from transformers import AutoModelForCausalLM
 
         tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=args.auth_token)
-        model = AutoModelForCausalLM.from_pretrained(model_name, use_auth_token=args.auth_token)
+        model = AutoModelForCausalLM.from_pretrained(model_name, use_auth_token=args.auth_token, **model_args)
         skip_input_tokens = True
     elif args.model_type.lower() == "t5conditional":
         from transformers import T5ForConditionalGeneration
 
         tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=args.auth_token)
-        model = T5ForConditionalGeneration.from_pretrained(model_name, use_auth_token=args.auth_token)
+        model = T5ForConditionalGeneration.from_pretrained(model_name, use_auth_token=args.auth_token, **model_args)
         skip_input_tokens = False
     else:
         raise RuntimeError("Invalid model_type specified")
@@ -293,7 +304,10 @@ def main():
     model.eval()
     if args.half:
         model = model.half()
-    model = model.to(device)
+
+    # int8 models (load_in_8bit = True + device_map = auto): will cause this method to error
+    if not args.int8:
+        model = model.to(device)
 
     print(f"Loading prompts file: {args.prompts}")
     prompts = load_jsonl(input_file_path=args.prompts)