fix: config.torch_dtype in LlamaModel_fast_forward_inference (#2091)

lurf21 · danielhanchen · web-flow · commit 7d47557b7787 · 2025-03-19T02:06:48.000-07:00
* fix: config.torch_dtype in LlamaModel_fast_forward_inference

* Update llama.py

* update for consistency

---------

Co-authored-by: Daniel Han &lt;danielhanchen@gmail.com&gt;
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
@@ -652,13 +652,7 @@ def LlamaModel_fast_forward(
     if inputs_embeds is None:
         inputs_embeds = self.embed_tokens(input_ids)
 
-    # inputs_embeds = inputs_embeds.to(self.config.torch_dtype)
-    torch_dtype = __DTYPE_MAP.get(self.config.torch_dtype, None)
-    if torch_dtype is not None:
-        inputs_embeds = inputs_embeds.to(torch_dtype)
-    else:
-        raise TypeError("Unsloth: torch_dtype for models is not bfloat16, float16 or float32!")
-    pass
+    inputs_embeds = inputs_embeds.to(_get_dtype(self.config.torch_dtype))
 
     # Normalized from Gemma
     IS_GEMMA   = self.config.model_type.startswith("gemma")
@@ -924,7 +918,7 @@ def LlamaModel_fast_forward_inference(
     mlp_size = self.config.intermediate_size
 
     X = self.model.embed_tokens(input_ids)
-    X = X.to(self.config.torch_dtype)
+    X = X.to(_get_dtype(self.config.torch_dtype))
     bsz, q_len, hd = X.shape
     assert(q_len == 1)
     # Get saved buffers to reduce memory movement