Fix Mistral, Qwen (#1565)

danielhanchen · tajimagrp · mosama1994 · web-flow · commit d8c58fbbb7d5 · 2025-01-20T01:27:24.000-08:00
* use exact model name * Update save.py * Update _utils.py * Update _utils.py * Update _utils.py * Update _utils.py * print * Update _utils.py * Update _utils.py * Update llama.py * Update _utils.py * Update vision.py * Update _utils.py * Update _utils.py * Update _utils.py * Update _utils.py * Update _utils.py * Update _utils.py * Update _utils.py * Update _utils.py * Update loader.py * accurate_accumulation * Update loader.py * Update loader.py * Update _utils.py * Update loader.py * Update loader.py * Update loader.py * Update loader.py * Update pyproject.toml * Update __init__.py * Update pyproject.toml * Update __init__.py * Update __init__.py * Fix Triton heuristics triton-lang/triton#5224 * Update __init__.py * Update __init__.py * Update __init__.py * Update __init__.py * Xformers * Update loader.py * Update loader.py * Rewind * Update _utils.py * Update _utils.py * requires grad * Update loader.py * Update _utils.py * Update loader.py * changing model to base_model if peft model is already used * Improve debugging experience (#1512) * Create CONTRIBUTING.md (#1472) Creating contributing guidelines * Update CONTRIBUTING.md improved sentence * Improve logging control in `unsloth_compile_transformers` by conditionally redirecting stdout based on UNSLOTH_DISABLE_LOGGER environment variable --------- Co-authored-by: Michael Han <107991372+shimmyshimmer@users.noreply.github.com> Co-authored-by: Nino Risteski <95188570+NinoRisteski@users.noreply.github.com> * Update loader.py * Update llama.py * Update llama.py * Revert "Update llama.py" This reverts commit b7ddf96. * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Auto change is_bfloat16_supported * Update llama.py * Force data-type * Update llama.py * All attention refactor fix (#1491) * change initilization of n_heads, n_kv_heads, hidden_size in llama.py * do the same for cohere, mistral, gemma2, granite * do the same for flexattention,cohere, mistral, granite * Update llama.py * Update llama.py * Update granite to work with latest post_patch methods (#1502) * Update granite to work with latest post_patch methods * Pass position_embeddings for granite even if transformers<4.47 * Update llama.py --------- Co-authored-by: Daniel Han <danielhanchen@gmail.com> * Minor fixes for granite models (#1503) * Update granite.py Grab residual multiplier directly from layer * Update llama.py Version should read >= 4.47.1 as that is the version requiring the changes * Update granite.py * Update llama.py --------- Co-authored-by: Daniel Han <danielhanchen@gmail.com> * support modelscope models and datasets (#1481) * support modelscope * change modelscope args * remove useless import * remove useless import * fix * wip * fix * remove useless code * add readme * add some comments * change print to raise error * update comment * Update loader.py --------- Co-authored-by: Daniel Han <danielhanchen@gmail.com> * Merge branch 'main' into nightly * Phi 4 * Update llama.py * Torch.Cuda Is Available Condition and Warning (#1545) * check for torch.cuda and triton if available on my machine(mac m3) the cuda were not available * Update pyproject.toml * Update __init__.py --------- Co-authored-by: Daniel Han <danielhanchen@gmail.com> * Update mistral.py * Update mistral.py * Update _utils.py * Update _utils.py * Update _utils.py * Update _utils.py * Update _utils.py * Fix * Bug fixes * Update mapper.py * Add dropout to granite to match HF's implementation (#1557) Signed-off-by: datta0 <venkatadattasainimmaturi@gmail.com> * Update llama.py * Update llama.py * Bug fixes * fix: flash_attn_detection_error (#1556) * fix: flash_attn_detection_error * Update _utils.py --------- Co-authored-by: Daniel Han <danielhanchen@gmail.com> --------- Signed-off-by: datta0 <venkatadattasainimmaturi@gmail.com> Co-authored-by: Itsuro Tajima <tajima@georepublic.de> Co-authored-by: Muhammad Osama <muhammadosama1994@gmail.com> Co-authored-by: Edd <68678137+Erland366@users.noreply.github.com> Co-authored-by: Michael Han <107991372+shimmyshimmer@users.noreply.github.com> Co-authored-by: Nino Risteski <95188570+NinoRisteski@users.noreply.github.com> Co-authored-by: Kareem <81531392+KareemMusleh@users.noreply.github.com> Co-authored-by: Datta Nimmaturi <datta.nimmaturi@nutanix.com> Co-authored-by: Z <coffeevampirebusiness@gmail.com> Co-authored-by: tastelikefeet <58414341+tastelikefeet@users.noreply.github.com> Co-authored-by: AminWhat <88392440+aminwhat@users.noreply.github.com> Co-authored-by: Zhe Zhang <2631992879@qq.com>
diff --git a/pyproject.toml b/pyproject.toml
@@ -39,7 +39,7 @@ triton = [
     "triton @ https://github.com/woct0rdho/triton-windows/releases/download/v3.1.0-windows.post5/triton-3.1.0-cp312-cp312-win_amd64.whl ; python_version=='3.12' and platform_system == 'Windows'",
 ]
 huggingface = [
-    "unsloth_zoo>=2025.1.2",
+    "unsloth_zoo>=2025.1.4",
     "packaging",
     "tyro",
     "transformers>=4.46.1,!=4.47.0",
@@ -285,7 +285,7 @@ colab-ampere-torch220 = [
     "flash-attn>=2.6.3",
 ]
 colab-new = [
-    "unsloth_zoo>=2025.1.2",
+    "unsloth_zoo>=2025.1.4",
     "packaging",
     "tyro",
     "transformers>=4.46.1,!=4.47.0",
diff --git a/unsloth/__init__.py b/unsloth/__init__.py
@@ -86,6 +86,10 @@
     del os.environ["PYTORCH_CUDA_ALLOC_CONF"]
 pass
 
+# First check if CUDA is available ie a NVIDIA GPU is seen
+if not torch.cuda.is_available():
+    raise NotImplementedError("Unsloth: No NVIDIA GPU found? Unsloth currently only supports GPUs!")
+
 # Fix Xformers performance issues since 0.0.25
 import importlib.util
 from pathlib import Path
@@ -194,7 +198,7 @@ def is_bf16_supported(): return SUPPORTS_BFLOAT16
 # Check for unsloth_zoo
 try:
     unsloth_zoo_version = importlib_version("unsloth_zoo")
-    if Version(unsloth_zoo_version) < Version("2025.1.2"):
+    if Version(unsloth_zoo_version) < Version("2025.1.4"):
         try:
             os.system("pip install --upgrade --no-cache-dir --no-deps unsloth_zoo")
         except:
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2025.1.5"
+__version__ = "2025.1.6"
 
 __all__ = [
     "SUPPORTS_BFLOAT16",
@@ -285,7 +285,11 @@ def _is_openai_available(): return False
     if _is_package_available("flash_attn"):
         # Check for CUDA linking errors "undefined symbol: _ZNK3c106SymIntltEl"
         try:
-            from flash_attn.flash_attn_interface import flash_attn_cuda
+            try:
+                # See https://github.com/unslothai/unsloth/issues/1437
+                from flash_attn.flash_attn_interface import flash_attn_gpu
+            except:
+                from flash_attn.flash_attn_interface import flash_attn_cuda
             HAS_FLASH_ATTENTION = True
 
             # Also check for softcapping
@@ -843,7 +847,9 @@ def patch_linear_scaling(
         "self.rotary_emb = .+?\)", function,
         flags = re.DOTALL | re.MULTILINE,
     )
-    if len(rotary_emb) == 0: return None, function
+    if len(rotary_emb) == 0:
+        return None, exec_code + "\n\n" + function
+
     rotary_emb = rotary_emb[0]
     function = function.replace(rotary_emb, fix_rope_function, 1)
     function = exec_code + "\n\n" + function
diff --git a/unsloth/models/granite.py b/unsloth/models/granite.py
@@ -89,6 +89,7 @@ def GraniteAttention_fast_forward(
     n_groups   = self.num_key_value_groups
     n_kv_heads = self.config.num_key_value_heads
     head_dim   = self.head_dim
+    dropout_p  = self.config.attention_dropout if self.training else 0
     assert(n_kv_heads * n_groups == n_heads)
 
     Q, K, V = self.apply_qkv(self, hidden_states)
@@ -135,15 +136,15 @@ def GraniteAttention_fast_forward(
             Q = Q.view(bsz, q_len, n_kv_heads, n_groups, head_dim)
         pass
 
-        A = xformers_attention(Q, K, V, attn_bias = causal_mask, scale=self.scaling)
+        A = xformers_attention(Q, K, V, attn_bias = causal_mask, scale=self.scaling, p=dropout_p)
         A = A.view(bsz, q_len, n_heads, head_dim)
 
     elif HAS_FLASH_ATTENTION and attention_mask is None:
         Q = Q.transpose(1, 2)
         K = K.transpose(1, 2)
         V = V.transpose(1, 2)
         window = (kv_seq_len, kv_seq_len)
-        A = flash_attn_func(Q, K, V, causal = True, window_size = window, softmax_scale=self.scaling)
+        A = flash_attn_func(Q, K, V, causal = True, window_size = window, softmax_scale=self.scaling, dropout_p=dropout_p)
     else:
         # Grouped query attention
         # if n_groups != 1:
@@ -157,7 +158,7 @@ def GraniteAttention_fast_forward(
         Q, K, V = Q.contiguous(), K.contiguous(), V.contiguous()
         # Needs (batch_size, n_heads, seq_len, head_dim)
         # is_casual and attention_mask must not be both set!
-        A = scaled_dot_product_attention(Q, K, V, attn_mask = attention_mask, scale = self.scaling, is_causal = False)
+        A = scaled_dot_product_attention(Q, K, V, attn_mask = attention_mask, scale = self.scaling, is_causal = False, dropout_p=dropout_p)
         # Go back to (batch_size, seq_len, n_heads, head_dim)
         A = A.transpose(1, 2).contiguous()
     pass
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
@@ -636,6 +636,7 @@ def LlamaModel_fast_forward(
     IS_GEMMA2  = self.config.model_type.startswith("gemma2")
     IS_COHERE  = self.config.model_type.startswith("cohere")
     IS_GRANITE = self.config.model_type.startswith("granite")
+
     train_embed_tokens = self.embed_tokens.weight.requires_grad
 
     if IS_GEMMA:
@@ -664,7 +665,7 @@ def LlamaModel_fast_forward(
 
     # Fix up attention mask by setting elements to 0
     # Specifically for DPO
-    if self._has_no_labels and (attention_mask is not None) and (past_key_values is None) and \
+    if getattr(self, "_has_no_labels", False) is True and (attention_mask is not None) and (past_key_values is None) and \
         (not train_embed_tokens):
         # Careful for inference the attention_mask is size (1, kv_seq_len)
         # Whilst the input_embeds is size (1, 1, 4096)
@@ -792,9 +793,12 @@ def LlamaModel_fast_forward(
         pass
     pass
 
-    if IS_ATTENTION_REFACTOR and not hasattr(self.layers[0].self_attn, "rotary_emb"):
+    if (IS_ATTENTION_REFACTOR and (hasattr(self, "rotary_emb") or not hasattr(self.layers[0].self_attn, "rotary_emb"))) or IS_GRANITE:
         # Transformers main has made it mandatory to pass position_embeddings
         # https://github.com/huggingface/transformers/pull/34858
+        # Also, transformers 4.45.0 supports granite but with the attention refactor (it always had the refactor)
+        # unsloth's check for granite too has "version >= 4.45.0 (rightly so)".
+        # so let granite always use the attention refactor implementation.
         position_embeddings = self.rotary_emb(hidden_states, position_ids, self.config.max_position_embeddings)
     else:
         position_embeddings = None
diff --git a/unsloth/models/mapper.py b/unsloth/models/mapper.py
@@ -471,20 +471,18 @@
         "meta-llama/Llama-3.2-11B-Vision-Instruct",
         "unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit",
     ),
-    "unsloth/Llama-3.2-90B-Vision-Instruct-unsloth-bnb-4bit" : (
+    "unsloth/Llama-3.2-90B-Vision-Instruct-bnb-4bit" : (
         "unsloth/Llama-3.2-90B-Vision-Instruct",
         "meta-llama/Llama-3.2-90B-Vision-Instruct",
-        "unsloth/Llama-3.2-90B-Vision-Instruct-bnb-4bit",
     ),
     "unsloth/Llama-3.2-11B-Vision-unsloth-bnb-4bit" : (
         "unsloth/Llama-3.2-11B-Vision",
         "meta-llama/Llama-3.2-11B-Vision",
         "unsloth/Llama-3.2-11B-Vision-bnb-4bit",
     ),
-    "unsloth/Llama-3.2-90B-Vision-unsloth-bnb-4bit" : (
+    "unsloth/Llama-3.2-90B-Vision-bnb-4bit" : (
         "unsloth/Llama-3.2-90B-Vision",
         "meta-llama/Llama-3.2-90B-Vision",
-        "unsloth/Llama-3.2-90B-Vision-bnb-4bit",
     ),
     "unsloth/Pixtral-12B-2409-unsloth-bnb-4bit" : (
         "unsloth/Pixtral-12B-2409",
diff --git a/unsloth/models/mistral.py b/unsloth/models/mistral.py
@@ -304,7 +304,7 @@ def pre_patch():
             attention_module   = MistralAttention,
         )
         # Just for Mistral Nemo models!
-        if function is not None:
+        if function is not None and init_name is not None:
             function = patch_mistral_nemo_attention(function)
             # if True:#init_name is not None:
             exec(function, globals())

Original file line number	Diff line number	Diff line change
`@@ -304,7 +304,7 @@ def pre_patch():`
`304`	`304`	`attention_module = MistralAttention,`
`305`	`305`	`)`
`306`	`306`	`# Just for Mistral Nemo models!`
`307`		`- if function is not None:`
	`307`	`+ if function is not None and init_name is not None:`
`308`	`308`	`function = patch_mistral_nemo_attention(function)`
`309`	`309`	`# if True:#init_name is not None:`
`310`	`310`	`exec(function, globals())`