Skip to content

Commit d8c58fb

Browse files
danielhanchentajimagrpmosama1994Erland366shimmyshimmer
authored
Fix Mistral, Qwen (#1565)
* use exact model name * Update save.py * Update _utils.py * Update _utils.py * Update _utils.py * Update _utils.py * print * Update _utils.py * Update _utils.py * Update llama.py * Update _utils.py * Update vision.py * Update _utils.py * Update _utils.py * Update _utils.py * Update _utils.py * Update _utils.py * Update _utils.py * Update _utils.py * Update _utils.py * Update loader.py * accurate_accumulation * Update loader.py * Update loader.py * Update _utils.py * Update loader.py * Update loader.py * Update loader.py * Update loader.py * Update pyproject.toml * Update __init__.py * Update pyproject.toml * Update __init__.py * Update __init__.py * Fix Triton heuristics triton-lang/triton#5224 * Update __init__.py * Update __init__.py * Update __init__.py * Update __init__.py * Xformers * Update loader.py * Update loader.py * Rewind * Update _utils.py * Update _utils.py * requires grad * Update loader.py * Update _utils.py * Update loader.py * changing model to base_model if peft model is already used * Improve debugging experience (#1512) * Create CONTRIBUTING.md (#1472) Creating contributing guidelines * Update CONTRIBUTING.md improved sentence * Improve logging control in `unsloth_compile_transformers` by conditionally redirecting stdout based on UNSLOTH_DISABLE_LOGGER environment variable --------- Co-authored-by: Michael Han <107991372+shimmyshimmer@users.noreply.github.com> Co-authored-by: Nino Risteski <95188570+NinoRisteski@users.noreply.github.com> * Update loader.py * Update llama.py * Update llama.py * Revert "Update llama.py" This reverts commit b7ddf96. * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Auto change is_bfloat16_supported * Update llama.py * Force data-type * Update llama.py * All attention refactor fix (#1491) * change initilization of n_heads, n_kv_heads, hidden_size in llama.py * do the same for cohere, mistral, gemma2, granite * do the same for flexattention,cohere, mistral, granite * Update llama.py * Update llama.py * Update granite to work with latest post_patch methods (#1502) * Update granite to work with latest post_patch methods * Pass position_embeddings for granite even if transformers<4.47 * Update llama.py --------- Co-authored-by: Daniel Han <danielhanchen@gmail.com> * Minor fixes for granite models (#1503) * Update granite.py Grab residual multiplier directly from layer * Update llama.py Version should read >= 4.47.1 as that is the version requiring the changes * Update granite.py * Update llama.py --------- Co-authored-by: Daniel Han <danielhanchen@gmail.com> * support modelscope models and datasets (#1481) * support modelscope * change modelscope args * remove useless import * remove useless import * fix * wip * fix * remove useless code * add readme * add some comments * change print to raise error * update comment * Update loader.py --------- Co-authored-by: Daniel Han <danielhanchen@gmail.com> * Merge branch 'main' into nightly * Phi 4 * Update llama.py * Torch.Cuda Is Available Condition and Warning (#1545) * check for torch.cuda and triton if available on my machine(mac m3) the cuda were not available * Update pyproject.toml * Update __init__.py --------- Co-authored-by: Daniel Han <danielhanchen@gmail.com> * Update mistral.py * Update mistral.py * Update _utils.py * Update _utils.py * Update _utils.py * Update _utils.py * Update _utils.py * Fix * Bug fixes * Update mapper.py * Add dropout to granite to match HF's implementation (#1557) Signed-off-by: datta0 <venkatadattasainimmaturi@gmail.com> * Update llama.py * Update llama.py * Bug fixes * fix: flash_attn_detection_error (#1556) * fix: flash_attn_detection_error * Update _utils.py --------- Co-authored-by: Daniel Han <danielhanchen@gmail.com> --------- Signed-off-by: datta0 <venkatadattasainimmaturi@gmail.com> Co-authored-by: Itsuro Tajima <tajima@georepublic.de> Co-authored-by: Muhammad Osama <muhammadosama1994@gmail.com> Co-authored-by: Edd <68678137+Erland366@users.noreply.github.com> Co-authored-by: Michael Han <107991372+shimmyshimmer@users.noreply.github.com> Co-authored-by: Nino Risteski <95188570+NinoRisteski@users.noreply.github.com> Co-authored-by: Kareem <81531392+KareemMusleh@users.noreply.github.com> Co-authored-by: Datta Nimmaturi <datta.nimmaturi@nutanix.com> Co-authored-by: Z <coffeevampirebusiness@gmail.com> Co-authored-by: tastelikefeet <58414341+tastelikefeet@users.noreply.github.com> Co-authored-by: AminWhat <88392440+aminwhat@users.noreply.github.com> Co-authored-by: Zhe Zhang <2631992879@qq.com>
1 parent d6982c1 commit d8c58fb

File tree

7 files changed

+29
-16
lines changed

7 files changed

+29
-16
lines changed

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ triton = [
3939
"triton @ https://github.com/woct0rdho/triton-windows/releases/download/v3.1.0-windows.post5/triton-3.1.0-cp312-cp312-win_amd64.whl ; python_version=='3.12' and platform_system == 'Windows'",
4040
]
4141
huggingface = [
42-
"unsloth_zoo>=2025.1.2",
42+
"unsloth_zoo>=2025.1.4",
4343
"packaging",
4444
"tyro",
4545
"transformers>=4.46.1,!=4.47.0",
@@ -285,7 +285,7 @@ colab-ampere-torch220 = [
285285
"flash-attn>=2.6.3",
286286
]
287287
colab-new = [
288-
"unsloth_zoo>=2025.1.2",
288+
"unsloth_zoo>=2025.1.4",
289289
"packaging",
290290
"tyro",
291291
"transformers>=4.46.1,!=4.47.0",

unsloth/__init__.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,10 @@
8686
del os.environ["PYTORCH_CUDA_ALLOC_CONF"]
8787
pass
8888

89+
# First check if CUDA is available ie a NVIDIA GPU is seen
90+
if not torch.cuda.is_available():
91+
raise NotImplementedError("Unsloth: No NVIDIA GPU found? Unsloth currently only supports GPUs!")
92+
8993
# Fix Xformers performance issues since 0.0.25
9094
import importlib.util
9195
from pathlib import Path
@@ -194,7 +198,7 @@ def is_bf16_supported(): return SUPPORTS_BFLOAT16
194198
# Check for unsloth_zoo
195199
try:
196200
unsloth_zoo_version = importlib_version("unsloth_zoo")
197-
if Version(unsloth_zoo_version) < Version("2025.1.2"):
201+
if Version(unsloth_zoo_version) < Version("2025.1.4"):
198202
try:
199203
os.system("pip install --upgrade --no-cache-dir --no-deps unsloth_zoo")
200204
except:

unsloth/models/_utils.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
__version__ = "2025.1.5"
15+
__version__ = "2025.1.6"
1616

1717
__all__ = [
1818
"SUPPORTS_BFLOAT16",
@@ -285,7 +285,11 @@ def _is_openai_available(): return False
285285
if _is_package_available("flash_attn"):
286286
# Check for CUDA linking errors "undefined symbol: _ZNK3c106SymIntltEl"
287287
try:
288-
from flash_attn.flash_attn_interface import flash_attn_cuda
288+
try:
289+
# See https://github.com/unslothai/unsloth/issues/1437
290+
from flash_attn.flash_attn_interface import flash_attn_gpu
291+
except:
292+
from flash_attn.flash_attn_interface import flash_attn_cuda
289293
HAS_FLASH_ATTENTION = True
290294

291295
# Also check for softcapping
@@ -843,7 +847,9 @@ def patch_linear_scaling(
843847
"self.rotary_emb = .+?\)", function,
844848
flags = re.DOTALL | re.MULTILINE,
845849
)
846-
if len(rotary_emb) == 0: return None, function
850+
if len(rotary_emb) == 0:
851+
return None, exec_code + "\n\n" + function
852+
847853
rotary_emb = rotary_emb[0]
848854
function = function.replace(rotary_emb, fix_rope_function, 1)
849855
function = exec_code + "\n\n" + function

unsloth/models/granite.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ def GraniteAttention_fast_forward(
8989
n_groups = self.num_key_value_groups
9090
n_kv_heads = self.config.num_key_value_heads
9191
head_dim = self.head_dim
92+
dropout_p = self.config.attention_dropout if self.training else 0
9293
assert(n_kv_heads * n_groups == n_heads)
9394

9495
Q, K, V = self.apply_qkv(self, hidden_states)
@@ -135,15 +136,15 @@ def GraniteAttention_fast_forward(
135136
Q = Q.view(bsz, q_len, n_kv_heads, n_groups, head_dim)
136137
pass
137138

138-
A = xformers_attention(Q, K, V, attn_bias = causal_mask, scale=self.scaling)
139+
A = xformers_attention(Q, K, V, attn_bias = causal_mask, scale=self.scaling, p=dropout_p)
139140
A = A.view(bsz, q_len, n_heads, head_dim)
140141

141142
elif HAS_FLASH_ATTENTION and attention_mask is None:
142143
Q = Q.transpose(1, 2)
143144
K = K.transpose(1, 2)
144145
V = V.transpose(1, 2)
145146
window = (kv_seq_len, kv_seq_len)
146-
A = flash_attn_func(Q, K, V, causal = True, window_size = window, softmax_scale=self.scaling)
147+
A = flash_attn_func(Q, K, V, causal = True, window_size = window, softmax_scale=self.scaling, dropout_p=dropout_p)
147148
else:
148149
# Grouped query attention
149150
# if n_groups != 1:
@@ -157,7 +158,7 @@ def GraniteAttention_fast_forward(
157158
Q, K, V = Q.contiguous(), K.contiguous(), V.contiguous()
158159
# Needs (batch_size, n_heads, seq_len, head_dim)
159160
# is_casual and attention_mask must not be both set!
160-
A = scaled_dot_product_attention(Q, K, V, attn_mask = attention_mask, scale = self.scaling, is_causal = False)
161+
A = scaled_dot_product_attention(Q, K, V, attn_mask = attention_mask, scale = self.scaling, is_causal = False, dropout_p=dropout_p)
161162
# Go back to (batch_size, seq_len, n_heads, head_dim)
162163
A = A.transpose(1, 2).contiguous()
163164
pass

unsloth/models/llama.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -636,6 +636,7 @@ def LlamaModel_fast_forward(
636636
IS_GEMMA2 = self.config.model_type.startswith("gemma2")
637637
IS_COHERE = self.config.model_type.startswith("cohere")
638638
IS_GRANITE = self.config.model_type.startswith("granite")
639+
639640
train_embed_tokens = self.embed_tokens.weight.requires_grad
640641

641642
if IS_GEMMA:
@@ -664,7 +665,7 @@ def LlamaModel_fast_forward(
664665

665666
# Fix up attention mask by setting elements to 0
666667
# Specifically for DPO
667-
if self._has_no_labels and (attention_mask is not None) and (past_key_values is None) and \
668+
if getattr(self, "_has_no_labels", False) is True and (attention_mask is not None) and (past_key_values is None) and \
668669
(not train_embed_tokens):
669670
# Careful for inference the attention_mask is size (1, kv_seq_len)
670671
# Whilst the input_embeds is size (1, 1, 4096)
@@ -792,9 +793,12 @@ def LlamaModel_fast_forward(
792793
pass
793794
pass
794795

795-
if IS_ATTENTION_REFACTOR and not hasattr(self.layers[0].self_attn, "rotary_emb"):
796+
if (IS_ATTENTION_REFACTOR and (hasattr(self, "rotary_emb") or not hasattr(self.layers[0].self_attn, "rotary_emb"))) or IS_GRANITE:
796797
# Transformers main has made it mandatory to pass position_embeddings
797798
# https://github.com/huggingface/transformers/pull/34858
799+
# Also, transformers 4.45.0 supports granite but with the attention refactor (it always had the refactor)
800+
# unsloth's check for granite too has "version >= 4.45.0 (rightly so)".
801+
# so let granite always use the attention refactor implementation.
798802
position_embeddings = self.rotary_emb(hidden_states, position_ids, self.config.max_position_embeddings)
799803
else:
800804
position_embeddings = None

unsloth/models/mapper.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -471,20 +471,18 @@
471471
"meta-llama/Llama-3.2-11B-Vision-Instruct",
472472
"unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit",
473473
),
474-
"unsloth/Llama-3.2-90B-Vision-Instruct-unsloth-bnb-4bit" : (
474+
"unsloth/Llama-3.2-90B-Vision-Instruct-bnb-4bit" : (
475475
"unsloth/Llama-3.2-90B-Vision-Instruct",
476476
"meta-llama/Llama-3.2-90B-Vision-Instruct",
477-
"unsloth/Llama-3.2-90B-Vision-Instruct-bnb-4bit",
478477
),
479478
"unsloth/Llama-3.2-11B-Vision-unsloth-bnb-4bit" : (
480479
"unsloth/Llama-3.2-11B-Vision",
481480
"meta-llama/Llama-3.2-11B-Vision",
482481
"unsloth/Llama-3.2-11B-Vision-bnb-4bit",
483482
),
484-
"unsloth/Llama-3.2-90B-Vision-unsloth-bnb-4bit" : (
483+
"unsloth/Llama-3.2-90B-Vision-bnb-4bit" : (
485484
"unsloth/Llama-3.2-90B-Vision",
486485
"meta-llama/Llama-3.2-90B-Vision",
487-
"unsloth/Llama-3.2-90B-Vision-bnb-4bit",
488486
),
489487
"unsloth/Pixtral-12B-2409-unsloth-bnb-4bit" : (
490488
"unsloth/Pixtral-12B-2409",

unsloth/models/mistral.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -304,7 +304,7 @@ def pre_patch():
304304
attention_module = MistralAttention,
305305
)
306306
# Just for Mistral Nemo models!
307-
if function is not None:
307+
if function is not None and init_name is not None:
308308
function = patch_mistral_nemo_attention(function)
309309
# if True:#init_name is not None:
310310
exec(function, globals())

0 commit comments

Comments
 (0)