From 7eb34655a5d335db22a67aeb340d0a522aeef7e6 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 18 Jul 2024 11:07:32 -0700 Subject: [PATCH 001/147] Update __init__.py --- unsloth/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/unsloth/__init__.py b/unsloth/__init__.py index 6a2d999b4..ea2fe7685 100644 --- a/unsloth/__init__.py +++ b/unsloth/__init__.py @@ -61,10 +61,10 @@ pass # Hugging Face Hub faster downloads (only enable during Colab and Kaggle sessions) -keynames = "\n" + "\n".join(os.environ.keys()) -if "\nCOLAB_" in keynames or "\nKAGGLE_" in keynames: - os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" -pass +# keynames = "\n" + "\n".join(os.environ.keys()) +# if "\nCOLAB_" in keynames or "\nKAGGLE_" in keynames: +# os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" +# pass # We support Pytorch 2 # Fixes https://github.com/unslothai/unsloth/issues/38 From 54dfb1a9e163dfb2e11c7c46ac182bb22849e6a3 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 18 Jul 2024 12:06:38 -0700 Subject: [PATCH 002/147] dynamic RoPE --- unsloth/__init__.py | 8 ++++---- unsloth/models/llama.py | 28 ++++++++++++++++++++-------- 2 files changed, 24 insertions(+), 12 deletions(-) diff --git a/unsloth/__init__.py b/unsloth/__init__.py index ea2fe7685..6a2d999b4 100644 --- a/unsloth/__init__.py +++ b/unsloth/__init__.py @@ -61,10 +61,10 @@ pass # Hugging Face Hub faster downloads (only enable during Colab and Kaggle sessions) -# keynames = "\n" + "\n".join(os.environ.keys()) -# if "\nCOLAB_" in keynames or "\nKAGGLE_" in keynames: -# os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" -# pass +keynames = "\n" + "\n".join(os.environ.keys()) +if "\nCOLAB_" in keynames or "\nKAGGLE_" in keynames: + os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" +pass # We support Pytorch 2 # Fixes https://github.com/unslothai/unsloth/issues/38 diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 9bea364ca..0fcfe2a27 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -335,6 +335,9 @@ def LlamaAttention_fast_forward( if past_key_value is not None: kv_seq_len += past_key_value[0].shape[-2] + # Extend RoPE dynamically to fit in VRAM + self.rotary_emb.extend_rope_embedding(V, seq_len = kv_seq_len) + if position_ids is None: cos = self.rotary_emb.cos_cached sin = self.rotary_emb.sin_cached @@ -971,19 +974,21 @@ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): self.dim = dim self.max_position_embeddings = max_position_embeddings self.base = base + # Dynamic RoPE we first set it to a max of 4 * 8192 tokens then we iteratively grow this + self.current_rope_size = min(4 * 8192, self.max_position_embeddings) # Build here to make `torch.jit.trace` work. - self._set_cos_sin_cache(seq_len=max_position_embeddings, device=device, dtype=torch.get_default_dtype()) + self._set_cos_sin_cache(seq_len=self.current_rope_size, device=device, dtype=torch.get_default_dtype()) pass def _set_cos_sin_cache(self, seq_len, device, dtype): # Note: on the original Llama codebase, these tensors are created on the target device (and not on CPU) and # in FP32. They are applied (multiplied) in FP32 as well. - self.max_seq_len_cached = seq_len + self.current_rope_size = seq_len inv_freq = 1.0 / ( self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64, device="cpu").float() / self.dim) ) - t = torch.arange(self.max_seq_len_cached, device="cpu", dtype=torch.int64).float() + t = torch.arange(self.current_rope_size, device="cpu", dtype=torch.int64).float() freqs = torch.outer(t, inv_freq) # Different from paper, but it uses a different permutation in order to obtain the same calculation @@ -994,14 +999,21 @@ def _set_cos_sin_cache(self, seq_len, device, dtype): def forward(self, x, position_ids=None, seq_len=None): # x: [bs, num_attention_heads, seq_len, head_size] - if seq_len > self.max_seq_len_cached: + if seq_len > self.current_rope_size: self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) return ( - self.cos_cached[:seq_len].to(dtype=x.dtype), - self.sin_cached[:seq_len].to(dtype=x.dtype), + self.cos_cached[:seq_len].to(dtype = x.dtype), + self.sin_cached[:seq_len].to(dtype = x.dtype), ) pass + + def extend_rope_embedding(self, x, seq_len): + if seq_len <= self.current_rope_size: return + # Iteratively grow by increments of 8192 + self.current_rope_size = int(round(seq_len / 8192)) * 8192 + self._set_cos_sin_cache(self.current_rope_size, device = "cuda:0", dtype = x.dtype) + pass pass @@ -1016,11 +1028,11 @@ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, s pass def _set_cos_sin_cache(self, seq_len, device, dtype): - self.max_seq_len_cached = seq_len + self.current_rope_size = seq_len inv_freq = 1.0 / ( self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64, device="cpu").float() / self.dim) ) - t = torch.arange(self.max_seq_len_cached, device="cpu", dtype=torch.int64).float() + t = torch.arange(self.current_rope_size, device="cpu", dtype=torch.int64).float() t = t / self.scaling_factor freqs = torch.outer(t, inv_freq) From 6c8618c75443a08c4ec0304cc54acfae74b2ddcd Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 18 Jul 2024 12:08:49 -0700 Subject: [PATCH 003/147] Update mistral.py --- unsloth/models/mistral.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/unsloth/models/mistral.py b/unsloth/models/mistral.py index 6eb3fccfa..d7376d952 100644 --- a/unsloth/models/mistral.py +++ b/unsloth/models/mistral.py @@ -78,6 +78,9 @@ def MistralAttention_fast_forward( if past_key_value is not None: kv_seq_len += past_key_value[0].shape[-2] + # Extend RoPE dynamically to fit in VRAM + self.rotary_emb.extend_rope_embedding(V, seq_len = kv_seq_len) + if position_ids is None: cos = self.rotary_emb.cos_cached sin = self.rotary_emb.sin_cached From a56b2d45c9fcaba1aefb3dd09dacf904362bff59 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 18 Jul 2024 12:33:22 -0700 Subject: [PATCH 004/147] Update llama.py --- unsloth/models/llama.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 0fcfe2a27..ee261b0f7 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1152,6 +1152,12 @@ def from_pretrained( f"\ / Bfloat16 = {str(SUPPORTS_BFLOAT16).upper()}. FA [Xformers = {xformers_version}. FA2 = {HAS_FLASH_ATTENTION}]\n"\ f' "-____-" Free Apache license: http://github.com/unslothai/unsloth' print(statistics) + + # Warn about fast transfers + if os.environ.get("HF_HUB_ENABLE_HF_TRANSFER", "0") == "1": + logger.warning_once("Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!") + pass + model_patcher.pre_patch() get_statistics() # For debugging - we use a download counter to see if environments are not breaking From 40aeb2629e90615a359c2b8870112a826ef5baa5 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 18 Jul 2024 13:25:30 -0700 Subject: [PATCH 005/147] Update tokenizer_utils.py --- unsloth/tokenizer_utils.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index dc0c7da85..060c1ccae 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -38,6 +38,17 @@ IGNORED_TOKENIZER_CHECKING = frozenset(( "CodeLlamaTokenizerFast", "CodeLlamaTokenizer", + "" +)) + + +IGNORED_TOKENIZER_NAMES = frozenset(( + "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit", + "unsloth/Mistral-Nemo-Instruct-2407", + "mistralai/Mistral-Nemo-Instruct-2407", + "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", + "unsloth/Mistral-Nemo-Base-2407", + "mistralai/Mistral-Nemo-Base-2407", )) # Check environments @@ -488,7 +499,7 @@ def load_correct_tokenizer( cache_dir = cache_dir, ) - if slow_tokenizer is not None: + if tokenizer_name not in IGNORED_TOKENIZER_NAMES and slow_tokenizer is not None: if hasattr(fast_tokenizer, "add_bos_token") and hasattr(slow_tokenizer, "add_bos_token"): fast_tokenizer.add_bos_token = slow_tokenizer.add_bos_token if hasattr(fast_tokenizer, "add_eos_token") and hasattr(slow_tokenizer, "add_eos_token"): From fbf6cc747ef52812f5593a2daf96e55b5c200514 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 18 Jul 2024 13:33:13 -0700 Subject: [PATCH 006/147] Update mistral.py --- unsloth/models/mistral.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/mistral.py b/unsloth/models/mistral.py index d7376d952..b2531056a 100644 --- a/unsloth/models/mistral.py +++ b/unsloth/models/mistral.py @@ -161,7 +161,7 @@ def MistralAttention_fast_forward( A = A.transpose(1, 2).contiguous() pass - attn_output = A.reshape(bsz, q_len, self.hidden_size) + attn_output = A.reshape(bsz, q_len, n_heads*head_dim) attn_output = self.apply_o(self, attn_output) attn_weights = None return attn_output, attn_weights, past_key_value From 983c2b601aa3418cac25011317e08b454bde2c31 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 18 Jul 2024 14:31:35 -0700 Subject: [PATCH 007/147] Update llama.py --- unsloth/models/llama.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index ee261b0f7..ba45bbbfb 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -158,6 +158,14 @@ def LlamaAttention_fast_forward_inference( self.temp_QA = torch.empty((2, bsz, 1, attention_size), dtype = dtype, device = "cuda:0") self.temp_KV = torch.empty((2, bsz, 1, n_kv_heads*head_dim), dtype = dtype, device = "cuda:0") self.RH_Q = torch.empty((bsz, n_heads, 1, head_dim), dtype = dtype, device = "cuda:0") + + # Mistral Nemo 12b has weird dimensions + if attention_size != self.hidden_size: + self.temp_O = torch.empty((1, bsz, self.hidden_size), dtype = dtype, device = "cuda:0") + else: + self.temp_O = self.temp_QA[1][:,:,:self.hidden_size] + pass + self.attention = torch.empty((bsz, n_heads, 1, KV_CACHE_INCREMENT+seq_len), dtype = dtype, device = "cuda:0") self.scalar = 1.0 / math_sqrt(self.head_dim) self.half_head_dim = head_dim // 2 @@ -239,7 +247,7 @@ def LlamaAttention_fast_forward_inference( pass A = A.transpose(1, 2) A = A.reshape(bsz, 1, attention_size) - A = fast_linear_forward(self.o_proj, A, out = self.temp_QA[1][:,:,:self.hidden_size]) + A = fast_linear_forward(self.o_proj, A, out = self.temp_O) return A, (Kn, Vn) pass @@ -1152,7 +1160,7 @@ def from_pretrained( f"\ / Bfloat16 = {str(SUPPORTS_BFLOAT16).upper()}. FA [Xformers = {xformers_version}. FA2 = {HAS_FLASH_ATTENTION}]\n"\ f' "-____-" Free Apache license: http://github.com/unslothai/unsloth' print(statistics) - + # Warn about fast transfers if os.environ.get("HF_HUB_ENABLE_HF_TRANSFER", "0") == "1": logger.warning_once("Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!") From ed56977a8c4b850d984517bc2da29319d20cc4c3 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 18 Jul 2024 14:43:03 -0700 Subject: [PATCH 008/147] Update __init__.py --- unsloth/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/__init__.py b/unsloth/__init__.py index 6a2d999b4..464068154 100644 --- a/unsloth/__init__.py +++ b/unsloth/__init__.py @@ -89,7 +89,7 @@ def is_bf16_supported(including_emulation = False): return old_is_bf16_supported(including_emulation) torch.cuda.is_bf16_supported = is_bf16_supported else: - def is_bf16_supported(): SUPPORTS_BFLOAT16 + def is_bf16_supported(): return SUPPORTS_BFLOAT16 torch.cuda.is_bf16_supported = is_bf16_supported pass From 2a251ec5948ea44d7332ab8b053b83276ae237ca Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 18 Jul 2024 18:18:09 -0700 Subject: [PATCH 009/147] Update flex_attention.py --- unsloth/kernels/flex_attention.py | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/unsloth/kernels/flex_attention.py b/unsloth/kernels/flex_attention.py index 1eb248699..a992a0238 100644 --- a/unsloth/kernels/flex_attention.py +++ b/unsloth/kernels/flex_attention.py @@ -25,18 +25,23 @@ } # Flex Attention supported from torch 2.5 onwards only -import torch.nn.attention -if hasattr(torch.nn.attention, "flex_attention"): - import torch.nn.attention.flex_attention - from torch.nn.attention.flex_attention import flex_attention - from torch.nn.attention.flex_attention import create_block_mask - FLEX_ATTENTION_PADDING = getattr( - torch.nn.attention.flex_attention, - "_DEFAULT_SPARSE_BLOCK_SIZE", - 1, - ) - flex_attention = torch.compile(flex_attention, dynamic = False) - HAS_FLEX_ATTENTION = True +import torch.nn +if hasattr(torch.nn, "attention"): + import torch.nn.attention + if hasattr(torch.nn.attention, "flex_attention"): + import torch.nn.attention.flex_attention + from torch.nn.attention.flex_attention import flex_attention + from torch.nn.attention.flex_attention import create_block_mask + FLEX_ATTENTION_PADDING = getattr( + torch.nn.attention.flex_attention, + "_DEFAULT_SPARSE_BLOCK_SIZE", + 1, + ) + flex_attention = torch.compile(flex_attention, dynamic = False) + HAS_FLEX_ATTENTION = True + else: + HAS_FLEX_ATTENTION = False + pass else: HAS_FLEX_ATTENTION = False pass From 477793753f6aa4100b785a1a3557f34c3223cbcd Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 18 Jul 2024 21:57:33 -0700 Subject: [PATCH 010/147] Update llama.py --- unsloth/models/llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index ba45bbbfb..212767b39 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1163,7 +1163,7 @@ def from_pretrained( # Warn about fast transfers if os.environ.get("HF_HUB_ENABLE_HF_TRANSFER", "0") == "1": - logger.warning_once("Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!") + print("Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!") pass model_patcher.pre_patch() From 152450462475b7621164be944fbe2945a26cddd0 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 18 Jul 2024 22:07:23 -0700 Subject: [PATCH 011/147] Update llama.py --- unsloth/models/llama.py | 53 ++++++++--------------------------------- 1 file changed, 10 insertions(+), 43 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 212767b39..1ac96a4f2 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -158,14 +158,6 @@ def LlamaAttention_fast_forward_inference( self.temp_QA = torch.empty((2, bsz, 1, attention_size), dtype = dtype, device = "cuda:0") self.temp_KV = torch.empty((2, bsz, 1, n_kv_heads*head_dim), dtype = dtype, device = "cuda:0") self.RH_Q = torch.empty((bsz, n_heads, 1, head_dim), dtype = dtype, device = "cuda:0") - - # Mistral Nemo 12b has weird dimensions - if attention_size != self.hidden_size: - self.temp_O = torch.empty((1, bsz, self.hidden_size), dtype = dtype, device = "cuda:0") - else: - self.temp_O = self.temp_QA[1][:,:,:self.hidden_size] - pass - self.attention = torch.empty((bsz, n_heads, 1, KV_CACHE_INCREMENT+seq_len), dtype = dtype, device = "cuda:0") self.scalar = 1.0 / math_sqrt(self.head_dim) self.half_head_dim = head_dim // 2 @@ -247,7 +239,7 @@ def LlamaAttention_fast_forward_inference( pass A = A.transpose(1, 2) A = A.reshape(bsz, 1, attention_size) - A = fast_linear_forward(self.o_proj, A, out = self.temp_O) + A = fast_linear_forward(self.o_proj, A, out = self.temp_QA[1][:,:,:self.hidden_size]) return A, (Kn, Vn) pass @@ -343,9 +335,6 @@ def LlamaAttention_fast_forward( if past_key_value is not None: kv_seq_len += past_key_value[0].shape[-2] - # Extend RoPE dynamically to fit in VRAM - self.rotary_emb.extend_rope_embedding(V, seq_len = kv_seq_len) - if position_ids is None: cos = self.rotary_emb.cos_cached sin = self.rotary_emb.sin_cached @@ -673,12 +662,6 @@ def LlamaModel_fast_forward( offloaded_gradient_checkpointing = True pass - # Check for Flex Attention - # if IS_GEMMA2 and HAS_FLEX_ATTENTION: - # if not (seq_length % FLEX_ATTENTION_PADDING == 0): - # USE_FLEX_ATTENTION = True - - # Gemma2 has alternating SWA and global attn if IS_GEMMA2 and not hasattr(self, "SWA_mask"): n = self.config.max_position_embeddings @@ -982,21 +965,19 @@ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): self.dim = dim self.max_position_embeddings = max_position_embeddings self.base = base - # Dynamic RoPE we first set it to a max of 4 * 8192 tokens then we iteratively grow this - self.current_rope_size = min(4 * 8192, self.max_position_embeddings) # Build here to make `torch.jit.trace` work. - self._set_cos_sin_cache(seq_len=self.current_rope_size, device=device, dtype=torch.get_default_dtype()) + self._set_cos_sin_cache(seq_len=max_position_embeddings, device=device, dtype=torch.get_default_dtype()) pass def _set_cos_sin_cache(self, seq_len, device, dtype): # Note: on the original Llama codebase, these tensors are created on the target device (and not on CPU) and # in FP32. They are applied (multiplied) in FP32 as well. - self.current_rope_size = seq_len + self.max_seq_len_cached = seq_len inv_freq = 1.0 / ( self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64, device="cpu").float() / self.dim) ) - t = torch.arange(self.current_rope_size, device="cpu", dtype=torch.int64).float() + t = torch.arange(self.max_seq_len_cached, device="cpu", dtype=torch.int64).float() freqs = torch.outer(t, inv_freq) # Different from paper, but it uses a different permutation in order to obtain the same calculation @@ -1007,21 +988,14 @@ def _set_cos_sin_cache(self, seq_len, device, dtype): def forward(self, x, position_ids=None, seq_len=None): # x: [bs, num_attention_heads, seq_len, head_size] - if seq_len > self.current_rope_size: + if seq_len > self.max_seq_len_cached: self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) return ( - self.cos_cached[:seq_len].to(dtype = x.dtype), - self.sin_cached[:seq_len].to(dtype = x.dtype), + self.cos_cached[:seq_len].to(dtype=x.dtype), + self.sin_cached[:seq_len].to(dtype=x.dtype), ) pass - - def extend_rope_embedding(self, x, seq_len): - if seq_len <= self.current_rope_size: return - # Iteratively grow by increments of 8192 - self.current_rope_size = int(round(seq_len / 8192)) * 8192 - self._set_cos_sin_cache(self.current_rope_size, device = "cuda:0", dtype = x.dtype) - pass pass @@ -1036,11 +1010,11 @@ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, s pass def _set_cos_sin_cache(self, seq_len, device, dtype): - self.current_rope_size = seq_len + self.max_seq_len_cached = seq_len inv_freq = 1.0 / ( self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64, device="cpu").float() / self.dim) ) - t = torch.arange(self.current_rope_size, device="cpu", dtype=torch.int64).float() + t = torch.arange(self.max_seq_len_cached, device="cpu", dtype=torch.int64).float() t = t / self.scaling_factor freqs = torch.outer(t, inv_freq) @@ -1160,12 +1134,6 @@ def from_pretrained( f"\ / Bfloat16 = {str(SUPPORTS_BFLOAT16).upper()}. FA [Xformers = {xformers_version}. FA2 = {HAS_FLASH_ATTENTION}]\n"\ f' "-____-" Free Apache license: http://github.com/unslothai/unsloth' print(statistics) - - # Warn about fast transfers - if os.environ.get("HF_HUB_ENABLE_HF_TRANSFER", "0") == "1": - print("Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!") - pass - model_patcher.pre_patch() get_statistics() # For debugging - we use a download counter to see if environments are not breaking @@ -2113,5 +2081,4 @@ def for_training(model, use_gradient_checkpointing = True): internal_model._saved_temp_tokenizer.padding_side = "right" pass pass -pass - +pass \ No newline at end of file From c1d349370411f2c7861d5967c0c4a2ca59935670 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 18 Jul 2024 22:53:23 -0700 Subject: [PATCH 012/147] Mistral Nemo --- unsloth/models/_utils.py | 31 ++++++++++++++++++--- unsloth/models/llama.py | 58 ++++++++++++++++++++++++++++++++------- unsloth/models/mistral.py | 22 ++++++++++++++- 3 files changed, 96 insertions(+), 15 deletions(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 025daec13..466a5fee7 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -65,8 +65,26 @@ # ============================================= # Edits all Config files to enable RoPE Scaling for all models -from transformers import PretrainedConfig +# Transformers had to update for Mistral Nemo 12b since Attention is (5120, 4096) now. +def patch_mistral_nemo_config(config): + if "head_dim (" not in config: + add_head_dim = "If it is not specified, will default to `8`.\n"\ + " head_dim (`int`, *optional*, defaults to `hidden_size // num_attention_heads`):\n"\ + " The attention head dimension." + config = config.replace("If it is not specified, will default to `8`.", add_head_dim) + + add_head_dim = "num_key_value_heads=8,\n head_dim=None," + config = config.replace("num_key_value_heads=8,", add_head_dim) + + add_head_dim = "self.sliding_window = sliding_window\n self.head_dim = head_dim or hidden_size // num_attention_heads\n" + config = config.replace("self.sliding_window = sliding_window", add_head_dim) + pass + return config +pass + +from transformers import __version__ as transformers_version +from transformers import PretrainedConfig model_architectures = ["llama", "mistral", "gemma", "gemma2", "qwen2",] for model_name in model_architectures: @@ -87,8 +105,14 @@ r"\n self.rope_scaling = rope_scaling\n", config, ) - exec(config, globals()) + # Just for Mistral Nemo + if model_name == "mistral": + if Version(transformers_version) <= Version("4.42.4"): + config = patch_mistral_nemo_config(config) + pass + + exec(config, globals()) exec(f"import {config_filepath}", globals()) exec(f"{config_filepath}.{config_filename} = {config_filename}", globals()) pass @@ -97,7 +121,6 @@ # ============================================= # torch.cuda.amp.custom_fwd is deprecated >= 2.4 import torch -from packaging.version import Version if Version(torch.__version__) < Version("2.4.0"): torch_amp_custom_fwd = torch.cuda.amp.custom_fwd torch_amp_custom_bwd = torch.cuda.amp.custom_bwd @@ -748,7 +771,7 @@ def patch_linear_scaling( "self.rotary_emb = .+?\)", function, flags = re.DOTALL | re.MULTILINE, ) - if len(rotary_emb) == 0: return + if len(rotary_emb) == 0: return None, function rotary_emb = rotary_emb[0] function = function.replace(rotary_emb, fix_rope_function, 1) function = exec_code + "\n\n" + function diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 1ac96a4f2..ca4e65159 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -158,6 +158,14 @@ def LlamaAttention_fast_forward_inference( self.temp_QA = torch.empty((2, bsz, 1, attention_size), dtype = dtype, device = "cuda:0") self.temp_KV = torch.empty((2, bsz, 1, n_kv_heads*head_dim), dtype = dtype, device = "cuda:0") self.RH_Q = torch.empty((bsz, n_heads, 1, head_dim), dtype = dtype, device = "cuda:0") + + # Mistral Nemo 12b has weird dimensions + if attention_size != self.hidden_size: + self.temp_O = torch.empty((1, bsz, self.hidden_size), dtype = dtype, device = "cuda:0") + else: + self.temp_O = self.temp_QA[1][:,:,:self.hidden_size] + pass + self.attention = torch.empty((bsz, n_heads, 1, KV_CACHE_INCREMENT+seq_len), dtype = dtype, device = "cuda:0") self.scalar = 1.0 / math_sqrt(self.head_dim) self.half_head_dim = head_dim // 2 @@ -239,7 +247,7 @@ def LlamaAttention_fast_forward_inference( pass A = A.transpose(1, 2) A = A.reshape(bsz, 1, attention_size) - A = fast_linear_forward(self.o_proj, A, out = self.temp_QA[1][:,:,:self.hidden_size]) + A = fast_linear_forward(self.o_proj, A, out = self.temp_O) return A, (Kn, Vn) pass @@ -335,6 +343,9 @@ def LlamaAttention_fast_forward( if past_key_value is not None: kv_seq_len += past_key_value[0].shape[-2] + # Extend RoPE dynamically to fit in VRAM + self.rotary_emb.extend_rope_embedding(V, seq_len = kv_seq_len) + if position_ids is None: cos = self.rotary_emb.cos_cached sin = self.rotary_emb.sin_cached @@ -662,6 +673,12 @@ def LlamaModel_fast_forward( offloaded_gradient_checkpointing = True pass + # Check for Flex Attention + # if IS_GEMMA2 and HAS_FLEX_ATTENTION: + # if not (seq_length % FLEX_ATTENTION_PADDING == 0): + # USE_FLEX_ATTENTION = True + + # Gemma2 has alternating SWA and global attn if IS_GEMMA2 and not hasattr(self, "SWA_mask"): n = self.config.max_position_embeddings @@ -965,19 +982,21 @@ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): self.dim = dim self.max_position_embeddings = max_position_embeddings self.base = base + # Dynamic RoPE we first set it to a max of 4 * 8192 tokens then we iteratively grow this + self.current_rope_size = min(4 * 8192, self.max_position_embeddings) # Build here to make `torch.jit.trace` work. - self._set_cos_sin_cache(seq_len=max_position_embeddings, device=device, dtype=torch.get_default_dtype()) + self._set_cos_sin_cache(seq_len=self.current_rope_size, device=device, dtype=torch.get_default_dtype()) pass def _set_cos_sin_cache(self, seq_len, device, dtype): # Note: on the original Llama codebase, these tensors are created on the target device (and not on CPU) and # in FP32. They are applied (multiplied) in FP32 as well. - self.max_seq_len_cached = seq_len + self.current_rope_size = seq_len inv_freq = 1.0 / ( self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64, device="cpu").float() / self.dim) ) - t = torch.arange(self.max_seq_len_cached, device="cpu", dtype=torch.int64).float() + t = torch.arange(self.current_rope_size, device="cpu", dtype=torch.int64).float() freqs = torch.outer(t, inv_freq) # Different from paper, but it uses a different permutation in order to obtain the same calculation @@ -988,14 +1007,21 @@ def _set_cos_sin_cache(self, seq_len, device, dtype): def forward(self, x, position_ids=None, seq_len=None): # x: [bs, num_attention_heads, seq_len, head_size] - if seq_len > self.max_seq_len_cached: + if seq_len > self.current_rope_size: self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) return ( - self.cos_cached[:seq_len].to(dtype=x.dtype), - self.sin_cached[:seq_len].to(dtype=x.dtype), + self.cos_cached[:seq_len].to(dtype = x.dtype), + self.sin_cached[:seq_len].to(dtype = x.dtype), ) pass + + def extend_rope_embedding(self, x, seq_len): + if seq_len <= self.current_rope_size: return + # Iteratively grow by increments of 8192 + self.current_rope_size = int(round(seq_len / 8192)) * 8192 + self._set_cos_sin_cache(self.current_rope_size, device = "cuda:0", dtype = x.dtype) + pass pass @@ -1010,11 +1036,11 @@ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, s pass def _set_cos_sin_cache(self, seq_len, device, dtype): - self.max_seq_len_cached = seq_len + self.current_rope_size = seq_len inv_freq = 1.0 / ( self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64, device="cpu").float() / self.dim) ) - t = torch.arange(self.max_seq_len_cached, device="cpu", dtype=torch.int64).float() + t = torch.arange(self.current_rope_size, device="cpu", dtype=torch.int64).float() t = t / self.scaling_factor freqs = torch.outer(t, inv_freq) @@ -1134,6 +1160,15 @@ def from_pretrained( f"\ / Bfloat16 = {str(SUPPORTS_BFLOAT16).upper()}. FA [Xformers = {xformers_version}. FA2 = {HAS_FLASH_ATTENTION}]\n"\ f' "-____-" Free Apache license: http://github.com/unslothai/unsloth' print(statistics) + + # Warn about fast transfers + old_hf_transfer = os.environ.get("HF_HUB_ENABLE_HF_TRANSFER", "0") + if os.environ.get("HF_HUB_ENABLE_HF_TRANSFER", "0") == "1": + print("Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!") + pass + # Return old flag + os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = old_hf_transfer + model_patcher.pre_patch() get_statistics() # For debugging - we use a download counter to see if environments are not breaking @@ -1215,6 +1250,8 @@ def from_pretrained( attn_implementation = "eager", **kwargs, ) + # Return old flag + os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = old_hf_transfer # We currently only support NVIDIA GPUs - AMD / Intel is a work in progress! post_check = check_nvidia() @@ -2081,4 +2118,5 @@ def for_training(model, use_gradient_checkpointing = True): internal_model._saved_temp_tokenizer.padding_side = "right" pass pass -pass \ No newline at end of file +pass + diff --git a/unsloth/models/mistral.py b/unsloth/models/mistral.py index b2531056a..e0e034fc5 100644 --- a/unsloth/models/mistral.py +++ b/unsloth/models/mistral.py @@ -270,6 +270,24 @@ def MistralForCausalLM_fast_forward( pass +# Transformers had to update for Mistral Nemo 12b since Attention is (5120, 4096) now. +def patch_mistral_nemo_attention(function): + function = function.replace( + "(self.head_dim * self.num_heads) != self.hidden_size", + "False", + ) + function = function.replace( + "self.head_dim = self.hidden_size // self.num_heads", + "self.head_dim = config.head_dim", + ) + function = function.replace( + "self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False)", + "self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)", + ) + return function +pass + + class FastMistralModel(FastLlamaModel): @staticmethod @@ -280,7 +298,9 @@ def pre_patch(): scaled_rope_module = LlamaLinearScalingRotaryEmbedding, attention_module = MistralAttention, ) - if init_name is not None: + # Just for Mistral Nemo models! + function = patch_mistral_nemo_attention(function) + if True:#init_name is not None: exec(function, globals()) MistralAttention.__init__ = eval(init_name) pass From 10c13545c346990c78717b529af5cdac6d1856d1 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 19 Jul 2024 01:03:51 -0700 Subject: [PATCH 013/147] Update tokenizer_utils.py --- unsloth/tokenizer_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index e0c89e451..7b88b0932 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -498,7 +498,8 @@ def load_correct_tokenizer( cache_dir = cache_dir, ) - if tokenizer_name in IGNORED_TOKENIZER_NAMES: pass + if tokenizer_name in IGNORED_TOKENIZER_NAMES: + return fast_tokenizer elif slow_tokenizer is not None: if hasattr(fast_tokenizer, "add_bos_token") and hasattr(slow_tokenizer, "add_bos_token"): fast_tokenizer.add_bos_token = slow_tokenizer.add_bos_token From ad3d38ad4dde514b842688d6fa184e085eaf5320 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 19 Jul 2024 01:06:41 -0700 Subject: [PATCH 014/147] Update tokenizer_utils.py --- unsloth/tokenizer_utils.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index 7b88b0932..07cd87412 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -41,14 +41,17 @@ )) -IGNORED_TOKENIZER_NAMES = frozenset(( +IGNORED_TOKENIZER_NAMES = [ "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit", "unsloth/Mistral-Nemo-Instruct-2407", "mistralai/Mistral-Nemo-Instruct-2407", "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", "unsloth/Mistral-Nemo-Base-2407", "mistralai/Mistral-Nemo-Base-2407", -)) +] +IGNORED_TOKENIZER_NAMES = frozenset( + [x.lower() for x in IGNORED_TOKENIZER_NAMES] +) # Check environments keynames = "\n" + "\n".join(os.environ.keys()) From 8ee997cac28e3bc3ff205252ca543ab46ade3d25 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 19 Jul 2024 01:29:52 -0700 Subject: [PATCH 015/147] Update tokenizer_utils.py --- unsloth/tokenizer_utils.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index 07cd87412..3f75d1686 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -682,6 +682,11 @@ def fix_untrained_tokens(model, tokenizer, train_dataset, eps = 1e-16): embedding_matrix = model.get_input_embeddings ().weight lm_head_matrix = model.get_output_embeddings().weight + # Ignore some model checks for now + if model.config._name_or_path in IGNORED_TOKENIZER_NAMES: + return + pass + # Get untrained tokens indicator_untrained = torch.amax(embedding_matrix, axis = 1) <= eps where_untrained = torch.where(indicator_untrained)[0] From 565a5a389460bc8e4d0f56cc5fb6276bbb658065 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 19 Jul 2024 09:27:18 -0700 Subject: [PATCH 016/147] Fix Gemma --- unsloth/models/gemma.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/unsloth/models/gemma.py b/unsloth/models/gemma.py index bc70b993a..ce89ad3be 100644 --- a/unsloth/models/gemma.py +++ b/unsloth/models/gemma.py @@ -210,22 +210,24 @@ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): self.dim = dim self.max_position_embeddings = max_position_embeddings self.base = base + # Dynamic RoPE we first set it to a max of 4 * 8192 tokens then we iteratively grow this + self.current_rope_size = min(4 * 8192, self.max_position_embeddings) # Build here to make `torch.jit.trace` work. - self._set_cos_sin_cache(seq_len=max_position_embeddings, device=device, dtype=torch.get_default_dtype()) + self._set_cos_sin_cache(seq_len=self.current_rope_size, device=device, dtype=torch.get_default_dtype()) pass def _set_cos_sin_cache(self, seq_len, device, dtype): # Note: on the original Llama codebase, these tensors are created on the target device (and not on CPU) and # in FP32. They are applied (multiplied) in FP32 as well. - self.max_seq_len_cached = seq_len + self.current_rope_size = seq_len # The difference is we do division explicity instead of t * (1/x) ie we do t/x. freq_exponents = (2.0 / self.dim) * ( torch.arange(self.dim // 2, dtype = torch.int64, device = "cpu").float() ) timescale = self.base**freq_exponents - positions = torch.arange(self.max_seq_len_cached, device = "cpu", dtype = torch.int64).float() + positions = torch.arange(self.current_rope_size, device = "cpu", dtype = torch.int64).float() radians_new = positions[..., None] / timescale[None, None, :] radians_new = radians_new.squeeze(0) @@ -239,7 +241,7 @@ def _set_cos_sin_cache(self, seq_len, device, dtype): def forward(self, x, position_ids=None, seq_len=None): # x: [bs, num_attention_heads, seq_len, head_size] - if seq_len > self.max_seq_len_cached: + if seq_len > self.current_rope_size: self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) return ( @@ -247,6 +249,13 @@ def forward(self, x, position_ids=None, seq_len=None): self.sin_cached[:seq_len].to(dtype=x.dtype), ) pass + + def extend_rope_embedding(self, x, seq_len): + if seq_len <= self.current_rope_size: return + # Iteratively grow by increments of 8192 + self.current_rope_size = int(round(seq_len / 8192)) * 8192 + self._set_cos_sin_cache(self.current_rope_size, device = "cuda:0", dtype = x.dtype) + pass pass @@ -263,14 +272,14 @@ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, s def _set_cos_sin_cache(self, seq_len, device, dtype): # Note: on the original Llama codebase, these tensors are created on the target device (and not on CPU) and # in FP32. They are applied (multiplied) in FP32 as well. - self.max_seq_len_cached = seq_len + self.current_rope_size = seq_len # The difference is we do division explicity instead of t * (1/x) ie we do t/x. freq_exponents = (2.0 / self.dim) * ( torch.arange(self.dim // 2, dtype = torch.int64, device = "cpu").float() ) timescale = self.base**freq_exponents - positions = torch.arange(self.max_seq_len_cached, device = "cpu", dtype = torch.int64).float() + positions = torch.arange(self.current_rope_size, device = "cpu", dtype = torch.int64).float() positions = positions / self.scaling_factor radians_new = positions[..., None] / timescale[None, None, :] radians_new = radians_new.squeeze(0) From 182ab7e0cb28b21c0b3b119668ec3cd9aceb15de Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 19 Jul 2024 09:32:27 -0700 Subject: [PATCH 017/147] Update mistral.py --- unsloth/models/mistral.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/unsloth/models/mistral.py b/unsloth/models/mistral.py index e0e034fc5..ed6207bb0 100644 --- a/unsloth/models/mistral.py +++ b/unsloth/models/mistral.py @@ -299,8 +299,9 @@ def pre_patch(): attention_module = MistralAttention, ) # Just for Mistral Nemo models! - function = patch_mistral_nemo_attention(function) - if True:#init_name is not None: + if function is not None: + function = patch_mistral_nemo_attention(function) + # if True:#init_name is not None: exec(function, globals()) MistralAttention.__init__ = eval(init_name) pass From 72e1b03544c3d23a0c28f883f242fa0f96e8091b Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sat, 20 Jul 2024 11:53:32 -0700 Subject: [PATCH 018/147] Update llama.py --- unsloth/models/llama.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index ca4e65159..32610bbfd 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1068,6 +1068,8 @@ def _fast_generate(*args, **kwargs): # For newer HF kwargs["cache_implementation"] = "dynamic" + print(kwargs) + # Set pad token # old_pad_token_id = getattr(model.config, "pad_token_id", None) # old_eos_token_id = getattr(model.config, "eos_token_id", None) From ba515ec92dbc85c03c65d3f31e10166cc73ef323 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sat, 20 Jul 2024 12:47:36 -0700 Subject: [PATCH 019/147] Update llama.py --- unsloth/models/llama.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 32610bbfd..ff51b90b8 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1068,7 +1068,14 @@ def _fast_generate(*args, **kwargs): # For newer HF kwargs["cache_implementation"] = "dynamic" - print(kwargs) + # Remove token_type_ids + kwargs.pop("token_type_ids", None) + + # Check pad_token + kwargs["pad_token_id"] = kwargs.pop( + "pad_token_id", + getattr(model.config, "eos_token_id", None), + ) # Set pad token # old_pad_token_id = getattr(model.config, "pad_token_id", None) From 5f496efdb4db75371aa17d5b1b393f96cd55a2bd Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sat, 20 Jul 2024 13:22:36 -0700 Subject: [PATCH 020/147] Update tokenizer_utils.py --- unsloth/tokenizer_utils.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index 3f75d1686..0469f4d61 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -688,7 +688,12 @@ def fix_untrained_tokens(model, tokenizer, train_dataset, eps = 1e-16): pass # Get untrained tokens - indicator_untrained = torch.amax(embedding_matrix, axis = 1) <= eps + indicator_untrained1 = torch.amax(embedding_matrix, axis = 1) <= eps + # Check lm_head as well + indicator_untrained2 = torch.amax(lm_head_matrix, axis = 1) <= eps + # Combine both checks + indicator_untrained = indicator_untrained1 & indicator_untrained2 + where_untrained = torch.where(indicator_untrained)[0] n_untrained = where_untrained.shape[0] n_trained = embedding_matrix.shape[0] - n_untrained From e41cc4093c70095e4aef390c8afae85c38aa4eb3 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sat, 20 Jul 2024 13:25:59 -0700 Subject: [PATCH 021/147] Update tokenizer_utils.py --- unsloth/tokenizer_utils.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index 0469f4d61..8474c2c6b 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -42,12 +42,12 @@ IGNORED_TOKENIZER_NAMES = [ - "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit", - "unsloth/Mistral-Nemo-Instruct-2407", - "mistralai/Mistral-Nemo-Instruct-2407", - "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", - "unsloth/Mistral-Nemo-Base-2407", - "mistralai/Mistral-Nemo-Base-2407", + # "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit", + # "unsloth/Mistral-Nemo-Instruct-2407", + # "mistralai/Mistral-Nemo-Instruct-2407", + # "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", + # "unsloth/Mistral-Nemo-Base-2407", + # "mistralai/Mistral-Nemo-Base-2407", ] IGNORED_TOKENIZER_NAMES = frozenset( [x.lower() for x in IGNORED_TOKENIZER_NAMES] From c553b175d239f023882562ac727a92e6fcc95417 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 22 Jul 2024 22:58:02 -0700 Subject: [PATCH 022/147] Llama 3.1 --- unsloth/models/_utils.py | 93 +++++++++++++++++++++++++++++++++++++++- unsloth/models/llama.py | 73 +++++++++++++++++++++++++++++++ unsloth/models/mapper.py | 14 ++++++ 3 files changed, 179 insertions(+), 1 deletion(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 466a5fee7..c7c779a23 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -33,6 +33,7 @@ "unsloth_offloaded_gradient_checkpoint", "torch_compile_options", "patch_linear_scaling", + "patch_llama_rope_scaling", "check_nvidia", "create_boolean_mask", "torch_amp_custom_fwd", @@ -332,7 +333,13 @@ def patch_tokenizer(model, tokenizer): Check if pad_token is not the same as eos_token otherwise the loss will ignore it!! Fixes https://github.com/unslothai/unsloth/issues/5 """ - possible_reserved_tokens = ("<|reserved", "<|placeholder", "[control") + possible_reserved_tokens = ( + "<|reserved", # Llama-3 + "<|placeholder", # Phi-3 + "[control", # Forgot where lol + "", # Mistral Nemo + "<|finetune_right_pad_id|>", # Llama-3.1 + ) if model is not None: model.config.update({"unsloth_version" : __version__}) @@ -779,6 +786,90 @@ def patch_linear_scaling( pass +# Patches for Llama-3 LlamaExtendedRotaryEmbedding +def patch_llama_rope_scaling( + model_name = "llama", + rope_module = None, + scaled_rope_module = None, + extended_rope_module = None, + attention_module = None, +): + assert(\ + rope_module is not None and \ + scaled_rope_module is not None and \ + extended_rope_module is not None + ) + assert(attention_module is not None) + + rope_name = rope_module.__name__ + scaled_rope_name = scaled_rope_module.__name__ + model_filepath = f"transformers.models.{model_name}.modeling_{model_name}" + exec_code = \ + f"import torch.nn as nn\n"\ + f"from typing import Union, Optional, List, Any, Callable, Tuple\n"\ + f"from {model_filepath} import logger, "\ + f"{model_name.title()}Attention, {model_name.title()}Config" + + try: + function = inspect.getsource(attention_module.__init__) + except: + # Most likely already patched! + return None, None + where = function.find("def") + function = function.split("\n") + function = "\n".join(x[where:] for x in function) + init_name = f"{model_name.title()}Attention__init__" + function = function.replace("def __init__", f"def {init_name}") + function = function.replace( + "super().__init__()", + f"super({model_name.title()}Attention, self).__init__()", + ) + fix_rope_function = """ + if getattr(self.config, "rope_scaling", None) is None: + # Hack + if self.config.max_position_embeddings == 131072 + self.rotary_emb = {rope_function}( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) + else: + scaling_type = self.config.rope_scaling["type"] + scaling_factor = self.config.rope_scaling.get("factor") + if scaling_type == "linear": + self.rotary_emb = {scaled_rope_function}( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + scaling_factor=scaling_factor, + base=self.rope_theta, + ) + elif scaling_type == "extended": + self.rotary_emb = {extended_rope_function}( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) + else: + raise ValueError(f"Unknown RoPE scaling type {{scaling_type}}") + pass + """ + fix_rope_function = fix_rope_function.format( + rope_function = rope_module.__name__, + scaled_rope_function = scaled_rope_module.__name__, + extended_rope_function = extended_rope_module.__name__, + ) + rotary_emb = re.findall( + "self.rotary_emb = .+?\)", function, + flags = re.DOTALL | re.MULTILINE, + ) + if len(rotary_emb) == 0: return None, function + rotary_emb = rotary_emb[0] + function = function.replace(rotary_emb, fix_rope_function, 1) + function = exec_code + "\n\n" + function + return init_name, function +pass + + def check_nvidia(): # Unsloth doesn't work yet on AMD devices - we're working on it! output = np.array([0,]) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index ff51b90b8..2d224b3ca 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1052,6 +1052,68 @@ def _set_cos_sin_cache(self, seq_len, device, dtype): pass +# See https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/rotary_embedding.py#L736 +# For Llama 3.1 +class LlamaExtendedRotaryEmbedding(LlamaRotaryEmbedding): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + super().__init__() + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + # Dynamic RoPE we first set it to a max of 4 * 8192 tokens then we iteratively grow this + self.current_rope_size = min(4 * 8192, self.max_position_embeddings) + + # Normal Llama-3 RoPE + inv_freq = 1.0 / ( + self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64, device="cpu").float() / self.dim) + ) + inv_freq = self.apply_scaling(inv_freq) + self.register_buffer("inv_freq", inv_freq, persistent = False) + + # Build here to make `torch.jit.trace` work. + self._set_cos_sin_cache(seq_len=self.current_rope_size, device=device, dtype=torch.get_default_dtype()) + pass + + def _set_cos_sin_cache(self, seq_len, device, dtype): + # Note: on the original Llama codebase, these tensors are created on the target device (and not on CPU) and + # in FP32. They are applied (multiplied) in FP32 as well. + self.current_rope_size = seq_len + + t = torch.arange(self.current_rope_size, device="cpu", dtype=torch.int64).float() + + freqs = torch.outer(t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype=dtype, device=device, non_blocking=True), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype=dtype, device=device, non_blocking=True), persistent=False) + pass + + def apply_scaling(self, freqs: torch.Tensor): + scale_factor = 8 + low_freq_factor = 1 + high_freq_factor = 4 + old_context_len = 8192 + + low_freq_wavelen = old_context_len / low_freq_factor + high_freq_wavelen = old_context_len / high_freq_factor + new_freqs = [] + for freq in freqs: + wavelen = 2 * math.pi / freq + if wavelen < high_freq_wavelen: + new_freqs.append(freq) + elif wavelen > low_freq_wavelen: + new_freqs.append(freq / scale_factor) + else: + assert low_freq_wavelen != high_freq_wavelen + smooth = (old_context_len / wavelen - low_freq_factor) / ( + high_freq_factor - low_freq_factor) + new_freqs.append((1 - smooth) * freq / scale_factor + + smooth * freq) + return torch.tensor(new_freqs, dtype=freqs.dtype, device=freqs.device) + pass +pass + + def _wrap_fast_inference(generate, device_type, dtype, model): # Wraps inference with bfloat16 / float16 @torch.inference_mode @@ -1108,6 +1170,17 @@ class FastLlamaModel: @staticmethod def pre_patch(): + init_name, function = patch_llama_rope_scaling( + model_name = "llama", + rope_module = LlamaRotaryEmbedding, + scaled_rope_module = LlamaLinearScalingRotaryEmbedding, + extended_rope_module = LlamaExtendedRotaryEmbedding, + attention_module = LlamaAttention, + ) + if init_name is not None: + exec(function, globals()) + LlamaAttention.__init__ = eval(init_name) + pass LlamaAttention .forward = LlamaAttention_fast_forward LlamaSdpaAttention .forward = LlamaAttention_fast_forward LlamaFlashAttention2.forward = LlamaAttention_fast_forward diff --git a/unsloth/models/mapper.py b/unsloth/models/mapper.py index 38cbdbe99..462c85f2a 100644 --- a/unsloth/models/mapper.py +++ b/unsloth/models/mapper.py @@ -218,6 +218,20 @@ "unsloth/Mistral-Nemo-Base-2407", "mistralai/Mistral-Nemo-Base-2407", ), + "unsloth/llama-3.1-8b-bnb-4bit" : ( + "unsloth/llama-3.1-8b", + "meta-llama/Meta-Llama-3.1-8B", + ), + "unsloth/llama-3.1-8b-Instruct-bnb-4bit" : ( + "unsloth/llama-3.1-8b-Instruct", + "meta-llama/Meta-Llama-3.1-8B-Instruct", + ), + "unsloth/llama-3.1-70b-bnb-4bit" : ( + "meta-llama/Meta-Llama-3.1-70B", + ), + "unsloth/llama-3.1-70b-Instruct-bnb-4bit" : ( + "meta-llama/Meta-Llama-3.1-70B-Instruct", + ), } INT_TO_FLOAT_MAPPER = {} From 00ad7992f69ea086ee4b8e9229d6c901ace494c5 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 22 Jul 2024 23:01:18 -0700 Subject: [PATCH 023/147] Update _utils.py --- unsloth/models/_utils.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index c7c779a23..27eb226f2 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -826,13 +826,19 @@ def patch_llama_rope_scaling( ) fix_rope_function = """ if getattr(self.config, "rope_scaling", None) is None: - # Hack - if self.config.max_position_embeddings == 131072 - self.rotary_emb = {rope_function}( - self.head_dim, - max_position_embeddings=self.max_position_embeddings, - base=self.rope_theta, - ) + # Hack to check for Llama-3.1 + if 'llama-3.1' in str(self.config.config._name_or_path).lower(): + self.rotary_emb = {extended_rope_function}( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) + else: + self.rotary_emb = {rope_function}( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) else: scaling_type = self.config.rope_scaling["type"] scaling_factor = self.config.rope_scaling.get("factor") From ae2d1b6cacf8ce46c5aed68ef44921f6a498d8e2 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 23 Jul 2024 10:27:36 -0700 Subject: [PATCH 024/147] Llama 3.1 --- README.md | 5 +++-- unsloth/models/_utils.py | 2 ++ unsloth/models/llama.py | 12 +++++++----- unsloth/models/mapper.py | 14 ++++++++------ 4 files changed, 20 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 05977bad7..c666f2d9c 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and | Unsloth supports | Free Notebooks | Performance | Memory use | |-----------|---------|--------|----------| -| **Llama 3 (8B)** | [▶️ Start for free](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) | 2x faster | 60% less | +| **Llama 3.1 (8B)** | [▶️ Start for free](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) | 2x faster | 60% less | | **Mistral Nemo (12B)** | [▶️ Start for free](https://colab.research.google.com/drive/17d3U-CAIwzmbDRqbZ9NnpHxCkmXB6LZ0?usp=sharing) | 2x faster | 60% less | | **Gemma 2 (9B)** | [▶️ Start for free](https://colab.research.google.com/drive/1vIrqH5uYDQwsJ4-OO3DErvuv4pBgVwk4?usp=sharing) | 2x faster | 63% less | | **Phi-3 (mini)** | [▶️ Start for free](https://colab.research.google.com/drive/1lN6hPQveB_mHSnTOYifygFcrO8C1bxq4?usp=sharing) | 2x faster | 50% less | @@ -32,13 +32,14 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and | **DPO Zephyr** | [▶️ Start for free](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing) | 1.9x faster | 43% less | | **TinyLlama** | [▶️ Start for free](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing) | 3.9x faster | 74% less | -- **Kaggle Notebooks** for [Llama 3 (8B)](https://www.kaggle.com/code/danielhanchen/kaggle-llama-3-8b-unsloth-notebook), [Gemma 2 (9B)](https://www.kaggle.com/code/danielhanchen/kaggle-gemma-7b-unsloth-notebook/), [Mistral (7B)](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook) +- **Kaggle Notebooks** for [Llama 3.1 (8B)](https://www.kaggle.com/code/danielhanchen/kaggle-llama-3-8b-unsloth-notebook), [Gemma 2 (9B)](https://www.kaggle.com/code/danielhanchen/kaggle-gemma-7b-unsloth-notebook/), [Mistral (7B)](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook) - Run [Llama 3 conversational notebook](https://colab.research.google.com/drive/1XamvWYinY6FOSX9GLvnqSjjsNflxdhNc?usp=sharing) and [Mistral v0.3 ChatML](https://colab.research.google.com/drive/15F1xyn8497_dUbxZP4zWmPZ3PJx1Oymv?usp=sharing) - This [text completion notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing) is for continued pretraining / raw text - This [continued pretraining notebook](https://colab.research.google.com/drive/1tEd1FrOXWMnCU9UIvdYhs61tkxdMuKZu?usp=sharing) is for learning another language - Click [here](https://github.com/unslothai/unsloth/wiki) for detailed documentation for Unsloth. ## 🦥 Unsloth.ai News +- 📣 NEW! [Llama 3.1 8b, 70b](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) both Base and Instruct now supported - 📣 NEW! [Mistral Nemo-12b](https://colab.research.google.com/drive/17d3U-CAIwzmbDRqbZ9NnpHxCkmXB6LZ0?usp=sharing) both Base and Instruct now supported - 📣 NEW! [Gemma-2-9b](https://colab.research.google.com/drive/1vIrqH5uYDQwsJ4-OO3DErvuv4pBgVwk4?usp=sharing) and Gemma-2-27b now supported - 📣 UPDATE! [Phi-3 mini](https://colab.research.google.com/drive/1hhdhBa1j_hsymiW9m-WzxQtgqTH_NHqi?usp=sharing) model updated. [Phi-3 Medium](https://colab.research.google.com/drive/1hhdhBa1j_hsymiW9m-WzxQtgqTH_NHqi?usp=sharing) 2x faster finetuning. diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 27eb226f2..394213b9f 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -868,9 +868,11 @@ def patch_llama_rope_scaling( "self.rotary_emb = .+?\)", function, flags = re.DOTALL | re.MULTILINE, ) + print(rotary_emb) if len(rotary_emb) == 0: return None, function rotary_emb = rotary_emb[0] function = function.replace(rotary_emb, fix_rope_function, 1) + print(function) function = exec_code + "\n\n" + function return init_name, function pass diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 2d224b3ca..58fcc9276 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1088,11 +1088,13 @@ def _set_cos_sin_cache(self, seq_len, device, dtype): self.register_buffer("sin_cached", emb.sin().to(dtype=dtype, device=device, non_blocking=True), persistent=False) pass - def apply_scaling(self, freqs: torch.Tensor): + # From https://github.com/meta-llama/llama-models/blob/main/models/llama3_1/api/model.py#L41 + def apply_scaling(freqs: torch.Tensor): + # Values obtained from grid search scale_factor = 8 low_freq_factor = 1 high_freq_factor = 4 - old_context_len = 8192 + old_context_len = 8192 # original llama3 length low_freq_wavelen = old_context_len / low_freq_factor high_freq_wavelen = old_context_len / high_freq_factor @@ -1106,9 +1108,9 @@ def apply_scaling(self, freqs: torch.Tensor): else: assert low_freq_wavelen != high_freq_wavelen smooth = (old_context_len / wavelen - low_freq_factor) / ( - high_freq_factor - low_freq_factor) - new_freqs.append((1 - smooth) * freq / scale_factor + - smooth * freq) + high_freq_factor - low_freq_factor + ) + new_freqs.append((1 - smooth) * freq / scale_factor + smooth * freq) return torch.tensor(new_freqs, dtype=freqs.dtype, device=freqs.device) pass pass diff --git a/unsloth/models/mapper.py b/unsloth/models/mapper.py index 462c85f2a..fc13c94e8 100644 --- a/unsloth/models/mapper.py +++ b/unsloth/models/mapper.py @@ -218,18 +218,20 @@ "unsloth/Mistral-Nemo-Base-2407", "mistralai/Mistral-Nemo-Base-2407", ), - "unsloth/llama-3.1-8b-bnb-4bit" : ( - "unsloth/llama-3.1-8b", + "unsloth/Meta-Llama-3.1-8B-bnb-4bit" : ( + "unsloth/Meta-Llama-3.1-8B", "meta-llama/Meta-Llama-3.1-8B", ), - "unsloth/llama-3.1-8b-Instruct-bnb-4bit" : ( - "unsloth/llama-3.1-8b-Instruct", + "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit" : ( + "unsloth/Meta-Llama-3.1-8B-Instruct", "meta-llama/Meta-Llama-3.1-8B-Instruct", ), - "unsloth/llama-3.1-70b-bnb-4bit" : ( + "unsloth/Meta-Llama-3.1-70B-bnb-4bit" : ( + "unsloth/Meta-Llama-3.1-70B", "meta-llama/Meta-Llama-3.1-70B", ), - "unsloth/llama-3.1-70b-Instruct-bnb-4bit" : ( + "unsloth/Meta-Llama-3.1-70B-Instruct-bnb-4bit" : ( + "unsloth/Meta-Llama-3.1-70B-Instruct", "meta-llama/Meta-Llama-3.1-70B-Instruct", ), } From 77c502cc0c97a84cf9230308919ad56aed9ef4f9 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 23 Jul 2024 10:33:07 -0700 Subject: [PATCH 025/147] Update _utils.py --- unsloth/models/_utils.py | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 394213b9f..3ea38eab1 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -826,22 +826,17 @@ def patch_llama_rope_scaling( ) fix_rope_function = """ if getattr(self.config, "rope_scaling", None) is None: - # Hack to check for Llama-3.1 - if 'llama-3.1' in str(self.config.config._name_or_path).lower(): - self.rotary_emb = {extended_rope_function}( - self.head_dim, - max_position_embeddings=self.max_position_embeddings, - base=self.rope_theta, - ) - else: - self.rotary_emb = {rope_function}( - self.head_dim, - max_position_embeddings=self.max_position_embeddings, - base=self.rope_theta, - ) + self.rotary_emb = {rope_function}( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) else: - scaling_type = self.config.rope_scaling["type"] + scaling_type1 = self.config.rope_scaling.get("type", None) + scaling_type2 = self.config.rope_scaling.get("rope_type", None) + scaling_type = scaling_type1 if scaling_type1 is not None else scaling_type2 scaling_factor = self.config.rope_scaling.get("factor") + if scaling_type == "linear": self.rotary_emb = {scaled_rope_function}( self.head_dim, @@ -849,7 +844,7 @@ def patch_llama_rope_scaling( scaling_factor=scaling_factor, base=self.rope_theta, ) - elif scaling_type == "extended": + elif scaling_type == "llama3": self.rotary_emb = {extended_rope_function}( self.head_dim, max_position_embeddings=self.max_position_embeddings, From 41ee26ce655e05769172c209ea6dd3f8174baefc Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 23 Jul 2024 10:35:03 -0700 Subject: [PATCH 026/147] Update llama.py --- unsloth/models/llama.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 58fcc9276..403a7130b 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1054,7 +1054,7 @@ def _set_cos_sin_cache(self, seq_len, device, dtype): # See https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/rotary_embedding.py#L736 # For Llama 3.1 -class LlamaExtendedRotaryEmbedding(LlamaRotaryEmbedding): +class LlamaExtendedRotaryEmbedding(torch.nn.Module): def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): super().__init__() self.dim = dim @@ -1113,6 +1113,24 @@ def apply_scaling(freqs: torch.Tensor): new_freqs.append((1 - smooth) * freq / scale_factor + smooth * freq) return torch.tensor(new_freqs, dtype=freqs.dtype, device=freqs.device) pass + + def forward(self, x, position_ids=None, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + if seq_len > self.current_rope_size: + self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) + + return ( + self.cos_cached[:seq_len].to(dtype = x.dtype), + self.sin_cached[:seq_len].to(dtype = x.dtype), + ) + pass + + def extend_rope_embedding(self, x, seq_len): + if seq_len <= self.current_rope_size: return + # Iteratively grow by increments of 8192 + self.current_rope_size = int(round(seq_len / 8192)) * 8192 + self._set_cos_sin_cache(self.current_rope_size, device = "cuda:0", dtype = x.dtype) + pass pass From 3dabf84ab67164c5a4c42c1ea598bdfbee320c6f Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 23 Jul 2024 10:36:06 -0700 Subject: [PATCH 027/147] Update llama.py --- unsloth/models/llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 403a7130b..830b345d7 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1089,7 +1089,7 @@ def _set_cos_sin_cache(self, seq_len, device, dtype): pass # From https://github.com/meta-llama/llama-models/blob/main/models/llama3_1/api/model.py#L41 - def apply_scaling(freqs: torch.Tensor): + def apply_scaling(self, freqs: torch.Tensor): # Values obtained from grid search scale_factor = 8 low_freq_factor = 1 From 07634b920399ffd0546be7a460bc27c48cc60b34 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 23 Jul 2024 10:43:36 -0700 Subject: [PATCH 028/147] hack for rotary --- unsloth/models/gemma.py | 8 ++++++-- unsloth/models/llama.py | 12 +++++++++--- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/unsloth/models/gemma.py b/unsloth/models/gemma.py index ce89ad3be..6c9a57abf 100644 --- a/unsloth/models/gemma.py +++ b/unsloth/models/gemma.py @@ -205,7 +205,9 @@ class GemmaFixedRotaryEmbedding(torch.nn.Module): # Fixes https://github.com/huggingface/transformers/pull/28837 # https://github.com/microsoft/DeepSpeed/issues/4932 # The precision of RoPE buffers is not correct, so we cast to int64. - def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, + config = None, # [TODO] Hack to pass in config - need to remove later + ): super().__init__() self.dim = dim self.max_position_embeddings = max_position_embeddings @@ -264,7 +266,9 @@ class GemmaFixedLinearScalingRotaryEmbedding(GemmaFixedRotaryEmbedding): # Fixes https://github.com/huggingface/transformers/pull/28837 # https://github.com/microsoft/DeepSpeed/issues/4932 # The precision of RoPE buffers is not correct, so we cast to int64. - def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0, + config = None, # [TODO] Hack to pass in config - need to remove later + ): self.scaling_factor = scaling_factor super().__init__(dim, max_position_embeddings, base, device) pass diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 830b345d7..929c32496 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -977,7 +977,9 @@ class LlamaRotaryEmbedding(torch.nn.Module): # Fixes https://github.com/huggingface/transformers/pull/28837 # https://github.com/microsoft/DeepSpeed/issues/4932 # The precision of RoPE buffers is not correct, so we cast to int64. - def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, + config = None, # [TODO] Hack to pass in config - need to remove later + ): super().__init__() self.dim = dim self.max_position_embeddings = max_position_embeddings @@ -1030,7 +1032,9 @@ class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding): # Fixes https://github.com/huggingface/transformers/pull/28837 # https://github.com/microsoft/DeepSpeed/issues/4932 # The precision of RoPE buffers is not correct, so we cast to int64. - def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0, + config = None, # [TODO] Hack to pass in config - need to remove later + ): self.scaling_factor = scaling_factor super().__init__(dim, max_position_embeddings, base, device) pass @@ -1055,7 +1059,9 @@ def _set_cos_sin_cache(self, seq_len, device, dtype): # See https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/rotary_embedding.py#L736 # For Llama 3.1 class LlamaExtendedRotaryEmbedding(torch.nn.Module): - def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, + config = None, # [TODO] Hack to pass in config - need to remove later + ): super().__init__() self.dim = dim self.max_position_embeddings = max_position_embeddings From 4a46220131efa70892b48468406dc3bcaaf569bc Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 23 Jul 2024 10:48:45 -0700 Subject: [PATCH 029/147] patch RoPE --- unsloth/models/_utils.py | 12 ++++++------ unsloth/models/gemma.py | 6 ++++-- unsloth/models/llama.py | 9 ++++++--- 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 3ea38eab1..2b8410032 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -752,7 +752,7 @@ def patch_linear_scaling( fix_rope_function = """ if getattr(self.config, "rope_scaling", None) is None: self.rotary_emb = {rope_function}( - self.head_dim, + dim = self.head_dim, max_position_embeddings=self.max_position_embeddings, base=self.rope_theta, ) @@ -761,7 +761,7 @@ def patch_linear_scaling( scaling_factor = self.config.rope_scaling["factor"] if scaling_type == "linear": self.rotary_emb = {scaled_rope_function}( - self.head_dim, + dim = self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=scaling_factor, base=self.rope_theta, @@ -827,7 +827,7 @@ def patch_llama_rope_scaling( fix_rope_function = """ if getattr(self.config, "rope_scaling", None) is None: self.rotary_emb = {rope_function}( - self.head_dim, + dim = self.head_dim, max_position_embeddings=self.max_position_embeddings, base=self.rope_theta, ) @@ -836,17 +836,17 @@ def patch_llama_rope_scaling( scaling_type2 = self.config.rope_scaling.get("rope_type", None) scaling_type = scaling_type1 if scaling_type1 is not None else scaling_type2 scaling_factor = self.config.rope_scaling.get("factor") - + if scaling_type == "linear": self.rotary_emb = {scaled_rope_function}( - self.head_dim, + dim = self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=scaling_factor, base=self.rope_theta, ) elif scaling_type == "llama3": self.rotary_emb = {extended_rope_function}( - self.head_dim, + dim = self.head_dim, max_position_embeddings=self.max_position_embeddings, base=self.rope_theta, ) diff --git a/unsloth/models/gemma.py b/unsloth/models/gemma.py index 6c9a57abf..3dccf63ae 100644 --- a/unsloth/models/gemma.py +++ b/unsloth/models/gemma.py @@ -205,9 +205,10 @@ class GemmaFixedRotaryEmbedding(torch.nn.Module): # Fixes https://github.com/huggingface/transformers/pull/28837 # https://github.com/microsoft/DeepSpeed/issues/4932 # The precision of RoPE buffers is not correct, so we cast to int64. - def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, + def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=None, config = None, # [TODO] Hack to pass in config - need to remove later ): + if config is not None: return # [TODO] Hack to pass in config - need to remove later super().__init__() self.dim = dim self.max_position_embeddings = max_position_embeddings @@ -266,9 +267,10 @@ class GemmaFixedLinearScalingRotaryEmbedding(GemmaFixedRotaryEmbedding): # Fixes https://github.com/huggingface/transformers/pull/28837 # https://github.com/microsoft/DeepSpeed/issues/4932 # The precision of RoPE buffers is not correct, so we cast to int64. - def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0, + def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0, config = None, # [TODO] Hack to pass in config - need to remove later ): + if config is not None: return # [TODO] Hack to pass in config - need to remove later self.scaling_factor = scaling_factor super().__init__(dim, max_position_embeddings, base, device) pass diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 929c32496..d043f03d1 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -977,9 +977,10 @@ class LlamaRotaryEmbedding(torch.nn.Module): # Fixes https://github.com/huggingface/transformers/pull/28837 # https://github.com/microsoft/DeepSpeed/issues/4932 # The precision of RoPE buffers is not correct, so we cast to int64. - def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, + def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=None, config = None, # [TODO] Hack to pass in config - need to remove later ): + if config is not None: return # [TODO] Hack to pass in config - need to remove later super().__init__() self.dim = dim self.max_position_embeddings = max_position_embeddings @@ -1032,9 +1033,10 @@ class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding): # Fixes https://github.com/huggingface/transformers/pull/28837 # https://github.com/microsoft/DeepSpeed/issues/4932 # The precision of RoPE buffers is not correct, so we cast to int64. - def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0, + def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0, config = None, # [TODO] Hack to pass in config - need to remove later ): + if config is not None: return # [TODO] Hack to pass in config - need to remove later self.scaling_factor = scaling_factor super().__init__(dim, max_position_embeddings, base, device) pass @@ -1059,9 +1061,10 @@ def _set_cos_sin_cache(self, seq_len, device, dtype): # See https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/rotary_embedding.py#L736 # For Llama 3.1 class LlamaExtendedRotaryEmbedding(torch.nn.Module): - def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, + def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=None, config = None, # [TODO] Hack to pass in config - need to remove later ): + if config is not None: return # [TODO] Hack to pass in config - need to remove later super().__init__() self.dim = dim self.max_position_embeddings = max_position_embeddings From 2d9f189cbe977c4d5bafc9629e9aa0558e373e96 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 23 Jul 2024 10:53:31 -0700 Subject: [PATCH 030/147] refix rope --- unsloth/models/gemma.py | 5 ++--- unsloth/models/llama.py | 9 +++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/unsloth/models/gemma.py b/unsloth/models/gemma.py index 3dccf63ae..e3f1e615d 100644 --- a/unsloth/models/gemma.py +++ b/unsloth/models/gemma.py @@ -208,8 +208,8 @@ class GemmaFixedRotaryEmbedding(torch.nn.Module): def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=None, config = None, # [TODO] Hack to pass in config - need to remove later ): - if config is not None: return # [TODO] Hack to pass in config - need to remove later super().__init__() + if config is not None: return # [TODO] Hack to pass in config - need to remove later self.dim = dim self.max_position_embeddings = max_position_embeddings self.base = base @@ -270,9 +270,8 @@ class GemmaFixedLinearScalingRotaryEmbedding(GemmaFixedRotaryEmbedding): def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0, config = None, # [TODO] Hack to pass in config - need to remove later ): - if config is not None: return # [TODO] Hack to pass in config - need to remove later self.scaling_factor = scaling_factor - super().__init__(dim, max_position_embeddings, base, device) + super().__init__(dim = dim, max_position_embeddings = max_position_embeddings, base = base, device = device, config = config) pass def _set_cos_sin_cache(self, seq_len, device, dtype): diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index d043f03d1..a4a6527ff 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -980,8 +980,9 @@ class LlamaRotaryEmbedding(torch.nn.Module): def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=None, config = None, # [TODO] Hack to pass in config - need to remove later ): - if config is not None: return # [TODO] Hack to pass in config - need to remove later super().__init__() + if config is not None: return # [TODO] Hack to pass in config - need to remove later + self.dim = dim self.max_position_embeddings = max_position_embeddings self.base = base @@ -1036,9 +1037,8 @@ class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding): def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0, config = None, # [TODO] Hack to pass in config - need to remove later ): - if config is not None: return # [TODO] Hack to pass in config - need to remove later self.scaling_factor = scaling_factor - super().__init__(dim, max_position_embeddings, base, device) + super().__init__(dim = dim, max_position_embeddings = max_position_embeddings, base = base, device = device, config = config) pass def _set_cos_sin_cache(self, seq_len, device, dtype): @@ -1064,8 +1064,9 @@ class LlamaExtendedRotaryEmbedding(torch.nn.Module): def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=None, config = None, # [TODO] Hack to pass in config - need to remove later ): - if config is not None: return # [TODO] Hack to pass in config - need to remove later super().__init__() + if config is not None: return # [TODO] Hack to pass in config - need to remove later + self.dim = dim self.max_position_embeddings = max_position_embeddings self.base = base From 80d62c3fa6ae248623c974b2926b61c3dba62da3 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 23 Jul 2024 10:54:54 -0700 Subject: [PATCH 031/147] Update _utils.py --- unsloth/models/_utils.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 2b8410032..b021e89e9 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -863,11 +863,9 @@ def patch_llama_rope_scaling( "self.rotary_emb = .+?\)", function, flags = re.DOTALL | re.MULTILINE, ) - print(rotary_emb) if len(rotary_emb) == 0: return None, function rotary_emb = rotary_emb[0] function = function.replace(rotary_emb, fix_rope_function, 1) - print(function) function = exec_code + "\n\n" + function return init_name, function pass From 7d7a5f77655b373c0c50b8df7a2a43ee950dc852 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 23 Jul 2024 10:58:31 -0700 Subject: [PATCH 032/147] Update llama.py --- unsloth/models/llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index a4a6527ff..a4b655216 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1065,7 +1065,7 @@ def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device= config = None, # [TODO] Hack to pass in config - need to remove later ): super().__init__() - if config is not None: return # [TODO] Hack to pass in config - need to remove later + # if config is not None: return # [TODO] Hack to pass in config - need to remove later self.dim = dim self.max_position_embeddings = max_position_embeddings From 2f9bd5bcb61f1530a48ee08bbdd5adbd4ec39a33 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 23 Jul 2024 11:09:24 -0700 Subject: [PATCH 033/147] Llama 3.1 check --- pyproject.toml | 4 ++-- unsloth/models/loader.py | 15 ++++++++++++++- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 29b35577e..829b35ad3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,7 +35,7 @@ exclude = ["images*"] huggingface = [ "packaging", "tyro", - "transformers>=4.42.3", + "transformers>=4.43.1", "datasets>=2.16.0", "sentencepiece>=0.2.0", "tqdm", @@ -188,7 +188,7 @@ colab-ampere-torch220 = [ colab-new = [ "packaging", "tyro", - "transformers>=4.42.3", + "transformers>=4.43.1", "datasets>=2.16.0", "sentencepiece>=0.2.0", "tqdm", diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 0f170597b..ece8af282 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -27,6 +27,7 @@ SUPPORTS_FOURBIT = transformers_version >= Version("4.37") SUPPORTS_GEMMA = transformers_version >= Version("4.38") SUPPORTS_GEMMA2 = transformers_version >= Version("4.42") +SUPPORTS_LLAMA31 = transformers_version >= Version("4.43.1") if SUPPORTS_GEMMA: from .gemma import FastGemmaModel if SUPPORTS_GEMMA2: @@ -130,7 +131,19 @@ def from_pretrained( model_type = model_config.model_type - if model_type == "llama": dispatch_model = FastLlamaModel + if model_type == "llama": + scaling_type1 = model_config.rope_scaling.get("type", None) + scaling_type2 = model_config.rope_scaling.get("rope_type", None) + scaling_type = scaling_type1 if scaling_type1 is not None else scaling_type2 + + if scaling_type == "llama3" and not SUPPORTS_LLAMA31: + raise ImportError( + f"Unsloth: Your transformers version of {transformers_version} does not support Llama 3.1.\n"\ + f"The minimum required version is 4.43.1\n"\ + f'Try `pip install --upgrade "transformers>=4.43.1"`\n'\ + f"to obtain the latest transformers build, then restart this session."\ + ) + dispatch_model = FastLlamaModel elif model_type == "mistral": dispatch_model = FastMistralModel elif model_type == "gemma": if not SUPPORTS_GEMMA: From 740979b1b9af32d39af7904973a71aaadf009984 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 23 Jul 2024 11:12:58 -0700 Subject: [PATCH 034/147] Update llama.py --- unsloth/models/llama.py | 1 + 1 file changed, 1 insertion(+) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index a4b655216..295d92f62 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1010,6 +1010,7 @@ def _set_cos_sin_cache(self, seq_len, device, dtype): pass def forward(self, x, position_ids=None, seq_len=None): + print(x, position_ids, seq_len) # x: [bs, num_attention_heads, seq_len, head_size] if seq_len > self.current_rope_size: self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) From 47d230b3cd043306463e2b76bd8023f867427ea2 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 23 Jul 2024 11:13:15 -0700 Subject: [PATCH 035/147] Update llama.py --- unsloth/models/llama.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 295d92f62..ff4d19c54 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1010,7 +1010,7 @@ def _set_cos_sin_cache(self, seq_len, device, dtype): pass def forward(self, x, position_ids=None, seq_len=None): - print(x, position_ids, seq_len) + print(__LINE__, x, position_ids, seq_len) # x: [bs, num_attention_heads, seq_len, head_size] if seq_len > self.current_rope_size: self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) @@ -1127,6 +1127,7 @@ def apply_scaling(self, freqs: torch.Tensor): def forward(self, x, position_ids=None, seq_len=None): # x: [bs, num_attention_heads, seq_len, head_size] + print(__LINE__, x, position_ids, seq_len) if seq_len > self.current_rope_size: self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) From f849b8b61f387b672d74de4a4372d03fdebcf809 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 23 Jul 2024 11:15:35 -0700 Subject: [PATCH 036/147] Update llama.py --- unsloth/models/llama.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index ff4d19c54..d2bbb5a3a 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1010,7 +1010,6 @@ def _set_cos_sin_cache(self, seq_len, device, dtype): pass def forward(self, x, position_ids=None, seq_len=None): - print(__LINE__, x, position_ids, seq_len) # x: [bs, num_attention_heads, seq_len, head_size] if seq_len > self.current_rope_size: self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) @@ -1066,6 +1065,7 @@ def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device= config = None, # [TODO] Hack to pass in config - need to remove later ): super().__init__() + print(__FILE__, __LINE__) # if config is not None: return # [TODO] Hack to pass in config - need to remove later self.dim = dim @@ -1080,6 +1080,7 @@ def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device= ) inv_freq = self.apply_scaling(inv_freq) self.register_buffer("inv_freq", inv_freq, persistent = False) + print(__FILE__, __LINE__) # Build here to make `torch.jit.trace` work. self._set_cos_sin_cache(seq_len=self.current_rope_size, device=device, dtype=torch.get_default_dtype()) @@ -1089,6 +1090,7 @@ def _set_cos_sin_cache(self, seq_len, device, dtype): # Note: on the original Llama codebase, these tensors are created on the target device (and not on CPU) and # in FP32. They are applied (multiplied) in FP32 as well. self.current_rope_size = seq_len + print(__FILE__, __LINE__) t = torch.arange(self.current_rope_size, device="cpu", dtype=torch.int64).float() @@ -1127,7 +1129,6 @@ def apply_scaling(self, freqs: torch.Tensor): def forward(self, x, position_ids=None, seq_len=None): # x: [bs, num_attention_heads, seq_len, head_size] - print(__LINE__, x, position_ids, seq_len) if seq_len > self.current_rope_size: self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) From 6157cef3d1a37bb432389686ba35038d751b6ba6 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 23 Jul 2024 11:16:31 -0700 Subject: [PATCH 037/147] Update llama.py --- unsloth/models/llama.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index d2bbb5a3a..de9eb80da 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1080,7 +1080,7 @@ def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device= ) inv_freq = self.apply_scaling(inv_freq) self.register_buffer("inv_freq", inv_freq, persistent = False) - print(__FILE__, __LINE__) + print(__LINE__) # Build here to make `torch.jit.trace` work. self._set_cos_sin_cache(seq_len=self.current_rope_size, device=device, dtype=torch.get_default_dtype()) @@ -1090,7 +1090,7 @@ def _set_cos_sin_cache(self, seq_len, device, dtype): # Note: on the original Llama codebase, these tensors are created on the target device (and not on CPU) and # in FP32. They are applied (multiplied) in FP32 as well. self.current_rope_size = seq_len - print(__FILE__, __LINE__) + print(__LINE__) t = torch.arange(self.current_rope_size, device="cpu", dtype=torch.int64).float() From 5da00a946e2af9ebfd1aaf1f3885e94b628745a3 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 23 Jul 2024 11:16:40 -0700 Subject: [PATCH 038/147] Update llama.py --- unsloth/models/llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index de9eb80da..3f358fe67 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1065,7 +1065,7 @@ def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device= config = None, # [TODO] Hack to pass in config - need to remove later ): super().__init__() - print(__FILE__, __LINE__) + print(__LINE__) # if config is not None: return # [TODO] Hack to pass in config - need to remove later self.dim = dim From 2ff7d8368c44c78db1e8cd10326b3c88055d8832 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 23 Jul 2024 11:18:12 -0700 Subject: [PATCH 039/147] Update llama.py --- unsloth/models/llama.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 3f358fe67..3085ccd0b 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1065,7 +1065,7 @@ def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device= config = None, # [TODO] Hack to pass in config - need to remove later ): super().__init__() - print(__LINE__) + print(1068) # if config is not None: return # [TODO] Hack to pass in config - need to remove later self.dim = dim @@ -1080,7 +1080,7 @@ def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device= ) inv_freq = self.apply_scaling(inv_freq) self.register_buffer("inv_freq", inv_freq, persistent = False) - print(__LINE__) + print(1083) # Build here to make `torch.jit.trace` work. self._set_cos_sin_cache(seq_len=self.current_rope_size, device=device, dtype=torch.get_default_dtype()) @@ -1090,7 +1090,7 @@ def _set_cos_sin_cache(self, seq_len, device, dtype): # Note: on the original Llama codebase, these tensors are created on the target device (and not on CPU) and # in FP32. They are applied (multiplied) in FP32 as well. self.current_rope_size = seq_len - print(__LINE__) + print(1093) t = torch.arange(self.current_rope_size, device="cpu", dtype=torch.int64).float() From 7c441f3480b217e1909d7b7eb53eb77a6481c7fc Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 23 Jul 2024 11:21:29 -0700 Subject: [PATCH 040/147] Update llama.py --- unsloth/models/llama.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 3085ccd0b..f9981f56e 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1065,9 +1065,15 @@ def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device= config = None, # [TODO] Hack to pass in config - need to remove later ): super().__init__() - print(1068) - # if config is not None: return # [TODO] Hack to pass in config - need to remove later - + if config is not None: + # [TODO] Hack to pass in config - need to remove later + base = config.rope_theta + partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 + dim = int((config.hidden_size // config.num_attention_heads)) + device = "cuda" + max_position_embeddings = config.max_position_embeddings + pass + self.dim = dim self.max_position_embeddings = max_position_embeddings self.base = base @@ -1080,7 +1086,6 @@ def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device= ) inv_freq = self.apply_scaling(inv_freq) self.register_buffer("inv_freq", inv_freq, persistent = False) - print(1083) # Build here to make `torch.jit.trace` work. self._set_cos_sin_cache(seq_len=self.current_rope_size, device=device, dtype=torch.get_default_dtype()) @@ -1090,7 +1095,6 @@ def _set_cos_sin_cache(self, seq_len, device, dtype): # Note: on the original Llama codebase, these tensors are created on the target device (and not on CPU) and # in FP32. They are applied (multiplied) in FP32 as well. self.current_rope_size = seq_len - print(1093) t = torch.arange(self.current_rope_size, device="cpu", dtype=torch.int64).float() From 5d9245660fd3a739d992f4a0e717ee8c85bdb635 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 23 Jul 2024 11:22:27 -0700 Subject: [PATCH 041/147] Update llama.py --- unsloth/models/llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index f9981f56e..8fc480d14 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1073,7 +1073,7 @@ def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device= device = "cuda" max_position_embeddings = config.max_position_embeddings pass - + self.dim = dim self.max_position_embeddings = max_position_embeddings self.base = base From 4a3fddd055333f2eeb4ba58cdbf374e449ce3c3a Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 23 Jul 2024 11:23:00 -0700 Subject: [PATCH 042/147] Update llama.py --- unsloth/models/llama.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 8fc480d14..474dad329 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1067,13 +1067,14 @@ def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device= super().__init__() if config is not None: # [TODO] Hack to pass in config - need to remove later + print(1) base = config.rope_theta partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 dim = int((config.hidden_size // config.num_attention_heads)) device = "cuda" max_position_embeddings = config.max_position_embeddings pass - + self.dim = dim self.max_position_embeddings = max_position_embeddings self.base = base From ca3a1b7315c54ccffafe60b5c7abe6869cd7be6a Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 23 Jul 2024 11:23:18 -0700 Subject: [PATCH 043/147] Update llama.py --- unsloth/models/llama.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 474dad329..aef04d604 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -981,8 +981,16 @@ def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device= config = None, # [TODO] Hack to pass in config - need to remove later ): super().__init__() - if config is not None: return # [TODO] Hack to pass in config - need to remove later - + if config is not None: + # [TODO] Hack to pass in config - need to remove later + print(2) + base = config.rope_theta + partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 + dim = int((config.hidden_size // config.num_attention_heads)) + device = "cuda" + max_position_embeddings = config.max_position_embeddings + pass + self.dim = dim self.max_position_embeddings = max_position_embeddings self.base = base From b93a75778e2ef0b7c9b83b1cc329c6e2f7649b73 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 23 Jul 2024 11:24:58 -0700 Subject: [PATCH 044/147] Update llama.py --- unsloth/models/llama.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index aef04d604..338ae0a7c 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -983,7 +983,6 @@ def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device= super().__init__() if config is not None: # [TODO] Hack to pass in config - need to remove later - print(2) base = config.rope_theta partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 dim = int((config.hidden_size // config.num_attention_heads)) @@ -1075,7 +1074,6 @@ def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device= super().__init__() if config is not None: # [TODO] Hack to pass in config - need to remove later - print(1) base = config.rope_theta partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 dim = int((config.hidden_size // config.num_attention_heads)) From c86b13d46512c5e7a8b2221e885a3a00eb0ad59a Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 23 Jul 2024 11:40:49 -0700 Subject: [PATCH 045/147] Llama 3.1 (#797) * Llama 3.1 * Update _utils.py * Llama 3.1 * Update _utils.py * Update llama.py * Update llama.py * hack for rotary * patch RoPE * refix rope * Update _utils.py * Update llama.py * Llama 3.1 check * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py --- README.md | 5 +- pyproject.toml | 4 +- unsloth/models/_utils.py | 98 ++++++++++++++++++++++++++++++- unsloth/models/gemma.py | 11 +++- unsloth/models/llama.py | 123 ++++++++++++++++++++++++++++++++++++++- unsloth/models/loader.py | 15 ++++- unsloth/models/mapper.py | 16 +++++ 7 files changed, 258 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 05977bad7..c666f2d9c 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and | Unsloth supports | Free Notebooks | Performance | Memory use | |-----------|---------|--------|----------| -| **Llama 3 (8B)** | [▶️ Start for free](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) | 2x faster | 60% less | +| **Llama 3.1 (8B)** | [▶️ Start for free](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) | 2x faster | 60% less | | **Mistral Nemo (12B)** | [▶️ Start for free](https://colab.research.google.com/drive/17d3U-CAIwzmbDRqbZ9NnpHxCkmXB6LZ0?usp=sharing) | 2x faster | 60% less | | **Gemma 2 (9B)** | [▶️ Start for free](https://colab.research.google.com/drive/1vIrqH5uYDQwsJ4-OO3DErvuv4pBgVwk4?usp=sharing) | 2x faster | 63% less | | **Phi-3 (mini)** | [▶️ Start for free](https://colab.research.google.com/drive/1lN6hPQveB_mHSnTOYifygFcrO8C1bxq4?usp=sharing) | 2x faster | 50% less | @@ -32,13 +32,14 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and | **DPO Zephyr** | [▶️ Start for free](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing) | 1.9x faster | 43% less | | **TinyLlama** | [▶️ Start for free](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing) | 3.9x faster | 74% less | -- **Kaggle Notebooks** for [Llama 3 (8B)](https://www.kaggle.com/code/danielhanchen/kaggle-llama-3-8b-unsloth-notebook), [Gemma 2 (9B)](https://www.kaggle.com/code/danielhanchen/kaggle-gemma-7b-unsloth-notebook/), [Mistral (7B)](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook) +- **Kaggle Notebooks** for [Llama 3.1 (8B)](https://www.kaggle.com/code/danielhanchen/kaggle-llama-3-8b-unsloth-notebook), [Gemma 2 (9B)](https://www.kaggle.com/code/danielhanchen/kaggle-gemma-7b-unsloth-notebook/), [Mistral (7B)](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook) - Run [Llama 3 conversational notebook](https://colab.research.google.com/drive/1XamvWYinY6FOSX9GLvnqSjjsNflxdhNc?usp=sharing) and [Mistral v0.3 ChatML](https://colab.research.google.com/drive/15F1xyn8497_dUbxZP4zWmPZ3PJx1Oymv?usp=sharing) - This [text completion notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing) is for continued pretraining / raw text - This [continued pretraining notebook](https://colab.research.google.com/drive/1tEd1FrOXWMnCU9UIvdYhs61tkxdMuKZu?usp=sharing) is for learning another language - Click [here](https://github.com/unslothai/unsloth/wiki) for detailed documentation for Unsloth. ## 🦥 Unsloth.ai News +- 📣 NEW! [Llama 3.1 8b, 70b](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) both Base and Instruct now supported - 📣 NEW! [Mistral Nemo-12b](https://colab.research.google.com/drive/17d3U-CAIwzmbDRqbZ9NnpHxCkmXB6LZ0?usp=sharing) both Base and Instruct now supported - 📣 NEW! [Gemma-2-9b](https://colab.research.google.com/drive/1vIrqH5uYDQwsJ4-OO3DErvuv4pBgVwk4?usp=sharing) and Gemma-2-27b now supported - 📣 UPDATE! [Phi-3 mini](https://colab.research.google.com/drive/1hhdhBa1j_hsymiW9m-WzxQtgqTH_NHqi?usp=sharing) model updated. [Phi-3 Medium](https://colab.research.google.com/drive/1hhdhBa1j_hsymiW9m-WzxQtgqTH_NHqi?usp=sharing) 2x faster finetuning. diff --git a/pyproject.toml b/pyproject.toml index 29b35577e..829b35ad3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,7 +35,7 @@ exclude = ["images*"] huggingface = [ "packaging", "tyro", - "transformers>=4.42.3", + "transformers>=4.43.1", "datasets>=2.16.0", "sentencepiece>=0.2.0", "tqdm", @@ -188,7 +188,7 @@ colab-ampere-torch220 = [ colab-new = [ "packaging", "tyro", - "transformers>=4.42.3", + "transformers>=4.43.1", "datasets>=2.16.0", "sentencepiece>=0.2.0", "tqdm", diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 466a5fee7..b021e89e9 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -33,6 +33,7 @@ "unsloth_offloaded_gradient_checkpoint", "torch_compile_options", "patch_linear_scaling", + "patch_llama_rope_scaling", "check_nvidia", "create_boolean_mask", "torch_amp_custom_fwd", @@ -332,7 +333,13 @@ def patch_tokenizer(model, tokenizer): Check if pad_token is not the same as eos_token otherwise the loss will ignore it!! Fixes https://github.com/unslothai/unsloth/issues/5 """ - possible_reserved_tokens = ("<|reserved", "<|placeholder", "[control") + possible_reserved_tokens = ( + "<|reserved", # Llama-3 + "<|placeholder", # Phi-3 + "[control", # Forgot where lol + "", # Mistral Nemo + "<|finetune_right_pad_id|>", # Llama-3.1 + ) if model is not None: model.config.update({"unsloth_version" : __version__}) @@ -745,7 +752,7 @@ def patch_linear_scaling( fix_rope_function = """ if getattr(self.config, "rope_scaling", None) is None: self.rotary_emb = {rope_function}( - self.head_dim, + dim = self.head_dim, max_position_embeddings=self.max_position_embeddings, base=self.rope_theta, ) @@ -754,7 +761,7 @@ def patch_linear_scaling( scaling_factor = self.config.rope_scaling["factor"] if scaling_type == "linear": self.rotary_emb = {scaled_rope_function}( - self.head_dim, + dim = self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=scaling_factor, base=self.rope_theta, @@ -779,6 +786,91 @@ def patch_linear_scaling( pass +# Patches for Llama-3 LlamaExtendedRotaryEmbedding +def patch_llama_rope_scaling( + model_name = "llama", + rope_module = None, + scaled_rope_module = None, + extended_rope_module = None, + attention_module = None, +): + assert(\ + rope_module is not None and \ + scaled_rope_module is not None and \ + extended_rope_module is not None + ) + assert(attention_module is not None) + + rope_name = rope_module.__name__ + scaled_rope_name = scaled_rope_module.__name__ + model_filepath = f"transformers.models.{model_name}.modeling_{model_name}" + exec_code = \ + f"import torch.nn as nn\n"\ + f"from typing import Union, Optional, List, Any, Callable, Tuple\n"\ + f"from {model_filepath} import logger, "\ + f"{model_name.title()}Attention, {model_name.title()}Config" + + try: + function = inspect.getsource(attention_module.__init__) + except: + # Most likely already patched! + return None, None + where = function.find("def") + function = function.split("\n") + function = "\n".join(x[where:] for x in function) + init_name = f"{model_name.title()}Attention__init__" + function = function.replace("def __init__", f"def {init_name}") + function = function.replace( + "super().__init__()", + f"super({model_name.title()}Attention, self).__init__()", + ) + fix_rope_function = """ + if getattr(self.config, "rope_scaling", None) is None: + self.rotary_emb = {rope_function}( + dim = self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) + else: + scaling_type1 = self.config.rope_scaling.get("type", None) + scaling_type2 = self.config.rope_scaling.get("rope_type", None) + scaling_type = scaling_type1 if scaling_type1 is not None else scaling_type2 + scaling_factor = self.config.rope_scaling.get("factor") + + if scaling_type == "linear": + self.rotary_emb = {scaled_rope_function}( + dim = self.head_dim, + max_position_embeddings=self.max_position_embeddings, + scaling_factor=scaling_factor, + base=self.rope_theta, + ) + elif scaling_type == "llama3": + self.rotary_emb = {extended_rope_function}( + dim = self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) + else: + raise ValueError(f"Unknown RoPE scaling type {{scaling_type}}") + pass + """ + fix_rope_function = fix_rope_function.format( + rope_function = rope_module.__name__, + scaled_rope_function = scaled_rope_module.__name__, + extended_rope_function = extended_rope_module.__name__, + ) + rotary_emb = re.findall( + "self.rotary_emb = .+?\)", function, + flags = re.DOTALL | re.MULTILINE, + ) + if len(rotary_emb) == 0: return None, function + rotary_emb = rotary_emb[0] + function = function.replace(rotary_emb, fix_rope_function, 1) + function = exec_code + "\n\n" + function + return init_name, function +pass + + def check_nvidia(): # Unsloth doesn't work yet on AMD devices - we're working on it! output = np.array([0,]) diff --git a/unsloth/models/gemma.py b/unsloth/models/gemma.py index ce89ad3be..e3f1e615d 100644 --- a/unsloth/models/gemma.py +++ b/unsloth/models/gemma.py @@ -205,8 +205,11 @@ class GemmaFixedRotaryEmbedding(torch.nn.Module): # Fixes https://github.com/huggingface/transformers/pull/28837 # https://github.com/microsoft/DeepSpeed/issues/4932 # The precision of RoPE buffers is not correct, so we cast to int64. - def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=None, + config = None, # [TODO] Hack to pass in config - need to remove later + ): super().__init__() + if config is not None: return # [TODO] Hack to pass in config - need to remove later self.dim = dim self.max_position_embeddings = max_position_embeddings self.base = base @@ -264,9 +267,11 @@ class GemmaFixedLinearScalingRotaryEmbedding(GemmaFixedRotaryEmbedding): # Fixes https://github.com/huggingface/transformers/pull/28837 # https://github.com/microsoft/DeepSpeed/issues/4932 # The precision of RoPE buffers is not correct, so we cast to int64. - def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): + def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0, + config = None, # [TODO] Hack to pass in config - need to remove later + ): self.scaling_factor = scaling_factor - super().__init__(dim, max_position_embeddings, base, device) + super().__init__(dim = dim, max_position_embeddings = max_position_embeddings, base = base, device = device, config = config) pass def _set_cos_sin_cache(self, seq_len, device, dtype): diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index ff51b90b8..338ae0a7c 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -977,8 +977,19 @@ class LlamaRotaryEmbedding(torch.nn.Module): # Fixes https://github.com/huggingface/transformers/pull/28837 # https://github.com/microsoft/DeepSpeed/issues/4932 # The precision of RoPE buffers is not correct, so we cast to int64. - def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=None, + config = None, # [TODO] Hack to pass in config - need to remove later + ): super().__init__() + if config is not None: + # [TODO] Hack to pass in config - need to remove later + base = config.rope_theta + partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 + dim = int((config.hidden_size // config.num_attention_heads)) + device = "cuda" + max_position_embeddings = config.max_position_embeddings + pass + self.dim = dim self.max_position_embeddings = max_position_embeddings self.base = base @@ -1030,9 +1041,11 @@ class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding): # Fixes https://github.com/huggingface/transformers/pull/28837 # https://github.com/microsoft/DeepSpeed/issues/4932 # The precision of RoPE buffers is not correct, so we cast to int64. - def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): + def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0, + config = None, # [TODO] Hack to pass in config - need to remove later + ): self.scaling_factor = scaling_factor - super().__init__(dim, max_position_embeddings, base, device) + super().__init__(dim = dim, max_position_embeddings = max_position_embeddings, base = base, device = device, config = config) pass def _set_cos_sin_cache(self, seq_len, device, dtype): @@ -1052,6 +1065,99 @@ def _set_cos_sin_cache(self, seq_len, device, dtype): pass +# See https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/rotary_embedding.py#L736 +# For Llama 3.1 +class LlamaExtendedRotaryEmbedding(torch.nn.Module): + def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=None, + config = None, # [TODO] Hack to pass in config - need to remove later + ): + super().__init__() + if config is not None: + # [TODO] Hack to pass in config - need to remove later + base = config.rope_theta + partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 + dim = int((config.hidden_size // config.num_attention_heads)) + device = "cuda" + max_position_embeddings = config.max_position_embeddings + pass + + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + # Dynamic RoPE we first set it to a max of 4 * 8192 tokens then we iteratively grow this + self.current_rope_size = min(4 * 8192, self.max_position_embeddings) + + # Normal Llama-3 RoPE + inv_freq = 1.0 / ( + self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64, device="cpu").float() / self.dim) + ) + inv_freq = self.apply_scaling(inv_freq) + self.register_buffer("inv_freq", inv_freq, persistent = False) + + # Build here to make `torch.jit.trace` work. + self._set_cos_sin_cache(seq_len=self.current_rope_size, device=device, dtype=torch.get_default_dtype()) + pass + + def _set_cos_sin_cache(self, seq_len, device, dtype): + # Note: on the original Llama codebase, these tensors are created on the target device (and not on CPU) and + # in FP32. They are applied (multiplied) in FP32 as well. + self.current_rope_size = seq_len + + t = torch.arange(self.current_rope_size, device="cpu", dtype=torch.int64).float() + + freqs = torch.outer(t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype=dtype, device=device, non_blocking=True), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype=dtype, device=device, non_blocking=True), persistent=False) + pass + + # From https://github.com/meta-llama/llama-models/blob/main/models/llama3_1/api/model.py#L41 + def apply_scaling(self, freqs: torch.Tensor): + # Values obtained from grid search + scale_factor = 8 + low_freq_factor = 1 + high_freq_factor = 4 + old_context_len = 8192 # original llama3 length + + low_freq_wavelen = old_context_len / low_freq_factor + high_freq_wavelen = old_context_len / high_freq_factor + new_freqs = [] + for freq in freqs: + wavelen = 2 * math.pi / freq + if wavelen < high_freq_wavelen: + new_freqs.append(freq) + elif wavelen > low_freq_wavelen: + new_freqs.append(freq / scale_factor) + else: + assert low_freq_wavelen != high_freq_wavelen + smooth = (old_context_len / wavelen - low_freq_factor) / ( + high_freq_factor - low_freq_factor + ) + new_freqs.append((1 - smooth) * freq / scale_factor + smooth * freq) + return torch.tensor(new_freqs, dtype=freqs.dtype, device=freqs.device) + pass + + def forward(self, x, position_ids=None, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + if seq_len > self.current_rope_size: + self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) + + return ( + self.cos_cached[:seq_len].to(dtype = x.dtype), + self.sin_cached[:seq_len].to(dtype = x.dtype), + ) + pass + + def extend_rope_embedding(self, x, seq_len): + if seq_len <= self.current_rope_size: return + # Iteratively grow by increments of 8192 + self.current_rope_size = int(round(seq_len / 8192)) * 8192 + self._set_cos_sin_cache(self.current_rope_size, device = "cuda:0", dtype = x.dtype) + pass +pass + + def _wrap_fast_inference(generate, device_type, dtype, model): # Wraps inference with bfloat16 / float16 @torch.inference_mode @@ -1108,6 +1214,17 @@ class FastLlamaModel: @staticmethod def pre_patch(): + init_name, function = patch_llama_rope_scaling( + model_name = "llama", + rope_module = LlamaRotaryEmbedding, + scaled_rope_module = LlamaLinearScalingRotaryEmbedding, + extended_rope_module = LlamaExtendedRotaryEmbedding, + attention_module = LlamaAttention, + ) + if init_name is not None: + exec(function, globals()) + LlamaAttention.__init__ = eval(init_name) + pass LlamaAttention .forward = LlamaAttention_fast_forward LlamaSdpaAttention .forward = LlamaAttention_fast_forward LlamaFlashAttention2.forward = LlamaAttention_fast_forward diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 0f170597b..ece8af282 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -27,6 +27,7 @@ SUPPORTS_FOURBIT = transformers_version >= Version("4.37") SUPPORTS_GEMMA = transformers_version >= Version("4.38") SUPPORTS_GEMMA2 = transformers_version >= Version("4.42") +SUPPORTS_LLAMA31 = transformers_version >= Version("4.43.1") if SUPPORTS_GEMMA: from .gemma import FastGemmaModel if SUPPORTS_GEMMA2: @@ -130,7 +131,19 @@ def from_pretrained( model_type = model_config.model_type - if model_type == "llama": dispatch_model = FastLlamaModel + if model_type == "llama": + scaling_type1 = model_config.rope_scaling.get("type", None) + scaling_type2 = model_config.rope_scaling.get("rope_type", None) + scaling_type = scaling_type1 if scaling_type1 is not None else scaling_type2 + + if scaling_type == "llama3" and not SUPPORTS_LLAMA31: + raise ImportError( + f"Unsloth: Your transformers version of {transformers_version} does not support Llama 3.1.\n"\ + f"The minimum required version is 4.43.1\n"\ + f'Try `pip install --upgrade "transformers>=4.43.1"`\n'\ + f"to obtain the latest transformers build, then restart this session."\ + ) + dispatch_model = FastLlamaModel elif model_type == "mistral": dispatch_model = FastMistralModel elif model_type == "gemma": if not SUPPORTS_GEMMA: diff --git a/unsloth/models/mapper.py b/unsloth/models/mapper.py index 38cbdbe99..fc13c94e8 100644 --- a/unsloth/models/mapper.py +++ b/unsloth/models/mapper.py @@ -218,6 +218,22 @@ "unsloth/Mistral-Nemo-Base-2407", "mistralai/Mistral-Nemo-Base-2407", ), + "unsloth/Meta-Llama-3.1-8B-bnb-4bit" : ( + "unsloth/Meta-Llama-3.1-8B", + "meta-llama/Meta-Llama-3.1-8B", + ), + "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit" : ( + "unsloth/Meta-Llama-3.1-8B-Instruct", + "meta-llama/Meta-Llama-3.1-8B-Instruct", + ), + "unsloth/Meta-Llama-3.1-70B-bnb-4bit" : ( + "unsloth/Meta-Llama-3.1-70B", + "meta-llama/Meta-Llama-3.1-70B", + ), + "unsloth/Meta-Llama-3.1-70B-Instruct-bnb-4bit" : ( + "unsloth/Meta-Llama-3.1-70B-Instruct", + "meta-llama/Meta-Llama-3.1-70B-Instruct", + ), } INT_TO_FLOAT_MAPPER = {} From 22968a2134f3fb265a6158610a8ef173ba9547aa Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 23 Jul 2024 11:51:08 -0700 Subject: [PATCH 046/147] Update README.md --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index c666f2d9c..e7ef854cf 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ unsloth logo - + @@ -22,7 +22,7 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and | Unsloth supports | Free Notebooks | Performance | Memory use | |-----------|---------|--------|----------| -| **Llama 3.1 (8B)** | [▶️ Start for free](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) | 2x faster | 60% less | +| **Llama 3.1 (8B)** | [▶️ Start for free](https://colab.research.google.com/drive/1Ys44kVvmeZtnICzWz0xgpRnrIOjZAuxp?usp=sharing) | 2x faster | 60% less | | **Mistral Nemo (12B)** | [▶️ Start for free](https://colab.research.google.com/drive/17d3U-CAIwzmbDRqbZ9NnpHxCkmXB6LZ0?usp=sharing) | 2x faster | 60% less | | **Gemma 2 (9B)** | [▶️ Start for free](https://colab.research.google.com/drive/1vIrqH5uYDQwsJ4-OO3DErvuv4pBgVwk4?usp=sharing) | 2x faster | 63% less | | **Phi-3 (mini)** | [▶️ Start for free](https://colab.research.google.com/drive/1lN6hPQveB_mHSnTOYifygFcrO8C1bxq4?usp=sharing) | 2x faster | 50% less | @@ -39,7 +39,7 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and - Click [here](https://github.com/unslothai/unsloth/wiki) for detailed documentation for Unsloth. ## 🦥 Unsloth.ai News -- 📣 NEW! [Llama 3.1 8b, 70b](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) both Base and Instruct now supported +- 📣 NEW! [Llama 3.1 8b, 70b](https://colab.research.google.com/drive/1Ys44kVvmeZtnICzWz0xgpRnrIOjZAuxp?usp=sharing) both Base and Instruct now supported - 📣 NEW! [Mistral Nemo-12b](https://colab.research.google.com/drive/17d3U-CAIwzmbDRqbZ9NnpHxCkmXB6LZ0?usp=sharing) both Base and Instruct now supported - 📣 NEW! [Gemma-2-9b](https://colab.research.google.com/drive/1vIrqH5uYDQwsJ4-OO3DErvuv4pBgVwk4?usp=sharing) and Gemma-2-27b now supported - 📣 UPDATE! [Phi-3 mini](https://colab.research.google.com/drive/1hhdhBa1j_hsymiW9m-WzxQtgqTH_NHqi?usp=sharing) model updated. [Phi-3 Medium](https://colab.research.google.com/drive/1hhdhBa1j_hsymiW9m-WzxQtgqTH_NHqi?usp=sharing) 2x faster finetuning. From 824511e265ff9c45b2448d4c89c93d0306c42741 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 23 Jul 2024 12:07:27 -0700 Subject: [PATCH 047/147] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e7ef854cf..1c98c43f1 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and | **DPO Zephyr** | [▶️ Start for free](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing) | 1.9x faster | 43% less | | **TinyLlama** | [▶️ Start for free](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing) | 3.9x faster | 74% less | -- **Kaggle Notebooks** for [Llama 3.1 (8B)](https://www.kaggle.com/code/danielhanchen/kaggle-llama-3-8b-unsloth-notebook), [Gemma 2 (9B)](https://www.kaggle.com/code/danielhanchen/kaggle-gemma-7b-unsloth-notebook/), [Mistral (7B)](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook) +- **Kaggle Notebooks** for [Llama 3.1 (8B)](https://www.kaggle.com/danielhanchen/kaggle-llama-3-1-8b-unsloth-notebook), [Gemma 2 (9B)](https://www.kaggle.com/code/danielhanchen/kaggle-gemma-7b-unsloth-notebook/), [Mistral (7B)](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook) - Run [Llama 3 conversational notebook](https://colab.research.google.com/drive/1XamvWYinY6FOSX9GLvnqSjjsNflxdhNc?usp=sharing) and [Mistral v0.3 ChatML](https://colab.research.google.com/drive/15F1xyn8497_dUbxZP4zWmPZ3PJx1Oymv?usp=sharing) - This [text completion notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing) is for continued pretraining / raw text - This [continued pretraining notebook](https://colab.research.google.com/drive/1tEd1FrOXWMnCU9UIvdYhs61tkxdMuKZu?usp=sharing) is for learning another language From 777453967fc8476a846983e9c5eeab3382b88543 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 23 Jul 2024 12:12:29 -0700 Subject: [PATCH 048/147] Update loader.py --- unsloth/models/loader.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index ece8af282..85416b81b 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -132,9 +132,12 @@ def from_pretrained( model_type = model_config.model_type if model_type == "llama": - scaling_type1 = model_config.rope_scaling.get("type", None) - scaling_type2 = model_config.rope_scaling.get("rope_type", None) - scaling_type = scaling_type1 if scaling_type1 is not None else scaling_type2 + scaling_type = None + if getattr(model_config, "rope_scaling", None) is not None: + scaling_type1 = model_config.rope_scaling.get("type", None) + scaling_type2 = model_config.rope_scaling.get("rope_type", None) + scaling_type = scaling_type1 if scaling_type1 is not None else scaling_type2 + pass if scaling_type == "llama3" and not SUPPORTS_LLAMA31: raise ImportError( From caa402828715d428b5426955df8fecc8e3fe1c80 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 23 Jul 2024 12:25:24 -0700 Subject: [PATCH 049/147] Update _utils.py --- unsloth/models/_utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index b021e89e9..5a2e85997 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -393,7 +393,10 @@ def patch_tokenizer(model, tokenizer): tokenizer.pad_token = possible_pad_token if model is not None: config = model.config.update({"pad_token_id" : tokenizer.pad_token_id}) - pass + else: + if model is not None: + if model.config.pad_token_id is None: + config = model.config.update({"pad_token_id" : tokenizer.pad_token_id}) return model, tokenizer pass From 4dd4ad2104ae9865a029f0408df89c7121f353e9 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 23 Jul 2024 12:27:46 -0700 Subject: [PATCH 050/147] Update llama.py --- unsloth/models/llama.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 338ae0a7c..719aee537 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1178,10 +1178,11 @@ def _fast_generate(*args, **kwargs): kwargs.pop("token_type_ids", None) # Check pad_token - kwargs["pad_token_id"] = kwargs.pop( - "pad_token_id", - getattr(model.config, "eos_token_id", None), - ) + model_eos_token_id = getattr(model.config, "eos_token_id", None) + if hasattr(model_eos_token_id, "__iter__"): + model_eos_token_id = model_eos_token_id[0] + + kwargs["pad_token_id"] = kwargs.pop("pad_token_id", model_eos_token_id) # Set pad token # old_pad_token_id = getattr(model.config, "pad_token_id", None) From cc11b7886138e45690a459019f57c53675a70623 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 23 Jul 2024 12:28:12 -0700 Subject: [PATCH 051/147] Update llama.py --- unsloth/models/llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 719aee537..ba4362b3c 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1179,7 +1179,7 @@ def _fast_generate(*args, **kwargs): # Check pad_token model_eos_token_id = getattr(model.config, "eos_token_id", None) - if hasattr(model_eos_token_id, "__iter__"): + if model_eos_token_id is not None and hasattr(model_eos_token_id, "__iter__"): model_eos_token_id = model_eos_token_id[0] kwargs["pad_token_id"] = kwargs.pop("pad_token_id", model_eos_token_id) From d1f3b6c1c4f69cd09ebdcab014bd72ac1217ee71 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 23 Jul 2024 13:14:21 -0700 Subject: [PATCH 052/147] Create Run.png --- images/Run.png | Bin 0 -> 11471 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 images/Run.png diff --git a/images/Run.png b/images/Run.png new file mode 100644 index 0000000000000000000000000000000000000000..fd737aa4d6e3684a0ae3405565f95d52e521b785 GIT binary patch literal 11471 zcmXw9bwHHQ)29*X1}Om%NkJr}5hNv)?hffZjxGrS2`Q-qM7lw`r6o?fkE0uoZg`*b z`+NU5j@#$ishypj&+L3rQ<1}e{NynT3JSi0yv#=w6f_9%fBz#a;IEYUvKsJ(<1DZ1 zhJu1iiu|CWq-9W{pl}8$$Vh2`GxisKd^OH8AC4$ZpK8lL#X&VA;a)a-L>UUtBksW+ zdu&$gVLDnoIT>u{X=hlfH?OG`FnP9lU%ZQNr>`+Fr}&|h-6;DzI~vpHK$71qGTe+f zaX!J!xA#JLZ{NSNwi-!wvpV({r$0gYmg*+{=Dii{L~QBg3Y0G4NNyG--jTgpPiyit zZf=7D_Zd2GAokX}WF`|6(`Y$80!=PRFPpsZliNIQB6@So62Tia>xmN~LrF4u>|WaT zuj2RC+_KqZ#Kgp#nWB{mw#U4+lGJK=_QKmx^r_^j+81D~Ui4_p5zGTOOQcm5JW8CMpq^)BO|j=N{MJ)732xtTnc3ASPH7hWE;$E+FoJmXlU3J z>5xE|KpD&6k^|oSA<2)RgtNdIs3-+JJ{kIT^O0WiIy3hsZ2}T8X(^!lyWVOMh+^N; zn?r0;$X4A^3APA^P+;9)l7-l}1rZSuGaT5=Zi(g=anw|)Kb5u>JNbwk@L%+N_vHP` zeaXNLP>I=ziI++?vG_L6^GY%0FL|&v76PS`28>6wV*duoCNN%szN`NQp@~IuA;oYf z9~~HD24?~P)T53TuwXpstf!w{gtuC$sb+nK^+|}eakMJFNMIKF(MU2gFx^a1R2DRg zDk@J#koHa;vG?UDtCnt6UHV(iIv`&J1`AFDF7x;;Z3IAXS4IhOE8dH-C>Ka(tprdf zu9A5yTQ?gS@4;mB6B}v(r!(QT{eoPNQ8sMm-4^N36@^qqo#W_3=#fEe1k*@)NeI+= zs-dDx?!?*hco1teyTI_jq0+#EvD$8Pi)?F6+78P~l-^35LLT5k|d}D~zRUV@ehgsTE#8BR6flBlx z+7v3L3MG;SNzh}S>WOzMlw-;B!JEn*^O8;Dp~x9BmG05+*NbcV9C09vf!MqoEL+7M zfVlC0y?Cl8L#T8IT^t`lsydk9+y%HuJ-)p4M18p&92Jv60BH@Z57Z@?W<)X=wlcFQ zmtl=^eg!_qgkf{LS7IT_l&`!sc@wv%fvSQDb&vjA#4-I2(Chc60UZ75@Rv*xV$pG4 zf4B?5o<1=AvqXecCkWl}^KUxQgrIiRD^pknQb=EBhTO%EI;zd#eBkFM?``QAdi{`Q zkiQw}?zh5Oo-=I-wpN8JM|)u+m0cm&G_&{xS)S1q3bq!6Uu^9lH6j*lB4RF?g;|Kx zcAiL?fT+%q&coE1`Za#m-qI71fqwgY!w<<$e#<{7e$n3C8j*o_JJq`%g&eE}T-3R) zk^^jo8i3|K^UDuOvI>ID++{gi$q1LyzKpm6^Bq2%?&u$Ai?E zX%-{;y)|1&$R>ZsR|+R}U`Xcop=`9hH8n@bCXu~-??1`Ev+}-zu#`TAECx{q!6+P5 zk?&gG&Z;B$o@GINSS|hJ&k@(6(@<@t>P+3ImT)iL*nt3abJUBsL!x3ss2qBZ{@Ad`4of!_3v;% zP5P(8=w1O8^=P7G@TTZfHn=O2*fTvAGYc8@AuK3r$3pfW*bC&Cr)i)=YxJYL>KA@5Tv<67Fhyb-0(D?$1}#@Ef}^;fG}SIw#|Osh9S=9pW-dgf=_DNVA?&Aly%zsg}!s;)(U7)BDR1}KFyqZ-sF(j3wb-$t!N zljh`96z1K#ZOb@=rsAc2U(vxQr`ge|u_4TA*WTB`N6P9_ht3>Cuz7qXVl9FnI#NmH zrz!BrB-ujv+jQ8@cQY4sV=4$t~@1f(fwxJaQECJYp`d z4`=hlGaWj`D^TKf7(8Z&`B`9}Y0CX?!ZuZJ|K`Yf+pKP33k-U}BH#5=-g0E+h?ra$ z^*I0MNkY#mH9!C2_D6tvcV-i+!~r6mE*cC%2cgcMPfU0yH<~{pwYC;ElfYx^gL?6 zb-ouxqOvuEn@LPCe)mBrd8YdSU=h{bCI+w=S-YZ%x~gs%*>(+n^@HbYX^X}Dg}V0Q z+wDGndn2QF@8Z)=qawtv31V`cwPKcT-kD$S{E=wJy(7A$c0OHuS?5{YN<>8!6Zyl? z+mT7IO^Dl8xWuAQf{iQ#xJE0q+ab4?S5;uMW)<^5K+u>8DIAp)V&dxKZ;jsm;(U~A@o}cmPyT>2 zh8xbFAEI1I~)E1Y8H4x@oE$i-f6(Qn(A=;JL^I&x^Ay7rr z6$v@HtP!=ZsZ+7zFBROHq9aytvp(gnWFlR*o4ufP-rp6mN07k))B6$5DiFqSyChmL z=$v#YA@uFiV)z12#y&WF@L?@>TUuf%u7daeCK7 zDLNiBo>9b7ZFqVZlF7};lp*06H_p=V_T2|DJL4tuowi2IA^~k6E|X9cm|6^zcDhBo z)ykAhk*aUG=^4zNYd6-$R}&J3IRt1a?#)Z@C&lPrAsly#_1`aQ`-#1id8Qa0<`lfC zgN^$|hY}|6!v1EiZs#dfYiE}$&Fli;fqMv=Hkrk_US-0 ztF`@OmoM~VZX~FMC5}1gk{q>3TzgAXlC#Cq8cqZUUB!nw7f@v;fzgvPb9}Y7BOagn z3H7GaIDQG8xG~wD1d$>_HuRN#D%)QL;*FWQJ;{0~?Dc}qU}Giuz8=N4rW-;frZ>z7 zi4TLt$Fh1)S~VPsmb{@)g}E*~1cM&5Z%FK%{Cu-z3t{AQ5$=<9x=H>8}93 z0rk3%rTb5guiN|Ipd>ta5HY`Wy%7BqPnmZ#g>DJ#c>XZe-LA=#lRMa9`^e?rYCj{M zm+8NC-(apSk{C#q>qEjSH7f7C5w0E=A`;2?P45gpOJPI^PKvP+2(G zXOE?lR;45uySlz}++5vJ!{_@TB4`0Birs%OnZf6%!TM%-QHnGfL{m|bLxr=GL&gV} z8Kw!>t^?5Uzk>6byMC;v+nvAxce36qT%hEVIIF(Jb#{Ow@5loJ32z!p6n1sYroukC zkd6GwF_C>fn8j(n`h%T@9WQoy{F>)%$u&fn0N4#cw5xVpp@-d1#;9EIUgdZChsgMt zk&3RVjyw%;#oC1}bQQCL<|#IYFr4{(p|OBX$yyEQlTn`hIiVJEiRQ4HtlsN%h>4y9 zEH1KUe(0m3HB`L5n!-UQnXCQkZy!3IYeQT_f?!W;ZsfccoyB}{i0@{fZic6l00hE& z&p>9C%dLEP+*uP)-Z=S*P)jJ_8p@RqllY7?&L<610b<&=XS4<4Qk=7~bt3rB@4@0# z9#ikVADGp8dWxAR=#GxgX)Y&xd?Xyb6b&yUoi+ zM@zKkW=8cpWS|xr&FGLQ0$os$$Lc}Z38W7^5HqJDV$^;zec-fS>y>pfmO;Xxxc44T&4d~eHIEWy3iMV3#vZNkO2n$n20ic z86%Q_{9r!GiL`?ud-*=;3LIqy@ftzCEW_@Bn* ztuKc_*IR_f$K9tM_IjX>hfIfL`p!OHD&YKVbo^jaFb(oA#yMa9$irMS*xooxC#(un zt(00S|6t4qggogdbcAx1_3Au*ZN)9|kzBQ)m5ET48Qwf`hIyi%6&JBHj#CudikPYrJy<4o$ zbX~kFPnr(TLG+eJw9FI&EdJ2p)54ruatcmkna$m#-*POoSeN@KA@x-(_9PRtgSp=UPKi;2= z?|H1z+)S)9kRm;iZYQ4@>!?P>yOL7f7FPS4NDWInyBXstM44uGF#gR|0-p|~irK9E zCztMr*-u$RrN51QH|2Cu8N$zju%I$fT}*Jn4`gd2sY#aJPZaEfN6`lC1C{y_JeH0o z`}u%1UfeZ=4G)&gwWh(QiP~%YEBDzll1TpsW9WvqqCvRF=GH97Z<6Sy2xO^fMcW9w zMigt$PW_(t!R&WScZP>c7h2N!aW7o2ZkO{Rxup5lAaBuUWJ8$V|y$M(>9VQ-BzSpBT!xY>(UQVi#JI&ht2) zwM5`pQZM_$9=YR{XM>AOgn9%C@x6^J7Ecry8ww-8Mit8g z*4Uo1pWpCMY2u%saPe~LJP$OkPh)RwaXq0Jho(!$G&gA3;t|bm#&mJA@n55+Z3%HMR z`0Uj4mlX@TB#7|vN!9&zLP4kO;st|?Sl|FtS9byqTl4D>Rl9_g(0T`+yW8_m#iz&T z6`)E)9X={F>y*_&qoqhv|J2pyPI_3}3CPAhl6=;bBBV+vFrTHNt6vq=-rmpQb2bOi z$ffPIUbkAm1*`7MMXswRZ)Uw*5 z3-0i+#R=wyn*Rky*>s)g9O zv9{9i$wMZ%^6fD4yL|L#j;j3jeVhKcyzcUhR6Y@&znBxt3is`dofYKg;Z||YRhzEh zHuUbFkY5YR{L091xYDzsSGOLmynT9OZe5U`W~_7S+GuJ081SEy<*1GP=WE7OM_cni zhLSoLmQ*Wka^~4PR8;{)#qFk@d5PopG9bYaUZkIGl6yd8_QcSMZMfr@5@#)hNMW+o zIXvi03q|{v0kKropi!$}TEuj6N10{tb(+C-)dL%L)t4MqR@>rJEO*;>W^~z0LaH)l zqbbf`=hE{421ygPT-0?wkQjRS9z$1)PFwou4#|8;&tm`Ekd-kupBx8Rw({pzW~EI+ zg zbiIVDrj#kjUSW^jPZi(aBSG^UUh~m@DP@uJWuvZL!n8((NR`ok5JfwX&^Vs@SKH5@ zOsKSlo#*h78hDmgSu3g?t)TmdSOa;@uMbE2{(-MgQ2WaNu@X6BMEvRS>* zF4UXBC$|D6bt@V*pJS8U@KmQXw)4uHdYs&@+Co-?#obJ`_j45coi7dp{pFbFrgAWAmm9F*E_7*W6`sKfH-TS#ROeL^4ZA&=DWC)i}q{89%>HBFTF|HLXD}Ra71N~X8 z{h+CA`pvL8X&1eq4Yf!IJ+naJf|7`5wKu^&jIQZHSFH^jsiOws0T^Eh-qS@Sug*+W zW7ns)9Zm!cdffE~N2FuY;4MD=W=Dx5Wrz274LHNs;0OI}R6niFJh@xlUP$9uqsR2Ji&x8)r#& z4h5;CygfVLx@)PiS~eOl7=8X8s-Y?z4|>gTEz+`%mG4go zvo}DfLX~C2hw-^!Z&?-G380kHu(gDp;Fs^CCAWCt6*A5;lV3D8IN#2-x^s1vfAG*M z5_a@{)H}U|jw{Y4gZidJ;|0*f55&L}{YIyp7!5UtY<=w^|^-%j8}s)0|JY5k5^Cx11?<-)8*7U2)`t z$xp?lfXwIyy!+@}c4;!?3z6Y>aB>+4)OSXs1UaGE0O)qvf7NJJzU3soH@QMdQW(Mu z+{xHar~TDltDOH_g3WVrhEtB4iL##6^l+)m7<5ro8-PlDtK}k_;@A z?xj9;;{5DNhswkFpD@TS?7FAc)-2FZMVt6j9jE6et481+ zUqa({9^ZFwZSH^dlLEu6_FdSjb)MuDHgtmxc0l0L*GGF&zwJ_d#krJjkogQ4i7|UO_*-tz9Y*EP#h25c{rcyRqh zj%PRE#L!Q#wQSC>Ov8RF4cD^F+pK;Y1_^?=O8n~c<+}&?W8Y!{aLIHdSoE=VsCXn3 zNufu;D^KdTKx9Nd6+2D^tOEcyKDxQ7_T~2SADis&9R{^;X5E4ht3*Mdro0@|Xv@Ii z7DwgpkWT~DNi~~zqDR6BNhw44xV^g7hAv*V<9wM z7Y|VRjW*sO-tp^Dhb676n}p_!|4LH#_kD)>hsHe2tIS#`DI?l-@tV|LiKYqF|pG&dIF2nVoNaFfoT3qAwilfUY1^Z-@>iW2pl zZ`5%eWWryeEb&>8^1vGPnACMgrFIBubF|wLr5*?QSco9oqLYs;r-48{(iVXCy?#z8 zx?cAW4CbVwK>QV|SZL{)s1s~8rvCd0frRcu{XfmzxPVh)rn#qq2q^kXXAOa`H=ncF zN7|Kon+bM775F_m38|Q!fz~FP9U-lZm={P)60MK(&UtiV1uoxghy*G(mxw5k@BJrs&Vn)U4cKQ)eyaedF|TPTwB)0ty$=nVF8oou^F_`?%nHOTcLsf;BMTyR=FI;Oi(B$;N}_jRLn(MaP zE^Bc*fHL(==Ub+^d`oO09n<@Fg&ek}Z+@qx3}4hlTeS%?*5nTj^Ly;D$BoA`#TNqc zI?{;S^HR@fL{bw_jJ@|>qO8N}0$|YizJtsjfCIHR*PJWuwmG4 zi@>2RP4jM7-hFs!8I zDl2b-L1@LDy1W*N1dLmJ3VKqFDE?B4mx=C|6Jxui0|3Ty*@4EMMyj;Lae2~5N$0Av zmQq$g(;LX3W{y7$Ah8ZipJak|LNbY4teaH3yw(F|3!s}HDTBH~`V}G{ek{VT;?YB8 zlFB%2XDVe17*Xr|)w+78JvfEOHMZBgD2LmxFseCC02J8XyipUk*>z=fo7HI63aIFT zgp?bRXD#(6&6W(me4$r;+`4wf+SZl8!IcgGs7i~|8jE)Tho6?A5`xnK z)g`z;<>;vj7TOJNQj3UYG`1}V_VdWT7H0#shd^(fDb0GdTSpcN(2dMy$Tav0w^XN@!ztA!*j0>A@~$H>-Yo zSlaag>fOTmr5$&+u^40>lBa@_?QT_mM5s1X!7exD&Jy4LVT*|T-T5#vDNgtZWETY} zL({^Re#5xVd>iXv{HA@{o(~Jp%t%%J2;VerILA0*u>%z=Y3RPR!ME_}oc&nKBqvck z_;uDh&MLg`8#i+Ea4#B!P6J1zY*EO{)qvgVHpTex_Q&5ljk8%*PD&x;mdMLtky|qG z!bR+NfhAv#7T}&I&2aek=(>E8{w(he%VT7tjBB-ewdFj#k&dVRQJK3fG~fFkyeJ#v zPj|OCZk(q7J7!YY5O|G=HPO^!kaP}9HLyLjAh~>ba77rBc}1-p-sajwrtdp>%O7w# ziUsm5{R>=a0{a zwEs0i+!D=L2XGlGNuLq>2keIxZl5$-dXYnK5$XhfpC0Dm08tW`bJUVK|EgBF7ujXa z{<`}Ir9+POGh@ef`SEF$6q3YTV}{i->_0lbo0LR%>e|2s(=(xf-u}b_o0nj;7F#DS z!+9X;fv<~IXRm|&2EWdl0@nq|pSll}_z^%FE?+Q5*KI%FchWr_WEj}klfz=e{Jg-m zbAfiuZLxW5v@_C5OZ>w~vS;!CE>N8WdA@^WMVpOh9>QYi6Cb~?u6ohAb#b+wbusEb zqqLL(UP* z`BGchKCMBfQxb!gjI3vD&#+_EwA|yk21{nH22AHKSb>=zKK;8LKBW|GkG=L5=C*$O zstc;hVrSs=O7_`ktK3$|-@X|7Tvt10V~XhyO~+50r|1tzI5@umqL#H0a&B*>-v<|9 zzvvO#=jM(}oJ%;(!TE`=knN-*I)HAbpt!GKP#q3Hx@}?_>3YXSAJI)P!u%ich$c5XQ;;>%ss{xI-1Pm*NF90-7cW%q@CVQgI;o9SJ#F{~8lM>?%YL z&9L!xlEMRmHsb!*hA%RS0WGS734la)>xf7A|OY zNGp#J4J>yAZX1c?okU~eiIs7lfzd?KCNTfM8N#C@0Q)#lGGtQ<9)!a1UpK@iYP7wL zUUnyG^!R>rFgOFSS%gfuAqFod-*@;d+5&y|DsY7^^Gx^_TlP7gJP#mUMaS_|1`ebe z@pf+zDJJw(_6UrIq^IA?|4lBXfE$PSIlt(v!E1Q=xewXhB5%=1I76`)B8>wv`K3Ka zv?73{*?lKqV$q%*`b4y~L%Cy`(Gl4(CbO_>7d^CzZ^@hk>{!(gPDrieZO&1EzLPKH zounq>h^lw(-^KR%+F?<`d9Ce3l4uWJaR6)?a z7HXg+OdM*K;q7Vhy9CKTv9Pya11fCTKd?X{iV8#T&E@!KkKif-W~J6mcSeUsUNG7o z)(ASpt%(TGFaFUnt!&ubzB}W-iyZ>P9Bogz!$UH&s}?DAgD#z{RTB=Sfy@N4Ahv4O zW)L3_Mb*FEpyTTSSk9rNU#9(eFxEN~8~C~u{V|XCM}DBI4k^44KhT|M&A9qObD2QtOU(g>ZfhDS!&V&EV_?a{`h`7m{h_tsd!-Eb6NL{A>((XlC+^ zhfrwG9(fosB@5v#e2I6V5Ydn1{{hIuySdyRltU%`7yrNBM3O|TXwC#&(f4GA<0Q(R z^Am5owqYfkTEs6?&Hfx62>v~Q?BTq80=~QC!fl{$z*!wfw$((?fzY9-0VwRW z2>acCF#%HJ6o^q-%RdCUT>CN;H#s(u0_dLVC1U$%ezmVX znIQU&KaJll)Zppt7c{_YyGd%Pa#MOb@)Pc(=jhXyv*FZD?C`tQ`h+u7EYs5L#1NOx zkGARhoN=zV_f?X?`;$O-kh_?&BgZTYLnn1mudYjQ&|m literal 0 HcmV?d00001 From a96d16e44ba2a07f5b6cdc10919286c23a984fe1 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 23 Jul 2024 15:08:09 -0700 Subject: [PATCH 053/147] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 1c98c43f1..4c1271396 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ -### Finetune Llama 3, Mistral, Phi-3 & Gemma 2-5x faster with 80% less memory! +### Finetune Llama 3.1, Mistral, Phi-3 & Gemma 2-5x faster with 80% less memory! ![](https://i.ibb.co/sJ7RhGG/image-41.png) From bd180c13579f199516ac285ad724f99d11c562c3 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 24 Jul 2024 14:05:31 -0700 Subject: [PATCH 054/147] Mistral --- unsloth/models/_utils.py | 4 ++-- unsloth/models/mapper.py | 11 +++++++++-- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 5a2e85997..213cb5b0a 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2024.7" +__version__ = "2024.8" __all__ = [ "prepare_model_for_kbit_training", @@ -336,7 +336,7 @@ def patch_tokenizer(model, tokenizer): possible_reserved_tokens = ( "<|reserved", # Llama-3 "<|placeholder", # Phi-3 - "[control", # Forgot where lol + "[control", # Mistral type models "", # Mistral Nemo "<|finetune_right_pad_id|>", # Llama-3.1 ) diff --git a/unsloth/models/mapper.py b/unsloth/models/mapper.py index fc13c94e8..462555f31 100644 --- a/unsloth/models/mapper.py +++ b/unsloth/models/mapper.py @@ -227,13 +227,20 @@ "meta-llama/Meta-Llama-3.1-8B-Instruct", ), "unsloth/Meta-Llama-3.1-70B-bnb-4bit" : ( - "unsloth/Meta-Llama-3.1-70B", "meta-llama/Meta-Llama-3.1-70B", ), + "unsloth/Meta-Llama-3.1-405B-bnb-4bit" : ( + "meta-llama/Meta-Llama-3.1-405B", + ), + "unsloth/Meta-Llama-3.1-405B-Instruct-bnb-4bit" : ( + "meta-llama/Meta-Llama-3.1-405B-Instruct", + ), "unsloth/Meta-Llama-3.1-70B-Instruct-bnb-4bit" : ( - "unsloth/Meta-Llama-3.1-70B-Instruct", "meta-llama/Meta-Llama-3.1-70B-Instruct", ), + "unsloth/Mistral-Large-Instruct-2407-bnb-4bit" : ( + "mistralai/Mistral-Large-Instruct-2407", + ), } INT_TO_FLOAT_MAPPER = {} From 6e30a7a006d51dc5692f4687a5b38a19c7e48596 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 24 Jul 2024 23:45:39 -0700 Subject: [PATCH 055/147] Patch PEFT --- unsloth/models/llama.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index ba4362b3c..96eb5035e 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -2063,9 +2063,9 @@ def patch_peft_model( (getattr(gate_proj, "base_layer", gate_proj).bias is None) and \ (getattr( up_proj, "base_layer", up_proj).bias is None) and \ (getattr(down_proj, "base_layer", down_proj).bias is None) and \ - (getattr(gate_proj, "lora_magnitude_vector", None) is None) and \ - (getattr( up_proj, "lora_magnitude_vector", None) is None) and \ - (getattr(down_proj, "lora_magnitude_vector", None) is None): + (len(getattr(gate_proj, "lora_magnitude_vector", [])) == 0) and \ + (len(getattr( up_proj, "lora_magnitude_vector", [])) == 0) and \ + (len(getattr(down_proj, "lora_magnitude_vector", [])) == 0): # https://stackoverflow.com/questions/50599045/python-replacing-a-function-within-a-class-of-a-module layer.mlp.forward = types.MethodType(apply_lora_mlp, layer.mlp) @@ -2085,11 +2085,11 @@ def patch_peft_model( hasattr(k_proj, "lora_A") and \ hasattr(v_proj, "lora_A") and \ (getattr(q_proj, "base_layer", q_proj).bias is None) and \ - (getattr(q_proj, "base_layer", k_proj).bias is None) and \ - (getattr(q_proj, "base_layer", v_proj).bias is None) and \ - (getattr(q_proj, "lora_magnitude_vector", None) is None) and \ - (getattr(k_proj, "lora_magnitude_vector", None) is None) and \ - (getattr(v_proj, "lora_magnitude_vector", None) is None): + (getattr(k_proj, "base_layer", k_proj).bias is None) and \ + (getattr(v_proj, "base_layer", v_proj).bias is None) and \ + (len(getattr(q_proj, "lora_magnitude_vector", [])) == 0) and \ + (len(getattr(k_proj, "lora_magnitude_vector", [])) == 0) and \ + (len(getattr(v_proj, "lora_magnitude_vector", [])) == 0): layer.self_attn.apply_qkv = apply_lora_qkv n_qkv += 1 @@ -2106,7 +2106,7 @@ def patch_peft_model( o_proj = layer.self_attn.o_proj if hasattr(o_proj, "lora_A") and \ (getattr(o_proj, "base_layer", o_proj).bias is None) and \ - (getattr(o_proj, "lora_magnitude_vector", None) is None): + (len(getattr(o_proj, "lora_magnitude_vector", [])) == 0): layer.self_attn.apply_o = apply_lora_o n_o += 1 From 08d3ef4bb3a1da4de67c9e4135e4ea4838895164 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 25 Jul 2024 00:17:19 -0700 Subject: [PATCH 056/147] Fix PEFT --- unsloth/models/_utils.py | 54 +++++++++++++++++++++------------------- 1 file changed, 29 insertions(+), 25 deletions(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 213cb5b0a..23297cc4a 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -404,32 +404,36 @@ def patch_tokenizer(model, tokenizer): # ============================================= # Weirdly LoraLayer.update_layer downcasts PEFT layers to float16?? # For mixed precision, we need it to be in float32 not float16. -from peft.tuners.lora.layer import LoraLayer -import inspect, re -try: - source = inspect.getsource(LoraLayer.update_layer) - text = "if weight is not None:\n" - start = source.find(text) + len(text) - end = source.find("self.to(weight.device)", start) - spaces = re.findall(r"^([ ]{1,})break", source, flags = re.MULTILINE)[0] - source = source.replace(source[start : end], spaces) - spaces = len(re.match(r"[\s]{1,}", source).group(0)) - lines = source.split("\n") - source = "\n".join(x[spaces:] for x in lines) - source = re.sub("([^\.])nn\.", r"\1torch.nn.", source) - source = source.replace("def update_layer", "def LoraLayer_update_layer") - exec(source, globals()) - - # Fix up incorrect downcasting of LoRA weights +from packaging import Version +from peft import __version__ +if Version(__version__) < Version("0.12.0"): from peft.tuners.lora.layer import LoraLayer - LoraLayer.update_layer = LoraLayer_update_layer - from peft.tuners.lora import LoraLayer - LoraLayer.update_layer = LoraLayer_update_layer -except: - logger.warning_once( - "Unsloth unsuccessfully patched LoraLayer.update_layer. Please file a bug report.\n"\ - "Luckily, your training run will still work in the meantime!" - ) + import inspect, re + try: + source = inspect.getsource(LoraLayer.update_layer) + text = "if weight is not None:\n" + start = source.find(text) + len(text) + end = source.find("self.to(weight.device)", start) + spaces = re.findall(r"^([ ]{1,})break", source, flags = re.MULTILINE)[0] + source = source.replace(source[start : end], spaces) + spaces = len(re.match(r"[\s]{1,}", source).group(0)) + lines = source.split("\n") + source = "\n".join(x[spaces:] for x in lines) + source = re.sub("([^\.])nn\.", r"\1torch.nn.", source) + source = source.replace("def update_layer", "def LoraLayer_update_layer") + exec(source, globals()) + + # Fix up incorrect downcasting of LoRA weights + from peft.tuners.lora.layer import LoraLayer + LoraLayer.update_layer = LoraLayer_update_layer + from peft.tuners.lora import LoraLayer + LoraLayer.update_layer = LoraLayer_update_layer + except: + logger.warning_once( + "Unsloth unsuccessfully patched LoraLayer.update_layer. Please file a bug report.\n"\ + "Luckily, your training run will still work in the meantime!" + ) + pass pass # ============================================= From 66e0453ea85a33132c2e9b6c616726cc4bc0b0f1 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 25 Jul 2024 00:19:32 -0700 Subject: [PATCH 057/147] Update llama.py --- unsloth/models/llama.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 96eb5035e..5bc2983a2 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -96,7 +96,7 @@ def fix_prepare_inputs_for_generation(module): pass pass - +torch_matmul = torch.matmul def LlamaAttention_fast_forward_inference( self, hidden_states: torch.Tensor, @@ -238,10 +238,10 @@ def LlamaAttention_fast_forward_inference( if bsz == 1: Qn *= self.scalar # See https://github.com/ggerganov/llama.cpp/issues/7805#issuecomment-2153349963 # It seems like doing (Q * scalar) @ K is better than (Q @ K) * scalar to stop overflows - A = torch.matmul(Qn, Knn.transpose(2, 3), out = self.attention[:,:,:,:cached_len]) + A = torch_matmul(Qn, Knn.transpose(2, 3), out = self.attention[:,:,:,:cached_len]) # if attention_mask is not None: A += attention_mask # Must add attention_mask for batched A[:] = torch_nn_functional_softmax(A, dim = -1, dtype = torch.float32)#.to(A.dtype) - A = torch.matmul(A, Vnn, out = Qn) + A = torch_matmul(A, Vnn, out = Qn) else: A = scaled_dot_product_attention(Qn, Knn, Vnn, attn_mask = attention_mask, is_causal = False) pass From 7fccd21d9f1388ca51063455c4ffae8e7c06720c Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 25 Jul 2024 00:28:20 -0700 Subject: [PATCH 058/147] Update loader.py --- unsloth/models/loader.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 85416b81b..6b83b8e73 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -95,6 +95,9 @@ def from_pretrained( model_name = _get_model_name(model_name, load_in_4bit) # First check if it's a normal model via AutoConfig + from huggingface_hub.utils import disable_progress_bars, enable_progress_bars, are_progress_bars_disabled + was_disabled = are_progress_bars_disabled() + disable_progress_bars() try: model_config = AutoConfig.from_pretrained(model_name, token = token, revision = revision) is_model = True @@ -129,6 +132,8 @@ def from_pretrained( model_config = AutoConfig.from_pretrained(model_name, token = token, revision = revision) pass + if not was_disabled: enable_progress_bars() + model_type = model_config.model_type if model_type == "llama": From 9e1ad7c319e4b6d7412d5f9a104abceef29a7247 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 25 Jul 2024 00:29:12 -0700 Subject: [PATCH 059/147] Update _utils.py --- unsloth/models/_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 23297cc4a..9dc82f1e5 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -404,7 +404,6 @@ def patch_tokenizer(model, tokenizer): # ============================================= # Weirdly LoraLayer.update_layer downcasts PEFT layers to float16?? # For mixed precision, we need it to be in float32 not float16. -from packaging import Version from peft import __version__ if Version(__version__) < Version("0.12.0"): from peft.tuners.lora.layer import LoraLayer From 8e5054bbea23cb91628cfe8923696806ca4a6274 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 25 Jul 2024 00:33:38 -0700 Subject: [PATCH 060/147] Update _utils.py --- unsloth/models/_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 9dc82f1e5..5a267a459 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -404,8 +404,8 @@ def patch_tokenizer(model, tokenizer): # ============================================= # Weirdly LoraLayer.update_layer downcasts PEFT layers to float16?? # For mixed precision, we need it to be in float32 not float16. -from peft import __version__ -if Version(__version__) < Version("0.12.0"): +from peft import __version__ as peft_version +if Version(peft_version) < Version("0.12.0"): from peft.tuners.lora.layer import LoraLayer import inspect, re try: From fd753fed99ed5f10ef8a9b7139588d9de9ddecfb Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 25 Jul 2024 08:53:21 -0700 Subject: [PATCH 061/147] Update llama.py --- unsloth/models/llama.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 5bc2983a2..bc434ecf1 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -2063,9 +2063,9 @@ def patch_peft_model( (getattr(gate_proj, "base_layer", gate_proj).bias is None) and \ (getattr( up_proj, "base_layer", up_proj).bias is None) and \ (getattr(down_proj, "base_layer", down_proj).bias is None) and \ - (len(getattr(gate_proj, "lora_magnitude_vector", [])) == 0) and \ - (len(getattr( up_proj, "lora_magnitude_vector", [])) == 0) and \ - (len(getattr(down_proj, "lora_magnitude_vector", [])) == 0): + (len(getattr(gate_proj, "lora_magnitude_vector", []) or []) == 0) and \ + (len(getattr( up_proj, "lora_magnitude_vector", []) or []) == 0) and \ + (len(getattr(down_proj, "lora_magnitude_vector", []) or []) == 0): # https://stackoverflow.com/questions/50599045/python-replacing-a-function-within-a-class-of-a-module layer.mlp.forward = types.MethodType(apply_lora_mlp, layer.mlp) @@ -2087,9 +2087,9 @@ def patch_peft_model( (getattr(q_proj, "base_layer", q_proj).bias is None) and \ (getattr(k_proj, "base_layer", k_proj).bias is None) and \ (getattr(v_proj, "base_layer", v_proj).bias is None) and \ - (len(getattr(q_proj, "lora_magnitude_vector", [])) == 0) and \ - (len(getattr(k_proj, "lora_magnitude_vector", [])) == 0) and \ - (len(getattr(v_proj, "lora_magnitude_vector", [])) == 0): + (len(getattr(q_proj, "lora_magnitude_vector", []) or []) == 0) and \ + (len(getattr(k_proj, "lora_magnitude_vector", []) or []) == 0) and \ + (len(getattr(v_proj, "lora_magnitude_vector", []) or []) == 0): layer.self_attn.apply_qkv = apply_lora_qkv n_qkv += 1 @@ -2106,7 +2106,7 @@ def patch_peft_model( o_proj = layer.self_attn.o_proj if hasattr(o_proj, "lora_A") and \ (getattr(o_proj, "base_layer", o_proj).bias is None) and \ - (len(getattr(o_proj, "lora_magnitude_vector", [])) == 0): + (len(getattr(o_proj, "lora_magnitude_vector", []) or []) == 0): layer.self_attn.apply_o = apply_lora_o n_o += 1 From 01c35f9e17cf455e97f7ce6cf55ecd653363433f Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 26 Jul 2024 16:31:40 -0700 Subject: [PATCH 062/147] Update __init__.py --- unsloth/__init__.py | 36 +++++++++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/unsloth/__init__.py b/unsloth/__init__.py index 464068154..265d08c90 100644 --- a/unsloth/__init__.py +++ b/unsloth/__init__.py @@ -60,6 +60,37 @@ "We have some installation instructions on our Github page.") pass +# ============================================= +# Check if Unsloth's model list has been updated +import os, requests, inspect, re +import numpy as np +import subprocess + +try: + file_location = inspect.getfile(torch) + package, _ = os.path.split(file_location) + dist_packages, package = os.path.split(package) + old_mapper = os.path.join(dist_packages, "unsloth", "models", "mapper.py") + new_mapper = "https://raw.githubusercontent.com/unslothai/unsloth/main/unsloth/models/mapper.py" + with open(old_mapper, "r") as old_mapper: old_mapper = old_mapper.read() + with requests.get(new_mapper) as new_mapper: new_mapper = new_mapper.text + old_mapper = re.findall(r'\"unsloth\/([^\"]{1,})\-bnb\-4bit\" \: \(', old_mapper) + new_mapper = re.findall(r'\"unsloth\/([^\"]{1,})\-bnb\-4bit\" \: \(', new_mapper) + new_models = list(frozenset(new_mapper) - frozenset(old_mapper)) + + print(1) + if len(new_models) != 0: + warnings.warn( + f"Unsloth: Some new models including {new_models} have dropped!\n"\ + "If you want to try them out, please update Unsloth via:\n\n" + 'pip install --upgrade --force-reinstall --no-cache-dir \\\n "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"' + ) + pass + del new_models, old_mapper, dist_packages, package, file_location +except: + pass +# ============================================= + # Hugging Face Hub faster downloads (only enable during Colab and Kaggle sessions) keynames = "\n" + "\n".join(os.environ.keys()) if "\nCOLAB_" in keynames or "\nKAGGLE_" in keynames: @@ -103,11 +134,6 @@ def is_bf16_supported(): return SUPPORTS_BFLOAT16 except: pass else: from triton.common.build import libcuda_dirs -import os -import re -import numpy as np -import subprocess - try: cdequantize_blockwise_fp32 = bnb.functional.lib.cdequantize_blockwise_fp32 libcuda_dirs() From 08379f8a9cc0448b13897ffbf0897ad01f7549dc Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sat, 27 Jul 2024 20:30:32 -0700 Subject: [PATCH 063/147] Edits --- pyproject.toml | 4 ++-- unsloth/__init__.py | 29 +---------------------------- unsloth/models/llama.py | 3 ++- unsloth/models/loader.py | 6 +++--- 4 files changed, 8 insertions(+), 34 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 829b35ad3..3335a7593 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,7 +35,7 @@ exclude = ["images*"] huggingface = [ "packaging", "tyro", - "transformers>=4.43.1", + "transformers>=4.43.2", "datasets>=2.16.0", "sentencepiece>=0.2.0", "tqdm", @@ -188,7 +188,7 @@ colab-ampere-torch220 = [ colab-new = [ "packaging", "tyro", - "transformers>=4.43.1", + "transformers>=4.43.2", "datasets>=2.16.0", "sentencepiece>=0.2.0", "tqdm", diff --git a/unsloth/__init__.py b/unsloth/__init__.py index 265d08c90..db54c9a16 100644 --- a/unsloth/__init__.py +++ b/unsloth/__init__.py @@ -60,37 +60,10 @@ "We have some installation instructions on our Github page.") pass -# ============================================= -# Check if Unsloth's model list has been updated -import os, requests, inspect, re +import os, re import numpy as np import subprocess -try: - file_location = inspect.getfile(torch) - package, _ = os.path.split(file_location) - dist_packages, package = os.path.split(package) - old_mapper = os.path.join(dist_packages, "unsloth", "models", "mapper.py") - new_mapper = "https://raw.githubusercontent.com/unslothai/unsloth/main/unsloth/models/mapper.py" - with open(old_mapper, "r") as old_mapper: old_mapper = old_mapper.read() - with requests.get(new_mapper) as new_mapper: new_mapper = new_mapper.text - old_mapper = re.findall(r'\"unsloth\/([^\"]{1,})\-bnb\-4bit\" \: \(', old_mapper) - new_mapper = re.findall(r'\"unsloth\/([^\"]{1,})\-bnb\-4bit\" \: \(', new_mapper) - new_models = list(frozenset(new_mapper) - frozenset(old_mapper)) - - print(1) - if len(new_models) != 0: - warnings.warn( - f"Unsloth: Some new models including {new_models} have dropped!\n"\ - "If you want to try them out, please update Unsloth via:\n\n" - 'pip install --upgrade --force-reinstall --no-cache-dir \\\n "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"' - ) - pass - del new_models, old_mapper, dist_packages, package, file_location -except: - pass -# ============================================= - # Hugging Face Hub faster downloads (only enable during Colab and Kaggle sessions) keynames = "\n" + "\n".join(os.environ.keys()) if "\nCOLAB_" in keynames or "\nKAGGLE_" in keynames: diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index bc434ecf1..d36353999 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -18,6 +18,7 @@ from ._utils import * from ._utils import __version__ from torch.nn.functional import scaled_dot_product_attention +from transformers import __version__ as transformers_version from transformers.models.llama.modeling_llama import ( logger, BaseModelOutputWithPast, @@ -1281,7 +1282,7 @@ def from_pretrained( max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3) statistics = \ - f"==((====))== Unsloth: Fast {model_patcher.__name__[4:-5]} patching release {__version__}\n"\ + f"==((====))== Unsloth {__version__}: Fast {model_patcher.__name__[4:-5]} patching. Transformers = {transformers_version}\n"\ f" \\\ /| GPU: {gpu_stats.name}. Max memory: {max_memory} GB. Platform = {platform_system}.\n"\ f"O^O/ \_/ \\ Pytorch: {torch.__version__}. CUDA = {gpu_stats.major}.{gpu_stats.minor}. CUDA Toolkit = {torch.version.cuda}.\n"\ f"\ / Bfloat16 = {str(SUPPORTS_BFLOAT16).upper()}. FA [Xformers = {xformers_version}. FA2 = {HAS_FLASH_ATTENTION}]\n"\ diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 6b83b8e73..ecf871d5d 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -27,7 +27,7 @@ SUPPORTS_FOURBIT = transformers_version >= Version("4.37") SUPPORTS_GEMMA = transformers_version >= Version("4.38") SUPPORTS_GEMMA2 = transformers_version >= Version("4.42") -SUPPORTS_LLAMA31 = transformers_version >= Version("4.43.1") +SUPPORTS_LLAMA31 = transformers_version >= Version("4.43.2") if SUPPORTS_GEMMA: from .gemma import FastGemmaModel if SUPPORTS_GEMMA2: @@ -147,8 +147,8 @@ def from_pretrained( if scaling_type == "llama3" and not SUPPORTS_LLAMA31: raise ImportError( f"Unsloth: Your transformers version of {transformers_version} does not support Llama 3.1.\n"\ - f"The minimum required version is 4.43.1\n"\ - f'Try `pip install --upgrade "transformers>=4.43.1"`\n'\ + f"The minimum required version is 4.43.2\n"\ + f'Try `pip install --upgrade "transformers>=4.43.2"`\n'\ f"to obtain the latest transformers build, then restart this session."\ ) dispatch_model = FastLlamaModel From f6c2b4aa7d99b16e43bc165fd75b10970e766af1 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sat, 27 Jul 2024 22:16:33 -0700 Subject: [PATCH 064/147] Checks --- pyproject.toml | 6 ++- unsloth/models/_utils.py | 22 ++++++++++- unsloth/models/llama.py | 2 +- unsloth/models/loader.py | 81 +++++++++++++++++++++++++++++++++------- 4 files changed, 92 insertions(+), 19 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 3335a7593..6777f7c26 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,7 +46,8 @@ huggingface = [ "trl>=0.7.9,<0.9.0", "peft>=0.7.1,!=0.11.0", "protobuf<4.0.0", - "huggingface_hub[hf_transfer]", + "huggingface_hub", + "hf-transfer", ] cu118only = [ "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.22.post7%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'", @@ -196,7 +197,8 @@ colab-new = [ "wheel>=0.42.0", "numpy", "protobuf<4.0.0", - "huggingface_hub[hf_transfer]", + "huggingface_hub", + "hf-transfer", ] colab-no-deps = [ "accelerate>=0.26.1", diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 5a267a459..a3263f85a 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -122,7 +122,8 @@ def patch_mistral_nemo_config(config): # ============================================= # torch.cuda.amp.custom_fwd is deprecated >= 2.4 import torch -if Version(torch.__version__) < Version("2.4.0"): +torch_version = torch.__version__ +if Version(torch_version) < Version("2.4.0"): torch_amp_custom_fwd = torch.cuda.amp.custom_fwd torch_amp_custom_bwd = torch.cuda.amp.custom_bwd else: @@ -184,7 +185,7 @@ def patch_mistral_nemo_config(config): # Check TRL version from trl import __version__ as trl_version -if Version(xformers_version) >= Version("0.9.0"): +if Version(trl_version) >= Version("0.9.0"): raise ImportError( "Unsloth: If you are in Colab, we updated the top cell install instructions - please change it to below "\ "then press Disconnect Runtime and then Restart it.\n"\ @@ -199,7 +200,24 @@ def patch_mistral_nemo_config(config): ) pass +# Confirm versions # ============================================= +if Version(torch_version) < Version("2.2.0") and Version(xformers_version) >= Version("0.0.24"): + raise ImportError( + f"Unsloth: You have torch = {torch_version} but xformers = {xformers_version}.\n"\ + f"Please install xformers < 0.0.24 for torch = {torch_version}." + ) +elif Version(torch_version) < Version("2.3.0") and Version(xformers_version) >= Version("0.0.26"): + raise ImportError( + f"Unsloth: You have torch = {torch_version} but xformers = {xformers_version}.\n"\ + f"Please install xformers < 0.0.26 for torch = {torch_version}." + ) +elif Version(torch_version) < Version("2.4.0") and Version(xformers_version) >= Version("0.0.27"): + raise ImportError( + f"Unsloth: You have torch = {torch_version} but xformers = {xformers_version}.\n"\ + f"Please install xformers < 0.0.27 for torch = {torch_version}." + ) +pass # ============================================= # Torch compile settings diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index d36353999..6b16a4cc6 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1282,7 +1282,7 @@ def from_pretrained( max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3) statistics = \ - f"==((====))== Unsloth {__version__}: Fast {model_patcher.__name__[4:-5]} patching. Transformers = {transformers_version}\n"\ + f"==((====))== Unsloth {__version__}: Fast {model_patcher.__name__[4:-5]} patching. Transformers = {transformers_version}.\n"\ f" \\\ /| GPU: {gpu_stats.name}. Max memory: {max_memory} GB. Platform = {platform_system}.\n"\ f"O^O/ \_/ \\ Pytorch: {torch.__version__}. CUDA = {gpu_stats.major}.{gpu_stats.minor}. CUDA Toolkit = {torch.version.cuda}.\n"\ f"\ / Bfloat16 = {str(SUPPORTS_BFLOAT16).upper()}. FA [Xformers = {xformers_version}. FA2 = {HAS_FLASH_ATTENTION}]\n"\ diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index ecf871d5d..2a7fa75fe 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -35,9 +35,14 @@ pass -def _get_model_name(model_name, load_in_4bit = True): +def __get_model_name( + model_name, + load_in_4bit = True, + INT_TO_FLOAT_MAPPER = None, + FLOAT_TO_INT_MAPPER = None, +): - if not SUPPORTS_FOURBIT and model_name in INT_TO_FLOAT_MAPPER: + if not SUPPORTS_FOURBIT and model_name.lower() in INT_TO_FLOAT_MAPPER: model_name = INT_TO_FLOAT_MAPPER[model_name.lower()] logger.warning_once( f"Unsloth: Your transformers version of {transformers_version} does not support native "\ @@ -46,25 +51,71 @@ def _get_model_name(model_name, load_in_4bit = True): f"to obtain the latest transformers build, then restart this session.\n"\ f"For now, we shall load `{model_name}` instead (still 4bit, just slower downloading)." ) + return model_name - elif not load_in_4bit and model_name in INT_TO_FLOAT_MAPPER: + elif not load_in_4bit and model_name.lower() in INT_TO_FLOAT_MAPPER: new_model_name = INT_TO_FLOAT_MAPPER[model_name.lower()] # logger.warning_once( # f"Unsloth: You passed in `{model_name}` which is a 4bit model, yet you set\n"\ # f"`load_in_4bit = False`. We shall load `{new_model_name}` instead." # ) - model_name = new_model_name + return new_model_name - elif load_in_4bit and SUPPORTS_FOURBIT and model_name in FLOAT_TO_INT_MAPPER: + elif load_in_4bit and SUPPORTS_FOURBIT and model_name.lower() in FLOAT_TO_INT_MAPPER: new_model_name = FLOAT_TO_INT_MAPPER[model_name.lower()] # logger.warning_once( # f"Unsloth: You passed in `{model_name}` and `load_in_4bit = True`.\n"\ # f"We shall load `{new_model_name}` for 4x faster loading." # ) - model_name = new_model_name + return new_model_name pass - return model_name + return None +pass + + +def _get_new_mapper(): + try: + import requests + new_mapper = "https://raw.githubusercontent.com/unslothai/unsloth/main/unsloth/models/mapper.py" + with requests.get(new_mapper, timeout = 3) as new_mapper: new_mapper = new_mapper.text + new_mapper = new_mapper\ + .replace("INT_TO_FLOAT_MAPPER", "NEW_INT_TO_FLOAT_MAPPER")\ + .replace("FLOAT_TO_INT_MAPPER", "NEW_FLOAT_TO_INT_MAPPER") + exec(new_mapper, locals()) + return NEW_INT_TO_FLOAT_MAPPER, NEW_FLOAT_TO_INT_MAPPER + except: + return {}, {} + pass +pass + + +def _get_model_name(model_name, load_in_4bit = True): + new_model_name = __get_model_name( + model_name = model_name, + load_in_4bit = load_in_4bit, + INT_TO_FLOAT_MAPPER = INT_TO_FLOAT_MAPPER, + FLOAT_TO_INT_MAPPER = FLOAT_TO_INT_MAPPER, + ) + if new_model_name is None and \ + model_name.count("/") == 1 and \ + model_name[0].isalnum(): + # Try checking if a new Unsloth version allows it! + NEW_INT_TO_FLOAT_MAPPER, NEW_FLOAT_TO_INT_MAPPER = _get_new_mapper() + upgraded_model_name = __get_model_name( + model_name = model_name, + load_in_4bit = load_in_4bit, + INT_TO_FLOAT_MAPPER = NEW_INT_TO_FLOAT_MAPPER, + FLOAT_TO_INT_MAPPER = NEW_FLOAT_TO_INT_MAPPER, + ) + if upgraded_model_name is not None: + raise NotImplementedError( + f"Unsloth: {model_name} is not supported in your current Unsloth version! Please update Unsloth via:\n\n"\ + 'pip install --upgrade --force-reinstall --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"' + ) + pass + pass + return new_model_name if new_model_name is not None else model_name pass @@ -98,16 +149,22 @@ def from_pretrained( from huggingface_hub.utils import disable_progress_bars, enable_progress_bars, are_progress_bars_disabled was_disabled = are_progress_bars_disabled() disable_progress_bars() + + autoconfig_error = None + peft_error = None try: model_config = AutoConfig.from_pretrained(model_name, token = token, revision = revision) is_model = True - except: + except Exception as autoconfig_error: + autoconfig_error = str(autoconfig_error) is_model = False try: peft_config = PeftConfig .from_pretrained(model_name, token = token, revision = revision) is_peft = True - except: + except Exception as peft_error: + peft_error = str(peft_error) is_peft = False + pass # Cannot be both! if is_model and is_peft: @@ -118,11 +175,7 @@ def from_pretrained( "Please separate the LoRA and base models to 2 repos." ) elif not is_model and not is_peft: - raise RuntimeError( - f"Unsloth: `{model_name}` is not a base model or a PEFT model.\n"\ - "We could not locate a `config.json` or `adapter_config.json` file.\n"\ - "Are you certain the model name is correct? Does it actually exist?" - ) + raise RuntimeError(autoconfig_error or peft_error) pass # Get base model for PEFT: From 78fa9d058db4f039183da7a56e81a5cc4dfe289f Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sat, 27 Jul 2024 22:30:48 -0700 Subject: [PATCH 065/147] Update _utils.py --- unsloth/models/_utils.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index a3263f85a..04ffd2062 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -135,35 +135,36 @@ def patch_mistral_nemo_config(config): # ============================================= # Get Flash Attention v2 if Ampere (RTX 30xx, A100) import bitsandbytes as bnb -from transformers.models.llama.modeling_llama import logger from transformers import AutoTokenizer +from transformers.utils.import_utils import _is_package_available major_version, minor_version = torch.cuda.get_device_capability() SUPPORTS_BFLOAT16 = False if major_version >= 8: SUPPORTS_BFLOAT16 = True - try: - from flash_attn import flash_attn_func + if _is_package_available("flash_attn"): # Check for CUDA linking errors "undefined symbol: _ZNK3c106SymIntltEl" try: from flash_attn.flash_attn_interface import flash_attn_cuda HAS_FLASH_ATTENTION = True except: - logger.warning_once( + print( "Unsloth: Your Flash Attention 2 installation seems to be broken?\n"\ "A possible explanation is you have a new CUDA version which isn't\n"\ "yet compatible with FA2? Please file a ticket to Unsloth or FA2.\n"\ - "We shall now use Xformers instead, which gets a 0.01% performance hit.\n"\ + "We shall now use Xformers instead, which does not have any performance hits!\n"\ "We found this negligible impact by benchmarking on 1x A100." ) HAS_FLASH_ATTENTION = False - except: + else: HAS_FLASH_ATTENTION = False else: # Tri Dao's benchmark shows xformers is faster for now. HAS_FLASH_ATTENTION = False pass + +from transformers.models.llama.modeling_llama import logger import xformers.ops.fmha as xformers xformers_attention = xformers.memory_efficient_attention from xformers import __version__ as xformers_version From 4ea5789db6d601e452a7a701fe5ac53669876856 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sat, 27 Jul 2024 22:32:39 -0700 Subject: [PATCH 066/147] Update _utils.py --- unsloth/models/_utils.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 04ffd2062..74dfe6987 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -156,7 +156,15 @@ def patch_mistral_nemo_config(config): "We shall now use Xformers instead, which does not have any performance hits!\n"\ "We found this negligible impact by benchmarking on 1x A100." ) + + # Stop Flash Attention from importing! + import transformers.utils.import_utils + transformers.utils.import_utils.is_flash_attn_2_available = lambda *args, **kwargs: False + import transformers.utils + transformers.utils.is_flash_attn_2_available = lambda *args, **kwargs: False + HAS_FLASH_ATTENTION = False + pass else: HAS_FLASH_ATTENTION = False else: From aab503ac80e2032805854c2ff211cdc2a4b23b3f Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sat, 27 Jul 2024 22:35:24 -0700 Subject: [PATCH 067/147] Update loader.py --- unsloth/models/loader.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 2a7fa75fe..615b8286f 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -111,7 +111,8 @@ def _get_model_name(model_name, load_in_4bit = True): if upgraded_model_name is not None: raise NotImplementedError( f"Unsloth: {model_name} is not supported in your current Unsloth version! Please update Unsloth via:\n\n"\ - 'pip install --upgrade --force-reinstall --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"' + 'pip uninstall unsloth -y\n'\ + 'pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"' ) pass pass From b310f5e06a6b00e18d66324c53a686626d0b9040 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sat, 27 Jul 2024 22:48:52 -0700 Subject: [PATCH 068/147] Update _utils.py --- unsloth/models/_utils.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 74dfe6987..8435c834f 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -173,6 +173,22 @@ def patch_mistral_nemo_config(config): pass from transformers.models.llama.modeling_llama import logger + +# ============================================= +# Get Xformers +from xformers._cpp_lib import _register_extensions +try: + _register_extensions() # Check if C++ modules are loaded correctly +except Exception as error: + raise ImportError( + "Unsloth: Xformers was not installed correctly. "\ + "Please install xformers separately first. "\ + "Then confirm if it's correctly installed by running:\n"\ + "python -m xformers.info\n\n" + "Longer error message:\n" + str(error) + ) +pass + import xformers.ops.fmha as xformers xformers_attention = xformers.memory_efficient_attention from xformers import __version__ as xformers_version From 1f6705673883efbed5373d9bc845d56f40fd6dc9 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sat, 27 Jul 2024 22:50:38 -0700 Subject: [PATCH 069/147] Update mapper.py --- unsloth/models/mapper.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/unsloth/models/mapper.py b/unsloth/models/mapper.py index 462555f31..a3191392c 100644 --- a/unsloth/models/mapper.py +++ b/unsloth/models/mapper.py @@ -218,10 +218,10 @@ "unsloth/Mistral-Nemo-Base-2407", "mistralai/Mistral-Nemo-Base-2407", ), - "unsloth/Meta-Llama-3.1-8B-bnb-4bit" : ( - "unsloth/Meta-Llama-3.1-8B", - "meta-llama/Meta-Llama-3.1-8B", - ), + # "unsloth/Meta-Llama-3.1-8B-bnb-4bit" : ( + # "unsloth/Meta-Llama-3.1-8B", + # "meta-llama/Meta-Llama-3.1-8B", + # ), "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit" : ( "unsloth/Meta-Llama-3.1-8B-Instruct", "meta-llama/Meta-Llama-3.1-8B-Instruct", From a8e7556fc6aaac537f67d418b2c862ac349e66cd Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sat, 27 Jul 2024 22:52:06 -0700 Subject: [PATCH 070/147] Update loader.py --- unsloth/models/loader.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 615b8286f..fc356c3c1 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -97,6 +97,7 @@ def _get_model_name(model_name, load_in_4bit = True): INT_TO_FLOAT_MAPPER = INT_TO_FLOAT_MAPPER, FLOAT_TO_INT_MAPPER = FLOAT_TO_INT_MAPPER, ) + print(new_model_name) if new_model_name is None and \ model_name.count("/") == 1 and \ model_name[0].isalnum(): @@ -108,6 +109,7 @@ def _get_model_name(model_name, load_in_4bit = True): INT_TO_FLOAT_MAPPER = NEW_INT_TO_FLOAT_MAPPER, FLOAT_TO_INT_MAPPER = NEW_FLOAT_TO_INT_MAPPER, ) + print(upgraded_model_name) if upgraded_model_name is not None: raise NotImplementedError( f"Unsloth: {model_name} is not supported in your current Unsloth version! Please update Unsloth via:\n\n"\ From 38b5c77635f7a0febb694a45d41489c6c827f564 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sat, 27 Jul 2024 22:54:49 -0700 Subject: [PATCH 071/147] Update loader.py --- unsloth/models/loader.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index fc356c3c1..f06515e62 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -98,11 +98,11 @@ def _get_model_name(model_name, load_in_4bit = True): FLOAT_TO_INT_MAPPER = FLOAT_TO_INT_MAPPER, ) print(new_model_name) - if new_model_name is None and \ - model_name.count("/") == 1 and \ - model_name[0].isalnum(): + if new_model_name is None and model_name.count("/") == 1 and model_name[0].isalnum(): # Try checking if a new Unsloth version allows it! NEW_INT_TO_FLOAT_MAPPER, NEW_FLOAT_TO_INT_MAPPER = _get_new_mapper() + print(NEW_INT_TO_FLOAT_MAPPER) + print(NEW_FLOAT_TO_INT_MAPPER) upgraded_model_name = __get_model_name( model_name = model_name, load_in_4bit = load_in_4bit, From 4a8c9e88249ceabfd955c79c9eacef221e0ef175 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sat, 27 Jul 2024 22:56:59 -0700 Subject: [PATCH 072/147] Update _utils.py --- unsloth/models/_utils.py | 66 +++++++++++++++++++--------------------- 1 file changed, 32 insertions(+), 34 deletions(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 8435c834f..994f97ab7 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -176,21 +176,6 @@ def patch_mistral_nemo_config(config): # ============================================= # Get Xformers -from xformers._cpp_lib import _register_extensions -try: - _register_extensions() # Check if C++ modules are loaded correctly -except Exception as error: - raise ImportError( - "Unsloth: Xformers was not installed correctly. "\ - "Please install xformers separately first. "\ - "Then confirm if it's correctly installed by running:\n"\ - "python -m xformers.info\n\n" - "Longer error message:\n" + str(error) - ) -pass - -import xformers.ops.fmha as xformers -xformers_attention = xformers.memory_efficient_attention from xformers import __version__ as xformers_version # Temporarily disable 0.0.27 and higher - inference issues if Version(xformers_version) >= Version("0.0.27"): @@ -208,25 +193,6 @@ def patch_mistral_nemo_config(config): ) pass -# Check TRL version -from trl import __version__ as trl_version -if Version(trl_version) >= Version("0.9.0"): - raise ImportError( - "Unsloth: If you are in Colab, we updated the top cell install instructions - please change it to below "\ - "then press Disconnect Runtime and then Restart it.\n"\ - "\n"\ - "%%capture\n" - "# Installs Unsloth, Xformers (Flash Attention) and all other packages!\n" - '!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"\n' - '!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes\n'\ - '\n'\ - f"Otherwise in local machines, your TRL version of {trl_version} is too new.\n"\ - 'Please downgrade TRL via `pip install --force-reinstall "trl<0.9.0"' - ) -pass - -# Confirm versions -# ============================================= if Version(torch_version) < Version("2.2.0") and Version(xformers_version) >= Version("0.0.24"): raise ImportError( f"Unsloth: You have torch = {torch_version} but xformers = {xformers_version}.\n"\ @@ -244,6 +210,38 @@ def patch_mistral_nemo_config(config): ) pass +from xformers._cpp_lib import _register_extensions +try: + _register_extensions() # Check if C++ modules are loaded correctly +except Exception as error: + raise ImportError( + "Unsloth: Xformers was not installed correctly.\n"\ + "Please install xformers separately first.\n"\ + "Then confirm if it's correctly installed by running:\n"\ + "python -m xformers.info\n\n" + "Longer error message:\n" + str(error) + ) +pass +import xformers.ops.fmha as xformers +xformers_attention = xformers.memory_efficient_attention + +# Check TRL version +from trl import __version__ as trl_version +if Version(trl_version) >= Version("0.9.0"): + raise ImportError( + "Unsloth: If you are in Colab, we updated the top cell install instructions - please change it to below "\ + "then press Disconnect Runtime and then Restart it.\n"\ + "\n"\ + "%%capture\n" + "# Installs Unsloth, Xformers (Flash Attention) and all other packages!\n" + '!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"\n' + '!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes\n'\ + '\n'\ + f"Otherwise in local machines, your TRL version of {trl_version} is too new.\n"\ + 'Please downgrade TRL via `pip install --force-reinstall "trl<0.9.0"' + ) +pass + # ============================================= # Torch compile settings From c03fd228a240eb97d6ee386668c30b8352779935 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sat, 27 Jul 2024 22:58:15 -0700 Subject: [PATCH 073/147] Update loader.py --- unsloth/models/loader.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index f06515e62..21e58bb3d 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -75,17 +75,17 @@ def __get_model_name( def _get_new_mapper(): - try: - import requests - new_mapper = "https://raw.githubusercontent.com/unslothai/unsloth/main/unsloth/models/mapper.py" - with requests.get(new_mapper, timeout = 3) as new_mapper: new_mapper = new_mapper.text - new_mapper = new_mapper\ - .replace("INT_TO_FLOAT_MAPPER", "NEW_INT_TO_FLOAT_MAPPER")\ - .replace("FLOAT_TO_INT_MAPPER", "NEW_FLOAT_TO_INT_MAPPER") - exec(new_mapper, locals()) - return NEW_INT_TO_FLOAT_MAPPER, NEW_FLOAT_TO_INT_MAPPER - except: - return {}, {} + # try: + import requests + new_mapper = "https://raw.githubusercontent.com/unslothai/unsloth/main/unsloth/models/mapper.py" + with requests.get(new_mapper, timeout = 3) as new_mapper: new_mapper = new_mapper.text + new_mapper = new_mapper\ + .replace("INT_TO_FLOAT_MAPPER", "NEW_INT_TO_FLOAT_MAPPER")\ + .replace("FLOAT_TO_INT_MAPPER", "NEW_FLOAT_TO_INT_MAPPER") + exec(new_mapper, locals()) + return NEW_INT_TO_FLOAT_MAPPER, NEW_FLOAT_TO_INT_MAPPER + # except: + # return {}, {} pass pass From 9d5195296f7dcaac51519659fda13ee4027b2b1a Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sat, 27 Jul 2024 22:59:16 -0700 Subject: [PATCH 074/147] Update loader.py --- unsloth/models/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 21e58bb3d..074e83739 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -82,7 +82,7 @@ def _get_new_mapper(): new_mapper = new_mapper\ .replace("INT_TO_FLOAT_MAPPER", "NEW_INT_TO_FLOAT_MAPPER")\ .replace("FLOAT_TO_INT_MAPPER", "NEW_FLOAT_TO_INT_MAPPER") - exec(new_mapper, locals()) + exec(new_mapper) return NEW_INT_TO_FLOAT_MAPPER, NEW_FLOAT_TO_INT_MAPPER # except: # return {}, {} From ddc2dde97c2d234e75db032e41c7fafe2b6f384f Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sat, 27 Jul 2024 22:59:47 -0700 Subject: [PATCH 075/147] Update loader.py --- unsloth/models/loader.py | 1 + 1 file changed, 1 insertion(+) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 074e83739..9f2f53aa8 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -79,6 +79,7 @@ def _get_new_mapper(): import requests new_mapper = "https://raw.githubusercontent.com/unslothai/unsloth/main/unsloth/models/mapper.py" with requests.get(new_mapper, timeout = 3) as new_mapper: new_mapper = new_mapper.text + new_mapper = new_mapper[new_mapper.find("__INT_TO_FLOAT_MAPPER"):] new_mapper = new_mapper\ .replace("INT_TO_FLOAT_MAPPER", "NEW_INT_TO_FLOAT_MAPPER")\ .replace("FLOAT_TO_INT_MAPPER", "NEW_FLOAT_TO_INT_MAPPER") From b6ef70963011888c679ceaa6a929ce2cb32be1c7 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sat, 27 Jul 2024 23:00:47 -0700 Subject: [PATCH 076/147] Update loader.py --- unsloth/models/loader.py | 1 + 1 file changed, 1 insertion(+) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 9f2f53aa8..6bfbf0f28 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -84,6 +84,7 @@ def _get_new_mapper(): .replace("INT_TO_FLOAT_MAPPER", "NEW_INT_TO_FLOAT_MAPPER")\ .replace("FLOAT_TO_INT_MAPPER", "NEW_FLOAT_TO_INT_MAPPER") exec(new_mapper) + print(new_mapper) return NEW_INT_TO_FLOAT_MAPPER, NEW_FLOAT_TO_INT_MAPPER # except: # return {}, {} From ed8bc007c6beef621941a4ee5882c6c55c0f1df5 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sat, 27 Jul 2024 23:02:10 -0700 Subject: [PATCH 077/147] Update loader.py --- unsloth/models/loader.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 6bfbf0f28..f321f23ed 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -83,8 +83,7 @@ def _get_new_mapper(): new_mapper = new_mapper\ .replace("INT_TO_FLOAT_MAPPER", "NEW_INT_TO_FLOAT_MAPPER")\ .replace("FLOAT_TO_INT_MAPPER", "NEW_FLOAT_TO_INT_MAPPER") - exec(new_mapper) - print(new_mapper) + exec(new_mapper, globals()) return NEW_INT_TO_FLOAT_MAPPER, NEW_FLOAT_TO_INT_MAPPER # except: # return {}, {} From e1d61ce5d9f4c88af7d8c0c5d4ad9ca65b9e9327 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sat, 27 Jul 2024 23:03:13 -0700 Subject: [PATCH 078/147] Update loader.py --- unsloth/models/loader.py | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index f321f23ed..15d3f952b 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -75,18 +75,18 @@ def __get_model_name( def _get_new_mapper(): - # try: - import requests - new_mapper = "https://raw.githubusercontent.com/unslothai/unsloth/main/unsloth/models/mapper.py" - with requests.get(new_mapper, timeout = 3) as new_mapper: new_mapper = new_mapper.text - new_mapper = new_mapper[new_mapper.find("__INT_TO_FLOAT_MAPPER"):] - new_mapper = new_mapper\ - .replace("INT_TO_FLOAT_MAPPER", "NEW_INT_TO_FLOAT_MAPPER")\ - .replace("FLOAT_TO_INT_MAPPER", "NEW_FLOAT_TO_INT_MAPPER") - exec(new_mapper, globals()) - return NEW_INT_TO_FLOAT_MAPPER, NEW_FLOAT_TO_INT_MAPPER - # except: - # return {}, {} + try: + import requests + new_mapper = "https://raw.githubusercontent.com/unslothai/unsloth/main/unsloth/models/mapper.py" + with requests.get(new_mapper, timeout = 3) as new_mapper: new_mapper = new_mapper.text + new_mapper = new_mapper[new_mapper.find("__INT_TO_FLOAT_MAPPER"):] + new_mapper = new_mapper\ + .replace("INT_TO_FLOAT_MAPPER", "NEW_INT_TO_FLOAT_MAPPER")\ + .replace("FLOAT_TO_INT_MAPPER", "NEW_FLOAT_TO_INT_MAPPER") + exec(new_mapper, globals()) + return NEW_INT_TO_FLOAT_MAPPER, NEW_FLOAT_TO_INT_MAPPER + except: + return {}, {} pass pass @@ -102,15 +102,12 @@ def _get_model_name(model_name, load_in_4bit = True): if new_model_name is None and model_name.count("/") == 1 and model_name[0].isalnum(): # Try checking if a new Unsloth version allows it! NEW_INT_TO_FLOAT_MAPPER, NEW_FLOAT_TO_INT_MAPPER = _get_new_mapper() - print(NEW_INT_TO_FLOAT_MAPPER) - print(NEW_FLOAT_TO_INT_MAPPER) upgraded_model_name = __get_model_name( model_name = model_name, load_in_4bit = load_in_4bit, INT_TO_FLOAT_MAPPER = NEW_INT_TO_FLOAT_MAPPER, FLOAT_TO_INT_MAPPER = NEW_FLOAT_TO_INT_MAPPER, ) - print(upgraded_model_name) if upgraded_model_name is not None: raise NotImplementedError( f"Unsloth: {model_name} is not supported in your current Unsloth version! Please update Unsloth via:\n\n"\ From 858e1a2a8c23dc08e6bbd17c841e5a7b133cee06 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sat, 27 Jul 2024 23:03:29 -0700 Subject: [PATCH 079/147] Update mapper.py --- unsloth/models/mapper.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/unsloth/models/mapper.py b/unsloth/models/mapper.py index a3191392c..462555f31 100644 --- a/unsloth/models/mapper.py +++ b/unsloth/models/mapper.py @@ -218,10 +218,10 @@ "unsloth/Mistral-Nemo-Base-2407", "mistralai/Mistral-Nemo-Base-2407", ), - # "unsloth/Meta-Llama-3.1-8B-bnb-4bit" : ( - # "unsloth/Meta-Llama-3.1-8B", - # "meta-llama/Meta-Llama-3.1-8B", - # ), + "unsloth/Meta-Llama-3.1-8B-bnb-4bit" : ( + "unsloth/Meta-Llama-3.1-8B", + "meta-llama/Meta-Llama-3.1-8B", + ), "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit" : ( "unsloth/Meta-Llama-3.1-8B-Instruct", "meta-llama/Meta-Llama-3.1-8B-Instruct", From ea0a49448dca64808734cff797e352a69236343c Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sat, 27 Jul 2024 23:04:23 -0700 Subject: [PATCH 080/147] Update loader.py --- unsloth/models/loader.py | 1 - 1 file changed, 1 deletion(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 15d3f952b..e2bfe1d63 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -98,7 +98,6 @@ def _get_model_name(model_name, load_in_4bit = True): INT_TO_FLOAT_MAPPER = INT_TO_FLOAT_MAPPER, FLOAT_TO_INT_MAPPER = FLOAT_TO_INT_MAPPER, ) - print(new_model_name) if new_model_name is None and model_name.count("/") == 1 and model_name[0].isalnum(): # Try checking if a new Unsloth version allows it! NEW_INT_TO_FLOAT_MAPPER, NEW_FLOAT_TO_INT_MAPPER = _get_new_mapper() From a7bfbe7927ea75f959e1d7c84e7bf50945d405ff Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 28 Jul 2024 00:10:02 -0700 Subject: [PATCH 081/147] Better debugging (#826) * Update __init__.py * Edits * Checks * Update _utils.py * Update _utils.py * Update loader.py * Update _utils.py * Update mapper.py * Update loader.py * Update loader.py * Update _utils.py * Update loader.py * Update loader.py * Update loader.py * Update loader.py * Update loader.py * Update loader.py * Update mapper.py * Update loader.py --- pyproject.toml | 10 +++-- unsloth/__init__.py | 9 ++--- unsloth/models/_utils.py | 65 ++++++++++++++++++++++++------ unsloth/models/llama.py | 3 +- unsloth/models/loader.py | 87 ++++++++++++++++++++++++++++++++-------- 5 files changed, 135 insertions(+), 39 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 829b35ad3..6777f7c26 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,7 +35,7 @@ exclude = ["images*"] huggingface = [ "packaging", "tyro", - "transformers>=4.43.1", + "transformers>=4.43.2", "datasets>=2.16.0", "sentencepiece>=0.2.0", "tqdm", @@ -46,7 +46,8 @@ huggingface = [ "trl>=0.7.9,<0.9.0", "peft>=0.7.1,!=0.11.0", "protobuf<4.0.0", - "huggingface_hub[hf_transfer]", + "huggingface_hub", + "hf-transfer", ] cu118only = [ "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.22.post7%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'", @@ -188,7 +189,7 @@ colab-ampere-torch220 = [ colab-new = [ "packaging", "tyro", - "transformers>=4.43.1", + "transformers>=4.43.2", "datasets>=2.16.0", "sentencepiece>=0.2.0", "tqdm", @@ -196,7 +197,8 @@ colab-new = [ "wheel>=0.42.0", "numpy", "protobuf<4.0.0", - "huggingface_hub[hf_transfer]", + "huggingface_hub", + "hf-transfer", ] colab-no-deps = [ "accelerate>=0.26.1", diff --git a/unsloth/__init__.py b/unsloth/__init__.py index 464068154..db54c9a16 100644 --- a/unsloth/__init__.py +++ b/unsloth/__init__.py @@ -60,6 +60,10 @@ "We have some installation instructions on our Github page.") pass +import os, re +import numpy as np +import subprocess + # Hugging Face Hub faster downloads (only enable during Colab and Kaggle sessions) keynames = "\n" + "\n".join(os.environ.keys()) if "\nCOLAB_" in keynames or "\nKAGGLE_" in keynames: @@ -103,11 +107,6 @@ def is_bf16_supported(): return SUPPORTS_BFLOAT16 except: pass else: from triton.common.build import libcuda_dirs -import os -import re -import numpy as np -import subprocess - try: cdequantize_blockwise_fp32 = bnb.functional.lib.cdequantize_blockwise_fp32 libcuda_dirs() diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 5a267a459..994f97ab7 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -122,7 +122,8 @@ def patch_mistral_nemo_config(config): # ============================================= # torch.cuda.amp.custom_fwd is deprecated >= 2.4 import torch -if Version(torch.__version__) < Version("2.4.0"): +torch_version = torch.__version__ +if Version(torch_version) < Version("2.4.0"): torch_amp_custom_fwd = torch.cuda.amp.custom_fwd torch_amp_custom_bwd = torch.cuda.amp.custom_bwd else: @@ -134,37 +135,47 @@ def patch_mistral_nemo_config(config): # ============================================= # Get Flash Attention v2 if Ampere (RTX 30xx, A100) import bitsandbytes as bnb -from transformers.models.llama.modeling_llama import logger from transformers import AutoTokenizer +from transformers.utils.import_utils import _is_package_available major_version, minor_version = torch.cuda.get_device_capability() SUPPORTS_BFLOAT16 = False if major_version >= 8: SUPPORTS_BFLOAT16 = True - try: - from flash_attn import flash_attn_func + if _is_package_available("flash_attn"): # Check for CUDA linking errors "undefined symbol: _ZNK3c106SymIntltEl" try: from flash_attn.flash_attn_interface import flash_attn_cuda HAS_FLASH_ATTENTION = True except: - logger.warning_once( + print( "Unsloth: Your Flash Attention 2 installation seems to be broken?\n"\ "A possible explanation is you have a new CUDA version which isn't\n"\ "yet compatible with FA2? Please file a ticket to Unsloth or FA2.\n"\ - "We shall now use Xformers instead, which gets a 0.01% performance hit.\n"\ + "We shall now use Xformers instead, which does not have any performance hits!\n"\ "We found this negligible impact by benchmarking on 1x A100." ) + + # Stop Flash Attention from importing! + import transformers.utils.import_utils + transformers.utils.import_utils.is_flash_attn_2_available = lambda *args, **kwargs: False + import transformers.utils + transformers.utils.is_flash_attn_2_available = lambda *args, **kwargs: False + HAS_FLASH_ATTENTION = False - except: + pass + else: HAS_FLASH_ATTENTION = False else: # Tri Dao's benchmark shows xformers is faster for now. HAS_FLASH_ATTENTION = False pass -import xformers.ops.fmha as xformers -xformers_attention = xformers.memory_efficient_attention + +from transformers.models.llama.modeling_llama import logger + +# ============================================= +# Get Xformers from xformers import __version__ as xformers_version # Temporarily disable 0.0.27 and higher - inference issues if Version(xformers_version) >= Version("0.0.27"): @@ -182,9 +193,41 @@ def patch_mistral_nemo_config(config): ) pass +if Version(torch_version) < Version("2.2.0") and Version(xformers_version) >= Version("0.0.24"): + raise ImportError( + f"Unsloth: You have torch = {torch_version} but xformers = {xformers_version}.\n"\ + f"Please install xformers < 0.0.24 for torch = {torch_version}." + ) +elif Version(torch_version) < Version("2.3.0") and Version(xformers_version) >= Version("0.0.26"): + raise ImportError( + f"Unsloth: You have torch = {torch_version} but xformers = {xformers_version}.\n"\ + f"Please install xformers < 0.0.26 for torch = {torch_version}." + ) +elif Version(torch_version) < Version("2.4.0") and Version(xformers_version) >= Version("0.0.27"): + raise ImportError( + f"Unsloth: You have torch = {torch_version} but xformers = {xformers_version}.\n"\ + f"Please install xformers < 0.0.27 for torch = {torch_version}." + ) +pass + +from xformers._cpp_lib import _register_extensions +try: + _register_extensions() # Check if C++ modules are loaded correctly +except Exception as error: + raise ImportError( + "Unsloth: Xformers was not installed correctly.\n"\ + "Please install xformers separately first.\n"\ + "Then confirm if it's correctly installed by running:\n"\ + "python -m xformers.info\n\n" + "Longer error message:\n" + str(error) + ) +pass +import xformers.ops.fmha as xformers +xformers_attention = xformers.memory_efficient_attention + # Check TRL version from trl import __version__ as trl_version -if Version(xformers_version) >= Version("0.9.0"): +if Version(trl_version) >= Version("0.9.0"): raise ImportError( "Unsloth: If you are in Colab, we updated the top cell install instructions - please change it to below "\ "then press Disconnect Runtime and then Restart it.\n"\ @@ -199,8 +242,6 @@ def patch_mistral_nemo_config(config): ) pass -# ============================================= - # ============================================= # Torch compile settings diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index bc434ecf1..6b16a4cc6 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -18,6 +18,7 @@ from ._utils import * from ._utils import __version__ from torch.nn.functional import scaled_dot_product_attention +from transformers import __version__ as transformers_version from transformers.models.llama.modeling_llama import ( logger, BaseModelOutputWithPast, @@ -1281,7 +1282,7 @@ def from_pretrained( max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3) statistics = \ - f"==((====))== Unsloth: Fast {model_patcher.__name__[4:-5]} patching release {__version__}\n"\ + f"==((====))== Unsloth {__version__}: Fast {model_patcher.__name__[4:-5]} patching. Transformers = {transformers_version}.\n"\ f" \\\ /| GPU: {gpu_stats.name}. Max memory: {max_memory} GB. Platform = {platform_system}.\n"\ f"O^O/ \_/ \\ Pytorch: {torch.__version__}. CUDA = {gpu_stats.major}.{gpu_stats.minor}. CUDA Toolkit = {torch.version.cuda}.\n"\ f"\ / Bfloat16 = {str(SUPPORTS_BFLOAT16).upper()}. FA [Xformers = {xformers_version}. FA2 = {HAS_FLASH_ATTENTION}]\n"\ diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 6b83b8e73..e2bfe1d63 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -27,7 +27,7 @@ SUPPORTS_FOURBIT = transformers_version >= Version("4.37") SUPPORTS_GEMMA = transformers_version >= Version("4.38") SUPPORTS_GEMMA2 = transformers_version >= Version("4.42") -SUPPORTS_LLAMA31 = transformers_version >= Version("4.43.1") +SUPPORTS_LLAMA31 = transformers_version >= Version("4.43.2") if SUPPORTS_GEMMA: from .gemma import FastGemmaModel if SUPPORTS_GEMMA2: @@ -35,9 +35,14 @@ pass -def _get_model_name(model_name, load_in_4bit = True): +def __get_model_name( + model_name, + load_in_4bit = True, + INT_TO_FLOAT_MAPPER = None, + FLOAT_TO_INT_MAPPER = None, +): - if not SUPPORTS_FOURBIT and model_name in INT_TO_FLOAT_MAPPER: + if not SUPPORTS_FOURBIT and model_name.lower() in INT_TO_FLOAT_MAPPER: model_name = INT_TO_FLOAT_MAPPER[model_name.lower()] logger.warning_once( f"Unsloth: Your transformers version of {transformers_version} does not support native "\ @@ -46,25 +51,71 @@ def _get_model_name(model_name, load_in_4bit = True): f"to obtain the latest transformers build, then restart this session.\n"\ f"For now, we shall load `{model_name}` instead (still 4bit, just slower downloading)." ) + return model_name - elif not load_in_4bit and model_name in INT_TO_FLOAT_MAPPER: + elif not load_in_4bit and model_name.lower() in INT_TO_FLOAT_MAPPER: new_model_name = INT_TO_FLOAT_MAPPER[model_name.lower()] # logger.warning_once( # f"Unsloth: You passed in `{model_name}` which is a 4bit model, yet you set\n"\ # f"`load_in_4bit = False`. We shall load `{new_model_name}` instead." # ) - model_name = new_model_name + return new_model_name - elif load_in_4bit and SUPPORTS_FOURBIT and model_name in FLOAT_TO_INT_MAPPER: + elif load_in_4bit and SUPPORTS_FOURBIT and model_name.lower() in FLOAT_TO_INT_MAPPER: new_model_name = FLOAT_TO_INT_MAPPER[model_name.lower()] # logger.warning_once( # f"Unsloth: You passed in `{model_name}` and `load_in_4bit = True`.\n"\ # f"We shall load `{new_model_name}` for 4x faster loading." # ) - model_name = new_model_name + return new_model_name pass - return model_name + return None +pass + + +def _get_new_mapper(): + try: + import requests + new_mapper = "https://raw.githubusercontent.com/unslothai/unsloth/main/unsloth/models/mapper.py" + with requests.get(new_mapper, timeout = 3) as new_mapper: new_mapper = new_mapper.text + new_mapper = new_mapper[new_mapper.find("__INT_TO_FLOAT_MAPPER"):] + new_mapper = new_mapper\ + .replace("INT_TO_FLOAT_MAPPER", "NEW_INT_TO_FLOAT_MAPPER")\ + .replace("FLOAT_TO_INT_MAPPER", "NEW_FLOAT_TO_INT_MAPPER") + exec(new_mapper, globals()) + return NEW_INT_TO_FLOAT_MAPPER, NEW_FLOAT_TO_INT_MAPPER + except: + return {}, {} + pass +pass + + +def _get_model_name(model_name, load_in_4bit = True): + new_model_name = __get_model_name( + model_name = model_name, + load_in_4bit = load_in_4bit, + INT_TO_FLOAT_MAPPER = INT_TO_FLOAT_MAPPER, + FLOAT_TO_INT_MAPPER = FLOAT_TO_INT_MAPPER, + ) + if new_model_name is None and model_name.count("/") == 1 and model_name[0].isalnum(): + # Try checking if a new Unsloth version allows it! + NEW_INT_TO_FLOAT_MAPPER, NEW_FLOAT_TO_INT_MAPPER = _get_new_mapper() + upgraded_model_name = __get_model_name( + model_name = model_name, + load_in_4bit = load_in_4bit, + INT_TO_FLOAT_MAPPER = NEW_INT_TO_FLOAT_MAPPER, + FLOAT_TO_INT_MAPPER = NEW_FLOAT_TO_INT_MAPPER, + ) + if upgraded_model_name is not None: + raise NotImplementedError( + f"Unsloth: {model_name} is not supported in your current Unsloth version! Please update Unsloth via:\n\n"\ + 'pip uninstall unsloth -y\n'\ + 'pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"' + ) + pass + pass + return new_model_name if new_model_name is not None else model_name pass @@ -98,16 +149,22 @@ def from_pretrained( from huggingface_hub.utils import disable_progress_bars, enable_progress_bars, are_progress_bars_disabled was_disabled = are_progress_bars_disabled() disable_progress_bars() + + autoconfig_error = None + peft_error = None try: model_config = AutoConfig.from_pretrained(model_name, token = token, revision = revision) is_model = True - except: + except Exception as autoconfig_error: + autoconfig_error = str(autoconfig_error) is_model = False try: peft_config = PeftConfig .from_pretrained(model_name, token = token, revision = revision) is_peft = True - except: + except Exception as peft_error: + peft_error = str(peft_error) is_peft = False + pass # Cannot be both! if is_model and is_peft: @@ -118,11 +175,7 @@ def from_pretrained( "Please separate the LoRA and base models to 2 repos." ) elif not is_model and not is_peft: - raise RuntimeError( - f"Unsloth: `{model_name}` is not a base model or a PEFT model.\n"\ - "We could not locate a `config.json` or `adapter_config.json` file.\n"\ - "Are you certain the model name is correct? Does it actually exist?" - ) + raise RuntimeError(autoconfig_error or peft_error) pass # Get base model for PEFT: @@ -147,8 +200,8 @@ def from_pretrained( if scaling_type == "llama3" and not SUPPORTS_LLAMA31: raise ImportError( f"Unsloth: Your transformers version of {transformers_version} does not support Llama 3.1.\n"\ - f"The minimum required version is 4.43.1\n"\ - f'Try `pip install --upgrade "transformers>=4.43.1"`\n'\ + f"The minimum required version is 4.43.2\n"\ + f'Try `pip install --upgrade "transformers>=4.43.2"`\n'\ f"to obtain the latest transformers build, then restart this session."\ ) dispatch_model = FastLlamaModel From 18900721c2a3ac7f95d228d8fb41b2c3bfb6f869 Mon Sep 17 00:00:00 2001 From: XiaoYang Date: Wed, 31 Jul 2024 01:15:09 +0800 Subject: [PATCH 082/147] fix UnboundLocalError (#834) * When an exception has been assigned using as target, it is cleared at the end of the except clause.(https://docs.python.org/3/reference/compound_stmts.html#the-try-statement) * Update loader.py --------- Co-authored-by: xiaoyang Co-authored-by: Daniel Han --- unsloth/models/loader.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index e2bfe1d63..34deb8f9b 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -155,14 +155,14 @@ def from_pretrained( try: model_config = AutoConfig.from_pretrained(model_name, token = token, revision = revision) is_model = True - except Exception as autoconfig_error: - autoconfig_error = str(autoconfig_error) + except Exception as error: + autoconfig_error = str(error) is_model = False try: peft_config = PeftConfig .from_pretrained(model_name, token = token, revision = revision) is_peft = True - except Exception as peft_error: - peft_error = str(peft_error) + except Exception as error: + peft_error = str(error) is_peft = False pass From be0930d1f6d9a742e6971ba8e9206c04e87d16d6 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 30 Jul 2024 10:18:51 -0700 Subject: [PATCH 083/147] Update loader.py --- unsloth/models/loader.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 34deb8f9b..f22e81efa 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -175,6 +175,15 @@ def from_pretrained( "Please separate the LoRA and base models to 2 repos." ) elif not is_model and not is_peft: + error = autoconfig_error or peft_error + # Old transformers version + if "rope_scaling" in error.lower() and not SUPPORTS_LLAMA31: + raise ImportError( + f"Unsloth: Your transformers version of {transformers_version} does not support new RoPE scaling methods.\n"\ + f"This includes Llama 3.1. The minimum required version is 4.43.2\n"\ + f'Try `pip install --upgrade "transformers>=4.43.2"`\n'\ + f"to obtain the latest transformers build, then restart this session."\ + ) raise RuntimeError(autoconfig_error or peft_error) pass From 4285d1b479d665b5f94136353ba2d8c3a73a789f Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 30 Jul 2024 10:29:54 -0700 Subject: [PATCH 084/147] Update llama.py --- unsloth/models/llama.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 6b16a4cc6..496a37e7a 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -2012,7 +2012,8 @@ def patch_peft_model( model.peft_config[active_adapter].base_model_name_or_path = name pass # Add revision to enable future fast inference paths - model.peft_config[active_adapter].revision = f"unsloth" + # [TODO] Bugs out!see https://github.com/unslothai/unsloth/issues/492 + # model.peft_config[active_adapter].revision = f"unsloth" pass from transformers.trainer import Trainer From 42e09d192fb3d8a6e7b96563c0047fdd19585219 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 30 Jul 2024 19:56:36 -0700 Subject: [PATCH 085/147] bugs --- unsloth/models/_utils.py | 28 ++++++++++++++-------------- unsloth/models/loader.py | 1 + unsloth/models/mapper.py | 1 + 3 files changed, 16 insertions(+), 14 deletions(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 994f97ab7..8677879aa 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -178,20 +178,20 @@ def patch_mistral_nemo_config(config): # Get Xformers from xformers import __version__ as xformers_version # Temporarily disable 0.0.27 and higher - inference issues -if Version(xformers_version) >= Version("0.0.27"): - raise ImportError( - "Unsloth: If you are in Colab, we updated the top cell install instructions - please change it to below "\ - "then press Disconnect Runtime and then Restart it.\n"\ - "\n"\ - "%%capture\n" - "# Installs Unsloth, Xformers (Flash Attention) and all other packages!\n" - '!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"\n' - '!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes\n'\ - '\n'\ - f"Otherwise in local machines, your xformers version of {xformers_version} is too new.\n"\ - 'Please downgrade xformers via `pip install --force-reinstall "xformers<0.0.27"' - ) -pass +# if Version(xformers_version) >= Version("0.0.27"): +# raise ImportError( +# "Unsloth: If you are in Colab, we updated the top cell install instructions - please change it to below "\ +# "then press Disconnect Runtime and then Restart it.\n"\ +# "\n"\ +# "%%capture\n" +# "# Installs Unsloth, Xformers (Flash Attention) and all other packages!\n" +# '!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"\n' +# '!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes\n'\ +# '\n'\ +# f"Otherwise in local machines, your xformers version of {xformers_version} is too new.\n"\ +# 'Please downgrade xformers via `pip install --force-reinstall "xformers<0.0.27"' +# ) +# pass if Version(torch_version) < Version("2.2.0") and Version(xformers_version) >= Version("0.0.24"): raise ImportError( diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index f22e81efa..fb6d5c501 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -42,6 +42,7 @@ def __get_model_name( FLOAT_TO_INT_MAPPER = None, ): + model_name = str(model_name) if not SUPPORTS_FOURBIT and model_name.lower() in INT_TO_FLOAT_MAPPER: model_name = INT_TO_FLOAT_MAPPER[model_name.lower()] logger.warning_once( diff --git a/unsloth/models/mapper.py b/unsloth/models/mapper.py index 462555f31..254a68a42 100644 --- a/unsloth/models/mapper.py +++ b/unsloth/models/mapper.py @@ -67,6 +67,7 @@ "codellama/CodeLlama-7b-hf", ), "unsloth/codellama-13b-bnb-4bit" : ( + "unsloth/codellama-13b", "codellama/CodeLlama-13b-hf", ), "unsloth/yi-6b-bnb-4bit" : ( From 79ef745c7b9f369644d9a740a2b8be29e9dad860 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 30 Jul 2024 19:57:53 -0700 Subject: [PATCH 086/147] Update _utils.py --- unsloth/models/_utils.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 8677879aa..f4e4257b2 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -203,12 +203,12 @@ def patch_mistral_nemo_config(config): f"Unsloth: You have torch = {torch_version} but xformers = {xformers_version}.\n"\ f"Please install xformers < 0.0.26 for torch = {torch_version}." ) -elif Version(torch_version) < Version("2.4.0") and Version(xformers_version) >= Version("0.0.27"): - raise ImportError( - f"Unsloth: You have torch = {torch_version} but xformers = {xformers_version}.\n"\ - f"Please install xformers < 0.0.27 for torch = {torch_version}." - ) -pass +# elif Version(torch_version) < Version("2.4.0") and Version(xformers_version) >= Version("0.0.27"): +# raise ImportError( +# f"Unsloth: You have torch = {torch_version} but xformers = {xformers_version}.\n"\ +# f"Please install xformers < 0.0.27 for torch = {torch_version}." +# ) +# pass from xformers._cpp_lib import _register_extensions try: From 9617ecbbb8bef7864961096f925f03e40ffa7f99 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 30 Jul 2024 22:48:53 -0700 Subject: [PATCH 087/147] flash-attn softcapping --- pyproject.toml | 22 ++++++------- unsloth/models/_utils.py | 14 +++++++++ unsloth/models/gemma2.py | 68 +++++++++++++++++++++++++++++----------- unsloth/models/llama.py | 39 +++++++++++++---------- unsloth/models/loader.py | 16 ++++++++++ unsloth/models/mapper.py | 1 - 6 files changed, 113 insertions(+), 47 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 6777f7c26..e711325be 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -171,7 +171,7 @@ colab-ampere-torch211 = [ "unsloth[cu121onlytorch211]", "packaging", "ninja", - "flash-attn", + "flash-attn>=2.6.3", ] colab-torch220 = [ "unsloth[huggingface]", @@ -184,7 +184,7 @@ colab-ampere-torch220 = [ "unsloth[cu121onlytorch220]", "packaging", "ninja", - "flash-attn", + "flash-attn>=2.6.3", ] colab-new = [ "packaging", @@ -215,7 +215,7 @@ colab-ampere = [ "unsloth[colab-ampere-torch220]", "packaging", "ninja", - "flash-attn", + "flash-attn>=2.6.3", ] cu118-ampere = [ "unsloth[huggingface]", @@ -223,7 +223,7 @@ cu118-ampere = [ "unsloth[cu118only]", "packaging", "ninja", - "flash-attn", + "flash-attn>=2.6.3", ] cu121-ampere = [ "unsloth[huggingface]", @@ -231,7 +231,7 @@ cu121-ampere = [ "unsloth[cu121only]", "packaging", "ninja", - "flash-attn", + "flash-attn>=2.6.3", ] cu118-ampere-torch211 = [ "unsloth[huggingface]", @@ -239,7 +239,7 @@ cu118-ampere-torch211 = [ "unsloth[cu118onlytorch211]", "packaging", "ninja", - "flash-attn", + "flash-attn>=2.6.3", ] cu121-ampere-torch211 = [ "unsloth[huggingface]", @@ -247,7 +247,7 @@ cu121-ampere-torch211 = [ "unsloth[cu121onlytorch211]", "packaging", "ninja", - "flash-attn", + "flash-attn>=2.6.3", ] cu118-ampere-torch220 = [ "unsloth[huggingface]", @@ -255,7 +255,7 @@ cu118-ampere-torch220 = [ "unsloth[cu118onlytorch220]", "packaging", "ninja", - "flash-attn", + "flash-attn>=2.6.3", ] cu121-ampere-torch220 = [ "unsloth[huggingface]", @@ -263,7 +263,7 @@ cu121-ampere-torch220 = [ "unsloth[cu121onlytorch220]", "packaging", "ninja", - "flash-attn", + "flash-attn>=2.6.3", ] cu118-ampere-torch230 = [ "unsloth[huggingface]", @@ -271,7 +271,7 @@ cu118-ampere-torch230 = [ "unsloth[cu118onlytorch230]", "packaging", "ninja", - "flash-attn", + "flash-attn>=2.6.3", ] cu121-ampere-torch230 = [ "unsloth[huggingface]", @@ -279,7 +279,7 @@ cu121-ampere-torch230 = [ "unsloth[cu121onlytorch230]", "packaging", "ninja", - "flash-attn", + "flash-attn>=2.6.3", ] [project.urls] diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index f4e4257b2..c9bc6065f 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -21,6 +21,7 @@ "xformers_version", "__version__", "HAS_FLASH_ATTENTION", + "HAS_FLASH_ATTENTION_SOFTCAPPING", "PRE_CHECK", "platform_system", "patch_tokenizer", @@ -140,6 +141,8 @@ def patch_mistral_nemo_config(config): major_version, minor_version = torch.cuda.get_device_capability() SUPPORTS_BFLOAT16 = False +HAS_FLASH_ATTENTION = False +HAS_FLASH_ATTENTION_SOFTCAPPING = False if major_version >= 8: SUPPORTS_BFLOAT16 = True @@ -148,6 +151,17 @@ def patch_mistral_nemo_config(config): try: from flash_attn.flash_attn_interface import flash_attn_cuda HAS_FLASH_ATTENTION = True + + # Also check for softcapping + from flash_attn import __version__ as flash_attn_version + HAS_FLASH_ATTENTION_SOFTCAPPING = Version(flash_attn_version) >= Version("2.6.3") + if not HAS_FLASH_ATTENTION_SOFTCAPPING: + print( + "Unsloth: If you want to finetune Gemma 2, upgrade flash-attn to version 2.6.3 or higher!\n"\ + "Newer versions support faster and less memory usage kernels for Gemma 2's attention softcapping!\n"\ + "To update flash-attn, do the below:\n"\ + '\npip install --no-deps --upgrade "flash-attn>=2.6.3"' + ) except: print( "Unsloth: Your Flash Attention 2 installation seems to be broken?\n"\ diff --git a/unsloth/models/gemma2.py b/unsloth/models/gemma2.py index 0d21c47b0..ecd45fbce 100644 --- a/unsloth/models/gemma2.py +++ b/unsloth/models/gemma2.py @@ -56,6 +56,8 @@ Gemma2FlashAttention2 = Gemma2Attention pass +if HAS_FLASH_ATTENTION_SOFTCAPPING: + from flash_attn import flash_attn_func # [TODO] We must randomnly use torch.compile? # I checked the gradients and formulas and I'm sure it's correct. @@ -126,8 +128,31 @@ def Gemma2Attention_fast_forward( V = torch.cat([past_key_value[1], V], dim = 2) pass past_key_value = (K, V) if use_cache else None - - A = slow_attention_softcapping(Q, K, V, causal_mask, self, bsz, kv_seq_len) + + # Only enable if the attention_mask is True + has_sliding_window = type(attention_mask) is bool and attention_mask is True + if HAS_FLASH_ATTENTION_SOFTCAPPING and type(attention_mask) is bool: + window = (-1, -1) + if has_sliding_window: + sw = getattr(self.config, "sliding_window", None) + sw = kv_seq_len if (sw is None or sw == "null") else sw + window = (-1, -1) if (kv_seq_len <= sw) else (sw, sw) + pass + + Q = Q.transpose(1, 2) + K = K.transpose(1, 2) + V = V.transpose(1, 2) + A = flash_attn_func( + Q, K, V, + causal = True, + softcap = self.config.attn_logit_softcapping, + softmax_scale = self.config.query_pre_attn_scalar, + window_size = window, + ) + A = A.reshape(bsz, q_len, n_heads*head_dim) + else: + A = slow_attention_softcapping(Q, K, V, causal_mask, self, bsz, kv_seq_len) + pass A = self.apply_o(self, A) return A, None, past_key_value pass @@ -205,6 +230,8 @@ def Gemma2DecoderLayer_fast_forward( from math import sqrt as math_sqrt KV_CACHE_INCREMENT = 256 # KV Cache update size torch_nn_functional_softmax = torch.nn.functional.softmax +torch_matmul = torch.matmul +torch_tanh = torch.tanh def Gemma2Attention_fast_forward_inference( self, @@ -322,13 +349,13 @@ def Gemma2Attention_fast_forward_inference( # if bsz == 1: Qn *= self.scalar # See https://github.com/ggerganov/llama.cpp/issues/7805#issuecomment-2153349963 # It seems like doing (Q * scalar) @ K is better than (Q @ K) * scalar to stop overflows - A = torch.matmul(Qn, Knn.transpose(2, 3), out = self.attention[:,:,:,:cached_len]) + A = torch_matmul(Qn, Knn.transpose(2, 3), out = self.attention[:,:,:,:cached_len]) # if attention_mask is not None: A += attention_mask # Must add attention_mask for batched - A *= self.reciprocal_t; torch.tanh(A, out = A); A *= self.t; # Logit softcapping + A *= self.reciprocal_t; torch_tanh(A, out = A); A *= self.t; # Logit softcapping A[:] = torch_nn_functional_softmax(A, dim = -1, dtype = torch.float32)#.to(A.dtype) - A = torch.matmul(A, Vnn, out = Qn) + A = torch_matmul(A, Vnn, out = Qn) # else: # A = scaled_dot_product_attention(Qn, Knn, Vnn, attn_mask = attention_mask, is_causal = False) # pass @@ -359,19 +386,24 @@ def Gemma2Model_fast_forward_inference( bsz, q_len, hd = hidden_states.shape seq_len = past_key_values[0][0].shape[-2] if bsz != 1: - SWA = _prepare_4d_causal_attention_mask_for_sdpa( - attention_mask, - (bsz, q_len), - hidden_states, - seq_len, - sliding_window = self.config.sliding_window, - ) - GA = _prepare_4d_causal_attention_mask_for_sdpa( - attention_mask, - (bsz, q_len), - hidden_states, - seq_len, - ) + if HAS_FLASH_ATTENTION_SOFTCAPPING: + SWA = True + GA = False + else: + SWA = _prepare_4d_causal_attention_mask_for_sdpa( + attention_mask, + (bsz, q_len), + hidden_states, + seq_len, + sliding_window = self.config.sliding_window, + ) + GA = _prepare_4d_causal_attention_mask_for_sdpa( + attention_mask, + (bsz, q_len), + hidden_states, + seq_len, + ) + pass else: SWA = attention_mask GA = attention_mask diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 496a37e7a..b5244ed4e 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -682,23 +682,28 @@ def LlamaModel_fast_forward( # Gemma2 has alternating SWA and global attn if IS_GEMMA2 and not hasattr(self, "SWA_mask"): - n = self.config.max_position_embeddings - # masked_fill is making stuff slower! - # self. GA_mask = create_boolean_mask(n = n, sliding_window = 0) - # self.SWA_mask = create_boolean_mask(n = n, sliding_window = self.config.sliding_window) - from transformers.modeling_attn_mask_utils import AttentionMaskConverter - self.SWA_mask = AttentionMaskConverter( - is_causal = True, - sliding_window = self.config.sliding_window, - )\ - .to_causal_4d(1, n, n, dtype = inputs_embeds.dtype, device = "cuda:0",)\ - .squeeze(0).squeeze(0) - - self.GA_mask = AttentionMaskConverter( - is_causal = True, - )\ - .to_causal_4d(1, n, n, dtype = inputs_embeds.dtype, device = "cuda:0",)\ - .squeeze(0).squeeze(0) + if HAS_FLASH_ATTENTION_SOFTCAPPING: + self.SWA_mask = True + self.GA_mask = False + else: + n = self.config.max_position_embeddings + # masked_fill is making stuff slower! + # self. GA_mask = create_boolean_mask(n = n, sliding_window = 0) + # self.SWA_mask = create_boolean_mask(n = n, sliding_window = self.config.sliding_window) + from transformers.modeling_attn_mask_utils import AttentionMaskConverter + self.SWA_mask = AttentionMaskConverter( + is_causal = True, + sliding_window = self.config.sliding_window, + )\ + .to_causal_4d(1, n, n, dtype = inputs_embeds.dtype, device = "cuda:0",)\ + .squeeze(0).squeeze(0) + + self.GA_mask = AttentionMaskConverter( + is_causal = True, + )\ + .to_causal_4d(1, n, n, dtype = inputs_embeds.dtype, device = "cuda:0",)\ + .squeeze(0).squeeze(0) + pass pass # Go through every layer! diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index fb6d5c501..47152d676 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from ._utils import is_bfloat16_supported, HAS_FLASH_ATTENTION, HAS_FLASH_ATTENTION_SOFTCAPPING from .llama import FastLlamaModel, logger from .mistral import FastMistralModel from .qwen2 import FastQwen2Model @@ -233,6 +234,21 @@ def from_pretrained( f'Try `pip install --upgrade "transformers>=4.42.3"`\n'\ f"to obtain the latest transformers build, then restart this session."\ ) + # Also check for softcapping support in flash-attn which is faster! + if is_bfloat16_supported() and not HAS_FLASH_ATTENTION: + print( + "Unsloth: If you want to finetune Gemma 2, install flash-attn to make it faster!\n"\ + "To install flash-attn, do the below:\n"\ + '\npip install --no-deps --upgrade "flash-attn>=2.6.3"' + ) + elif HAS_FLASH_ATTENTION and not HAS_FLASH_ATTENTION_SOFTCAPPING: + print( + "Unsloth: If you want to finetune Gemma 2, upgrade flash-attn to version 2.6.3 or higher!\n"\ + "Newer versions support faster and less memory usage kernels for Gemma 2's attention softcapping!\n"\ + "To update flash-attn, do the below:\n"\ + '\npip install --no-deps --upgrade "flash-attn>=2.6.3"' + ) + dispatch_model = FastGemma2Model elif model_type == "qwen2": dispatch_model = FastQwen2Model diff --git a/unsloth/models/mapper.py b/unsloth/models/mapper.py index 254a68a42..462555f31 100644 --- a/unsloth/models/mapper.py +++ b/unsloth/models/mapper.py @@ -67,7 +67,6 @@ "codellama/CodeLlama-7b-hf", ), "unsloth/codellama-13b-bnb-4bit" : ( - "unsloth/codellama-13b", "codellama/CodeLlama-13b-hf", ), "unsloth/yi-6b-bnb-4bit" : ( From d326c988585d3c764bacefec0f92432c8a50e85a Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 30 Jul 2024 23:04:00 -0700 Subject: [PATCH 088/147] Update gemma2.py --- unsloth/models/gemma2.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/unsloth/models/gemma2.py b/unsloth/models/gemma2.py index ecd45fbce..a0880daef 100644 --- a/unsloth/models/gemma2.py +++ b/unsloth/models/gemma2.py @@ -131,6 +131,9 @@ def Gemma2Attention_fast_forward( # Only enable if the attention_mask is True has_sliding_window = type(attention_mask) is bool and attention_mask is True + print(HAS_FLASH_ATTENTION_SOFTCAPPING) + print(has_sliding_window) + print(attention_mask) if HAS_FLASH_ATTENTION_SOFTCAPPING and type(attention_mask) is bool: window = (-1, -1) if has_sliding_window: From 86b71c4ef5f90379b075d2ab97827b3c2537d501 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 30 Jul 2024 23:07:47 -0700 Subject: [PATCH 089/147] Update gemma2.py --- unsloth/models/gemma2.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/unsloth/models/gemma2.py b/unsloth/models/gemma2.py index a0880daef..2191a99c0 100644 --- a/unsloth/models/gemma2.py +++ b/unsloth/models/gemma2.py @@ -130,11 +130,8 @@ def Gemma2Attention_fast_forward( past_key_value = (K, V) if use_cache else None # Only enable if the attention_mask is True - has_sliding_window = type(attention_mask) is bool and attention_mask is True - print(HAS_FLASH_ATTENTION_SOFTCAPPING) - print(has_sliding_window) - print(attention_mask) - if HAS_FLASH_ATTENTION_SOFTCAPPING and type(attention_mask) is bool: + has_sliding_window = type(causal_mask) is bool and causal_mask is True + if HAS_FLASH_ATTENTION_SOFTCAPPING and attention_mask is None: window = (-1, -1) if has_sliding_window: sw = getattr(self.config, "sliding_window", None) From cf1054c9bcc74bd659739f34444f46d8c79837cf Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 30 Jul 2024 23:11:23 -0700 Subject: [PATCH 090/147] Update gemma2.py --- unsloth/models/gemma2.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/unsloth/models/gemma2.py b/unsloth/models/gemma2.py index 2191a99c0..d2bfb7899 100644 --- a/unsloth/models/gemma2.py +++ b/unsloth/models/gemma2.py @@ -139,6 +139,11 @@ def Gemma2Attention_fast_forward( window = (-1, -1) if (kv_seq_len <= sw) else (sw, sw) pass + # FA uses 1 / sqrt for softmax_scale! + if not hasattr(self, "_flash_attention_softmax_scale"): + self._flash_attention_softmax_scale = 1.0 / self.config.query_pre_attn_scalar**0.5 + pass + Q = Q.transpose(1, 2) K = K.transpose(1, 2) V = V.transpose(1, 2) @@ -146,7 +151,7 @@ def Gemma2Attention_fast_forward( Q, K, V, causal = True, softcap = self.config.attn_logit_softcapping, - softmax_scale = self.config.query_pre_attn_scalar, + softmax_scale = self._flash_attention_softmax_scale, window_size = window, ) A = A.reshape(bsz, q_len, n_heads*head_dim) From 8db7e809d0dd60fb0262b3d0c4db70d43100cce0 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 30 Jul 2024 23:11:35 -0700 Subject: [PATCH 091/147] Update gemma2.py --- unsloth/models/gemma2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/gemma2.py b/unsloth/models/gemma2.py index d2bfb7899..1cbaf5b16 100644 --- a/unsloth/models/gemma2.py +++ b/unsloth/models/gemma2.py @@ -141,7 +141,7 @@ def Gemma2Attention_fast_forward( # FA uses 1 / sqrt for softmax_scale! if not hasattr(self, "_flash_attention_softmax_scale"): - self._flash_attention_softmax_scale = 1.0 / self.config.query_pre_attn_scalar**0.5 + self._flash_attention_softmax_scale = 1.0 / (self.config.query_pre_attn_scalar**0.5) pass Q = Q.transpose(1, 2) From 0c932bc0bb79b405af6e4b623088c86bdc51e48e Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 30 Jul 2024 23:51:47 -0700 Subject: [PATCH 092/147] Update mapper.py --- unsloth/models/mapper.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/unsloth/models/mapper.py b/unsloth/models/mapper.py index 462555f31..57ba67658 100644 --- a/unsloth/models/mapper.py +++ b/unsloth/models/mapper.py @@ -241,6 +241,14 @@ "unsloth/Mistral-Large-Instruct-2407-bnb-4bit" : ( "mistralai/Mistral-Large-Instruct-2407", ), + "unsloth/gemma-2-2b-bnb-4bit" : ( + "unsloth/gemma-2-2b", + "google/gemma-2-2b", + ), + "unsloth/gemma-2-2b-it-bnb-4bit" : ( + "unsloth/gemma-2-2b-it", + "google/gemma-2-2b-it", + ), } INT_TO_FLOAT_MAPPER = {} From 7af632075c201075e8469917169862d002bc8dc5 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 31 Jul 2024 08:53:20 -0700 Subject: [PATCH 093/147] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 4c1271396..d843158d2 100644 --- a/README.md +++ b/README.md @@ -39,6 +39,7 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and - Click [here](https://github.com/unslothai/unsloth/wiki) for detailed documentation for Unsloth. ## 🦥 Unsloth.ai News +- 📣 NEW! [Gemma-2-2b](https://colab.research.google.com/drive/1weTpKOjBZxZJ5PQ-Ql8i6ptAY2x-FWVA?usp=sharing) now supported! Gemma-2-9b and Gemma-2-27b are alrady supported! - 📣 NEW! [Llama 3.1 8b, 70b](https://colab.research.google.com/drive/1Ys44kVvmeZtnICzWz0xgpRnrIOjZAuxp?usp=sharing) both Base and Instruct now supported - 📣 NEW! [Mistral Nemo-12b](https://colab.research.google.com/drive/17d3U-CAIwzmbDRqbZ9NnpHxCkmXB6LZ0?usp=sharing) both Base and Instruct now supported - 📣 NEW! [Gemma-2-9b](https://colab.research.google.com/drive/1vIrqH5uYDQwsJ4-OO3DErvuv4pBgVwk4?usp=sharing) and Gemma-2-27b now supported From fdfe1f59f56935f1945269e5beda50969810158a Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 31 Jul 2024 08:54:22 -0700 Subject: [PATCH 094/147] Update _utils.py --- unsloth/models/_utils.py | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index c9bc6065f..fe3aa9040 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -192,20 +192,20 @@ def patch_mistral_nemo_config(config): # Get Xformers from xformers import __version__ as xformers_version # Temporarily disable 0.0.27 and higher - inference issues -# if Version(xformers_version) >= Version("0.0.27"): -# raise ImportError( -# "Unsloth: If you are in Colab, we updated the top cell install instructions - please change it to below "\ -# "then press Disconnect Runtime and then Restart it.\n"\ -# "\n"\ -# "%%capture\n" -# "# Installs Unsloth, Xformers (Flash Attention) and all other packages!\n" -# '!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"\n' -# '!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes\n'\ -# '\n'\ -# f"Otherwise in local machines, your xformers version of {xformers_version} is too new.\n"\ -# 'Please downgrade xformers via `pip install --force-reinstall "xformers<0.0.27"' -# ) -# pass +if Version(xformers_version) >= Version("0.0.27"): + raise ImportError( + "Unsloth: If you are in Colab, we updated the top cell install instructions - please change it to below "\ + "then press Disconnect Runtime and then Restart it.\n"\ + "\n"\ + "%%capture\n" + "# Installs Unsloth, Xformers (Flash Attention) and all other packages!\n" + '!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"\n' + '!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes\n'\ + '\n'\ + f"Otherwise in local machines, your xformers version of {xformers_version} is too new.\n"\ + 'Please downgrade xformers via `pip install --force-reinstall "xformers<0.0.27"' + ) +pass if Version(torch_version) < Version("2.2.0") and Version(xformers_version) >= Version("0.0.24"): raise ImportError( @@ -217,12 +217,12 @@ def patch_mistral_nemo_config(config): f"Unsloth: You have torch = {torch_version} but xformers = {xformers_version}.\n"\ f"Please install xformers < 0.0.26 for torch = {torch_version}." ) -# elif Version(torch_version) < Version("2.4.0") and Version(xformers_version) >= Version("0.0.27"): -# raise ImportError( -# f"Unsloth: You have torch = {torch_version} but xformers = {xformers_version}.\n"\ -# f"Please install xformers < 0.0.27 for torch = {torch_version}." -# ) -# pass +elif Version(torch_version) < Version("2.4.0") and Version(xformers_version) >= Version("0.0.27"): + raise ImportError( + f"Unsloth: You have torch = {torch_version} but xformers = {xformers_version}.\n"\ + f"Please install xformers < 0.0.27 for torch = {torch_version}." + ) +pass from xformers._cpp_lib import _register_extensions try: From b85670de83fd8eb10a9ca61045361918ea35686b Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 31 Jul 2024 08:54:58 -0700 Subject: [PATCH 095/147] Gemma (#843) * bugs * Update _utils.py * flash-attn softcapping * Update gemma2.py * Update gemma2.py * Update gemma2.py * Update gemma2.py * Update mapper.py * Update README.md * Update _utils.py --- README.md | 1 + pyproject.toml | 22 ++++++------ unsloth/models/_utils.py | 14 ++++++++ unsloth/models/gemma2.py | 73 ++++++++++++++++++++++++++++++---------- unsloth/models/llama.py | 39 +++++++++++---------- unsloth/models/loader.py | 17 ++++++++++ unsloth/models/mapper.py | 8 +++++ 7 files changed, 128 insertions(+), 46 deletions(-) diff --git a/README.md b/README.md index 4c1271396..d843158d2 100644 --- a/README.md +++ b/README.md @@ -39,6 +39,7 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and - Click [here](https://github.com/unslothai/unsloth/wiki) for detailed documentation for Unsloth. ## 🦥 Unsloth.ai News +- 📣 NEW! [Gemma-2-2b](https://colab.research.google.com/drive/1weTpKOjBZxZJ5PQ-Ql8i6ptAY2x-FWVA?usp=sharing) now supported! Gemma-2-9b and Gemma-2-27b are alrady supported! - 📣 NEW! [Llama 3.1 8b, 70b](https://colab.research.google.com/drive/1Ys44kVvmeZtnICzWz0xgpRnrIOjZAuxp?usp=sharing) both Base and Instruct now supported - 📣 NEW! [Mistral Nemo-12b](https://colab.research.google.com/drive/17d3U-CAIwzmbDRqbZ9NnpHxCkmXB6LZ0?usp=sharing) both Base and Instruct now supported - 📣 NEW! [Gemma-2-9b](https://colab.research.google.com/drive/1vIrqH5uYDQwsJ4-OO3DErvuv4pBgVwk4?usp=sharing) and Gemma-2-27b now supported diff --git a/pyproject.toml b/pyproject.toml index 6777f7c26..e711325be 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -171,7 +171,7 @@ colab-ampere-torch211 = [ "unsloth[cu121onlytorch211]", "packaging", "ninja", - "flash-attn", + "flash-attn>=2.6.3", ] colab-torch220 = [ "unsloth[huggingface]", @@ -184,7 +184,7 @@ colab-ampere-torch220 = [ "unsloth[cu121onlytorch220]", "packaging", "ninja", - "flash-attn", + "flash-attn>=2.6.3", ] colab-new = [ "packaging", @@ -215,7 +215,7 @@ colab-ampere = [ "unsloth[colab-ampere-torch220]", "packaging", "ninja", - "flash-attn", + "flash-attn>=2.6.3", ] cu118-ampere = [ "unsloth[huggingface]", @@ -223,7 +223,7 @@ cu118-ampere = [ "unsloth[cu118only]", "packaging", "ninja", - "flash-attn", + "flash-attn>=2.6.3", ] cu121-ampere = [ "unsloth[huggingface]", @@ -231,7 +231,7 @@ cu121-ampere = [ "unsloth[cu121only]", "packaging", "ninja", - "flash-attn", + "flash-attn>=2.6.3", ] cu118-ampere-torch211 = [ "unsloth[huggingface]", @@ -239,7 +239,7 @@ cu118-ampere-torch211 = [ "unsloth[cu118onlytorch211]", "packaging", "ninja", - "flash-attn", + "flash-attn>=2.6.3", ] cu121-ampere-torch211 = [ "unsloth[huggingface]", @@ -247,7 +247,7 @@ cu121-ampere-torch211 = [ "unsloth[cu121onlytorch211]", "packaging", "ninja", - "flash-attn", + "flash-attn>=2.6.3", ] cu118-ampere-torch220 = [ "unsloth[huggingface]", @@ -255,7 +255,7 @@ cu118-ampere-torch220 = [ "unsloth[cu118onlytorch220]", "packaging", "ninja", - "flash-attn", + "flash-attn>=2.6.3", ] cu121-ampere-torch220 = [ "unsloth[huggingface]", @@ -263,7 +263,7 @@ cu121-ampere-torch220 = [ "unsloth[cu121onlytorch220]", "packaging", "ninja", - "flash-attn", + "flash-attn>=2.6.3", ] cu118-ampere-torch230 = [ "unsloth[huggingface]", @@ -271,7 +271,7 @@ cu118-ampere-torch230 = [ "unsloth[cu118onlytorch230]", "packaging", "ninja", - "flash-attn", + "flash-attn>=2.6.3", ] cu121-ampere-torch230 = [ "unsloth[huggingface]", @@ -279,7 +279,7 @@ cu121-ampere-torch230 = [ "unsloth[cu121onlytorch230]", "packaging", "ninja", - "flash-attn", + "flash-attn>=2.6.3", ] [project.urls] diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 994f97ab7..fe3aa9040 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -21,6 +21,7 @@ "xformers_version", "__version__", "HAS_FLASH_ATTENTION", + "HAS_FLASH_ATTENTION_SOFTCAPPING", "PRE_CHECK", "platform_system", "patch_tokenizer", @@ -140,6 +141,8 @@ def patch_mistral_nemo_config(config): major_version, minor_version = torch.cuda.get_device_capability() SUPPORTS_BFLOAT16 = False +HAS_FLASH_ATTENTION = False +HAS_FLASH_ATTENTION_SOFTCAPPING = False if major_version >= 8: SUPPORTS_BFLOAT16 = True @@ -148,6 +151,17 @@ def patch_mistral_nemo_config(config): try: from flash_attn.flash_attn_interface import flash_attn_cuda HAS_FLASH_ATTENTION = True + + # Also check for softcapping + from flash_attn import __version__ as flash_attn_version + HAS_FLASH_ATTENTION_SOFTCAPPING = Version(flash_attn_version) >= Version("2.6.3") + if not HAS_FLASH_ATTENTION_SOFTCAPPING: + print( + "Unsloth: If you want to finetune Gemma 2, upgrade flash-attn to version 2.6.3 or higher!\n"\ + "Newer versions support faster and less memory usage kernels for Gemma 2's attention softcapping!\n"\ + "To update flash-attn, do the below:\n"\ + '\npip install --no-deps --upgrade "flash-attn>=2.6.3"' + ) except: print( "Unsloth: Your Flash Attention 2 installation seems to be broken?\n"\ diff --git a/unsloth/models/gemma2.py b/unsloth/models/gemma2.py index 0d21c47b0..1cbaf5b16 100644 --- a/unsloth/models/gemma2.py +++ b/unsloth/models/gemma2.py @@ -56,6 +56,8 @@ Gemma2FlashAttention2 = Gemma2Attention pass +if HAS_FLASH_ATTENTION_SOFTCAPPING: + from flash_attn import flash_attn_func # [TODO] We must randomnly use torch.compile? # I checked the gradients and formulas and I'm sure it's correct. @@ -126,8 +128,36 @@ def Gemma2Attention_fast_forward( V = torch.cat([past_key_value[1], V], dim = 2) pass past_key_value = (K, V) if use_cache else None - - A = slow_attention_softcapping(Q, K, V, causal_mask, self, bsz, kv_seq_len) + + # Only enable if the attention_mask is True + has_sliding_window = type(causal_mask) is bool and causal_mask is True + if HAS_FLASH_ATTENTION_SOFTCAPPING and attention_mask is None: + window = (-1, -1) + if has_sliding_window: + sw = getattr(self.config, "sliding_window", None) + sw = kv_seq_len if (sw is None or sw == "null") else sw + window = (-1, -1) if (kv_seq_len <= sw) else (sw, sw) + pass + + # FA uses 1 / sqrt for softmax_scale! + if not hasattr(self, "_flash_attention_softmax_scale"): + self._flash_attention_softmax_scale = 1.0 / (self.config.query_pre_attn_scalar**0.5) + pass + + Q = Q.transpose(1, 2) + K = K.transpose(1, 2) + V = V.transpose(1, 2) + A = flash_attn_func( + Q, K, V, + causal = True, + softcap = self.config.attn_logit_softcapping, + softmax_scale = self._flash_attention_softmax_scale, + window_size = window, + ) + A = A.reshape(bsz, q_len, n_heads*head_dim) + else: + A = slow_attention_softcapping(Q, K, V, causal_mask, self, bsz, kv_seq_len) + pass A = self.apply_o(self, A) return A, None, past_key_value pass @@ -205,6 +235,8 @@ def Gemma2DecoderLayer_fast_forward( from math import sqrt as math_sqrt KV_CACHE_INCREMENT = 256 # KV Cache update size torch_nn_functional_softmax = torch.nn.functional.softmax +torch_matmul = torch.matmul +torch_tanh = torch.tanh def Gemma2Attention_fast_forward_inference( self, @@ -322,13 +354,13 @@ def Gemma2Attention_fast_forward_inference( # if bsz == 1: Qn *= self.scalar # See https://github.com/ggerganov/llama.cpp/issues/7805#issuecomment-2153349963 # It seems like doing (Q * scalar) @ K is better than (Q @ K) * scalar to stop overflows - A = torch.matmul(Qn, Knn.transpose(2, 3), out = self.attention[:,:,:,:cached_len]) + A = torch_matmul(Qn, Knn.transpose(2, 3), out = self.attention[:,:,:,:cached_len]) # if attention_mask is not None: A += attention_mask # Must add attention_mask for batched - A *= self.reciprocal_t; torch.tanh(A, out = A); A *= self.t; # Logit softcapping + A *= self.reciprocal_t; torch_tanh(A, out = A); A *= self.t; # Logit softcapping A[:] = torch_nn_functional_softmax(A, dim = -1, dtype = torch.float32)#.to(A.dtype) - A = torch.matmul(A, Vnn, out = Qn) + A = torch_matmul(A, Vnn, out = Qn) # else: # A = scaled_dot_product_attention(Qn, Knn, Vnn, attn_mask = attention_mask, is_causal = False) # pass @@ -359,19 +391,24 @@ def Gemma2Model_fast_forward_inference( bsz, q_len, hd = hidden_states.shape seq_len = past_key_values[0][0].shape[-2] if bsz != 1: - SWA = _prepare_4d_causal_attention_mask_for_sdpa( - attention_mask, - (bsz, q_len), - hidden_states, - seq_len, - sliding_window = self.config.sliding_window, - ) - GA = _prepare_4d_causal_attention_mask_for_sdpa( - attention_mask, - (bsz, q_len), - hidden_states, - seq_len, - ) + if HAS_FLASH_ATTENTION_SOFTCAPPING: + SWA = True + GA = False + else: + SWA = _prepare_4d_causal_attention_mask_for_sdpa( + attention_mask, + (bsz, q_len), + hidden_states, + seq_len, + sliding_window = self.config.sliding_window, + ) + GA = _prepare_4d_causal_attention_mask_for_sdpa( + attention_mask, + (bsz, q_len), + hidden_states, + seq_len, + ) + pass else: SWA = attention_mask GA = attention_mask diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 496a37e7a..b5244ed4e 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -682,23 +682,28 @@ def LlamaModel_fast_forward( # Gemma2 has alternating SWA and global attn if IS_GEMMA2 and not hasattr(self, "SWA_mask"): - n = self.config.max_position_embeddings - # masked_fill is making stuff slower! - # self. GA_mask = create_boolean_mask(n = n, sliding_window = 0) - # self.SWA_mask = create_boolean_mask(n = n, sliding_window = self.config.sliding_window) - from transformers.modeling_attn_mask_utils import AttentionMaskConverter - self.SWA_mask = AttentionMaskConverter( - is_causal = True, - sliding_window = self.config.sliding_window, - )\ - .to_causal_4d(1, n, n, dtype = inputs_embeds.dtype, device = "cuda:0",)\ - .squeeze(0).squeeze(0) - - self.GA_mask = AttentionMaskConverter( - is_causal = True, - )\ - .to_causal_4d(1, n, n, dtype = inputs_embeds.dtype, device = "cuda:0",)\ - .squeeze(0).squeeze(0) + if HAS_FLASH_ATTENTION_SOFTCAPPING: + self.SWA_mask = True + self.GA_mask = False + else: + n = self.config.max_position_embeddings + # masked_fill is making stuff slower! + # self. GA_mask = create_boolean_mask(n = n, sliding_window = 0) + # self.SWA_mask = create_boolean_mask(n = n, sliding_window = self.config.sliding_window) + from transformers.modeling_attn_mask_utils import AttentionMaskConverter + self.SWA_mask = AttentionMaskConverter( + is_causal = True, + sliding_window = self.config.sliding_window, + )\ + .to_causal_4d(1, n, n, dtype = inputs_embeds.dtype, device = "cuda:0",)\ + .squeeze(0).squeeze(0) + + self.GA_mask = AttentionMaskConverter( + is_causal = True, + )\ + .to_causal_4d(1, n, n, dtype = inputs_embeds.dtype, device = "cuda:0",)\ + .squeeze(0).squeeze(0) + pass pass # Go through every layer! diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index f22e81efa..47152d676 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from ._utils import is_bfloat16_supported, HAS_FLASH_ATTENTION, HAS_FLASH_ATTENTION_SOFTCAPPING from .llama import FastLlamaModel, logger from .mistral import FastMistralModel from .qwen2 import FastQwen2Model @@ -42,6 +43,7 @@ def __get_model_name( FLOAT_TO_INT_MAPPER = None, ): + model_name = str(model_name) if not SUPPORTS_FOURBIT and model_name.lower() in INT_TO_FLOAT_MAPPER: model_name = INT_TO_FLOAT_MAPPER[model_name.lower()] logger.warning_once( @@ -232,6 +234,21 @@ def from_pretrained( f'Try `pip install --upgrade "transformers>=4.42.3"`\n'\ f"to obtain the latest transformers build, then restart this session."\ ) + # Also check for softcapping support in flash-attn which is faster! + if is_bfloat16_supported() and not HAS_FLASH_ATTENTION: + print( + "Unsloth: If you want to finetune Gemma 2, install flash-attn to make it faster!\n"\ + "To install flash-attn, do the below:\n"\ + '\npip install --no-deps --upgrade "flash-attn>=2.6.3"' + ) + elif HAS_FLASH_ATTENTION and not HAS_FLASH_ATTENTION_SOFTCAPPING: + print( + "Unsloth: If you want to finetune Gemma 2, upgrade flash-attn to version 2.6.3 or higher!\n"\ + "Newer versions support faster and less memory usage kernels for Gemma 2's attention softcapping!\n"\ + "To update flash-attn, do the below:\n"\ + '\npip install --no-deps --upgrade "flash-attn>=2.6.3"' + ) + dispatch_model = FastGemma2Model elif model_type == "qwen2": dispatch_model = FastQwen2Model diff --git a/unsloth/models/mapper.py b/unsloth/models/mapper.py index 462555f31..57ba67658 100644 --- a/unsloth/models/mapper.py +++ b/unsloth/models/mapper.py @@ -241,6 +241,14 @@ "unsloth/Mistral-Large-Instruct-2407-bnb-4bit" : ( "mistralai/Mistral-Large-Instruct-2407", ), + "unsloth/gemma-2-2b-bnb-4bit" : ( + "unsloth/gemma-2-2b", + "google/gemma-2-2b", + ), + "unsloth/gemma-2-2b-it-bnb-4bit" : ( + "unsloth/gemma-2-2b-it", + "google/gemma-2-2b-it", + ), } INT_TO_FLOAT_MAPPER = {} From dfca5516e74e60d52915d4287121d9ff8b80b314 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 31 Jul 2024 09:50:11 -0700 Subject: [PATCH 096/147] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d843158d2..9407c452a 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and - Click [here](https://github.com/unslothai/unsloth/wiki) for detailed documentation for Unsloth. ## 🦥 Unsloth.ai News -- 📣 NEW! [Gemma-2-2b](https://colab.research.google.com/drive/1weTpKOjBZxZJ5PQ-Ql8i6ptAY2x-FWVA?usp=sharing) now supported! Gemma-2-9b and Gemma-2-27b are alrady supported! +- 📣 NEW! [Gemma-2-2b](https://colab.research.google.com/drive/1weTpKOjBZxZJ5PQ-Ql8i6ptAY2x-FWVA?usp=sharing) now supported! Gemma-2-9b and Gemma-2-27b are alrady supported! And uploaded [GGUF quants](https://huggingface.co/unsloth/gemma-2-it-GGUF) Try out [Chat interface](https://colab.research.google.com/drive/1i-8ESvtLRGNkkUQQr_-z_rcSAIo9c3lM?usp=sharing) for Gemma-2-2b Instruct! - 📣 NEW! [Llama 3.1 8b, 70b](https://colab.research.google.com/drive/1Ys44kVvmeZtnICzWz0xgpRnrIOjZAuxp?usp=sharing) both Base and Instruct now supported - 📣 NEW! [Mistral Nemo-12b](https://colab.research.google.com/drive/17d3U-CAIwzmbDRqbZ9NnpHxCkmXB6LZ0?usp=sharing) both Base and Instruct now supported - 📣 NEW! [Gemma-2-9b](https://colab.research.google.com/drive/1vIrqH5uYDQwsJ4-OO3DErvuv4pBgVwk4?usp=sharing) and Gemma-2-27b now supported From 2de142712d2dc8892d216dfca365dc3ba2707c43 Mon Sep 17 00:00:00 2001 From: XiaoYang Date: Thu, 1 Aug 2024 03:05:08 +0800 Subject: [PATCH 097/147] Fix ROPE extension issue and device mismatch (#840) * When an exception has been assigned using as target, it is cleared at the end of the except clause.(https://docs.python.org/3/reference/compound_stmts.html#the-try-statement) * Update loader.py * round up to extend rope size * inv_freq.device changed, make sure they are on the same device --------- Co-authored-by: xiaoyang Co-authored-by: Daniel Han --- unsloth/models/llama.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index b5244ed4e..e6c9280bc 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -14,6 +14,7 @@ import torch import gc +import math from typing import Optional, Tuple, List, Union from ._utils import * from ._utils import __version__ @@ -1036,7 +1037,7 @@ def forward(self, x, position_ids=None, seq_len=None): def extend_rope_embedding(self, x, seq_len): if seq_len <= self.current_rope_size: return # Iteratively grow by increments of 8192 - self.current_rope_size = int(round(seq_len / 8192)) * 8192 + self.current_rope_size = math.ceil(seq_len / 8192) * 8192 self._set_cos_sin_cache(self.current_rope_size, device = "cuda:0", dtype = x.dtype) pass pass @@ -1109,7 +1110,7 @@ def _set_cos_sin_cache(self, seq_len, device, dtype): # in FP32. They are applied (multiplied) in FP32 as well. self.current_rope_size = seq_len - t = torch.arange(self.current_rope_size, device="cpu", dtype=torch.int64).float() + t = torch.arange(self.current_rope_size, device=self.inv_freq.device, dtype=torch.int64).float() freqs = torch.outer(t, self.inv_freq) # Different from paper, but it uses a different permutation in order to obtain the same calculation @@ -1158,7 +1159,7 @@ def forward(self, x, position_ids=None, seq_len=None): def extend_rope_embedding(self, x, seq_len): if seq_len <= self.current_rope_size: return # Iteratively grow by increments of 8192 - self.current_rope_size = int(round(seq_len / 8192)) * 8192 + self.current_rope_size = math.ceil(seq_len / 8192) * 8192 self._set_cos_sin_cache(self.current_rope_size, device = "cuda:0", dtype = x.dtype) pass pass From d0a7dcec1dd2b9f67c9be97d3b9ac05341b5fc9b Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 31 Jul 2024 12:09:33 -0700 Subject: [PATCH 098/147] Update gemma.py --- unsloth/models/gemma.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/unsloth/models/gemma.py b/unsloth/models/gemma.py index e3f1e615d..a0894ec7a 100644 --- a/unsloth/models/gemma.py +++ b/unsloth/models/gemma.py @@ -14,6 +14,7 @@ from .llama import * from ._utils import __version__ +import math try: from transformers.models.gemma.modeling_gemma import ( @@ -256,7 +257,7 @@ def forward(self, x, position_ids=None, seq_len=None): def extend_rope_embedding(self, x, seq_len): if seq_len <= self.current_rope_size: return # Iteratively grow by increments of 8192 - self.current_rope_size = int(round(seq_len / 8192)) * 8192 + self.current_rope_size = math.ceil(seq_len / 8192) * 8192 self._set_cos_sin_cache(self.current_rope_size, device = "cuda:0", dtype = x.dtype) pass pass From 4e570be9ae4ced8cdc64e498125708e34942befc Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 31 Jul 2024 12:10:33 -0700 Subject: [PATCH 099/147] Fix RoPE extension (#846) * bugs * Update _utils.py * flash-attn softcapping * Update gemma2.py * Update gemma2.py * Update gemma2.py * Update gemma2.py * Update mapper.py * Update README.md * Update _utils.py * Fix ROPE extension issue and device mismatch (#840) * When an exception has been assigned using as target, it is cleared at the end of the except clause.(https://docs.python.org/3/reference/compound_stmts.html#the-try-statement) * Update loader.py * round up to extend rope size * inv_freq.device changed, make sure they are on the same device --------- Co-authored-by: xiaoyang Co-authored-by: Daniel Han * Update gemma.py --------- Co-authored-by: XiaoYang Co-authored-by: xiaoyang --- unsloth/models/gemma.py | 3 ++- unsloth/models/llama.py | 7 ++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/unsloth/models/gemma.py b/unsloth/models/gemma.py index e3f1e615d..a0894ec7a 100644 --- a/unsloth/models/gemma.py +++ b/unsloth/models/gemma.py @@ -14,6 +14,7 @@ from .llama import * from ._utils import __version__ +import math try: from transformers.models.gemma.modeling_gemma import ( @@ -256,7 +257,7 @@ def forward(self, x, position_ids=None, seq_len=None): def extend_rope_embedding(self, x, seq_len): if seq_len <= self.current_rope_size: return # Iteratively grow by increments of 8192 - self.current_rope_size = int(round(seq_len / 8192)) * 8192 + self.current_rope_size = math.ceil(seq_len / 8192) * 8192 self._set_cos_sin_cache(self.current_rope_size, device = "cuda:0", dtype = x.dtype) pass pass diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index b5244ed4e..e6c9280bc 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -14,6 +14,7 @@ import torch import gc +import math from typing import Optional, Tuple, List, Union from ._utils import * from ._utils import __version__ @@ -1036,7 +1037,7 @@ def forward(self, x, position_ids=None, seq_len=None): def extend_rope_embedding(self, x, seq_len): if seq_len <= self.current_rope_size: return # Iteratively grow by increments of 8192 - self.current_rope_size = int(round(seq_len / 8192)) * 8192 + self.current_rope_size = math.ceil(seq_len / 8192) * 8192 self._set_cos_sin_cache(self.current_rope_size, device = "cuda:0", dtype = x.dtype) pass pass @@ -1109,7 +1110,7 @@ def _set_cos_sin_cache(self, seq_len, device, dtype): # in FP32. They are applied (multiplied) in FP32 as well. self.current_rope_size = seq_len - t = torch.arange(self.current_rope_size, device="cpu", dtype=torch.int64).float() + t = torch.arange(self.current_rope_size, device=self.inv_freq.device, dtype=torch.int64).float() freqs = torch.outer(t, self.inv_freq) # Different from paper, but it uses a different permutation in order to obtain the same calculation @@ -1158,7 +1159,7 @@ def forward(self, x, position_ids=None, seq_len=None): def extend_rope_embedding(self, x, seq_len): if seq_len <= self.current_rope_size: return # Iteratively grow by increments of 8192 - self.current_rope_size = int(round(seq_len / 8192)) * 8192 + self.current_rope_size = math.ceil(seq_len / 8192) * 8192 self._set_cos_sin_cache(self.current_rope_size, device = "cuda:0", dtype = x.dtype) pass pass From f65cc9877c9ee42b9c6719a4fe168b00abceb095 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 4 Aug 2024 11:28:21 -0700 Subject: [PATCH 100/147] Update pyproject.toml --- pyproject.toml | 40 ++++++++++------------------------------ 1 file changed, 10 insertions(+), 30 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index e711325be..fdc098854 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,54 +50,34 @@ huggingface = [ "hf-transfer", ] cu118only = [ - "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.22.post7%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'", - "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.22.post7%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'", - "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.22.post7%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'", + "xformers==0.0.22.post7", ] cu121only = [ - "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.22.post7-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'", - "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.22.post7-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'", - "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.22.post7-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'", + "xformers==0.0.22.post7", ] cu118onlytorch211 = [ - "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'", - "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'", - "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'", + "xformers==0.0.23", ] cu121onlytorch211 = [ - "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'", - "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'", - "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'", + "xformers==0.0.23", ] cu118onlytorch212 = [ - "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23.post1%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'", - "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23.post1%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'", - "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23.post1%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'", + "xformers==0.0.23.post1", ] cu121onlytorch212 = [ - "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23.post1-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'", - "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23.post1-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'", - "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23.post1-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'", + "xformers==0.0.23.post1", ] cu118onlytorch220 = [ - "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.24%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'", - "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.24%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'", - "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.24%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'", + "xformers==0.0.24", ] cu121onlytorch220 = [ - "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.24-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'", - "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.24-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'", - "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.24-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'", + "xformers==0.0.24", ] cu118onlytorch230 = [ - "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.26.post1%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'", - "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.26.post1%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'", - "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.26.post1%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'", + "xformers==0.0.26.post1", ] cu121onlytorch230 = [ - "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.26.post1-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'", - "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.26.post1-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'", - "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.26.post1-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'", + "xformers==0.0.26.post1", ] cu118 = [ From 16b6932c43baaf0097943ab14321a8f3c1bc6415 Mon Sep 17 00:00:00 2001 From: moontidef <53668275+relic-yuexi@users.noreply.github.com> Date: Mon, 5 Aug 2024 14:45:34 +0800 Subject: [PATCH 101/147] fix: fix config.torch_dtype bug (#874) fix the bug #404 and the bug https://github.com/hiyouga/LLaMA-Factory/issues/4698#issue-2393500878 --- unsloth/models/llama.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index e6c9280bc..445e5026f 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -570,7 +570,14 @@ def LlamaModel_fast_forward( # Embed positions if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) - + + if self.config.torch_dtype == "float32": + self.config.torch_dtype = torch.float32 + elif self.config.torch_dtype == "bfloat16": + self.config.torch_dtype = torch.bfloat16 + elif self.config.torch_dtype == "float16": + self.config.torch_dtype = torch.float16 + inputs_embeds = inputs_embeds.to(self.config.torch_dtype) # Normalized from Gemma From 46b434869847e202e1cf594ab8466819cb398e7a Mon Sep 17 00:00:00 2001 From: emuchogu Date: Mon, 5 Aug 2024 09:45:51 +0300 Subject: [PATCH 102/147] pascal support (#870) Co-authored-by: Edward Muchogu --- README.md | 389 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 389 insertions(+) diff --git a/README.md b/README.md index 9407c452a..4e29a43ec 100644 --- a/README.md +++ b/README.md @@ -436,6 +436,395 @@ Two Tesla T4s on Kaggle ![](https://i.ibb.co/sJ7RhGG/image-41.png)
+## NVIDIA Pascal Support + +Support for NVIDIA Pascal family of cards, specifically the P40 and P100. + +### Setup Guide + +1. Create three files (`Dockerfile`, `unsloth_env_file.yml`, and `docker-compose.yml`) with the contents provided below. +2. Ensure Docker and Docker Compose are installed on your system. +3. Install the NVIDIA Container Toolkit for GPU support if not already done. +4. Place all three files in the same directory. +5. Open a terminal and navigate to the directory containing these files. +6. Run the following command to build and start the container: + + ``` + docker-compose up --build + ``` + +7. Once the container is running, access Jupyter Lab by opening a web browser and navigating to `http://localhost:8888`. + +### Configuration Files + +#### 1. Dockerfile + +```dockerfile +# Stage 1: Base image with system dependencies +FROM nvidia/cuda:11.8.0-devel-ubuntu22.04 as base + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + build-essential \ + git \ + vim \ + curl \ + wget \ + && rm -rf /var/lib/apt/lists/* + +# Install Miniconda only if it's not already installed +RUN if [ ! -d "/opt/conda" ]; then \ + wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh && \ + bash miniconda.sh -b -p /opt/conda && \ + rm miniconda.sh; \ + fi + +# Set path to conda +ENV PATH /opt/conda/bin:$PATH + +# Set path to conda +ENV PATH /opt/conda/bin:$PATH + +# Stage 2: Python environment setup +FROM base as python-env + +COPY unsloth_env_file.yml unsloth_env_file.yml + +RUN conda env create -f unsloth_env_file.yml + +SHELL ["conda", "run", "-n", "unsloth_env", "/bin/bash", "-c"] + +# Stage 3: Final image +FROM python-env as final + +# Install Unsloth (This step is separate because it's likely to change more frequently) +RUN pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git" + +ENV PATH /usr/local/cuda/bin:$PATH +ENV LD_LIBRARY_PATH /usr/local/cuda/lib64:$LD_LIBRARY_PATH + +# Set the working directory +WORKDIR /workspace + +# Set the default command to run Jupyter Lab +CMD ["conda", "run", "--no-capture-output", "-n", "unsloth_env", "jupyter", "lab", "--ip=0.0.0.0", "--no-browser", "--allow-root", "--NotebookApp.token=''", "--NotebookApp.password=''"] +``` + +#### 2. unsloth_env_file.yml + +```yaml +name: unsloth_env +channels: + - xformers + - pytorch + - nvidia + - conda-forge + - defaults +dependencies: + - _libgcc_mutex=0.1=conda_forge + - _openmp_mutex=4.5=2_gnu + - aiohttp=3.9.5=py310h5eee18b_0 + - aiosignal=1.2.0=pyhd3eb1b0_0 + - anyio=4.2.0=py310h06a4308_0 + - argon2-cffi=21.3.0=pyhd3eb1b0_0 + - argon2-cffi-bindings=21.2.0=py310h7f8727e_0 + - arrow-cpp=16.1.0=hc1eb8f0_0 + - async-lru=2.0.4=pyhd8ed1ab_0 + - async-timeout=4.0.3=py310h06a4308_0 + - attrs=23.1.0=py310h06a4308_0 + - aws-c-auth=0.6.19=h5eee18b_0 + - aws-c-cal=0.5.20=hdbd6064_0 + - aws-c-common=0.8.5=h5eee18b_0 + - aws-c-compression=0.2.16=h5eee18b_0 + - aws-c-event-stream=0.2.15=h6a678d5_0 + - aws-c-http=0.6.25=h5eee18b_0 + - aws-c-io=0.13.10=h5eee18b_0 + - aws-c-mqtt=0.7.13=h5eee18b_0 + - aws-c-s3=0.1.51=hdbd6064_0 + - aws-c-sdkutils=0.1.6=h5eee18b_0 + - aws-checksums=0.1.13=h5eee18b_0 + - aws-crt-cpp=0.18.16=h6a678d5_0 + - aws-sdk-cpp=1.10.55=h721c034_0 + - babel=2.14.0=pyhd8ed1ab_0 + - beautifulsoup4=4.12.3=py310h06a4308_0 + - blas=1.0=mkl + - bleach=4.1.0=pyhd3eb1b0_0 + - boost-cpp=1.82.0=hdb19cb5_2 + - bottleneck=1.3.7=py310ha9d4c09_0 + - brotli-python=1.0.9=py310h6a678d5_8 + - bzip2=1.0.8=h5eee18b_6 + - c-ares=1.19.1=h5eee18b_0 + - ca-certificates=2024.7.4=hbcca054_0 + - certifi=2024.7.4=pyhd8ed1ab_0 + - cffi=1.16.0=py310h5eee18b_1 + - charset-normalizer=3.3.2=pyhd3eb1b0_0 + - cuda-cudart=11.8.89=0 + - cuda-cupti=11.8.87=0 + - cuda-libraries=11.8.0=0 + - cuda-nvrtc=11.8.89=0 + - cuda-nvtx=11.8.86=0 + - cuda-runtime=11.8.0=0 + - cuda-version=11.8=hcce14f8_3 + - cudatoolkit=11.8.0=h6a678d5_0 + - datasets=2.19.1=py310h06a4308_0 + - debugpy=1.6.7=py310h6a678d5_0 + - decorator=5.1.1=pyhd3eb1b0_0 + - defusedxml=0.7.1=pyhd3eb1b0_0 + - dill=0.3.8=py310h06a4308_0 + - entrypoints=0.4=py310h06a4308_0 + - ffmpeg=4.3=hf484d3e_0 + - filelock=3.13.1=py310h06a4308_0 + - freetype=2.12.1=h4a9f257_0 + - frozenlist=1.4.0=py310h5eee18b_0 + - fsspec=2024.3.1=py310h06a4308_0 + - gflags=2.2.2=h6a678d5_1 + - glog=0.5.0=h6a678d5_1 + - gmp=6.2.1=h295c915_3 + - gmpy2=2.1.2=py310heeb90bb_0 + - gnutls=3.6.15=he1e5248_0 + - h11=0.14.0=pyhd8ed1ab_0 + - h2=4.1.0=pyhd8ed1ab_0 + - hpack=4.0.0=pyh9f0ad1d_0 + - httpcore=1.0.5=pyhd8ed1ab_0 + - httpx=0.27.0=pyhd8ed1ab_0 + - hyperframe=6.0.1=pyhd8ed1ab_0 + - icu=73.1=h6a678d5_0 + - idna=3.7=py310h06a4308_0 + - importlib-metadata=7.0.1=py310h06a4308_0 + - importlib_metadata=7.0.1=hd8ed1ab_0 + - importlib_resources=6.4.0=pyhd8ed1ab_0 + - intel-openmp=2023.1.0=hdb19cb5_46306 + - ipykernel=6.28.0=py310h06a4308_0 + - ipython_genutils=0.2.0=pyhd3eb1b0_1 + - jedi=0.19.1=py310h06a4308_0 + - jinja2=3.1.4=py310h06a4308_0 + - jpeg=9e=h5eee18b_2 + - json5=0.9.25=pyhd8ed1ab_0 + - jsonschema=4.19.2=py310h06a4308_0 + - jsonschema-specifications=2023.7.1=py310h06a4308_0 + - jupyter-lsp=2.2.5=pyhd8ed1ab_0 + - jupyter_client=7.4.9=py310h06a4308_0 + - jupyter_core=5.7.2=py310h06a4308_0 + - jupyter_events=0.10.0=py310h06a4308_0 + - jupyter_server=2.14.1=py310h06a4308_0 + - jupyter_server_terminals=0.4.4=py310h06a4308_1 + - jupyterlab=4.2.4=pyhd8ed1ab_0 + - jupyterlab_pygments=0.3.0=pyhd8ed1ab_1 + - jupyterlab_server=2.27.3=pyhd8ed1ab_0 + - krb5=1.20.1=h143b758_1 + - lame=3.100=h7b6447c_0 + - lcms2=2.12=h3be6417_0 + - ld_impl_linux-64=2.38=h1181459_1 + - lerc=3.0=h295c915_0 + - libabseil=20240116.2=cxx17_h6a678d5_0 + - libboost=1.82.0=h109eef0_2 + - libbrotlicommon=1.0.9=h5eee18b_8 + - libbrotlidec=1.0.9=h5eee18b_8 + - libbrotlienc=1.0.9=h5eee18b_8 + - libcublas=11.11.3.6=0 + - libcufft=10.9.0.58=0 + - libcufile=1.9.1.3=0 + - libcurand=10.3.5.147=0 + - libcurl=8.7.1=h251f7ec_0 + - libcusolver=11.4.1.48=0 + - libcusparse=11.7.5.86=0 + - libdeflate=1.17=h5eee18b_1 + - libedit=3.1.20230828=h5eee18b_0 + - libev=4.33=h7f8727e_1 + - libevent=2.1.12=hdbd6064_1 + - libffi=3.4.4=h6a678d5_1 + - libgcc-ng=14.1.0=h77fa898_0 + - libgomp=14.1.0=h77fa898_0 + - libgrpc=1.62.2=h2d74bed_0 + - libiconv=1.16=h5eee18b_3 + - libidn2=2.3.4=h5eee18b_0 + - libjpeg-turbo=2.0.0=h9bf148f_0 + - libnghttp2=1.57.0=h2d74bed_0 + - libnpp=11.8.0.86=0 + - libnvjpeg=11.9.0.86=0 + - libpng=1.6.39=h5eee18b_0 + - libprotobuf=4.25.3=he621ea3_0 + - libsodium=1.0.18=h7b6447c_0 + - libssh2=1.11.0=h251f7ec_0 + - libstdcxx-ng=11.2.0=h1234567_1 + - libtasn1=4.19.0=h5eee18b_0 + - libthrift=0.15.0=h1795dd8_2 + - libtiff=4.5.1=h6a678d5_0 + - libunistring=0.9.10=h27cfd23_0 + - libuuid=1.41.5=h5eee18b_0 + - libwebp-base=1.3.2=h5eee18b_0 + - llvm-openmp=14.0.6=h9e868ea_0 + - lz4-c=1.9.4=h6a678d5_1 + - markupsafe=2.1.3=py310h5eee18b_0 + - mistune=2.0.4=py310h06a4308_0 + - mkl=2023.1.0=h213fc3f_46344 + - mkl-service=2.4.0=py310h5eee18b_1 + - mkl_fft=1.3.8=py310h5eee18b_0 + - mkl_random=1.2.4=py310hdb19cb5_0 + - mpc=1.1.0=h10f8cd9_1 + - mpfr=4.0.2=hb69a4c5_1 + - mpmath=1.3.0=py310h06a4308_0 + - multidict=6.0.4=py310h5eee18b_0 + - multiprocess=0.70.15=py310h06a4308_0 + - nb_conda_kernels=2.3.1=py310h06a4308_0 + - nbclassic=1.1.0=py310h06a4308_0 + - nbclient=0.8.0=py310h06a4308_0 + - nbconvert=7.10.0=py310h06a4308_0 + - nbformat=5.9.2=py310h06a4308_0 + - ncurses=6.4=h6a678d5_0 + - nest-asyncio=1.6.0=py310h06a4308_0 + - nettle=3.7.3=hbbd107a_1 + - networkx=3.3=py310h06a4308_0 + - notebook=6.5.7=py310h06a4308_0 + - notebook-shim=0.2.3=py310h06a4308_0 + - numexpr=2.8.7=py310h85018f9_0 + - numpy=1.26.4=py310h5f9d8c6_0 + - numpy-base=1.26.4=py310hb5e798b_0 + - openh264=2.1.1=h4ff587b_0 + - openjpeg=2.4.0=h9ca470c_2 + - openssl=3.3.1=h4bc722e_2 + - orc=2.0.1=h2d29ad5_0 + - overrides=7.4.0=py310h06a4308_0 + - packaging=24.1=py310h06a4308_0 + - pandas=2.2.2=py310h6a678d5_0 + - pandocfilters=1.5.0=pyhd3eb1b0_0 + - pillow=10.4.0=py310h5eee18b_0 + - pip=24.0=py310h06a4308_0 + - platformdirs=3.10.0=py310h06a4308_0 + - prometheus_client=0.14.1=py310h06a4308_0 + - prompt_toolkit=3.0.43=hd3eb1b0_0 + - psutil=5.9.0=py310h5eee18b_0 + - ptyprocess=0.7.0=pyhd3eb1b0_2 + - pure_eval=0.2.2=pyhd3eb1b0_0 + - pyarrow=16.1.0=py310h1128e8f_0 + - pycparser=2.21=pyhd3eb1b0_0 + - pysocks=1.7.1=py310h06a4308_0 + - python=3.10.14=h955ad1f_1 + - python-dateutil=2.9.0post0=py310h06a4308_2 + - python-fastjsonschema=2.16.2=py310h06a4308_0 + - python-json-logger=2.0.7=py310h06a4308_0 + - python-tzdata=2023.3=pyhd3eb1b0_0 + - python-xxhash=2.0.2=py310h5eee18b_1 + - pytorch=2.1.0=py3.10_cuda11.8_cudnn8.7.0_0 + - pytorch-cuda=11.8=h7e8668a_5 + - pytorch-mutex=1.0=cuda + - pytz=2024.1=py310h06a4308_0 + - pyyaml=6.0.1=py310h5eee18b_0 + - pyzmq=24.0.1=py310h5eee18b_0 + - re2=2022.04.01=h295c915_0 + - readline=8.2=h5eee18b_0 + - referencing=0.30.2=py310h06a4308_0 + - regex=2023.10.3=py310h5eee18b_0 + - requests=2.32.3=py310h06a4308_0 + - rfc3339-validator=0.1.4=py310h06a4308_0 + - rfc3986-validator=0.1.1=py310h06a4308_0 + - rpds-py=0.10.6=py310hb02cf49_0 + - s2n=1.3.27=hdbd6064_0 + - safetensors=0.4.2=py310ha89cbab_1 + - send2trash=1.8.2=py310h06a4308_0 + - setuptools=69.5.1=py310h06a4308_0 + - six=1.16.0=pyhd3eb1b0_1 + - snappy=1.1.10=h6a678d5_1 + - sniffio=1.3.0=py310h06a4308_0 + - soupsieve=2.5=py310h06a4308_0 + - sqlite=3.45.3=h5eee18b_0 + - stack_data=0.2.0=pyhd3eb1b0_0 + - sympy=1.12=py310h06a4308_0 + - tbb=2021.8.0=hdb19cb5_0 + - terminado=0.17.1=py310h06a4308_0 + - tinycss2=1.2.1=py310h06a4308_0 + - tk=8.6.14=h39e8969_0 + - tokenizers=0.19.1=py310hff361bb_0 + - tomli=2.0.1=pyhd8ed1ab_0 + - torchaudio=2.1.0=py310_cu118 + - torchtriton=2.1.0=py310 + - torchvision=0.16.0=py310_cu118 + - tornado=6.4.1=py310h5eee18b_0 + - tqdm=4.66.4=py310h2f386ee_0 + - traitlets=5.14.3=py310h06a4308_0 + - typing-extensions=4.11.0=py310h06a4308_0 + - typing_extensions=4.11.0=py310h06a4308_0 + - tzdata=2024a=h04d1e81_0 + - urllib3=2.2.2=py310h06a4308_0 + - utf8proc=2.6.1=h5eee18b_1 + - webencodings=0.5.1=py310h06a4308_1 + - websocket-client=1.8.0=py310h06a4308_0 + - wheel=0.43.0=py310h06a4308_0 + - xformers=0.0.22.post7=py310_cu11.8.0_pyt2.1.0 + - xxhash=0.8.0=h7f8727e_3 + - xz=5.4.6=h5eee18b_1 + - yaml=0.2.5=h7b6447c_0 + - yarl=1.9.3=py310h5eee18b_0 + - zeromq=4.3.5=h6a678d5_0 + - zipp=3.17.0=py310h06a4308_0 + - zlib=1.2.13=h5eee18b_1 + - zstd=1.5.5=hc292b87_2 + - pip: + - accelerate==0.33.0 + - asttokens==2.4.1 + - bitsandbytes==0.43.2 + - comm==0.2.2 + - docstring-parser==0.16 + - exceptiongroup==1.2.2 + - executing==2.0.1 + - gguf==0.9.1 + - hf-transfer==0.1.8 + - huggingface-hub==0.24.2 + - iprogress==0.4 + - ipython==8.26.0 + - ipywidgets==8.1.3 + - jupyterlab-widgets==3.0.11 + - markdown-it-py==3.0.0 + - matplotlib-inline==0.1.7 + - mdurl==0.1.2 + - parso==0.8.4 + - peft==0.12.0 + - pexpect==4.9.0 + - prompt-toolkit==3.0.47 + - protobuf==3.20.3 + - pure-eval==0.2.3 + - pygments==2.18.0 + - rich==13.7.1 + - sentencepiece==0.2.0 + - shtab==1.7.1 + - stack-data==0.6.3 + - transformers==4.43.3 + - trl==0.8.6 + - tyro==0.8.5 + - wcwidth==0.2.13 + - widgetsnbextension==4.0.11 + ``` + +#### 3. docker-compose.yml + +```yaml +version: '3.8' + +services: + unsloth-env: + environment: + - NVIDIA_VISIBLE_DEVICES=all + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + volumes: + - ./cache:/root/.cache + - ./workspace:/workspace + working_dir: /workspace + ports: + - "8888:8888" # For Jupyter Lab + tty: true + stdin_open: true + build: + context: . + dockerfile: Dockerfile +``` + + ### Thank You to - [HuyNguyen-hust](https://github.com/HuyNguyen-hust) for making [RoPE Embeddings 28% faster](https://github.com/unslothai/unsloth/pull/238) - [RandomInternetPreson](https://github.com/RandomInternetPreson) for confirming WSL support From 28dea9ac9550b136d8493d3b3ea57c859f20aab1 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 4 Aug 2024 23:49:35 -0700 Subject: [PATCH 103/147] Update llama.py --- unsloth/models/llama.py | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 445e5026f..cec743e59 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -570,14 +570,7 @@ def LlamaModel_fast_forward( # Embed positions if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) - - if self.config.torch_dtype == "float32": - self.config.torch_dtype = torch.float32 - elif self.config.torch_dtype == "bfloat16": - self.config.torch_dtype = torch.bfloat16 - elif self.config.torch_dtype == "float16": - self.config.torch_dtype = torch.float16 - + inputs_embeds = inputs_embeds.to(self.config.torch_dtype) # Normalized from Gemma @@ -1580,6 +1573,30 @@ def from_pretrained( internal_model = internal_model.model pass internal_model._saved_temp_tokenizer = tokenizer + + # Also fix torch_dtype + internal_model = model + while hasattr(internal_model, "model"): + if hasattr(internal_model, "config"): + if internal_model.config.torch_dtype == "float32": + internal_model.config.torch_dtype = torch.float32 + elif internal_model.config.torch_dtype == "bfloat16": + internal_model.config.torch_dtype = torch.bfloat16 + elif internal_model.config.torch_dtype == "float16": + internal_model.config.torch_dtype = torch.float16 + pass + pass + internal_model = internal_model.model + pass + if hasattr(internal_model, "config"): + if internal_model.config.torch_dtype == "float32": + internal_model.config.torch_dtype = torch.float32 + elif internal_model.config.torch_dtype == "bfloat16": + internal_model.config.torch_dtype = torch.bfloat16 + elif internal_model.config.torch_dtype == "float16": + internal_model.config.torch_dtype = torch.float16 + pass + pass return model, tokenizer pass From 291bc6e25495070a9118bb0618ba6172abb11970 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 4 Aug 2024 23:50:40 -0700 Subject: [PATCH 104/147] Update README.md --- README.md | 389 ------------------------------------------------------ 1 file changed, 389 deletions(-) diff --git a/README.md b/README.md index 4e29a43ec..9407c452a 100644 --- a/README.md +++ b/README.md @@ -436,395 +436,6 @@ Two Tesla T4s on Kaggle ![](https://i.ibb.co/sJ7RhGG/image-41.png)
-## NVIDIA Pascal Support - -Support for NVIDIA Pascal family of cards, specifically the P40 and P100. - -### Setup Guide - -1. Create three files (`Dockerfile`, `unsloth_env_file.yml`, and `docker-compose.yml`) with the contents provided below. -2. Ensure Docker and Docker Compose are installed on your system. -3. Install the NVIDIA Container Toolkit for GPU support if not already done. -4. Place all three files in the same directory. -5. Open a terminal and navigate to the directory containing these files. -6. Run the following command to build and start the container: - - ``` - docker-compose up --build - ``` - -7. Once the container is running, access Jupyter Lab by opening a web browser and navigating to `http://localhost:8888`. - -### Configuration Files - -#### 1. Dockerfile - -```dockerfile -# Stage 1: Base image with system dependencies -FROM nvidia/cuda:11.8.0-devel-ubuntu22.04 as base - -# Install system dependencies -RUN apt-get update && apt-get install -y \ - build-essential \ - git \ - vim \ - curl \ - wget \ - && rm -rf /var/lib/apt/lists/* - -# Install Miniconda only if it's not already installed -RUN if [ ! -d "/opt/conda" ]; then \ - wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh && \ - bash miniconda.sh -b -p /opt/conda && \ - rm miniconda.sh; \ - fi - -# Set path to conda -ENV PATH /opt/conda/bin:$PATH - -# Set path to conda -ENV PATH /opt/conda/bin:$PATH - -# Stage 2: Python environment setup -FROM base as python-env - -COPY unsloth_env_file.yml unsloth_env_file.yml - -RUN conda env create -f unsloth_env_file.yml - -SHELL ["conda", "run", "-n", "unsloth_env", "/bin/bash", "-c"] - -# Stage 3: Final image -FROM python-env as final - -# Install Unsloth (This step is separate because it's likely to change more frequently) -RUN pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git" - -ENV PATH /usr/local/cuda/bin:$PATH -ENV LD_LIBRARY_PATH /usr/local/cuda/lib64:$LD_LIBRARY_PATH - -# Set the working directory -WORKDIR /workspace - -# Set the default command to run Jupyter Lab -CMD ["conda", "run", "--no-capture-output", "-n", "unsloth_env", "jupyter", "lab", "--ip=0.0.0.0", "--no-browser", "--allow-root", "--NotebookApp.token=''", "--NotebookApp.password=''"] -``` - -#### 2. unsloth_env_file.yml - -```yaml -name: unsloth_env -channels: - - xformers - - pytorch - - nvidia - - conda-forge - - defaults -dependencies: - - _libgcc_mutex=0.1=conda_forge - - _openmp_mutex=4.5=2_gnu - - aiohttp=3.9.5=py310h5eee18b_0 - - aiosignal=1.2.0=pyhd3eb1b0_0 - - anyio=4.2.0=py310h06a4308_0 - - argon2-cffi=21.3.0=pyhd3eb1b0_0 - - argon2-cffi-bindings=21.2.0=py310h7f8727e_0 - - arrow-cpp=16.1.0=hc1eb8f0_0 - - async-lru=2.0.4=pyhd8ed1ab_0 - - async-timeout=4.0.3=py310h06a4308_0 - - attrs=23.1.0=py310h06a4308_0 - - aws-c-auth=0.6.19=h5eee18b_0 - - aws-c-cal=0.5.20=hdbd6064_0 - - aws-c-common=0.8.5=h5eee18b_0 - - aws-c-compression=0.2.16=h5eee18b_0 - - aws-c-event-stream=0.2.15=h6a678d5_0 - - aws-c-http=0.6.25=h5eee18b_0 - - aws-c-io=0.13.10=h5eee18b_0 - - aws-c-mqtt=0.7.13=h5eee18b_0 - - aws-c-s3=0.1.51=hdbd6064_0 - - aws-c-sdkutils=0.1.6=h5eee18b_0 - - aws-checksums=0.1.13=h5eee18b_0 - - aws-crt-cpp=0.18.16=h6a678d5_0 - - aws-sdk-cpp=1.10.55=h721c034_0 - - babel=2.14.0=pyhd8ed1ab_0 - - beautifulsoup4=4.12.3=py310h06a4308_0 - - blas=1.0=mkl - - bleach=4.1.0=pyhd3eb1b0_0 - - boost-cpp=1.82.0=hdb19cb5_2 - - bottleneck=1.3.7=py310ha9d4c09_0 - - brotli-python=1.0.9=py310h6a678d5_8 - - bzip2=1.0.8=h5eee18b_6 - - c-ares=1.19.1=h5eee18b_0 - - ca-certificates=2024.7.4=hbcca054_0 - - certifi=2024.7.4=pyhd8ed1ab_0 - - cffi=1.16.0=py310h5eee18b_1 - - charset-normalizer=3.3.2=pyhd3eb1b0_0 - - cuda-cudart=11.8.89=0 - - cuda-cupti=11.8.87=0 - - cuda-libraries=11.8.0=0 - - cuda-nvrtc=11.8.89=0 - - cuda-nvtx=11.8.86=0 - - cuda-runtime=11.8.0=0 - - cuda-version=11.8=hcce14f8_3 - - cudatoolkit=11.8.0=h6a678d5_0 - - datasets=2.19.1=py310h06a4308_0 - - debugpy=1.6.7=py310h6a678d5_0 - - decorator=5.1.1=pyhd3eb1b0_0 - - defusedxml=0.7.1=pyhd3eb1b0_0 - - dill=0.3.8=py310h06a4308_0 - - entrypoints=0.4=py310h06a4308_0 - - ffmpeg=4.3=hf484d3e_0 - - filelock=3.13.1=py310h06a4308_0 - - freetype=2.12.1=h4a9f257_0 - - frozenlist=1.4.0=py310h5eee18b_0 - - fsspec=2024.3.1=py310h06a4308_0 - - gflags=2.2.2=h6a678d5_1 - - glog=0.5.0=h6a678d5_1 - - gmp=6.2.1=h295c915_3 - - gmpy2=2.1.2=py310heeb90bb_0 - - gnutls=3.6.15=he1e5248_0 - - h11=0.14.0=pyhd8ed1ab_0 - - h2=4.1.0=pyhd8ed1ab_0 - - hpack=4.0.0=pyh9f0ad1d_0 - - httpcore=1.0.5=pyhd8ed1ab_0 - - httpx=0.27.0=pyhd8ed1ab_0 - - hyperframe=6.0.1=pyhd8ed1ab_0 - - icu=73.1=h6a678d5_0 - - idna=3.7=py310h06a4308_0 - - importlib-metadata=7.0.1=py310h06a4308_0 - - importlib_metadata=7.0.1=hd8ed1ab_0 - - importlib_resources=6.4.0=pyhd8ed1ab_0 - - intel-openmp=2023.1.0=hdb19cb5_46306 - - ipykernel=6.28.0=py310h06a4308_0 - - ipython_genutils=0.2.0=pyhd3eb1b0_1 - - jedi=0.19.1=py310h06a4308_0 - - jinja2=3.1.4=py310h06a4308_0 - - jpeg=9e=h5eee18b_2 - - json5=0.9.25=pyhd8ed1ab_0 - - jsonschema=4.19.2=py310h06a4308_0 - - jsonschema-specifications=2023.7.1=py310h06a4308_0 - - jupyter-lsp=2.2.5=pyhd8ed1ab_0 - - jupyter_client=7.4.9=py310h06a4308_0 - - jupyter_core=5.7.2=py310h06a4308_0 - - jupyter_events=0.10.0=py310h06a4308_0 - - jupyter_server=2.14.1=py310h06a4308_0 - - jupyter_server_terminals=0.4.4=py310h06a4308_1 - - jupyterlab=4.2.4=pyhd8ed1ab_0 - - jupyterlab_pygments=0.3.0=pyhd8ed1ab_1 - - jupyterlab_server=2.27.3=pyhd8ed1ab_0 - - krb5=1.20.1=h143b758_1 - - lame=3.100=h7b6447c_0 - - lcms2=2.12=h3be6417_0 - - ld_impl_linux-64=2.38=h1181459_1 - - lerc=3.0=h295c915_0 - - libabseil=20240116.2=cxx17_h6a678d5_0 - - libboost=1.82.0=h109eef0_2 - - libbrotlicommon=1.0.9=h5eee18b_8 - - libbrotlidec=1.0.9=h5eee18b_8 - - libbrotlienc=1.0.9=h5eee18b_8 - - libcublas=11.11.3.6=0 - - libcufft=10.9.0.58=0 - - libcufile=1.9.1.3=0 - - libcurand=10.3.5.147=0 - - libcurl=8.7.1=h251f7ec_0 - - libcusolver=11.4.1.48=0 - - libcusparse=11.7.5.86=0 - - libdeflate=1.17=h5eee18b_1 - - libedit=3.1.20230828=h5eee18b_0 - - libev=4.33=h7f8727e_1 - - libevent=2.1.12=hdbd6064_1 - - libffi=3.4.4=h6a678d5_1 - - libgcc-ng=14.1.0=h77fa898_0 - - libgomp=14.1.0=h77fa898_0 - - libgrpc=1.62.2=h2d74bed_0 - - libiconv=1.16=h5eee18b_3 - - libidn2=2.3.4=h5eee18b_0 - - libjpeg-turbo=2.0.0=h9bf148f_0 - - libnghttp2=1.57.0=h2d74bed_0 - - libnpp=11.8.0.86=0 - - libnvjpeg=11.9.0.86=0 - - libpng=1.6.39=h5eee18b_0 - - libprotobuf=4.25.3=he621ea3_0 - - libsodium=1.0.18=h7b6447c_0 - - libssh2=1.11.0=h251f7ec_0 - - libstdcxx-ng=11.2.0=h1234567_1 - - libtasn1=4.19.0=h5eee18b_0 - - libthrift=0.15.0=h1795dd8_2 - - libtiff=4.5.1=h6a678d5_0 - - libunistring=0.9.10=h27cfd23_0 - - libuuid=1.41.5=h5eee18b_0 - - libwebp-base=1.3.2=h5eee18b_0 - - llvm-openmp=14.0.6=h9e868ea_0 - - lz4-c=1.9.4=h6a678d5_1 - - markupsafe=2.1.3=py310h5eee18b_0 - - mistune=2.0.4=py310h06a4308_0 - - mkl=2023.1.0=h213fc3f_46344 - - mkl-service=2.4.0=py310h5eee18b_1 - - mkl_fft=1.3.8=py310h5eee18b_0 - - mkl_random=1.2.4=py310hdb19cb5_0 - - mpc=1.1.0=h10f8cd9_1 - - mpfr=4.0.2=hb69a4c5_1 - - mpmath=1.3.0=py310h06a4308_0 - - multidict=6.0.4=py310h5eee18b_0 - - multiprocess=0.70.15=py310h06a4308_0 - - nb_conda_kernels=2.3.1=py310h06a4308_0 - - nbclassic=1.1.0=py310h06a4308_0 - - nbclient=0.8.0=py310h06a4308_0 - - nbconvert=7.10.0=py310h06a4308_0 - - nbformat=5.9.2=py310h06a4308_0 - - ncurses=6.4=h6a678d5_0 - - nest-asyncio=1.6.0=py310h06a4308_0 - - nettle=3.7.3=hbbd107a_1 - - networkx=3.3=py310h06a4308_0 - - notebook=6.5.7=py310h06a4308_0 - - notebook-shim=0.2.3=py310h06a4308_0 - - numexpr=2.8.7=py310h85018f9_0 - - numpy=1.26.4=py310h5f9d8c6_0 - - numpy-base=1.26.4=py310hb5e798b_0 - - openh264=2.1.1=h4ff587b_0 - - openjpeg=2.4.0=h9ca470c_2 - - openssl=3.3.1=h4bc722e_2 - - orc=2.0.1=h2d29ad5_0 - - overrides=7.4.0=py310h06a4308_0 - - packaging=24.1=py310h06a4308_0 - - pandas=2.2.2=py310h6a678d5_0 - - pandocfilters=1.5.0=pyhd3eb1b0_0 - - pillow=10.4.0=py310h5eee18b_0 - - pip=24.0=py310h06a4308_0 - - platformdirs=3.10.0=py310h06a4308_0 - - prometheus_client=0.14.1=py310h06a4308_0 - - prompt_toolkit=3.0.43=hd3eb1b0_0 - - psutil=5.9.0=py310h5eee18b_0 - - ptyprocess=0.7.0=pyhd3eb1b0_2 - - pure_eval=0.2.2=pyhd3eb1b0_0 - - pyarrow=16.1.0=py310h1128e8f_0 - - pycparser=2.21=pyhd3eb1b0_0 - - pysocks=1.7.1=py310h06a4308_0 - - python=3.10.14=h955ad1f_1 - - python-dateutil=2.9.0post0=py310h06a4308_2 - - python-fastjsonschema=2.16.2=py310h06a4308_0 - - python-json-logger=2.0.7=py310h06a4308_0 - - python-tzdata=2023.3=pyhd3eb1b0_0 - - python-xxhash=2.0.2=py310h5eee18b_1 - - pytorch=2.1.0=py3.10_cuda11.8_cudnn8.7.0_0 - - pytorch-cuda=11.8=h7e8668a_5 - - pytorch-mutex=1.0=cuda - - pytz=2024.1=py310h06a4308_0 - - pyyaml=6.0.1=py310h5eee18b_0 - - pyzmq=24.0.1=py310h5eee18b_0 - - re2=2022.04.01=h295c915_0 - - readline=8.2=h5eee18b_0 - - referencing=0.30.2=py310h06a4308_0 - - regex=2023.10.3=py310h5eee18b_0 - - requests=2.32.3=py310h06a4308_0 - - rfc3339-validator=0.1.4=py310h06a4308_0 - - rfc3986-validator=0.1.1=py310h06a4308_0 - - rpds-py=0.10.6=py310hb02cf49_0 - - s2n=1.3.27=hdbd6064_0 - - safetensors=0.4.2=py310ha89cbab_1 - - send2trash=1.8.2=py310h06a4308_0 - - setuptools=69.5.1=py310h06a4308_0 - - six=1.16.0=pyhd3eb1b0_1 - - snappy=1.1.10=h6a678d5_1 - - sniffio=1.3.0=py310h06a4308_0 - - soupsieve=2.5=py310h06a4308_0 - - sqlite=3.45.3=h5eee18b_0 - - stack_data=0.2.0=pyhd3eb1b0_0 - - sympy=1.12=py310h06a4308_0 - - tbb=2021.8.0=hdb19cb5_0 - - terminado=0.17.1=py310h06a4308_0 - - tinycss2=1.2.1=py310h06a4308_0 - - tk=8.6.14=h39e8969_0 - - tokenizers=0.19.1=py310hff361bb_0 - - tomli=2.0.1=pyhd8ed1ab_0 - - torchaudio=2.1.0=py310_cu118 - - torchtriton=2.1.0=py310 - - torchvision=0.16.0=py310_cu118 - - tornado=6.4.1=py310h5eee18b_0 - - tqdm=4.66.4=py310h2f386ee_0 - - traitlets=5.14.3=py310h06a4308_0 - - typing-extensions=4.11.0=py310h06a4308_0 - - typing_extensions=4.11.0=py310h06a4308_0 - - tzdata=2024a=h04d1e81_0 - - urllib3=2.2.2=py310h06a4308_0 - - utf8proc=2.6.1=h5eee18b_1 - - webencodings=0.5.1=py310h06a4308_1 - - websocket-client=1.8.0=py310h06a4308_0 - - wheel=0.43.0=py310h06a4308_0 - - xformers=0.0.22.post7=py310_cu11.8.0_pyt2.1.0 - - xxhash=0.8.0=h7f8727e_3 - - xz=5.4.6=h5eee18b_1 - - yaml=0.2.5=h7b6447c_0 - - yarl=1.9.3=py310h5eee18b_0 - - zeromq=4.3.5=h6a678d5_0 - - zipp=3.17.0=py310h06a4308_0 - - zlib=1.2.13=h5eee18b_1 - - zstd=1.5.5=hc292b87_2 - - pip: - - accelerate==0.33.0 - - asttokens==2.4.1 - - bitsandbytes==0.43.2 - - comm==0.2.2 - - docstring-parser==0.16 - - exceptiongroup==1.2.2 - - executing==2.0.1 - - gguf==0.9.1 - - hf-transfer==0.1.8 - - huggingface-hub==0.24.2 - - iprogress==0.4 - - ipython==8.26.0 - - ipywidgets==8.1.3 - - jupyterlab-widgets==3.0.11 - - markdown-it-py==3.0.0 - - matplotlib-inline==0.1.7 - - mdurl==0.1.2 - - parso==0.8.4 - - peft==0.12.0 - - pexpect==4.9.0 - - prompt-toolkit==3.0.47 - - protobuf==3.20.3 - - pure-eval==0.2.3 - - pygments==2.18.0 - - rich==13.7.1 - - sentencepiece==0.2.0 - - shtab==1.7.1 - - stack-data==0.6.3 - - transformers==4.43.3 - - trl==0.8.6 - - tyro==0.8.5 - - wcwidth==0.2.13 - - widgetsnbextension==4.0.11 - ``` - -#### 3. docker-compose.yml - -```yaml -version: '3.8' - -services: - unsloth-env: - environment: - - NVIDIA_VISIBLE_DEVICES=all - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: all - capabilities: [gpu] - volumes: - - ./cache:/root/.cache - - ./workspace:/workspace - working_dir: /workspace - ports: - - "8888:8888" # For Jupyter Lab - tty: true - stdin_open: true - build: - context: . - dockerfile: Dockerfile -``` - - ### Thank You to - [HuyNguyen-hust](https://github.com/HuyNguyen-hust) for making [RoPE Embeddings 28% faster](https://github.com/unslothai/unsloth/pull/238) - [RandomInternetPreson](https://github.com/RandomInternetPreson) for confirming WSL support From b43855fb3635ce06860b27f7c8f9987a16b47ad7 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 4 Aug 2024 23:59:57 -0700 Subject: [PATCH 105/147] Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 9407c452a..35cbbe697 100644 --- a/README.md +++ b/README.md @@ -37,8 +37,10 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and - This [text completion notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing) is for continued pretraining / raw text - This [continued pretraining notebook](https://colab.research.google.com/drive/1tEd1FrOXWMnCU9UIvdYhs61tkxdMuKZu?usp=sharing) is for learning another language - Click [here](https://github.com/unslothai/unsloth/wiki) for detailed documentation for Unsloth. +- Install Unsloth with `pip install unsloth[colab-new]` then `pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes` ## 🦥 Unsloth.ai News +- 📣 NEW! `pip install unsloth` now works! Head over to [pypi](https://pypi.org/project/unsloth/) to check it out! This allows non git pull installs. Use `pip install unsloth[colab-new]` for non dependency installs. - 📣 NEW! [Gemma-2-2b](https://colab.research.google.com/drive/1weTpKOjBZxZJ5PQ-Ql8i6ptAY2x-FWVA?usp=sharing) now supported! Gemma-2-9b and Gemma-2-27b are alrady supported! And uploaded [GGUF quants](https://huggingface.co/unsloth/gemma-2-it-GGUF) Try out [Chat interface](https://colab.research.google.com/drive/1i-8ESvtLRGNkkUQQr_-z_rcSAIo9c3lM?usp=sharing) for Gemma-2-2b Instruct! - 📣 NEW! [Llama 3.1 8b, 70b](https://colab.research.google.com/drive/1Ys44kVvmeZtnICzWz0xgpRnrIOjZAuxp?usp=sharing) both Base and Instruct now supported - 📣 NEW! [Mistral Nemo-12b](https://colab.research.google.com/drive/17d3U-CAIwzmbDRqbZ9NnpHxCkmXB6LZ0?usp=sharing) both Base and Instruct now supported From bfe38e6ea8d3d7cf8ce9e37962de03c71c90cbe2 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 5 Aug 2024 00:00:53 -0700 Subject: [PATCH 106/147] Update README.md --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 35cbbe697..86c3fbd86 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,6 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and - This [text completion notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing) is for continued pretraining / raw text - This [continued pretraining notebook](https://colab.research.google.com/drive/1tEd1FrOXWMnCU9UIvdYhs61tkxdMuKZu?usp=sharing) is for learning another language - Click [here](https://github.com/unslothai/unsloth/wiki) for detailed documentation for Unsloth. -- Install Unsloth with `pip install unsloth[colab-new]` then `pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes` ## 🦥 Unsloth.ai News - 📣 NEW! `pip install unsloth` now works! Head over to [pypi](https://pypi.org/project/unsloth/) to check it out! This allows non git pull installs. Use `pip install unsloth[colab-new]` for non dependency installs. @@ -94,6 +93,9 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and ![](https://i.ibb.co/sJ7RhGG/image-41.png) ## 💾 Installation Instructions + +If you have Pytorch 2.3 and CUDA 12.1, install Unsloth with `pip install unsloth[colab-new]` then `pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes` + ### Conda Installation Select either `pytorch-cuda=11.8` for CUDA 11.8 or `pytorch-cuda=12.1` for CUDA 12.1. If you have `mamba`, use `mamba` instead of `conda` for faster solving. See this [Github issue](https://github.com/unslothai/unsloth/issues/73) for help on debugging Conda installs. ```bash From 8001d30a8f7c179ff7036eaa2a7552ce620176b6 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 6 Aug 2024 20:24:44 -0700 Subject: [PATCH 107/147] Fix tokenizers (#887) * Update pyproject.toml * Update tokenizer_utils.py * Update tokenizer_utils.py * Update tokenizer_utils.py * Update _utils.py * Update _utils.py * Update tokenizer_utils.py * Update tokenizer_utils.py * fix_tokenizer * Update tokenizer_utils.py * Update tokenizer_utils.py --- pyproject.toml | 4 +- unsloth/models/_utils.py | 83 ++++++++++++++++++++------ unsloth/models/llama.py | 1 + unsloth/tokenizer_utils.py | 115 ++++++++++++++++++++++++++++++++++++- 4 files changed, 180 insertions(+), 23 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index fdc098854..2cbe68f4a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,7 +47,7 @@ huggingface = [ "peft>=0.7.1,!=0.11.0", "protobuf<4.0.0", "huggingface_hub", - "hf-transfer", + "hf_transfer", ] cu118only = [ "xformers==0.0.22.post7", @@ -178,7 +178,7 @@ colab-new = [ "numpy", "protobuf<4.0.0", "huggingface_hub", - "hf-transfer", + "hf_transfer", ] colab-no-deps = [ "accelerate>=0.26.1", diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index fe3aa9040..d5be8d97e 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -332,7 +332,6 @@ def prepare_model_for_kbit_training( """ # Freeze all parameters except LoRA - import re with torch.no_grad(): for name, param in model.named_parameters(): if ".lora_A." in name or ".lora_B." in name or ".lora_magnitude_vector" in name: @@ -389,12 +388,14 @@ def patch_tokenizer(model, tokenizer): Fixes https://github.com/unslothai/unsloth/issues/5 """ possible_reserved_tokens = ( + "<|finetune_right_pad_id|>", # Llama-3.1 + "", # Mistral Nemo "<|reserved", # Llama-3 "<|placeholder", # Phi-3 "[control", # Mistral type models - "", # Mistral Nemo - "<|finetune_right_pad_id|>", # Llama-3.1 ) + joiner = "\1\0=+=\0\1" + number_repetitions = 3 - 1 # Number of reserved tokens needed if model is not None: model.config.update({"unsloth_version" : __version__}) @@ -412,28 +413,69 @@ def patch_tokenizer(model, tokenizer): if bad_pad_token: # Find a better pad token added_tokens = [str(x) for x in tokenizer.added_tokens_decoder.values()] - possible_pad_token = None - n_possible_pad_tokens = 0 - for added_token in added_tokens[::-1]: - if added_token.startswith(possible_reserved_tokens): - if possible_pad_token is None: possible_pad_token = added_token - n_possible_pad_tokens += 1 - # We must see at least 3 of the reserved tokens - if n_possible_pad_tokens >= 3: break + all_added_tokens = joiner.join(added_tokens[::-1]) + all_added_tokens += joiner + + final_pad_token = None + final_good_match = False + + for possible_reserved_token in possible_reserved_tokens: + possible_reserved_token = re.escape(possible_reserved_token) + found = re.finditer(f"{possible_reserved_token}", all_added_tokens) + first_match = None + good_match = False + for j, x in enumerate(found): + if j == 0: first_match = x + if j >= number_repetitions: + good_match = True + break + pass + pass + + if first_match is None: continue + + # If it ends with |> or > etc, then set it as a good pad token! + start = first_match.span(0)[0] + possible_pad_token = first_match.group(0) + end = all_added_tokens.find(joiner, start) + first_match = all_added_tokens[start:end] + + if first_match is not None: + good_match = possible_pad_token.endswith((">", "|>", "]", ")")) + pass + possible_pad_token = first_match + + # Replace current pad token if another exact match is found + if not final_good_match and good_match: + final_good_match = True + final_pad_token = possible_pad_token + break + else: + final_good_match = False + final_pad_token = possible_pad_token pass pass - if n_possible_pad_tokens < 3: possible_pad_token = None + possible_pad_token = final_pad_token - if possible_pad_token is None: - # Try unk_token + # Try unk_token + if possible_pad_token is None and hasattr(tokenizer, "unk_token"): possible_pad_token = tokenizer.unk_token pass + # Check pad token's id must be less than vocab size + if possible_pad_token is not None: + check_pad_token = tokenizer(possible_pad_token, add_special_tokens = False).input_ids + if len(check_pad_token) != 1: + possible_pad_token = None + if check_pad_token[0] >= config.vocab_size: + possible_pad_token = None + pass + if possible_pad_token is None: # Failure to find a good replacement!! We shall manually add one! new_pad_token = "<|PAD_TOKEN|>" while new_pad_token in tokenizer.get_vocab(): - new_pad_token += "#" + new_pad_token = f"<{new_pad_token}>" pass possible_pad_token = new_pad_token pass @@ -447,11 +489,16 @@ def patch_tokenizer(model, tokenizer): tokenizer.add_special_tokens({"pad_token" : possible_pad_token}) tokenizer.pad_token = possible_pad_token if model is not None: - config = model.config.update({"pad_token_id" : tokenizer.pad_token_id}) + model.config.update({"pad_token_id" : tokenizer.pad_token_id}) + model.generation_config.update(pad_token_id = tokenizer.pad_token_id) else: if model is not None: if model.config.pad_token_id is None: - config = model.config.update({"pad_token_id" : tokenizer.pad_token_id}) + model.config.update({"pad_token_id" : tokenizer.pad_token_id}) + model.generation_config.update(pad_token_id = tokenizer.pad_token_id) + pass + pass + model.generation_config.update(max_length = model.config.max_position_embeddings) return model, tokenizer pass @@ -462,7 +509,6 @@ def patch_tokenizer(model, tokenizer): from peft import __version__ as peft_version if Version(peft_version) < Version("0.12.0"): from peft.tuners.lora.layer import LoraLayer - import inspect, re try: source = inspect.getsource(LoraLayer.update_layer) text = "if weight is not None:\n" @@ -688,7 +734,6 @@ def unsloth_offloaded_gradient_checkpoint(function, *args, use_reentrant = None, from transformers.utils.quantization_config import BitsAndBytesConfig, QuantizationMethod from inspect import getsource from accelerate.utils.dataclasses import DistributedType -import re BitsAndBytesConfig__init__ = getsource(BitsAndBytesConfig.__init__) BitsAndBytesConfig__init__ = re.sub( r"if[\s]{1,}kwargs\:[\s]{1,}.+?\n", diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index cec743e59..e300e07e0 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1397,6 +1397,7 @@ def from_pretrained( padding_side = "right", token = token, trust_remote_code = trust_remote_code, + fix_tokenizer = fix_tokenizer, ) model, tokenizer = patch_tokenizer(model, tokenizer) diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index 8474c2c6b..c67f82c2c 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -454,13 +454,14 @@ class SentencePieceTokenTypes(IntEnum): pass -def load_correct_tokenizer( +def _load_correct_tokenizer( tokenizer_name, model_max_length = None, padding_side = "right", token = None, trust_remote_code = False, cache_dir = "huggingface_tokenizers_cache", + fix_tokenizer = True, ): if IS_COLAB_ENVIRONMENT or IS_KAGGLE_ENVIRONMENT: cache_dir = cache_dir @@ -501,7 +502,10 @@ def load_correct_tokenizer( cache_dir = cache_dir, ) - if tokenizer_name in IGNORED_TOKENIZER_NAMES: + if not fix_tokenizer or tokenizer_name in IGNORED_TOKENIZER_NAMES: + return fast_tokenizer + # Ignore Mistral ones - they're a bit weird to handle! + elif "mistral" in tokenizer_name.lower(): return fast_tokenizer elif slow_tokenizer is not None: if hasattr(fast_tokenizer, "add_bos_token") and hasattr(slow_tokenizer, "add_bos_token"): @@ -522,6 +526,113 @@ def load_correct_tokenizer( pass +def load_correct_tokenizer( + tokenizer_name, + model_max_length = None, + padding_side = "right", + token = None, + trust_remote_code = False, + cache_dir = "huggingface_tokenizers_cache", + fix_tokenizer = True, +): + tokenizer = _load_correct_tokenizer( + tokenizer_name = tokenizer_name, + model_max_length = model_max_length, + padding_side = padding_side, + token = token, + trust_remote_code = trust_remote_code, + cache_dir = cache_dir, + fix_tokenizer = fix_tokenizer, + ) + + ### 1. Fixup tokenizer's chat_template + old_chat_template = getattr(tokenizer, "chat_template", None) + + # Ignore mistral type models since they don't have a add_generation_prompt + if "mistral" in str(getattr(tokenizer, "name_or_path", "")).lower(): + chat_template = old_chat_template + + # Also check Llama-2 old style models + elif old_chat_template is not None and \ + "[/INST]" in old_chat_template and "[INST]" in old_chat_template and \ + "bos_token" in old_chat_template and "eos_token" in old_chat_template: + + chat_template = old_chat_template + + else: + chat_template = fix_chat_template(tokenizer) + if old_chat_template is not None and chat_template is None: + raise RuntimeError( + "Unsloth: Fixing chat template failed - please file a report immediately!" + ) + pass + pass + + tokenizer.chat_template = chat_template + return tokenizer +pass + + +def _fix_chat_template(chat_template): + endfor = "{% endfor %}" + where = chat_template.find(endfor) + if where == -1: return chat_template + + after_endfor = chat_template[where + len(endfor):] + + if "{% if" not in after_endfor and "{% set " not in after_endfor and \ + after_endfor.startswith("{{") and after_endfor.endswith("}}") and \ + after_endfor.count("{{") == 1 and after_endfor.count("}}") == 1: + + after_endfor = "{% if add_generation_prompt %}" + after_endfor + "{% endif %}" + + chat_template = chat_template[:where + len(endfor)] + after_endfor + pass + return chat_template +pass + + +def fix_chat_template(tokenizer): + chat_template = getattr(tokenizer, "chat_template", None) + if chat_template is None: return None + + ### 1. Check if add_generation_prompt works + messages = [ + {"role": "user", "content": "Who are you?"}, + ] + no = tokenizer.apply_chat_template(messages, add_generation_prompt = False, tokenize = False) + yes = tokenizer.apply_chat_template(messages, add_generation_prompt = True, tokenize = False) + + if no == yes: + # SAME?! That's not good! We check for add_generation_prompt + if "{% if add_generation_prompt %}" not in chat_template: + # Try fixing it by adding it + new_chat_template = _fix_chat_template(chat_template) + if "{% if add_generation_prompt %}" not in new_chat_template: + raise RuntimeError( + f"Unsloth: The tokenizer `{tokenizer.name_or_path}`\n"\ + "does not have a {% if add_generation_prompt %} for generation purposes.\n"\ + "Please file a bug report immediately - thanks!" + ) + else: + logger.warning_once( + "Unsloth: We successfully patched the tokenizer to add a {% if add_generation_prompt %} to the chat_template.\n"\ + "This is not a bug, but please notify the Unsloth maintainers - thanks!" + ) + chat_template = new_chat_template + pass + else: + raise RuntimeError( + f"Unsloth: The tokenizer `{tokenizer.name_or_path}`\n"\ + "has a {% if add_generation_prompt %} for generation purposes, but wasn't provided correctly.\n"\ + "Please file a bug report immediately - thanks!" + ) + pass + pass + return chat_template +pass + + def check_tokenizer( model, tokenizer, From 637ed8c6bd252f981e89e30e1085efc03a06a880 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 7 Aug 2024 01:11:06 -0700 Subject: [PATCH 108/147] Update _utils.py --- unsloth/models/_utils.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index d5be8d97e..db27eb8a8 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -490,15 +490,21 @@ def patch_tokenizer(model, tokenizer): tokenizer.pad_token = possible_pad_token if model is not None: model.config.update({"pad_token_id" : tokenizer.pad_token_id}) - model.generation_config.update(pad_token_id = tokenizer.pad_token_id) + if getattr(model, "generation_config") is not None: + model.generation_config.update(pad_token_id = tokenizer.pad_token_id) else: if model is not None: if model.config.pad_token_id is None: model.config.update({"pad_token_id" : tokenizer.pad_token_id}) - model.generation_config.update(pad_token_id = tokenizer.pad_token_id) + if getattr(model, "generation_config") is not None: + model.generation_config.update(pad_token_id = tokenizer.pad_token_id) pass pass - model.generation_config.update(max_length = model.config.max_position_embeddings) + + if model is not None: + if getattr(model, "generation_config") is not None: + model.generation_config.update(max_length = model.config.max_position_embeddings) + return model, tokenizer pass From cad1146ff7c60f4afc10b9ab243304befdad7a0f Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 7 Aug 2024 10:47:11 -0700 Subject: [PATCH 109/147] Update _utils.py --- unsloth/models/_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index db27eb8a8..3686717b2 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -467,7 +467,7 @@ def patch_tokenizer(model, tokenizer): check_pad_token = tokenizer(possible_pad_token, add_special_tokens = False).input_ids if len(check_pad_token) != 1: possible_pad_token = None - if check_pad_token[0] >= config.vocab_size: + if check_pad_token[0] >= model.config.vocab_size: possible_pad_token = None pass From e4c8ceacb3fca634f78e662873a01c37678fcb3e Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 7 Aug 2024 10:48:39 -0700 Subject: [PATCH 110/147] Update _utils.py --- unsloth/models/_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 3686717b2..195fd5bb6 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -467,7 +467,7 @@ def patch_tokenizer(model, tokenizer): check_pad_token = tokenizer(possible_pad_token, add_special_tokens = False).input_ids if len(check_pad_token) != 1: possible_pad_token = None - if check_pad_token[0] >= model.config.vocab_size: + if model is not None and check_pad_token[0] >= model.config.vocab_size: possible_pad_token = None pass From 3bc804a9f9d603287f0a42a7169ed8cd40420f6b Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sat, 10 Aug 2024 19:59:40 -0700 Subject: [PATCH 111/147] Torch 2.4, Xformers>0.0.27, TRL>0.9, Python 3.12 + bug fixes (#902) * Update pyproject.toml * Update tokenizer_utils.py * Update tokenizer_utils.py * Update tokenizer_utils.py * Update _utils.py * Update _utils.py * Update tokenizer_utils.py * Update tokenizer_utils.py * fix_tokenizer * Update tokenizer_utils.py * Update tokenizer_utils.py * Update save.py * Update save.py * Update save.py * Update save.py * Update save.py * Update loader.py * Update pyproject.toml * Update _utils.py * Update gemma2.py * Update gemma2.py * Update _utils.py * gemma 2 mask * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update _utils.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update _utils.py * Update llama.py * Update llama.py * Update llama.py * Update _utils.py * Update _utils.py * Update _utils.py * Update _utils.py * Update _utils.py * Update _utils.py * Torch 2.4 Xformers 0.0.27post2 * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Gemma 2 fixes * Update gemma2.py * Update llama.py * Update llama.py * Update save.py * Update save.py --- pyproject.toml | 133 ++++++++++++++++++++++++++++----------- unsloth/models/_utils.py | 60 +++++++++++++++--- unsloth/models/gemma2.py | 2 +- unsloth/models/llama.py | 31 ++++++--- unsloth/models/loader.py | 6 +- unsloth/save.py | 82 +++++++++++++++++++----- 6 files changed, 240 insertions(+), 74 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 2cbe68f4a..b61908a69 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,111 +43,154 @@ huggingface = [ "wheel>=0.42.0", "numpy", "accelerate>=0.26.1", - "trl>=0.7.9,<0.9.0", + "trl>=0.7.9,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3", "peft>=0.7.1,!=0.11.0", "protobuf<4.0.0", "huggingface_hub", "hf_transfer", ] cu118only = [ - "xformers==0.0.22.post7", + "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.22.post7%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'", + "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.22.post7%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'", + "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.22.post7%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'", ] cu121only = [ - "xformers==0.0.22.post7", + "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.22.post7-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'", + "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.22.post7-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'", + "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.22.post7-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'", ] cu118onlytorch211 = [ - "xformers==0.0.23", + "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'", + "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'", + "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'", ] cu121onlytorch211 = [ - "xformers==0.0.23", + "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'", + "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'", + "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'", ] cu118onlytorch212 = [ - "xformers==0.0.23.post1", + "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23.post1%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'", + "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23.post1%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'", + "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23.post1%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'", ] cu121onlytorch212 = [ - "xformers==0.0.23.post1", + "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23.post1-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'", + "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23.post1-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'", + "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23.post1-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'", ] cu118onlytorch220 = [ - "xformers==0.0.24", + "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.24%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'", + "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.24%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'", + "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.24%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'", ] cu121onlytorch220 = [ - "xformers==0.0.24", + "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.24-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'", + "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.24-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'", + "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.24-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'", ] cu118onlytorch230 = [ - "xformers==0.0.26.post1", + "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'", + "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'", + "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'", + "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27%2Bcu118-cp312-cp312-manylinux2014_x86_64.whl ; python_version=='3.12'", ] cu121onlytorch230 = [ - "xformers==0.0.26.post1", + "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.27-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'", + "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.27-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'", + "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.27-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'", + "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.27-cp312-cp312-manylinux2014_x86_64.whl ; python_version=='3.12'", +] +cu118onlytorch240 = [ + "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27.post2%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'", + "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27.post2%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'", + "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27.post2%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'", + "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27.post2%2Bcu118-cp312-cp312-manylinux2014_x86_64.whl ; python_version=='3.12'", +] +cu121onlytorch240 = [ + "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.27.post2-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'", + "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.27.post2-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'", + "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.27.post2-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'", + "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.27.post2-cp312-cp312-manylinux2014_x86_64.whl ; python_version=='3.12'", ] - cu118 = [ "unsloth[huggingface]", - "bitsandbytes", + "bitsandbytes>=0.43.3", "unsloth[cu118only]", ] cu121 = [ "unsloth[huggingface]", - "bitsandbytes", + "bitsandbytes>=0.43.3", "unsloth[cu121only]", ] cu118-torch211 = [ "unsloth[huggingface]", - "bitsandbytes", + "bitsandbytes>=0.43.3", "unsloth[cu118onlytorch211]", ] cu121-torch211 = [ "unsloth[huggingface]", - "bitsandbytes", + "bitsandbytes>=0.43.3", "unsloth[cu121onlytorch211]", ] cu118-torch212 = [ "unsloth[huggingface]", - "bitsandbytes", + "bitsandbytes>=0.43.3", "unsloth[cu118onlytorch212]", ] cu121-torch212 = [ "unsloth[huggingface]", - "bitsandbytes", + "bitsandbytes>=0.43.3", "unsloth[cu121onlytorch212]", ] cu118-torch220 = [ "unsloth[huggingface]", - "bitsandbytes", + "bitsandbytes>=0.43.3", "unsloth[cu118onlytorch220]", ] cu121-torch220 = [ "unsloth[huggingface]", - "bitsandbytes", + "bitsandbytes>=0.43.3", "unsloth[cu121onlytorch220]", ] cu118-torch230 = [ "unsloth[huggingface]", - "bitsandbytes", + "bitsandbytes>=0.43.3", "unsloth[cu118onlytorch230]", ] cu121-torch230 = [ "unsloth[huggingface]", - "bitsandbytes", + "bitsandbytes>=0.43.3", "unsloth[cu121onlytorch230]", ] +cu118-torch240 = [ + "unsloth[huggingface]", + "bitsandbytes>=0.43.3", + "unsloth[cu118onlytorch240]", +] +cu121-torch240 = [ + "unsloth[huggingface]", + "bitsandbytes>=0.43.3", + "unsloth[cu121onlytorch240]", +] kaggle = [ "unsloth[huggingface]", ] kaggle-new = [ "unsloth[huggingface]", - "bitsandbytes", + "bitsandbytes>=0.43.3", ] conda = [ "unsloth[huggingface]", ] colab-torch211 = [ "unsloth[huggingface]", - "bitsandbytes", + "bitsandbytes>=0.43.3", "unsloth[cu121onlytorch211]", ] colab-ampere-torch211 = [ "unsloth[huggingface]", - "bitsandbytes", + "bitsandbytes>=0.43.3", "unsloth[cu121onlytorch211]", "packaging", "ninja", @@ -155,12 +198,12 @@ colab-ampere-torch211 = [ ] colab-torch220 = [ "unsloth[huggingface]", - "bitsandbytes", + "bitsandbytes>=0.43.3", "unsloth[cu121onlytorch220]", ] colab-ampere-torch220 = [ "unsloth[huggingface]", - "bitsandbytes", + "bitsandbytes>=0.43.3", "unsloth[cu121onlytorch220]", "packaging", "ninja", @@ -182,10 +225,10 @@ colab-new = [ ] colab-no-deps = [ "accelerate>=0.26.1", - "trl>=0.7.9", + "trl>=0.7.9,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3", "peft>=0.7.1", "xformers<0.0.27", - "bitsandbytes", + "bitsandbytes>=0.43.3", "protobuf<4.0.0", ] colab = [ @@ -199,7 +242,7 @@ colab-ampere = [ ] cu118-ampere = [ "unsloth[huggingface]", - "bitsandbytes", + "bitsandbytes>=0.43.3", "unsloth[cu118only]", "packaging", "ninja", @@ -207,7 +250,7 @@ cu118-ampere = [ ] cu121-ampere = [ "unsloth[huggingface]", - "bitsandbytes", + "bitsandbytes>=0.43.3", "unsloth[cu121only]", "packaging", "ninja", @@ -215,7 +258,7 @@ cu121-ampere = [ ] cu118-ampere-torch211 = [ "unsloth[huggingface]", - "bitsandbytes", + "bitsandbytes>=0.43.3", "unsloth[cu118onlytorch211]", "packaging", "ninja", @@ -223,7 +266,7 @@ cu118-ampere-torch211 = [ ] cu121-ampere-torch211 = [ "unsloth[huggingface]", - "bitsandbytes", + "bitsandbytes>=0.43.3", "unsloth[cu121onlytorch211]", "packaging", "ninja", @@ -231,7 +274,7 @@ cu121-ampere-torch211 = [ ] cu118-ampere-torch220 = [ "unsloth[huggingface]", - "bitsandbytes", + "bitsandbytes>=0.43.3", "unsloth[cu118onlytorch220]", "packaging", "ninja", @@ -239,7 +282,7 @@ cu118-ampere-torch220 = [ ] cu121-ampere-torch220 = [ "unsloth[huggingface]", - "bitsandbytes", + "bitsandbytes>=0.43.3", "unsloth[cu121onlytorch220]", "packaging", "ninja", @@ -247,7 +290,7 @@ cu121-ampere-torch220 = [ ] cu118-ampere-torch230 = [ "unsloth[huggingface]", - "bitsandbytes", + "bitsandbytes>=0.43.3", "unsloth[cu118onlytorch230]", "packaging", "ninja", @@ -255,12 +298,28 @@ cu118-ampere-torch230 = [ ] cu121-ampere-torch230 = [ "unsloth[huggingface]", - "bitsandbytes", + "bitsandbytes>=0.43.3", "unsloth[cu121onlytorch230]", "packaging", "ninja", "flash-attn>=2.6.3", ] +cu118-ampere-torch240 = [ + "unsloth[huggingface]", + "bitsandbytes>=0.43.3", + "unsloth[cu118onlytorch240]", + "packaging", + "ninja", + "flash-attn>=2.6.3", +] +cu121-ampere-torch240 = [ + "unsloth[huggingface]", + "bitsandbytes>=0.43.3", + "unsloth[cu121onlytorch240]", + "packaging", + "ninja", + "flash-attn>=2.6.3", +] [project.urls] homepage = "http://www.unsloth.ai" diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 195fd5bb6..0c0057496 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -53,7 +53,9 @@ # Disable some warnings which can get annoying warnings.filterwarnings(action = "ignore", category = UserWarning, module = "torch") warnings.filterwarnings(action = "ignore", category = UserWarning, module = "huggingface_hub") +warnings.filterwarnings(action = "ignore", category = UserWarning, module = "trl") warnings.filterwarnings(action = "ignore", category = FutureWarning, module = "huggingface_hub") +warnings.filterwarnings(action = "ignore", category = FutureWarning, module = "xformers") warnings.filterwarnings(action = "ignore", category = RuntimeWarning, module = "subprocess") warnings.filterwarnings(action = "ignore", category = UserWarning, module = "transformers") warnings.filterwarnings(action = "ignore", category = FutureWarning, module = "accelerate") @@ -133,6 +135,28 @@ def patch_mistral_nemo_config(config): pass # ============================================= +# ============================================= +# Fix KeyError: 'Cache only has 0 layers, attempted to access layer with index 0' +import transformers.cache_utils +if hasattr(transformers.cache_utils, "DynamicCache") and \ + transformers.cache_utils.DynamicCache.__getitem__.__name__ != "__cache_utils_getitem__": + + source = inspect.getsource(transformers.cache_utils.DynamicCache.__getitem__) + start = source.find("def") + spaces = start*" " + source = source.split("\n") + source = "\n".join(x[start:] for x in source) + where = source.find("raise KeyError") + source = source[:where] + \ + f"if len(self) == 0:\n{spaces}{spaces}"\ + " raise RuntimeError('Unsloth: You must call `FastLanguageModel.for_inference(model)` before doing inference for Unsloth models.')\n" + \ + f"{spaces}{spaces}else:\n{spaces}{spaces}{spaces}" + source[where:] + source = source.replace("__getitem__", "__cache_utils_getitem__", 1) + exec(source) + transformers.cache_utils.DynamicCache.__getitem__ = __cache_utils_getitem__ +pass +# ============================================= + # ============================================= # Get Flash Attention v2 if Ampere (RTX 30xx, A100) import bitsandbytes as bnb @@ -192,7 +216,7 @@ def patch_mistral_nemo_config(config): # Get Xformers from xformers import __version__ as xformers_version # Temporarily disable 0.0.27 and higher - inference issues -if Version(xformers_version) >= Version("0.0.27"): +if False: #Version(xformers_version) >= Version("0.0.27"): raise ImportError( "Unsloth: If you are in Colab, we updated the top cell install instructions - please change it to below "\ "then press Disconnect Runtime and then Restart it.\n"\ @@ -200,10 +224,10 @@ def patch_mistral_nemo_config(config): "%%capture\n" "# Installs Unsloth, Xformers (Flash Attention) and all other packages!\n" '!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"\n' - '!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes\n'\ + '!pip install --no-deps "xformers<=0.0.27" trl peft accelerate bitsandbytes\n'\ '\n'\ f"Otherwise in local machines, your xformers version of {xformers_version} is too new.\n"\ - 'Please downgrade xformers via `pip install --force-reinstall "xformers<0.0.27"' + 'Please downgrade xformers via `pip install --force-reinstall "xformers<=0.0.27"' ) pass @@ -217,10 +241,10 @@ def patch_mistral_nemo_config(config): f"Unsloth: You have torch = {torch_version} but xformers = {xformers_version}.\n"\ f"Please install xformers < 0.0.26 for torch = {torch_version}." ) -elif Version(torch_version) < Version("2.4.0") and Version(xformers_version) >= Version("0.0.27"): +elif Version(torch_version) < Version("2.4.0") and Version(xformers_version) > Version("0.0.27"): raise ImportError( f"Unsloth: You have torch = {torch_version} but xformers = {xformers_version}.\n"\ - f"Please install xformers < 0.0.27 for torch = {torch_version}." + f"Please install xformers <= 0.0.27 for torch = {torch_version}." ) pass @@ -241,7 +265,8 @@ def patch_mistral_nemo_config(config): # Check TRL version from trl import __version__ as trl_version -if Version(trl_version) >= Version("0.9.0"): +# Unsloth now supports all TRL versions! +if False:#Version(trl_version) >= Version("0.9.0"): raise ImportError( "Unsloth: If you are in Colab, we updated the top cell install instructions - please change it to below "\ "then press Disconnect Runtime and then Restart it.\n"\ @@ -249,13 +274,32 @@ def patch_mistral_nemo_config(config): "%%capture\n" "# Installs Unsloth, Xformers (Flash Attention) and all other packages!\n" '!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"\n' - '!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes\n'\ + '!pip install --no-deps "xformers<=0.0.27" trl peft accelerate bitsandbytes\n'\ '\n'\ f"Otherwise in local machines, your TRL version of {trl_version} is too new.\n"\ - 'Please downgrade TRL via `pip install --force-reinstall "trl<0.9.0"' + 'Please downgrade TRL via `pip install --force-reinstall trl' ) pass +# ============================================= +# Fix new Xformers versions TypeError: Multiple dispatch failed for 'torch._ops.aten.to.dtype_layout' +if Version(xformers_version) >= Version("0.0.27"): + import accelerate.utils.operations + if hasattr(accelerate.utils.operations, "send_to_device") and \ + accelerate.utils.operations.send_to_device.__name__ != "_fixed_send_to_device": + from accelerate.utils.operations import * + send_to_device = inspect.getsource(accelerate.utils.operations.send_to_device) + send_to_device = re.sub( + r"([ ]{4,})return tensor\.to\(device\)", + r"\1try: return tensor.to(device)\n\1except: return tensor", + send_to_device, + ).replace("def send_to_device", "def _fixed_send_to_device") + exec(send_to_device) + accelerate.utils.operations.send_to_device = _fixed_send_to_device + pass +pass +# ============================================= + # ============================================= # Torch compile settings diff --git a/unsloth/models/gemma2.py b/unsloth/models/gemma2.py index 1cbaf5b16..ea9f53e7d 100644 --- a/unsloth/models/gemma2.py +++ b/unsloth/models/gemma2.py @@ -156,6 +156,7 @@ def Gemma2Attention_fast_forward( ) A = A.reshape(bsz, q_len, n_heads*head_dim) else: + mask = causal_mask if attention_mask is None else attention_mask A = slow_attention_softcapping(Q, K, V, causal_mask, self, bsz, kv_seq_len) pass A = self.apply_o(self, A) @@ -413,7 +414,6 @@ def Gemma2Model_fast_forward_inference( SWA = attention_mask GA = attention_mask pass - next_decoder_cache = [] for idx, decoder_layer in enumerate(self.model.layers): diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index e300e07e0..2a07da6ce 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -419,7 +419,7 @@ def LlamaAttention_fast_forward( def LlamaDecoderLayer_fast_forward( self, hidden_states: torch.Tensor, - causal_mask: Optional[xformers.attn_bias.BlockDiagonalCausalMask] = None, + causal_mask = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_value: Optional[Tuple[torch.Tensor]] = None, @@ -505,7 +505,7 @@ def LlamaModel_fast_forward( return_dict: Optional[bool] = None, *args, **kwargs, ) -> Union[Tuple, BaseModelOutputWithPast]: - + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions assert(output_attentions is False) output_hidden_states = ( @@ -682,12 +682,27 @@ def LlamaModel_fast_forward( # Gemma2 has alternating SWA and global attn - if IS_GEMMA2 and not hasattr(self, "SWA_mask"): - if HAS_FLASH_ATTENTION_SOFTCAPPING: + if IS_GEMMA2: + if HAS_FLASH_ATTENTION_SOFTCAPPING and attention_mask is None: self.SWA_mask = True self.GA_mask = False - else: - n = self.config.max_position_embeddings + elif attention_mask is not None: + self.SWA_mask = _prepare_4d_causal_attention_mask_for_sdpa( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + sliding_window = self.config.sliding_window, + ) + self.GA_mask = _prepare_4d_causal_attention_mask_for_sdpa( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + sliding_window = None, + ) + elif not hasattr(self, "SWA_mask"): + n = self.max_seq_length # self.config.max_position_embeddings # masked_fill is making stuff slower! # self. GA_mask = create_boolean_mask(n = n, sliding_window = 0) # self.SWA_mask = create_boolean_mask(n = n, sliding_window = self.config.sliding_window) @@ -870,7 +885,7 @@ def _CausalLM_fast_forward( ) else: causal_mask = xformers.attn_bias.LowerTriangularMask() - + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -879,7 +894,6 @@ def _CausalLM_fast_forward( # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) self.model._has_no_labels = labels is None - outputs = self.model( input_ids=input_ids, causal_mask=causal_mask, @@ -893,7 +907,6 @@ def _CausalLM_fast_forward( return_dict=return_dict, ) pass - hidden_states = outputs[0] bsz, q_len, hd = hidden_states.shape lm_head = self.lm_head.weight diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 47152d676..cce22aebf 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -93,7 +93,7 @@ def _get_new_mapper(): pass -def _get_model_name(model_name, load_in_4bit = True): +def get_model_name(model_name, load_in_4bit = True): new_model_name = __get_model_name( model_name = model_name, load_in_4bit = load_in_4bit, @@ -145,7 +145,7 @@ def from_pretrained( token = os.environ["HUGGINGFACE_TOKEN"] old_model_name = model_name - model_name = _get_model_name(model_name, load_in_4bit) + model_name = get_model_name(model_name, load_in_4bit) # First check if it's a normal model via AutoConfig from huggingface_hub.utils import disable_progress_bars, enable_progress_bars, are_progress_bars_disabled @@ -192,7 +192,7 @@ def from_pretrained( # Get base model for PEFT: if is_peft: # Check base model again for PEFT - model_name = _get_model_name(peft_config.base_model_name_or_path, load_in_4bit) + model_name = get_model_name(peft_config.base_model_name_or_path, load_in_4bit) model_config = AutoConfig.from_pretrained(model_name, token = token, revision = revision) pass diff --git a/unsloth/save.py b/unsloth/save.py index a5904efc1..f45d8062a 100644 --- a/unsloth/save.py +++ b/unsloth/save.py @@ -28,12 +28,14 @@ import re from transformers.models.llama.modeling_llama import logger from .tokenizer_utils import fix_sentencepiece_gguf +from huggingface_hub import HfApi __all__ = [ "print_quantization_methods", "unsloth_save_model", "save_to_gguf", "patch_saving_functions", + "create_huggingface_repo", ] # Check environments @@ -207,8 +209,9 @@ def unsloth_save_model( ): if token is None and "HF_TOKEN" in os.environ: token = os.environ["HF_TOKEN"] - - if token is None and "HUGGINGFACE_TOKEN" in os.environ: + elif token is None and "hf_token" in os.environ: + token = os.environ["hf_token"] + elif token is None and "HUGGINGFACE_TOKEN" in os.environ: token = os.environ["HUGGINGFACE_TOKEN"] if commit_message is None: commit_message = "" @@ -555,7 +558,8 @@ def unsloth_save_model( logger.warning_once(f"We will save to Disk and not RAM now.") filename = os.path.join(temporary_location, f"{name}.pt") torch.save(W, filename, pickle_module = pickle, pickle_protocol = pickle.HIGHEST_PROTOCOL,) - state_dict[name] = torch.load(filename, map_location = "cpu", mmap = True) + # weights_only = True weirdly fails? + state_dict[name] = torch.load(filename, map_location = "cpu", mmap = True, weights_only = False) pass for item in LLAMA_LAYERNORMS: try: @@ -675,7 +679,6 @@ def unsloth_save_model( # Now manually go through each file and upload them manually! filenames = os.listdir(new_save_directory) - from huggingface_hub import HfApi hf_api = HfApi(token = save_pretrained_settings["token"]) print("Unsloth: Uploading all files... Please wait...") @@ -1312,6 +1315,49 @@ def _determine_username(save_directory, old_username, token): pass +def create_huggingface_repo( + model, + save_directory, + token = None, + private = False, +): + if token is None and "HF_TOKEN" in os.environ: + token = os.environ["HF_TOKEN"] + elif token is None and "hf_token" in os.environ: + token = os.environ["hf_token"] + elif token is None and "HUGGINGFACE_TOKEN" in os.environ: + token = os.environ["HUGGINGFACE_TOKEN"] + pass + save_directory, username = _determine_username(save_directory, "", token) + + from huggingface_hub import create_repo + try: + create_repo( + repo_id = save_directory, + token = token, + repo_type = "model", + exist_ok = False, + private = private, + ) + + # Create model card + from huggingface_hub import ModelCard + content = MODEL_CARD.format( + username = username, + base_model = model.config._name_or_path, + model_type = model.config.model_type, + method = "", + extra = "unsloth", + ) + card = ModelCard(content) + card.push_to_hub(save_directory, token = token) + except: + pass + hf_api = HfApi(token = token) + return save_directory, hf_api +pass + + def upload_to_huggingface( model, save_directory, @@ -1321,6 +1367,7 @@ def upload_to_huggingface( file_location = None, old_username = None, private = None, + create_config = True, ): save_directory, username = _determine_username(save_directory, old_username, token) @@ -1350,7 +1397,6 @@ def upload_to_huggingface( if file_location is not None: # Now upload file - from huggingface_hub import HfApi hf_api = HfApi(token = token) if "/" in file_location: @@ -1372,6 +1418,8 @@ def upload_to_huggingface( repo_type = "model", commit_message = "(Trained with Unsloth)", ) + pass + pass hf_api.upload_file( path_or_fileobj = file_location, @@ -1382,18 +1430,20 @@ def upload_to_huggingface( ) # We also upload a config.json file - import json - with open("_temporary_unsloth_config.json", "w") as file: - json.dump({"model_type" : model.config.model_type}, file, indent = 4) + if create_config: + import json + with open("_temporary_unsloth_config.json", "w") as file: + json.dump({"model_type" : model.config.model_type}, file, indent = 4) + pass + hf_api.upload_file( + path_or_fileobj = "_temporary_unsloth_config.json", + path_in_repo = "config.json", + repo_id = save_directory, + repo_type = "model", + commit_message = "(Trained with Unsloth)", + ) + os.remove("_temporary_unsloth_config.json") pass - hf_api.upload_file( - path_or_fileobj = "_temporary_unsloth_config.json", - path_in_repo = "config.json", - repo_id = save_directory, - repo_type = "model", - commit_message = "(Trained with Unsloth)", - ) - os.remove("_temporary_unsloth_config.json") pass return username pass From 3781a03903c6a24c929737f49a1f73b25a517ac6 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 11 Aug 2024 18:26:20 -0700 Subject: [PATCH 112/147] Fix DPO stats (#906) * Update pyproject.toml * Update tokenizer_utils.py * Update tokenizer_utils.py * Update tokenizer_utils.py * Update _utils.py * Update _utils.py * Update tokenizer_utils.py * Update tokenizer_utils.py * fix_tokenizer * Update tokenizer_utils.py * Update tokenizer_utils.py * Update save.py * Update save.py * Update save.py * Update save.py * Update save.py * Update loader.py * Update pyproject.toml * Update _utils.py * Update gemma2.py * Update gemma2.py * Update _utils.py * gemma 2 mask * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update _utils.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update _utils.py * Update llama.py * Update llama.py * Update llama.py * Update _utils.py * Update _utils.py * Update _utils.py * Update _utils.py * Update _utils.py * Update _utils.py * Torch 2.4 Xformers 0.0.27post2 * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Gemma 2 fixes * Update gemma2.py * Update llama.py * Update llama.py * Update save.py * Update save.py * Update llama.py * Update cross_entropy_loss.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py --- unsloth/kernels/cross_entropy_loss.py | 1 + unsloth/models/dpo.py | 16 +++++++++++++--- unsloth/models/llama.py | 1 + 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/unsloth/kernels/cross_entropy_loss.py b/unsloth/kernels/cross_entropy_loss.py index 6074a5153..b8473e60c 100644 --- a/unsloth/kernels/cross_entropy_loss.py +++ b/unsloth/kernels/cross_entropy_loss.py @@ -303,6 +303,7 @@ def backward(ctx, dlosses): pass +@torch._disable_dynamo def fast_cross_entropy_loss(logits, labels, logit_softcapping = 0): """ Arguments: diff --git a/unsloth/models/dpo.py b/unsloth/models/dpo.py index b7c7305bb..e7074350c 100644 --- a/unsloth/models/dpo.py +++ b/unsloth/models/dpo.py @@ -12,6 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +__all__ = [ + "PatchDPOTrainer", +] + try: from transformers.utils.notebook import ( IntervalStrategy, @@ -22,6 +26,12 @@ except: HAS_NOTEBOOK = False pass +import torch +from ._utils import torch_compile_options +import inspect +import torch.nn as nn +from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union + DPOTrainer_metrics = [ "rewards/chosen", @@ -37,11 +47,11 @@ def NotebookProgressCallback_on_train_begin(self, args, state, control, **kwargs): - self.first_column = "Epoch" if args.evaluation_strategy == IntervalStrategy.EPOCH else "Step" + self.first_column = "Epoch" if args.eval_strategy == IntervalStrategy.EPOCH else "Step" self.training_loss = 0 self.last_log = 0 column_names = [self.first_column] + ["Training Loss"] - if args.evaluation_strategy != IntervalStrategy.NO: + if args.eval_strategy != IntervalStrategy.NO: column_names.append("Validation Loss") column_names += [x.replace("/", " / ") for x in DPOTrainer_metrics] self.training_tracker = NotebookTrainingTracker(state.max_steps, column_names) @@ -50,7 +60,7 @@ def NotebookProgressCallback_on_train_begin(self, args, state, control, **kwargs def NotebookProgressCallback_on_log(self, args, state, control, logs=None, **kwargs): # Only for when there is no evaluation - if args.evaluation_strategy == IntervalStrategy.NO and "loss" in logs: + if args.eval_strategy == IntervalStrategy.NO and "loss" in logs: values = {"Training Loss": logs["loss"]} for metric in DPOTrainer_metrics: values[metric.replace("/", " / ")] = logs[metric] diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 2a07da6ce..6f1bb62c1 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -961,6 +961,7 @@ def _CausalLM_fast_forward( pass +@torch._disable_dynamo def PeftModelForCausalLM_fast_forward( self, input_ids=None, From a64b8f648ad067f9745253161e73a0367bf0ca5a Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 13 Aug 2024 17:54:02 -0700 Subject: [PATCH 113/147] Fix Chat Templates (#916) * Update pyproject.toml * Update tokenizer_utils.py * Update tokenizer_utils.py * Update tokenizer_utils.py * Update _utils.py * Update _utils.py * Update tokenizer_utils.py * Update tokenizer_utils.py * fix_tokenizer * Update tokenizer_utils.py * Update tokenizer_utils.py * Update save.py * Update save.py * Update save.py * Update save.py * Update save.py * Update loader.py * Update pyproject.toml * Update _utils.py * Update gemma2.py * Update gemma2.py * Update _utils.py * gemma 2 mask * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update _utils.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update _utils.py * Update llama.py * Update llama.py * Update llama.py * Update _utils.py * Update _utils.py * Update _utils.py * Update _utils.py * Update _utils.py * Update _utils.py * Torch 2.4 Xformers 0.0.27post2 * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Gemma 2 fixes * Update gemma2.py * Update llama.py * Update llama.py * Update save.py * Update save.py * Update llama.py * Update cross_entropy_loss.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Providing more flexibility for users to customize their llama when using LoRA (#910) * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update chat_templates.py * return model * Update tokenizer_utils.py * Update chat_templates.py * Update tokenizer_utils.py --------- Co-authored-by: Po-Lung Wang --- unsloth/chat_templates.py | 222 +++++++++++++++++++++++++++++++++++-- unsloth/models/llama.py | 17 ++- unsloth/tokenizer_utils.py | 28 ++++- 3 files changed, 256 insertions(+), 11 deletions(-) diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py index 5bd66bae0..07e79b180 100644 --- a/unsloth/chat_templates.py +++ b/unsloth/chat_templates.py @@ -508,6 +508,200 @@ CHAT_TEMPLATES["phi-3"] = (phi3_template, phi3_template_eos_token, False, phi3_ollama,) pass +# =========================================== Llama-3.1 +""" +No trimming in Llama 3.1 Instruct! +Also an extra newline for Cutting Knowledge Date +See https://colab.research.google.com/drive/1Xpqq5xpIgO-B00MQ-UccYMwN2J8QFgBM?usp=sharing + +Also should be + +import datetime +tokenizer.apply_chat_template( + messages, + add_generation_prompt = True, + tokenize = False, + date_string = datetime.today().strftime("%d %B %Y")), +) +""" + +llama31_template = \ +"""{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- set date_string = "26 July 2024" %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content'] %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message + builtin tools #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if builtin_tools is defined or tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{%- if builtin_tools is defined %} + {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content'] %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {%- if builtin_tools is defined and tool_call.name in builtin_tools %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- "<|python_tag|>" + tool_call.name + ".call(" }} + {%- for arg_name, arg_val in tool_call.arguments | items %} + {{- arg_name + '="' + arg_val + '"' }} + {%- if not loop.last %} + {{- ", " }} + {%- endif %} + {%- endfor %} + {{- ")" }} + {%- else %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {%- endif %} + {%- if builtin_tools is defined %} + {#- This means we're in ipython mode #} + {{- "<|eom_id|>" }} + {%- else %} + {{- "<|eot_id|>" }} + {%- endif %} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} +""" +pass + +# Ollama from https://ollama.com/library/llama3.1 (needs updating!) +llama31_ollama = \ +''' +FROM {__FILE_LOCATION__} +TEMPLATE """{{ if .Messages }} +{{- if or .System .Tools }}<|start_header_id|>system<|end_header_id|> +{{- if .System }} + +{{ .System }} +{{- end }} +{{- if .Tools }} + +You are a helpful assistant with tool calling capabilities. When you receive a tool call response, use the output to format an answer to the orginal use question. +{{- end }} +{{- end }}<|eot_id|> +{{- range $i, $_ := .Messages }} +{{- $last := eq (len (slice $.Messages $i)) 1 }} +{{- if eq .Role "user" }}<|start_header_id|>user<|end_header_id|> +{{- if and $.Tools $last }} + +Given the following functions, please respond with a JSON for a function call with its proper arguments that best answers the given prompt. + +Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}. Do not use variables. + +{{ $.Tools }} +{{- end }} + +{{ .Content }}<|eot_id|>{{ if $last }}<|start_header_id|>assistant<|end_header_id|> + +{{ end }} +{{- else if eq .Role "assistant" }}<|start_header_id|>assistant<|end_header_id|> +{{- if .ToolCalls }} + +{{- range .ToolCalls }}{"name": "{{ .Function.Name }}", "parameters": {{ .Function.Arguments }}}{{ end }} +{{- else }} + +{{ .Content }}{{ if not $last }}<|eot_id|>{{ end }} +{{- end }} +{{- else if eq .Role "tool" }}<|start_header_id|>ipython<|end_header_id|> + +{{ .Content }}<|eot_id|>{{ if $last }}<|start_header_id|>assistant<|end_header_id|> + +{{ end }} +{{- end }} +{{- end }} +{{- else }} +{{- if .System }}<|start_header_id|>system<|end_header_id|> + +{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|> + +{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|> + +{{ end }}{{ .Response }}{{ if .Response }}<|eot_id|>{{ end }}""" +PARAMETER stop "<|start_header_id|>" +PARAMETER stop "<|end_header_id|>" +PARAMETER stop "<|eot_id|>" +PARAMETER stop "<|eom_id|>" +''' + +llama31_template_eos_token = "eos_token" +CHAT_TEMPLATES["llama-3.1"] = (llama31_template, llama31_template_eos_token, False, llama31_ollama,) +CHAT_TEMPLATES["llama-31"] = (llama31_template, llama31_template_eos_token, False, llama31_ollama,) +pass + def get_chat_template( tokenizer, @@ -680,21 +874,33 @@ def get_chat_template( ) pass - # For ShareGPT role -> from and content -> value - chat_template = chat_template\ - .replace("'role'", "'" + mapping["role"] + "'")\ - .replace("'content'", "'" + mapping["content"] + "'")\ - .replace("'user'", "'" + mapping["user"] + "'")\ - .replace("'assistant'", "'" + mapping["assistant"] + "'") - # Careful on Gemma # bos_token is a must or else losses become too high if IS_GEMMA and not chat_template.startswith("{{ bos_token }}"): chat_template = "{{ bos_token }}" + chat_template pass + # For ShareGPT role -> from and content -> value + new_chat_template = chat_template\ + .replace("'role'", "'" + mapping["role"] + "'")\ + .replace("'content'", "'" + mapping["content"] + "'")\ + .replace("'user'", "'" + mapping["user"] + "'")\ + .replace("'assistant'", "'" + mapping["assistant"] + "'") + _, tokenizer = patch_tokenizer(model = None, tokenizer = tokenizer) - tokenizer.padding_side = old_padding_side + tokenizer.padding_side = old_padding_side + + # If not normal HF, we add a check to make old templates work + if mapping != {"role" : "role", "content" : "content", "user" : "user", "assistant" : "assistant"}: + chat_template = \ + "{% if 'role' in messages[0] %}" + \ + chat_template + \ + "{% else %}" + \ + new_chat_template + \ + "{% endif %}" + else: + chat_template = new_chat_template + pass tokenizer.chat_template = chat_template # Also fix up other tokens diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 6f1bb62c1..6a111c934 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1873,8 +1873,17 @@ def get_peft_model( else: modules_to_save.append("embed_tokens") else: - assert(module in accepted_modules) - final_modules.append(module) + try: + assert(module in accepted_modules) + final_modules.append(module) + except AssertionError as e: + final_modules.append(module) + print( + "Unsloth: You added custom modules, but Unsloth hasn't optimized for this.\n"\ + "Beware - your finetuning might be noticeably slower!" + ) + pass + pass pass # Check if we added new tokens! @@ -2253,6 +2262,8 @@ def for_inference(model): if hasattr(internal_model, "_saved_temp_tokenizer"): internal_model._saved_temp_tokenizer.padding_side = "left" pass + + return model pass @@ -2291,6 +2302,8 @@ def for_training(model, use_gradient_checkpointing = True): if hasattr(internal_model, "_saved_temp_tokenizer"): internal_model._saved_temp_tokenizer.padding_side = "right" pass + + return model pass pass diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index c67f82c2c..9c0bc1c51 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -597,8 +597,34 @@ def fix_chat_template(tokenizer): if chat_template is None: return None ### 1. Check if add_generation_prompt works + # Check for ShareGPT style first + is_sharegpt = None + try: + messages = [ + {"role": "user", "content": "Who are you?"}, + ] + tokenizer.apply_chat_template(messages, add_generation_prompt = False, tokenize = False) + is_sharegpt = False + except: + try: + messages = [ + {"from": "human", "value": "Who are you?"}, + ] + tokenizer.apply_chat_template(messages, add_generation_prompt = False, tokenize = False) + is_sharegpt = True + except: + is_sharegpt = None + pass + pass + + # Not ShareGPT or HF style - just return + if is_sharegpt is None: return chat_template + + # Tokenize messages = [ - {"role": "user", "content": "Who are you?"}, + {"role": "user", "content": "Who are you?"} \ + if not is_sharegpt else \ + {"from": "human", "value": "Who are you?"} ] no = tokenizer.apply_chat_template(messages, add_generation_prompt = False, tokenize = False) yes = tokenizer.apply_chat_template(messages, add_generation_prompt = True, tokenize = False) From a4ab920de9282602d587a40df828674bfa9d650e Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 14 Aug 2024 00:58:02 -0700 Subject: [PATCH 114/147] Fix chat templates (#917) * Update pyproject.toml * Update tokenizer_utils.py * Update tokenizer_utils.py * Update tokenizer_utils.py * Update _utils.py * Update _utils.py * Update tokenizer_utils.py * Update tokenizer_utils.py * fix_tokenizer * Update tokenizer_utils.py * Update tokenizer_utils.py * Update save.py * Update save.py * Update save.py * Update save.py * Update save.py * Update loader.py * Update pyproject.toml * Update _utils.py * Update gemma2.py * Update gemma2.py * Update _utils.py * gemma 2 mask * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update _utils.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update _utils.py * Update llama.py * Update llama.py * Update llama.py * Update _utils.py * Update _utils.py * Update _utils.py * Update _utils.py * Update _utils.py * Update _utils.py * Torch 2.4 Xformers 0.0.27post2 * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Gemma 2 fixes * Update gemma2.py * Update llama.py * Update llama.py * Update save.py * Update save.py * Update llama.py * Update cross_entropy_loss.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Providing more flexibility for users to customize their llama when using LoRA (#910) * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update chat_templates.py * return model * Update tokenizer_utils.py * Update chat_templates.py * Update tokenizer_utils.py * Train on completions --------- Co-authored-by: Po-Lung Wang --- unsloth/chat_templates.py | 165 +++++++++++++++++++++++++++++++------- 1 file changed, 138 insertions(+), 27 deletions(-) diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py index 07e79b180..7070524e0 100644 --- a/unsloth/chat_templates.py +++ b/unsloth/chat_templates.py @@ -1458,9 +1458,10 @@ def construct_chat_template( \ ollama_eos = '\n'.join(f'PARAMETER stop "{eos}"' for eos in ollama_eos) # Ollama modelfile + part = '"""' modelfile = 'FROM {__FILE_LOCATION__}\n\n'\ - 'TEMPLATE """' + system_modelfile + input_modelfile + output_modelfile + \ - '"""\n\n' + ollama_eos + 'TEMPLATE ' + part + system_modelfile + input_modelfile + output_modelfile + \ + part + '\n\n' + ollama_eos # HF Jinja Chat template def process(part, which, content = "message['content']"): @@ -1659,6 +1660,70 @@ def formatting_prompts_func(examples): pass +# From https://www.geeksforgeeks.org/longest-common-substring-array-strings/ +# Longest Common Substring in an Array of Strings +def _longest_common_substring(arr): + n = len(arr) + s = arr[0] + l = len(s) + res = "" + for i in range(l): + for j in range(i + 1, l + 1): + stem = s[i:j] + k = 1 + for k in range(1, n): + if stem not in arr[k]: + break + if (k + 1 == n and len(res) < len(stem)): + res = stem + return res +pass + + +def _find_common_token_ids(component, tokenizer): + """ + \n### User:\n\n + \n\n### User:\n\n + etc + we need to find the middle most repeatted part. + Tokenizers can tokenize newlines or spaces as 1 token! + """ + right_text = "" + if component.endswith (" "): right_text = " " + elif component.endswith("\n"): right_text = "\n" + left_text = "" + if component.startswith (" "): left_text = " " + elif component.startswith("\n"): left_text = "\n" + stripped = component.strip() + + # Add current pieces and also newlines + all_input_ids = [] + for left in range(3): + for right in range(3): + x = left*left_text + stripped + right*right_text + x = tokenizer(x, add_special_tokens = False).input_ids + all_input_ids.append(x) + + x = left*"\n" + stripped + right*"\n" + x = tokenizer(x, add_special_tokens = False).input_ids + all_input_ids.append(x) + pass + pass + substring = _longest_common_substring([str(x + [0]) for x in all_input_ids]) + substring = substring.split(", ")[:-1] + substring = [int(x) for x in substring] + + # Also get rest of tokenized string + original = tokenizer(component, add_special_tokens = False).input_ids + # Get optional left and right + for j in range(len(original)): + if original[j : j + len(substring)] == substring: break + optional_left = original[:j] + optional_right = original[j+len(substring):] + return substring, optional_left, optional_right +pass + + def train_on_responses_only( trainer, instruction_part = None, @@ -1685,41 +1750,87 @@ def train_on_responses_only( response_part = tokenizer._unsloth_output_part pass - instruction_ids = tokenizer(instruction_part, add_special_tokens = False).input_ids - response_ids = tokenizer(response_part, add_special_tokens = False).input_ids + # Get most common tokens since tokenizers can tokenize stuff differently! + Q_must, Q_left, Q_right = _find_common_token_ids(instruction_part, tokenizer) + A_must, A_left, A_right = _find_common_token_ids(response_part, tokenizer) - instruction_length = len(instruction_ids) - response_length = len(response_ids) - max_length = max(instruction_length, response_length) + # Store some temporary stuff + A_first = A_must[0] + len_A_must = len(A_must) + A_left_reversed = A_left[::-1] + A_right_forward = A_right + + Q_first = Q_must[0] + len_Q_must = len(Q_must) + Q_left_reversed = Q_left[::-1] + Q_right_forward = Q_right def _train_on_responses_only(examples): input_ids_ = examples["input_ids"] all_labels = [] for input_ids in input_ids_: - - labels = [-100] * len(input_ids) - m = len(input_ids) - max_length - first_response = response_ids[0] - first_instruction = instruction_ids[0] + n = len(input_ids) + labels = [-100] * n + n_minus_1 = n - 1 j = 0 - while j < m: - if input_ids[j] == first_response: - if input_ids[j : j+response_length] == response_ids: - j = j + response_length - start = j - while j < m: - if input_ids[j] == first_instruction and input_ids[j : j+instruction_length] == instruction_ids: - j = j + instruction_length - labels[start : j] = input_ids[start : j] - break - elif j == (m-1): - j = m - labels[start:] = input_ids[start:] - break + while j < n: + # Find + if (input_ids[j] == A_first) and \ + (input_ids[j : (k := j + len_A_must)] == A_must): + + # Now backtrack to get previous optional tokens + for optional_left in A_left_reversed: + if j < 1: break + if optional_left == input_ids[j-1]: j -= 1 + else: break + pass + # And forwards look as well + for optional_right in A_right_forward: + if k >= n_minus_1: break + if optional_right == input_ids[k+1]: k += 1 + else: break + pass + # assistant_j = j + assistant_k = k + + j = assistant_k + # Given , now find next user + while j < n: + # Find + # Also accept last final item if assistant is the last turn + if (j == n_minus_1) or \ + ((input_ids[j] == Q_first) and \ + (input_ids[j : (k := j + len_Q_must)] == Q_must)): + + # Now backtrack to get previous optional tokens + for optional_left in Q_left_reversed: + if j < 1: break + if optional_left == input_ids[j-1]: j -= 1 + else: break + pass + # And forwards look as well + for optional_right in Q_right_forward: + if k >= n_minus_1: break + if optional_right == input_ids[k+1]: k += 1 + else: break + pass + user_j = j + # Account for last item + if user_j != n_minus_1: + # user_k = k + # j = user_k + j = k + else: + user_j = n + k = n pass - j += 1 + # Now copy input_ids to labels + labels[assistant_k : user_j] = input_ids[assistant_k : user_j] + # print(assistant_j, assistant_k, user_j, user_k) + break pass + j += 1 pass pass j += 1 From 5393e9e00a1e2019144698d90035ae21e03325c7 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 15 Aug 2024 00:31:30 -0700 Subject: [PATCH 115/147] Bug Fixes (#920) * Update pyproject.toml * Update tokenizer_utils.py * Update tokenizer_utils.py * Update tokenizer_utils.py * Update _utils.py * Update _utils.py * Update tokenizer_utils.py * Update tokenizer_utils.py * fix_tokenizer * Update tokenizer_utils.py * Update tokenizer_utils.py * Update save.py * Update save.py * Update save.py * Update save.py * Update save.py * Update loader.py * Update pyproject.toml * Update _utils.py * Update gemma2.py * Update gemma2.py * Update _utils.py * gemma 2 mask * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update _utils.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update _utils.py * Update llama.py * Update llama.py * Update llama.py * Update _utils.py * Update _utils.py * Update _utils.py * Update _utils.py * Update _utils.py * Update _utils.py * Torch 2.4 Xformers 0.0.27post2 * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Gemma 2 fixes * Update gemma2.py * Update llama.py * Update llama.py * Update save.py * Update save.py * Update llama.py * Update cross_entropy_loss.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Providing more flexibility for users to customize their llama when using LoRA (#910) * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update chat_templates.py * return model * Update tokenizer_utils.py * Update chat_templates.py * Update tokenizer_utils.py * Train on completions * load_in_4bit=False broken --------- Co-authored-by: Po-Lung Wang --- unsloth/models/llama.py | 6 +++++- unsloth/models/loader.py | 19 +++++++++++-------- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 6a111c934..6139115f6 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1387,11 +1387,15 @@ def from_pretrained( # RoPE Scaling's max_position_embeddings must be updated max_position_embeddings = max(max_seq_length, model_max_seq_length) kwargs.pop("attn_implementation", None); # No need since we auto call it + + # Cannot be None, since HF now checks for the config + if load_in_4bit: kwargs["quantization_config"] = bnb_config + model = AutoModelForCausalLM.from_pretrained( model_name, device_map = device_map, torch_dtype = dtype, - quantization_config = bnb_config, + # quantization_config = bnb_config, token = token, max_position_embeddings = max_position_embeddings, trust_remote_code = trust_remote_code, diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index cce22aebf..ad1098eda 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -42,10 +42,11 @@ def __get_model_name( INT_TO_FLOAT_MAPPER = None, FLOAT_TO_INT_MAPPER = None, ): - model_name = str(model_name) - if not SUPPORTS_FOURBIT and model_name.lower() in INT_TO_FLOAT_MAPPER: - model_name = INT_TO_FLOAT_MAPPER[model_name.lower()] + lower_model_name = model_name.lower() + + if not SUPPORTS_FOURBIT and lower_model_name in INT_TO_FLOAT_MAPPER: + model_name = INT_TO_FLOAT_MAPPER[lower_model_name] logger.warning_once( f"Unsloth: Your transformers version of {transformers_version} does not support native "\ f"4bit loading.\nThe minimum required version is 4.37.\n"\ @@ -55,16 +56,18 @@ def __get_model_name( ) return model_name - elif not load_in_4bit and model_name.lower() in INT_TO_FLOAT_MAPPER: - new_model_name = INT_TO_FLOAT_MAPPER[model_name.lower()] + elif not load_in_4bit and lower_model_name in INT_TO_FLOAT_MAPPER: + new_model_name = INT_TO_FLOAT_MAPPER[lower_model_name] # logger.warning_once( # f"Unsloth: You passed in `{model_name}` which is a 4bit model, yet you set\n"\ # f"`load_in_4bit = False`. We shall load `{new_model_name}` instead." # ) return new_model_name - - elif load_in_4bit and SUPPORTS_FOURBIT and model_name.lower() in FLOAT_TO_INT_MAPPER: - new_model_name = FLOAT_TO_INT_MAPPER[model_name.lower()] + elif not load_in_4bit and lower_model_name in FLOAT_TO_INT_MAPPER: + new_model_name = FLOAT_TO_INT_MAPPER[lower_model_name] + return new_model_name + elif load_in_4bit and SUPPORTS_FOURBIT and lower_model_name in FLOAT_TO_INT_MAPPER: + new_model_name = FLOAT_TO_INT_MAPPER[lower_model_name] # logger.warning_once( # f"Unsloth: You passed in `{model_name}` and `load_in_4bit = True`.\n"\ # f"We shall load `{new_model_name}` for 4x faster loading." From 53cd1e778133efa9721731834fb06589dc95b719 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 15 Aug 2024 01:15:35 -0700 Subject: [PATCH 116/147] Fix mapping (#921) * Update pyproject.toml * Update tokenizer_utils.py * Update tokenizer_utils.py * Update tokenizer_utils.py * Update _utils.py * Update _utils.py * Update tokenizer_utils.py * Update tokenizer_utils.py * fix_tokenizer * Update tokenizer_utils.py * Update tokenizer_utils.py * Update save.py * Update save.py * Update save.py * Update save.py * Update save.py * Update loader.py * Update pyproject.toml * Update _utils.py * Update gemma2.py * Update gemma2.py * Update _utils.py * gemma 2 mask * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update _utils.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update _utils.py * Update llama.py * Update llama.py * Update llama.py * Update _utils.py * Update _utils.py * Update _utils.py * Update _utils.py * Update _utils.py * Update _utils.py * Torch 2.4 Xformers 0.0.27post2 * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Gemma 2 fixes * Update gemma2.py * Update llama.py * Update llama.py * Update save.py * Update save.py * Update llama.py * Update cross_entropy_loss.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Update dpo.py * Providing more flexibility for users to customize their llama when using LoRA (#910) * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update chat_templates.py * return model * Update tokenizer_utils.py * Update chat_templates.py * Update tokenizer_utils.py * Train on completions * load_in_4bit=False broken * Update llama.py * MAP_TO_UNSLOTH_16bit * Update loader.py * Update loader.py * Update loader.py * Update loader.py * Update mapper.py * Update mapper.py * works! --------- Co-authored-by: Po-Lung Wang --- unsloth/models/llama.py | 2 +- unsloth/models/loader.py | 39 +++++++++++++++++++++++++-------------- unsloth/models/mapper.py | 13 +++++++++++-- 3 files changed, 37 insertions(+), 17 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 6139115f6..6a23335c8 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1390,7 +1390,7 @@ def from_pretrained( # Cannot be None, since HF now checks for the config if load_in_4bit: kwargs["quantization_config"] = bnb_config - + model = AutoModelForCausalLM.from_pretrained( model_name, device_map = device_map, diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index ad1098eda..e260017fb 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -19,7 +19,7 @@ from transformers import AutoConfig from transformers import __version__ as transformers_version from peft import PeftConfig, PeftModel -from .mapper import INT_TO_FLOAT_MAPPER, FLOAT_TO_INT_MAPPER +from .mapper import INT_TO_FLOAT_MAPPER, FLOAT_TO_INT_MAPPER, MAP_TO_UNSLOTH_16bit import os # https://github.com/huggingface/transformers/pull/26037 allows 4 bit loading! @@ -39,13 +39,15 @@ def __get_model_name( model_name, load_in_4bit = True, - INT_TO_FLOAT_MAPPER = None, - FLOAT_TO_INT_MAPPER = None, + INT_TO_FLOAT_MAPPER = None, + FLOAT_TO_INT_MAPPER = None, + MAP_TO_UNSLOTH_16bit = None, ): model_name = str(model_name) lower_model_name = model_name.lower() if not SUPPORTS_FOURBIT and lower_model_name in INT_TO_FLOAT_MAPPER: + model_name = INT_TO_FLOAT_MAPPER[lower_model_name] logger.warning_once( f"Unsloth: Your transformers version of {transformers_version} does not support native "\ @@ -57,16 +59,21 @@ def __get_model_name( return model_name elif not load_in_4bit and lower_model_name in INT_TO_FLOAT_MAPPER: + new_model_name = INT_TO_FLOAT_MAPPER[lower_model_name] # logger.warning_once( # f"Unsloth: You passed in `{model_name}` which is a 4bit model, yet you set\n"\ # f"`load_in_4bit = False`. We shall load `{new_model_name}` instead." # ) return new_model_name - elif not load_in_4bit and lower_model_name in FLOAT_TO_INT_MAPPER: - new_model_name = FLOAT_TO_INT_MAPPER[lower_model_name] + + elif not load_in_4bit and lower_model_name in MAP_TO_UNSLOTH_16bit: + + new_model_name = MAP_TO_UNSLOTH_16bit[lower_model_name] return new_model_name + elif load_in_4bit and SUPPORTS_FOURBIT and lower_model_name in FLOAT_TO_INT_MAPPER: + new_model_name = FLOAT_TO_INT_MAPPER[lower_model_name] # logger.warning_once( # f"Unsloth: You passed in `{model_name}` and `load_in_4bit = True`.\n"\ @@ -86,12 +93,14 @@ def _get_new_mapper(): with requests.get(new_mapper, timeout = 3) as new_mapper: new_mapper = new_mapper.text new_mapper = new_mapper[new_mapper.find("__INT_TO_FLOAT_MAPPER"):] new_mapper = new_mapper\ - .replace("INT_TO_FLOAT_MAPPER", "NEW_INT_TO_FLOAT_MAPPER")\ - .replace("FLOAT_TO_INT_MAPPER", "NEW_FLOAT_TO_INT_MAPPER") + .replace("INT_TO_FLOAT_MAPPER", "NEW_INT_TO_FLOAT_MAPPER")\ + .replace("FLOAT_TO_INT_MAPPER", "NEW_FLOAT_TO_INT_MAPPER")\ + .replace("MAP_TO_UNSLOTH_16bit", "NEW_MAP_TO_UNSLOTH_16bit") + exec(new_mapper, globals()) - return NEW_INT_TO_FLOAT_MAPPER, NEW_FLOAT_TO_INT_MAPPER + return NEW_INT_TO_FLOAT_MAPPER, NEW_FLOAT_TO_INT_MAPPER, NEW_MAP_TO_UNSLOTH_16bit except: - return {}, {} + return {}, {}, {} pass pass @@ -100,17 +109,19 @@ def get_model_name(model_name, load_in_4bit = True): new_model_name = __get_model_name( model_name = model_name, load_in_4bit = load_in_4bit, - INT_TO_FLOAT_MAPPER = INT_TO_FLOAT_MAPPER, - FLOAT_TO_INT_MAPPER = FLOAT_TO_INT_MAPPER, + INT_TO_FLOAT_MAPPER = INT_TO_FLOAT_MAPPER, + FLOAT_TO_INT_MAPPER = FLOAT_TO_INT_MAPPER, + MAP_TO_UNSLOTH_16bit = MAP_TO_UNSLOTH_16bit, ) if new_model_name is None and model_name.count("/") == 1 and model_name[0].isalnum(): # Try checking if a new Unsloth version allows it! - NEW_INT_TO_FLOAT_MAPPER, NEW_FLOAT_TO_INT_MAPPER = _get_new_mapper() + NEW_INT_TO_FLOAT_MAPPER, NEW_FLOAT_TO_INT_MAPPER, NEW_MAP_TO_UNSLOTH_16bit = _get_new_mapper() upgraded_model_name = __get_model_name( model_name = model_name, load_in_4bit = load_in_4bit, - INT_TO_FLOAT_MAPPER = NEW_INT_TO_FLOAT_MAPPER, - FLOAT_TO_INT_MAPPER = NEW_FLOAT_TO_INT_MAPPER, + INT_TO_FLOAT_MAPPER = NEW_INT_TO_FLOAT_MAPPER, + FLOAT_TO_INT_MAPPER = NEW_FLOAT_TO_INT_MAPPER, + MAP_TO_UNSLOTH_16bit = NEW_MAP_TO_UNSLOTH_16bit, ) if upgraded_model_name is not None: raise NotImplementedError( diff --git a/unsloth/models/mapper.py b/unsloth/models/mapper.py index 57ba67658..b8259a073 100644 --- a/unsloth/models/mapper.py +++ b/unsloth/models/mapper.py @@ -251,8 +251,9 @@ ), } -INT_TO_FLOAT_MAPPER = {} -FLOAT_TO_INT_MAPPER = {} +INT_TO_FLOAT_MAPPER = {} +FLOAT_TO_INT_MAPPER = {} +MAP_TO_UNSLOTH_16bit = {} for key, values in __INT_TO_FLOAT_MAPPER.items(): INT_TO_FLOAT_MAPPER[key] = values[0] @@ -261,6 +262,14 @@ FLOAT_TO_INT_MAPPER[value] = key pass + # Map to Unsloth version for 16bit versions + if len(values) == 2: + if values[0].startswith("unsloth"): + MAP_TO_UNSLOTH_16bit[values[1]] = values[0] + MAP_TO_UNSLOTH_16bit[values[1].lower()] = values[0] + pass + pass + # Get lowercased lowered_key = key.lower() INT_TO_FLOAT_MAPPER[lowered_key] = values[0].lower() From 8be73b10860fee8ac3ab84c88548de2392948492 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 15 Aug 2024 15:04:46 -0700 Subject: [PATCH 117/147] Bug fixes --- unsloth/__init__.py | 15 ++++++--------- unsloth/models/loader.py | 24 ++++++++++++++++++++---- 2 files changed, 26 insertions(+), 13 deletions(-) diff --git a/unsloth/__init__.py b/unsloth/__init__.py index db54c9a16..dd526dc3c 100644 --- a/unsloth/__init__.py +++ b/unsloth/__init__.py @@ -11,10 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import os -import warnings -import importlib -import sys + +import warnings, importlib, sys from packaging.version import Version # # Define a list of modules to check @@ -60,9 +58,8 @@ "We have some installation instructions on our Github page.") pass -import os, re +import os, re, subprocess, inspect import numpy as np -import subprocess # Hugging Face Hub faster downloads (only enable during Colab and Kaggle sessions) keynames = "\n" + "\n".join(os.environ.keys()) @@ -83,12 +80,12 @@ del os.environ["PYTORCH_CUDA_ALLOC_CONF"] pass -# Torch 2.5 has including_emulation +# Torch 2.4 has including_emulation major_version, minor_version = torch.cuda.get_device_capability() SUPPORTS_BFLOAT16 = (major_version >= 8) -if (major_torch == 2) and (minor_torch >= 5): - old_is_bf16_supported = torch.cuda.is_bf16_supported +old_is_bf16_supported = torch.cuda.is_bf16_supported +if "including_emulation" in str(inspect.signature(old_is_bf16_supported)): def is_bf16_supported(including_emulation = False): return old_is_bf16_supported(including_emulation) torch.cuda.is_bf16_supported = is_bf16_supported diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index e260017fb..02ed00f5c 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -169,13 +169,23 @@ def from_pretrained( autoconfig_error = None peft_error = None try: - model_config = AutoConfig.from_pretrained(model_name, token = token, revision = revision) + model_config = AutoConfig.from_pretrained( + model_name, + token = token, + revision = revision, + trust_remote_code = trust_remote_code, + ) is_model = True except Exception as error: autoconfig_error = str(error) is_model = False try: - peft_config = PeftConfig .from_pretrained(model_name, token = token, revision = revision) + peft_config = PeftConfig.from_pretrained( + model_name, + token = token, + revision = revision, + trust_remote_code = trust_remote_code, + ) is_peft = True except Exception as error: peft_error = str(error) @@ -207,7 +217,12 @@ def from_pretrained( if is_peft: # Check base model again for PEFT model_name = get_model_name(peft_config.base_model_name_or_path, load_in_4bit) - model_config = AutoConfig.from_pretrained(model_name, token = token, revision = revision) + model_config = AutoConfig.from_pretrained( + model_name, + token = token, + revision = revision, + trust_remote_code = trust_remote_code, + ) pass if not was_disabled: enable_progress_bars() @@ -340,10 +355,11 @@ def from_pretrained( token = token, revision = revision, is_trainable = True, + trust_remote_code = trust_remote_code, ) # Patch it as well! model = dispatch_model.patch_peft_model(model, use_gradient_checkpointing) pass return model, tokenizer pass -pass +pass \ No newline at end of file From 8b80820b8b9f13ab4ecca089ec6ff92c58530bea Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 15 Aug 2024 15:07:42 -0700 Subject: [PATCH 118/147] Update __init__.py --- unsloth/__init__.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/unsloth/__init__.py b/unsloth/__init__.py index dd526dc3c..f6ed99953 100644 --- a/unsloth/__init__.py +++ b/unsloth/__init__.py @@ -14,6 +14,8 @@ import warnings, importlib, sys from packaging.version import Version +import os, re, subprocess, inspect +import numpy as np # # Define a list of modules to check # MODULES_TO_CHECK = ["bitsandbytes"] @@ -58,9 +60,6 @@ "We have some installation instructions on our Github page.") pass -import os, re, subprocess, inspect -import numpy as np - # Hugging Face Hub faster downloads (only enable during Colab and Kaggle sessions) keynames = "\n" + "\n".join(os.environ.keys()) if "\nCOLAB_" in keynames or "\nKAGGLE_" in keynames: From 5e2cf1c51cab723000d0ba33a863cad8c4642a7d Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 16 Aug 2024 19:28:43 -0700 Subject: [PATCH 119/147] untrained tokens llama 3.1 base --- unsloth/chat_templates.py | 4 ++-- unsloth/tokenizer_utils.py | 20 ++++++++++++++++++++ 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py index 7070524e0..82f6aba14 100644 --- a/unsloth/chat_templates.py +++ b/unsloth/chat_templates.py @@ -876,7 +876,7 @@ def get_chat_template( # Careful on Gemma # bos_token is a must or else losses become too high - if IS_GEMMA and not chat_template.startswith("{{ bos_token }}"): + if IS_GEMMA and not chat_template.startswith(("{{ bos_token }}", "{{- bos_token }}")): chat_template = "{{ bos_token }}" + chat_template pass @@ -1553,7 +1553,7 @@ def process(part, which, content = "message['content']"): # Check jinja tempate for bos if always_bos_token: - if not jinja_template.startswith("{{ bos_token }}"): + if not jinja_template.startswith(("{{ bos_token }}", "{{- bos_token }}")): jinja_template = "{{ bos_token }}" + jinja_template pass diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index 9c0bc1c51..a4f0b33be 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -827,7 +827,27 @@ def fix_untrained_tokens(model, tokenizer, train_dataset, eps = 1e-16): # Get untrained tokens indicator_untrained1 = torch.amax(embedding_matrix, axis = 1) <= eps # Check lm_head as well + + # Does NOT work for Llama 3.1!! indicator_untrained2 = torch.amax(lm_head_matrix, axis = 1) <= eps + + # We instead check for repeated vectors + lm_head_where = torch.where(indicator_untrained1)[0] + lm_head_bad = lm_head_matrix[lm_head_where] + lm_head_bad = lm_head_bad.cpu().numpy().round(3) + from collections import Counter + counter = Counter() + for row in lm_head_bad: counter[hash(row.data.tobytes())] += 1 + counter = Counter({k: c for k, c in counter.items() if c >= 2}) + + lm_head_where = lm_head_where.cpu().numpy() + final_bad_lm_head = [] + for j, row in enumerate(lm_head_bad): + if hash(row.data.tobytes()) in counter: + final_bad_lm_head.append(lm_head_where[j]) + indicator_untrained2 = indicator_untrained2 | torch.zeros_like(indicator_untrained2) + indicator_untrained2[final_bad_lm_head] = True + # Combine both checks indicator_untrained = indicator_untrained1 & indicator_untrained2 From c22162b402a0e8cc8a5580f232e39a005fad02f1 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 16 Aug 2024 19:57:19 -0700 Subject: [PATCH 120/147] untrained tokens llama 3.1 base (#929) --- unsloth/chat_templates.py | 4 ++-- unsloth/tokenizer_utils.py | 20 ++++++++++++++++++++ 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py index 7070524e0..82f6aba14 100644 --- a/unsloth/chat_templates.py +++ b/unsloth/chat_templates.py @@ -876,7 +876,7 @@ def get_chat_template( # Careful on Gemma # bos_token is a must or else losses become too high - if IS_GEMMA and not chat_template.startswith("{{ bos_token }}"): + if IS_GEMMA and not chat_template.startswith(("{{ bos_token }}", "{{- bos_token }}")): chat_template = "{{ bos_token }}" + chat_template pass @@ -1553,7 +1553,7 @@ def process(part, which, content = "message['content']"): # Check jinja tempate for bos if always_bos_token: - if not jinja_template.startswith("{{ bos_token }}"): + if not jinja_template.startswith(("{{ bos_token }}", "{{- bos_token }}")): jinja_template = "{{ bos_token }}" + jinja_template pass diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index 9c0bc1c51..a4f0b33be 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -827,7 +827,27 @@ def fix_untrained_tokens(model, tokenizer, train_dataset, eps = 1e-16): # Get untrained tokens indicator_untrained1 = torch.amax(embedding_matrix, axis = 1) <= eps # Check lm_head as well + + # Does NOT work for Llama 3.1!! indicator_untrained2 = torch.amax(lm_head_matrix, axis = 1) <= eps + + # We instead check for repeated vectors + lm_head_where = torch.where(indicator_untrained1)[0] + lm_head_bad = lm_head_matrix[lm_head_where] + lm_head_bad = lm_head_bad.cpu().numpy().round(3) + from collections import Counter + counter = Counter() + for row in lm_head_bad: counter[hash(row.data.tobytes())] += 1 + counter = Counter({k: c for k, c in counter.items() if c >= 2}) + + lm_head_where = lm_head_where.cpu().numpy() + final_bad_lm_head = [] + for j, row in enumerate(lm_head_bad): + if hash(row.data.tobytes()) in counter: + final_bad_lm_head.append(lm_head_where[j]) + indicator_untrained2 = indicator_untrained2 | torch.zeros_like(indicator_untrained2) + indicator_untrained2[final_bad_lm_head] = True + # Combine both checks indicator_untrained = indicator_untrained1 & indicator_untrained2 From 9cb5c2eca4c7b5ea8f2a3fb3048d0b376589296e Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 16 Aug 2024 23:38:02 -0700 Subject: [PATCH 121/147] Update tokenizer_utils.py --- unsloth/tokenizer_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index a4f0b33be..38d5949f4 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -834,7 +834,7 @@ def fix_untrained_tokens(model, tokenizer, train_dataset, eps = 1e-16): # We instead check for repeated vectors lm_head_where = torch.where(indicator_untrained1)[0] lm_head_bad = lm_head_matrix[lm_head_where] - lm_head_bad = lm_head_bad.cpu().numpy().round(3) + lm_head_bad = lm_head_bad.cpu().to(torch.float32).numpy().round(3) from collections import Counter counter = Counter() for row in lm_head_bad: counter[hash(row.data.tobytes())] += 1 From 487637db7bfd0d162a1932379f9dab176323689d Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 16 Aug 2024 23:38:43 -0700 Subject: [PATCH 122/147] Update tokenizer_utils.py --- unsloth/tokenizer_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index 38d5949f4..7316656b2 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -834,7 +834,7 @@ def fix_untrained_tokens(model, tokenizer, train_dataset, eps = 1e-16): # We instead check for repeated vectors lm_head_where = torch.where(indicator_untrained1)[0] lm_head_bad = lm_head_matrix[lm_head_where] - lm_head_bad = lm_head_bad.cpu().to(torch.float32).numpy().round(3) + lm_head_bad = lm_head_bad.cpu().float().numpy().round(3) from collections import Counter counter = Counter() for row in lm_head_bad: counter[hash(row.data.tobytes())] += 1 From 52bc19d1fa4cd3557b785127fd68b5f4d1c34347 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 16 Aug 2024 23:39:44 -0700 Subject: [PATCH 123/147] Bug #930 (#931) * untrained tokens llama 3.1 base * Update tokenizer_utils.py * Update tokenizer_utils.py --- unsloth/tokenizer_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index a4f0b33be..7316656b2 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -834,7 +834,7 @@ def fix_untrained_tokens(model, tokenizer, train_dataset, eps = 1e-16): # We instead check for repeated vectors lm_head_where = torch.where(indicator_untrained1)[0] lm_head_bad = lm_head_matrix[lm_head_where] - lm_head_bad = lm_head_bad.cpu().numpy().round(3) + lm_head_bad = lm_head_bad.cpu().float().numpy().round(3) from collections import Counter counter = Counter() for row in lm_head_bad: counter[hash(row.data.tobytes())] += 1 From 9335fa0960c40fd36e2702456415cbdbbcd847dd Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 19 Aug 2024 15:04:25 -0700 Subject: [PATCH 124/147] Bug fixes --- unsloth/models/_utils.py | 28 ++++++++++++++++++++++++---- unsloth/models/llama.py | 28 +++++++++++++++++++++++++++- 2 files changed, 51 insertions(+), 5 deletions(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 0c0057496..d8904aa12 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -595,7 +595,6 @@ def _get_statistics(statistics = None, force_download = True): # You can disable this by commenting the below out try: n_cpus = psutil.cpu_count(logical = False) - keynames = "\n" + "\n".join(os.environ.keys()) if statistics is not None: pass elif "\nCOLAB_" in keynames and n_cpus == 1: statistics = "colab" @@ -604,10 +603,31 @@ def _get_statistics(statistics = None, force_download = True): elif "\nRUNPOD_" in keynames: statistics = "runpod" elif "\nAWS_" in keynames: statistics = "aws" elif "\nAZURE_" in keynames: statistics = "azure" - elif "\nK_" in keynames or "\nFUNCTION_" in keynames: statistics = "gcp" + # elif "\nK_" in keynames or "\nFUNCTION_" in keynames: statistics = "gcp" elif "\nINVOCATION_ID" in keynames: statistics = "lambda" - else: statistics = "other" - + # else: statistics = "other" + else: + def try_vllm_check(): + vendor_files = ( + "/sys/class/dmi/id/product_version", + "/sys/class/dmi/id/bios_vendor", + "/sys/class/dmi/id/product_name", + "/sys/class/dmi/id/chassis_asset_tag", + "/sys/class/dmi/id/sys_vendor", + ) + from pathlib import Path + for vendor_file in vendor_files: + path = Path(vendor_file) + if path.is_file(): + file_content = path.read_text().lower() + if "amazon" in file_content: return "aws" + elif "microsoft corporation" in file_content: return "azure" + elif "google" in file_content: return "gcp" + return "other" + pass + try: statistics = try_vllm_check() + except: statistics = "other" + pass if statistics is not None: from transformers import AutoModelForCausalLM stats_model = AutoModelForCausalLM.from_pretrained( diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 6a23335c8..d18dd4ce9 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1628,7 +1628,7 @@ def post_patch(model): # Torch.compile fails on embedding matrix?? # Workaround randomnly fixes it for torch versions < 2. - model.set_input_embeddings(torch.nn.Embedding.from_pretrained(model.get_input_embeddings().weight)) + # model.set_input_embeddings(torch.nn.Embedding.from_pretrained(model.get_input_embeddings().weight)) model.config.update({"unsloth_version" : __version__}) # We also do this for the lm_head @@ -2234,6 +2234,9 @@ def for_inference(model): internal_model.gradient_checkpointing = False internal_model.training = False pass + if hasattr(internal_model, "training"): + internal_model.training = False + pass # Also check if lm_head / embeddings are trained internal_model = model @@ -2267,6 +2270,16 @@ def for_inference(model): internal_model._saved_temp_tokenizer.padding_side = "left" pass + # Also disable training for embeddings for NEFTune + if hasattr(model, "get_input_embeddings"): + embeddings = model.get_input_embeddings() + if hasattr(embeddings, "training"): embeddings.training = False + pass + if hasattr(model, "get_output_embeddings"): + embeddings = model.get_output_embeddings() + if hasattr(embeddings, "training"): embeddings.training = False + pass + return model pass @@ -2288,6 +2301,9 @@ def for_training(model, use_gradient_checkpointing = True): internal_model.gradient_checkpointing = use_gradient_checkpointing internal_model.training = True pass + if hasattr(internal_model, "training"): + internal_model.training = True + pass # Also revert model.generate if hasattr(model, "_unwrapped_old_generate"): @@ -2307,6 +2323,16 @@ def for_training(model, use_gradient_checkpointing = True): internal_model._saved_temp_tokenizer.padding_side = "right" pass + # Also re-enable training for embeddings for NEFTune + if hasattr(model, "get_input_embeddings"): + embeddings = model.get_input_embeddings() + if hasattr(embeddings, "training"): embeddings.training = True + pass + if hasattr(model, "get_output_embeddings"): + embeddings = model.get_output_embeddings() + if hasattr(embeddings, "training"): embeddings.training = True + pass + return model pass pass From 1bed78c99279f3667379e0798440ee3a94d536b4 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 19 Aug 2024 15:08:53 -0700 Subject: [PATCH 125/147] Update llama.py --- unsloth/models/llama.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index d18dd4ce9..3f42dee9c 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -571,6 +571,9 @@ def LlamaModel_fast_forward( if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) + print(getattr(self.embed_tokens, "neftune_noise_alpha")) + print(getattr(self.embed_tokens, "_forward_hooks")) + print(getattr(self.embed_tokens, "_forward_pre_hooks")) inputs_embeds = inputs_embeds.to(self.config.torch_dtype) # Normalized from Gemma From 2c4772b666e93404a780301a9166736fe4734c25 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 19 Aug 2024 15:50:16 -0700 Subject: [PATCH 126/147] Update tokenizer_utils.py --- unsloth/tokenizer_utils.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index 7316656b2..873544007 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -1109,6 +1109,7 @@ def check_nvidia(): import trl.trainer.sft_trainer from trl.trainer.sft_trainer import * from transformers.trainer import * +from trl.trainer.sft_trainer import neftune_post_forward_hook def patch_sft_trainer_tokenizer(): """ @@ -1173,6 +1174,17 @@ def patch_sft_trainer_tokenizer(): "\n"\ "fix_untrained_tokens(self.model, self.tokenizer, self.train_dataset, eps = 1e-16)\n\n" + # Add NEFTune since it doesn't seem to work?? We need to manually inject it + check_text += \ + "\n\n"\ + "if getattr(self.model.get_input_embeddings(), 'neftune_noise_alpha', None) is not None:\n"\ + " if hasattr(self, 'neftune_hook_handle'):\n"\ + " self.neftune_hook_handle.remove()\n"\ + " if hasattr(self, 'neftune_hook_handle'): del self.neftune_hook_handle\n"\ + "\n"\ + " self.neftune_hook_handle = self.model.get_input_embeddings().register_forward_hook(neftune_post_forward_hook)\n\n"\ + "\n" + check_text = check_text.split("\n") check_text = "\n".join(" "*where + x for x in check_text) From 7fd058fc71f7433bf55cd978feccc580fa26dab8 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 19 Aug 2024 15:52:13 -0700 Subject: [PATCH 127/147] Update tokenizer_utils.py --- unsloth/tokenizer_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index 873544007..a73887061 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -1177,14 +1177,16 @@ def patch_sft_trainer_tokenizer(): # Add NEFTune since it doesn't seem to work?? We need to manually inject it check_text += \ "\n\n"\ + "print(1)\n"\ "if getattr(self.model.get_input_embeddings(), 'neftune_noise_alpha', None) is not None:\n"\ + " print(2)\n"\ " if hasattr(self, 'neftune_hook_handle'):\n"\ " self.neftune_hook_handle.remove()\n"\ " if hasattr(self, 'neftune_hook_handle'): del self.neftune_hook_handle\n"\ "\n"\ " self.neftune_hook_handle = self.model.get_input_embeddings().register_forward_hook(neftune_post_forward_hook)\n\n"\ "\n" - + check_text = check_text.split("\n") check_text = "\n".join(" "*where + x for x in check_text) From 6e5ad15cd73388ba694bd532b8bf4d05316b1d9a Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 19 Aug 2024 16:03:24 -0700 Subject: [PATCH 128/147] Update tokenizer_utils.py --- unsloth/tokenizer_utils.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index a73887061..b677f864a 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -1176,15 +1176,15 @@ def patch_sft_trainer_tokenizer(): # Add NEFTune since it doesn't seem to work?? We need to manually inject it check_text += \ - "\n\n"\ - "print(1)\n"\ - "if getattr(self.model.get_input_embeddings(), 'neftune_noise_alpha', None) is not None:\n"\ - " print(2)\n"\ - " if hasattr(self, 'neftune_hook_handle'):\n"\ - " self.neftune_hook_handle.remove()\n"\ - " if hasattr(self, 'neftune_hook_handle'): del self.neftune_hook_handle\n"\ "\n"\ - " self.neftune_hook_handle = self.model.get_input_embeddings().register_forward_hook(neftune_post_forward_hook)\n\n"\ + "if hasattr(self, 'neftune_hook_handle'):\n"\ + " self.neftune_hook_handle.remove()\n"\ + " if hasattr(self, 'neftune_hook_handle'): del self.neftune_hook_handle\n"\ + "\n"\ + "if getattr(self, 'neftune_noise_alpha', None) is not None:\n"\ + " self.model.get_input_embeddings().neftune_noise_alpha = self.neftune_noise_alpha\n"\ + " self.neftune_hook_handle = self.model.get_input_embeddings().register_forward_hook(neftune_post_forward_hook)\n"\ + "pass\n"\ "\n" check_text = check_text.split("\n") From 7139e57b729253c0ce1d70892dbac4f7f87d28ef Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 19 Aug 2024 16:08:14 -0700 Subject: [PATCH 129/147] Update llama.py --- unsloth/models/llama.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 3f42dee9c..461feb3c7 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -571,9 +571,9 @@ def LlamaModel_fast_forward( if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) - print(getattr(self.embed_tokens, "neftune_noise_alpha")) - print(getattr(self.embed_tokens, "_forward_hooks")) - print(getattr(self.embed_tokens, "_forward_pre_hooks")) + # print(getattr(self.embed_tokens, "neftune_noise_alpha")) + # print(getattr(self.embed_tokens, "_forward_hooks")) + # print(getattr(self.embed_tokens, "_forward_pre_hooks")) inputs_embeds = inputs_embeds.to(self.config.torch_dtype) # Normalized from Gemma From 9caaa5af78292f29aaaad2ed05d5a55564020a3e Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 19 Aug 2024 16:11:09 -0700 Subject: [PATCH 130/147] Update llama.py --- unsloth/models/llama.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 461feb3c7..d18dd4ce9 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -571,9 +571,6 @@ def LlamaModel_fast_forward( if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) - # print(getattr(self.embed_tokens, "neftune_noise_alpha")) - # print(getattr(self.embed_tokens, "_forward_hooks")) - # print(getattr(self.embed_tokens, "_forward_pre_hooks")) inputs_embeds = inputs_embeds.to(self.config.torch_dtype) # Normalized from Gemma From be7ed9a1e60224c99fb91f01479b8b654264d8eb Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 19 Aug 2024 16:14:01 -0700 Subject: [PATCH 131/147] Update llama.py --- unsloth/models/llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index d18dd4ce9..048ba6919 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1628,7 +1628,7 @@ def post_patch(model): # Torch.compile fails on embedding matrix?? # Workaround randomnly fixes it for torch versions < 2. - # model.set_input_embeddings(torch.nn.Embedding.from_pretrained(model.get_input_embeddings().weight)) + model.set_input_embeddings(torch.nn.Embedding.from_pretrained(model.get_input_embeddings().weight)) model.config.update({"unsloth_version" : __version__}) # We also do this for the lm_head From 75013ff022523729f13479f7738ec5a0e1d237b0 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 19 Aug 2024 16:17:52 -0700 Subject: [PATCH 132/147] Fix NEFTune (#937) * untrained tokens llama 3.1 base * Update tokenizer_utils.py * Update tokenizer_utils.py * Bug fixes * Update llama.py * Update tokenizer_utils.py * Update tokenizer_utils.py * Update tokenizer_utils.py * Update llama.py * Update llama.py * Update llama.py --- unsloth/models/_utils.py | 28 ++++++++++++++++++++++++---- unsloth/models/llama.py | 26 ++++++++++++++++++++++++++ unsloth/tokenizer_utils.py | 14 ++++++++++++++ 3 files changed, 64 insertions(+), 4 deletions(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 0c0057496..d8904aa12 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -595,7 +595,6 @@ def _get_statistics(statistics = None, force_download = True): # You can disable this by commenting the below out try: n_cpus = psutil.cpu_count(logical = False) - keynames = "\n" + "\n".join(os.environ.keys()) if statistics is not None: pass elif "\nCOLAB_" in keynames and n_cpus == 1: statistics = "colab" @@ -604,10 +603,31 @@ def _get_statistics(statistics = None, force_download = True): elif "\nRUNPOD_" in keynames: statistics = "runpod" elif "\nAWS_" in keynames: statistics = "aws" elif "\nAZURE_" in keynames: statistics = "azure" - elif "\nK_" in keynames or "\nFUNCTION_" in keynames: statistics = "gcp" + # elif "\nK_" in keynames or "\nFUNCTION_" in keynames: statistics = "gcp" elif "\nINVOCATION_ID" in keynames: statistics = "lambda" - else: statistics = "other" - + # else: statistics = "other" + else: + def try_vllm_check(): + vendor_files = ( + "/sys/class/dmi/id/product_version", + "/sys/class/dmi/id/bios_vendor", + "/sys/class/dmi/id/product_name", + "/sys/class/dmi/id/chassis_asset_tag", + "/sys/class/dmi/id/sys_vendor", + ) + from pathlib import Path + for vendor_file in vendor_files: + path = Path(vendor_file) + if path.is_file(): + file_content = path.read_text().lower() + if "amazon" in file_content: return "aws" + elif "microsoft corporation" in file_content: return "azure" + elif "google" in file_content: return "gcp" + return "other" + pass + try: statistics = try_vllm_check() + except: statistics = "other" + pass if statistics is not None: from transformers import AutoModelForCausalLM stats_model = AutoModelForCausalLM.from_pretrained( diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 6a23335c8..048ba6919 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -2234,6 +2234,9 @@ def for_inference(model): internal_model.gradient_checkpointing = False internal_model.training = False pass + if hasattr(internal_model, "training"): + internal_model.training = False + pass # Also check if lm_head / embeddings are trained internal_model = model @@ -2267,6 +2270,16 @@ def for_inference(model): internal_model._saved_temp_tokenizer.padding_side = "left" pass + # Also disable training for embeddings for NEFTune + if hasattr(model, "get_input_embeddings"): + embeddings = model.get_input_embeddings() + if hasattr(embeddings, "training"): embeddings.training = False + pass + if hasattr(model, "get_output_embeddings"): + embeddings = model.get_output_embeddings() + if hasattr(embeddings, "training"): embeddings.training = False + pass + return model pass @@ -2288,6 +2301,9 @@ def for_training(model, use_gradient_checkpointing = True): internal_model.gradient_checkpointing = use_gradient_checkpointing internal_model.training = True pass + if hasattr(internal_model, "training"): + internal_model.training = True + pass # Also revert model.generate if hasattr(model, "_unwrapped_old_generate"): @@ -2307,6 +2323,16 @@ def for_training(model, use_gradient_checkpointing = True): internal_model._saved_temp_tokenizer.padding_side = "right" pass + # Also re-enable training for embeddings for NEFTune + if hasattr(model, "get_input_embeddings"): + embeddings = model.get_input_embeddings() + if hasattr(embeddings, "training"): embeddings.training = True + pass + if hasattr(model, "get_output_embeddings"): + embeddings = model.get_output_embeddings() + if hasattr(embeddings, "training"): embeddings.training = True + pass + return model pass pass diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index 7316656b2..b677f864a 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -1109,6 +1109,7 @@ def check_nvidia(): import trl.trainer.sft_trainer from trl.trainer.sft_trainer import * from transformers.trainer import * +from trl.trainer.sft_trainer import neftune_post_forward_hook def patch_sft_trainer_tokenizer(): """ @@ -1173,6 +1174,19 @@ def patch_sft_trainer_tokenizer(): "\n"\ "fix_untrained_tokens(self.model, self.tokenizer, self.train_dataset, eps = 1e-16)\n\n" + # Add NEFTune since it doesn't seem to work?? We need to manually inject it + check_text += \ + "\n"\ + "if hasattr(self, 'neftune_hook_handle'):\n"\ + " self.neftune_hook_handle.remove()\n"\ + " if hasattr(self, 'neftune_hook_handle'): del self.neftune_hook_handle\n"\ + "\n"\ + "if getattr(self, 'neftune_noise_alpha', None) is not None:\n"\ + " self.model.get_input_embeddings().neftune_noise_alpha = self.neftune_noise_alpha\n"\ + " self.neftune_hook_handle = self.model.get_input_embeddings().register_forward_hook(neftune_post_forward_hook)\n"\ + "pass\n"\ + "\n" + check_text = check_text.split("\n") check_text = "\n".join(" "*where + x for x in check_text) From 4cc20f4720ad482a2da04ce79cf5cd622c14e54e Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 19 Aug 2024 17:12:32 -0700 Subject: [PATCH 133/147] Create _auto_install.py --- unsloth/_auto_install.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 unsloth/_auto_install.py diff --git a/unsloth/_auto_install.py b/unsloth/_auto_install.py new file mode 100644 index 000000000..2f5b62d4c --- /dev/null +++ b/unsloth/_auto_install.py @@ -0,0 +1,16 @@ +try: import torch +except: raise ImportError('Install torch via `pip install torch`') +from packaging.version import Version as V +v = V(torch.__version__) +cuda = str(torch.version.cuda) +is_ampere = torch.cuda.get_device_capability()[0] >= 8 +if cuda != "12.1" and cuda != "11.8": raise RuntimeError(f"CUDA = {cuda} not supported!") +if v <= V('2.1.0'): raise RuntimeError(f"Torch = {v} too old!") +elif v <= V('2.1.1'): x = 'cu{}{}-torch211' +elif v <= V('2.1.2'): x = 'cu{}{}-torch212' +elif v < V('2.3.0'): x = 'cu{}{}-torch220' +elif v < V('2.4.0'): x = 'cu{}{}-torch230' +elif v < V('2.5.0'): x = 'cu{}{}-torch240' +else: raise RuntimeError(f"Torch = {v} too new!") +x = x.format(cuda.replace(".", ""), "-ampere" if is_ampere else "") +print(f'pip install "unsloth[{x}] @ git+https://github.com/unslothai/unsloth.git"') \ No newline at end of file From fbf50a42602bf299da2a0a99fea2f9b18550332d Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 19 Aug 2024 17:12:46 -0700 Subject: [PATCH 134/147] Update _auto_install.py --- unsloth/_auto_install.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/unsloth/_auto_install.py b/unsloth/_auto_install.py index 2f5b62d4c..2e6351b8d 100644 --- a/unsloth/_auto_install.py +++ b/unsloth/_auto_install.py @@ -1,3 +1,17 @@ +# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + try: import torch except: raise ImportError('Install torch via `pip install torch`') from packaging.version import Version as V From d45ade257b8578d63236b389f25e73b5c22bb862 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 19 Aug 2024 17:18:30 -0700 Subject: [PATCH 135/147] Update README.md (#938) --- README.md | 105 +++++++++++++++++++++++------------------------------- 1 file changed, 44 insertions(+), 61 deletions(-) diff --git a/README.md b/README.md index 86c3fbd86..b23acffcb 100644 --- a/README.md +++ b/README.md @@ -94,85 +94,68 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and ## 💾 Installation Instructions -If you have Pytorch 2.3 and CUDA 12.1, install Unsloth with `pip install unsloth[colab-new]` then `pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes` - ### Conda Installation -Select either `pytorch-cuda=11.8` for CUDA 11.8 or `pytorch-cuda=12.1` for CUDA 12.1. If you have `mamba`, use `mamba` instead of `conda` for faster solving. See this [Github issue](https://github.com/unslothai/unsloth/issues/73) for help on debugging Conda installs. +`⚠️Only use Conda if you have it. If not, use Pip`. Select either `pytorch-cuda=11.8,12.1` for CUDA 11.8 or CUDA 12.1. If you have `mamba`, use `mamba` instead of `conda` for faster solving. We support `python=3.10,3.11,3.12`. ```bash conda create --name unsloth_env \ - python=3.10 \ - pytorch-cuda=<11.8/12.1> \ + python=3.11 \ + pytorch-cuda=12.1 \ pytorch cudatoolkit xformers -c pytorch -c nvidia -c xformers \ -y conda activate unsloth_env pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git" - -pip install --no-deps "trl<0.9.0" peft accelerate bitsandbytes +pip install --no-deps trl peft accelerate bitsandbytes ``` +
+ If you're looking to install Conda in a Linux environment, read here, or run the below 🔽 + + ```bash + mkdir -p ~/miniconda3 + wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda3/miniconda.sh + bash ~/miniconda3/miniconda.sh -b -u -p ~/miniconda3 + rm -rf ~/miniconda3/miniconda.sh + ~/miniconda3/bin/conda init bash + ~/miniconda3/bin/conda init zsh + ``` +
+ ### Pip Installation -Do **NOT** use this if you have Anaconda. You must use the Conda install method, or else stuff will BREAK. +`⚠️Do **NOT** use this if you have Conda.` Pip is a bit more complex since there are dependency issues. The pip command is different for `torch 2.2,2.3,2.4` and CUDA versions. -1. Find your CUDA version via -```python -import torch; torch.version.cuda -``` -2. For Pytorch 2.1.0: You can update Pytorch via Pip (interchange `cu121` / `cu118`). Go to https://pytorch.org/ to learn more. Select either `cu118` for CUDA 11.8 or `cu121` for CUDA 12.1. If you have a RTX 3060 or higher (A100, H100 etc), use the `"ampere"` path. For Pytorch 2.1.1: go to step 3. For Pytorch 2.2.0: go to step 4. -```bash -pip install --upgrade --force-reinstall --no-cache-dir torch==2.1.0 triton \ - --index-url https://download.pytorch.org/whl/cu121 -``` -```bash -pip install "unsloth[cu118] @ git+https://github.com/unslothai/unsloth.git" -pip install "unsloth[cu121] @ git+https://github.com/unslothai/unsloth.git" -pip install "unsloth[cu118-ampere] @ git+https://github.com/unslothai/unsloth.git" -pip install "unsloth[cu121-ampere] @ git+https://github.com/unslothai/unsloth.git" -``` -3. For Pytorch 2.1.1: Use the `"ampere"` path for newer RTX 30xx GPUs or higher. -```bash -pip install --upgrade --force-reinstall --no-cache-dir torch==2.1.1 triton \ - --index-url https://download.pytorch.org/whl/cu121 -``` -```bash -pip install "unsloth[cu118-torch211] @ git+https://github.com/unslothai/unsloth.git" -pip install "unsloth[cu121-torch211] @ git+https://github.com/unslothai/unsloth.git" -pip install "unsloth[cu118-ampere-torch211] @ git+https://github.com/unslothai/unsloth.git" -pip install "unsloth[cu121-ampere-torch211] @ git+https://github.com/unslothai/unsloth.git" -``` -4. For Pytorch 2.2.0: Use the `"ampere"` path for newer RTX 30xx GPUs or higher. -```bash -pip install --upgrade --force-reinstall --no-cache-dir torch==2.2.0 triton \ - --index-url https://download.pytorch.org/whl/cu121 -``` -```bash -pip install "unsloth[cu118-torch220] @ git+https://github.com/unslothai/unsloth.git" -pip install "unsloth[cu121-torch220] @ git+https://github.com/unslothai/unsloth.git" -pip install "unsloth[cu118-ampere-torch220] @ git+https://github.com/unslothai/unsloth.git" -pip install "unsloth[cu121-ampere-torch220] @ git+https://github.com/unslothai/unsloth.git" -``` -5. If you get errors, try the below first, then go back to step 1: +In general, if you have `torch 2.4` and `CUDA 12.1`, use: ```bash pip install --upgrade pip +pip install "unsloth[cu121-torch240] @ git+https://github.com/unslothai/unsloth.git" ``` -6. For Pytorch 2.2.1: -```bash -# RTX 3090, 4090 Ampere GPUs: -pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git" -pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes -# Pre Ampere RTX 2080, T4, GTX 1080 GPUs: -pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git" -pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes -``` -7. For Pytorch 2.3.0: Use the `"ampere"` path for newer RTX 30xx GPUs or higher. +Or, run the below in a terminal to get the optional pip installation command: ```bash -pip install "unsloth[cu118-torch230] @ git+https://github.com/unslothai/unsloth.git" -pip install "unsloth[cu121-torch230] @ git+https://github.com/unslothai/unsloth.git" -pip install "unsloth[cu118-ampere-torch230] @ git+https://github.com/unslothai/unsloth.git" -pip install "unsloth[cu121-ampere-torch230] @ git+https://github.com/unslothai/unsloth.git" +wget -qO- https://raw.githubusercontent.com/unslothai/unsloth/main/unsloth/_auto_install.py | python - ``` -8. To troubleshoot installs try the below (all must succeed). Xformers should mostly all be available. + +Or, run the below manually in a Python REPL: +```python +try: import torch +except: raise ImportError("Install torch via `pip install torch`") +from packaging.version import Version as V +v = V(torch.__version__) +cuda = str(torch.version.cuda) +is_ampere = torch.cuda.get_device_capability()[0] >= 8 +if cuda != "12.1" and cuda != "11.8": raise RuntimeError(f"CUDA = {cuda} not supported!") +if v <= V('2.1.0'): raise RuntimeError(f"Torch = {v} too old!") +elif v <= V('2.1.1'): x = 'cu{}{}-torch211' +elif v <= V('2.1.2'): x = 'cu{}{}-torch212' +elif v < V('2.3.0'): x = 'cu{}{}-torch220' +elif v < V('2.4.0'): x = 'cu{}{}-torch230' +elif v < V('2.5.0'): x = 'cu{}{}-torch240' +else: raise RuntimeError(f"Torch = {v} too new!") +x = x.format(cuda.replace(".", ""), "-ampere" if is_ampere else "") +print(f'pip install --upgrade pip && pip install "unsloth[{x}] @ git+https://github.com/unslothai/unsloth.git"') +``` + +Afterwards, confirm if `nvcc` `xformers` and `bitsandbytes` have successfully installed - if not, install them individually first until they work, then install Unsloth. ```bash nvcc python -m xformers.info From cb7fd3a09dc3ecf4066d718ab6abc1156946050f Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 20 Aug 2024 14:38:20 -0700 Subject: [PATCH 136/147] LongRoPE --- unsloth/models/_utils.py | 13 ++++ unsloth/models/gemma.py | 4 ++ unsloth/models/llama.py | 127 +++++++++++++++++++++++++++++++++++++-- 3 files changed, 140 insertions(+), 4 deletions(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index d8904aa12..434554f67 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -969,6 +969,7 @@ def patch_llama_rope_scaling( scaled_rope_module = None, extended_rope_module = None, attention_module = None, + longrope_module = None, ): assert(\ rope_module is not None and \ @@ -1026,14 +1027,26 @@ def patch_llama_rope_scaling( max_position_embeddings=self.max_position_embeddings, base=self.rope_theta, ) + elif scaling_type == "longrope": + self.rotary_emb = {longrope_rope_function}( + dim = self.head_dim, + max_position_embeddings = self.max_position_embeddings, + original_max_position_embeddings = self.original_max_position_embeddings, + base = self.rope_theta, + short_factor = self.rope_scaling['short_factor'], + long_factor = self.rope_scaling['long_factor' ], + ) else: raise ValueError(f"Unknown RoPE scaling type {{scaling_type}}") pass """ + fix_rope_function = fix_rope_function.format( rope_function = rope_module.__name__, scaled_rope_function = scaled_rope_module.__name__, extended_rope_function = extended_rope_module.__name__, + longrope_rope_function = \ + (longrope_module if longrope_module is not None else rope_module).__name__ ) rotary_emb = re.findall( "self.rotary_emb = .+?\)", function, diff --git a/unsloth/models/gemma.py b/unsloth/models/gemma.py index a0894ec7a..45f14c113 100644 --- a/unsloth/models/gemma.py +++ b/unsloth/models/gemma.py @@ -254,6 +254,10 @@ def forward(self, x, position_ids=None, seq_len=None): ) pass + def get_cached(self, seq_len = None): + return self.cos_cached, self.sin_cached + pass + def extend_rope_embedding(self, x, seq_len): if seq_len <= self.current_rope_size: return # Iteratively grow by increments of 8192 diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 048ba6919..eef4f49e0 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -346,14 +346,17 @@ def LlamaAttention_fast_forward( kv_seq_len += past_key_value[0].shape[-2] # Extend RoPE dynamically to fit in VRAM - self.rotary_emb.extend_rope_embedding(V, seq_len = kv_seq_len) + rotary_emb = self.rotary_emb + rotary_emb.extend_rope_embedding(V, seq_len = kv_seq_len) if position_ids is None: - cos = self.rotary_emb.cos_cached - sin = self.rotary_emb.sin_cached + # Useful for LongRoPE + cos, sin = rotary_emb.get_cached(kv_seq_len) + # cos = self.rotary_emb.cos_cached + # sin = self.rotary_emb.sin_cached Q, K = fast_rope_embedding(Q, K, cos, sin) else: - cos, sin = self.rotary_emb(V, seq_len = kv_seq_len) + cos, sin = rotary_emb(V, seq_len = kv_seq_len) Q, K = inplace_rope_embedding(Q, K, cos, sin, position_ids) pass @@ -1048,6 +1051,10 @@ def forward(self, x, position_ids=None, seq_len=None): ) pass + def get_cached(self, seq_len = None): + return self.cos_cached, self.sin_cached + pass + def extend_rope_embedding(self, x, seq_len): if seq_len <= self.current_rope_size: return # Iteratively grow by increments of 8192 @@ -1170,6 +1177,117 @@ def forward(self, x, position_ids=None, seq_len=None): ) pass + def get_cached(self, seq_len = None): + return self.cos_cached, self.sin_cached + pass + + def extend_rope_embedding(self, x, seq_len): + if seq_len <= self.current_rope_size: return + # Iteratively grow by increments of 8192 + self.current_rope_size = math.ceil(seq_len / 8192) * 8192 + self._set_cos_sin_cache(self.current_rope_size, device = "cuda:0", dtype = x.dtype) + pass +pass + + +class LongRopeRotaryEmbedding(torch.nn.Module): + # For Phi 3.5 128K https://huggingface.co/microsoft/Phi-3.5-mini-instruct/blob/main/modeling_phi3.py + def __init__(self, + dim = None, + max_position_embeddings = 131072, + original_max_position_embeddings = 4096, + base = 10000, + short_factor = None, + long_factor = None, + device = None, + config = None, # [TODO] Hack to pass in config - need to remove later + ): + super().__init__() + assert(short_factor is not None) + assert(long_factor is not None) + assert(type(original_max_position_embeddings) is int) + + if config is not None: + # [TODO] Hack to pass in config - need to remove later + base = config.rope_theta + partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 + dim = int((config.hidden_size // config.num_attention_heads)) + device = "cuda" + max_position_embeddings = config.max_position_embeddings + pass + + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.original_max_position_embeddings = original_max_position_embeddings + self.base = base + # Dynamic RoPE we first set it to a max of 4 * 8192 tokens then we iteratively grow this + self.current_rope_size = min(original_max_position_embeddings, self.max_position_embeddings) + + # Long RoPE similar to RoPE except short sequences have 1 cos / sin + # and long sequences have another cos / sin + inv_freq_shape = torch.arange(0, self.dim, 2, dtype=torch.int64, device="cpu").float() / self.dim + short_factor = torch.tensor(short_factor, device = "cpu", dtype = torch.float32) + long_factor = torch.tensor(long_factor, device = "cpu", dtype = torch.float32) + short_inv_freq = 1.0 / (short_factor * self.base**inv_freq_shape) + long_inv_freq = 1.0 / (long_factor * self.base**inv_freq_shape) + + # Phi-3 Scale factor + scale = self.max_position_embeddings / self.original_max_position_embeddings + if scale <= 1.0: + scaling_factor = 1.0 + else: + scaling_factor = math.sqrt(1 + math.log(scale) / math.log(self.original_max_position_embeddings)) + pass + self.scaling_factor = scaling_factor + + # Short and long inv_freq + self.register_buffer("short_inv_freq", short_inv_freq, persistent = False) + self.register_buffer("long_inv_freq", long_inv_freq, persistent = False) + # Build here to make `torch.jit.trace` work. + # self._set_cos_sin_cache(seq_len=self.current_rope_size, device=device, dtype=torch.get_default_dtype()) + + # Short sequences + t = torch.arange(original_max_position_embeddings, device=self.short_inv_freq.device, dtype=torch.int64).float() + freqs = torch.outer(t, self.short_inv_freq) + emb = torch.cat((freqs, freqs), dim=-1) + cos_cached = (emb.cos() * self.scaling_factor).to(dtype=dtype, device=device, non_blocking=True) + sin_cached = (emb.sin() * self.scaling_factor).to(dtype=dtype, device=device, non_blocking=True) + self.register_buffer("short_cos_cached", cos_cached, persistent=False) + self.register_buffer("short_sin_cached", sin_cached, persistent=False) + pass + + def _set_cos_sin_cache(self, seq_len, device, dtype): + # Note: on the original Llama codebase, these tensors are created on the target device (and not on CPU) and + # in FP32. They are applied (multiplied) in FP32 as well. + self.current_rope_size = seq_len + + t = torch.arange(self.current_rope_size, device=self.inv_freq.device, dtype=torch.int64).float() + # Long sequences + freqs = torch.outer(t, self.long_inv_freq) + emb = torch.cat((freqs, freqs), dim=-1) + cos_cached = (emb.cos() * self.scaling_factor).to(dtype=dtype, device=device, non_blocking=True) + sin_cached = (emb.sin() * self.scaling_factor).to(dtype=dtype, device=device, non_blocking=True) + self.register_buffer("long_cos_cached", cos_cached, persistent=False) + self.register_buffer("long_sin_cached", sin_cached, persistent=False) + pass + + def forward(self, x, position_ids=None, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + if seq_len > self.current_rope_size: + self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) + + return ( + self.cos_cached[:seq_len].to(dtype = x.dtype), + self.sin_cached[:seq_len].to(dtype = x.dtype), + ) + pass + + def get_cached(self, seq_len = None): + if seq_len < original_max_position_embeddings: + return self.short_cos_cached, self.short_sin_cached + return self.long_cos_cached, self.long_sin_cached + pass + def extend_rope_embedding(self, x, seq_len): if seq_len <= self.current_rope_size: return # Iteratively grow by increments of 8192 @@ -1242,6 +1360,7 @@ def pre_patch(): scaled_rope_module = LlamaLinearScalingRotaryEmbedding, extended_rope_module = LlamaExtendedRotaryEmbedding, attention_module = LlamaAttention, + longrope_module = LongRopeRotaryEmbedding, ) if init_name is not None: exec(function, globals()) From ad9418f94a444daf98022c750c47ce4fd7d16a2c Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 20 Aug 2024 14:41:49 -0700 Subject: [PATCH 137/147] Update _utils.py --- unsloth/models/_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 434554f67..90ceb25b5 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -1028,6 +1028,7 @@ def patch_llama_rope_scaling( base=self.rope_theta, ) elif scaling_type == "longrope": + print('## Long RoPE') self.rotary_emb = {longrope_rope_function}( dim = self.head_dim, max_position_embeddings = self.max_position_embeddings, From 592e7a483608897ef3e5e9f85be46e06a5508eb0 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 20 Aug 2024 14:45:36 -0700 Subject: [PATCH 138/147] Update _utils.py --- unsloth/models/_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 90ceb25b5..8cc056b03 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -1037,6 +1037,7 @@ def patch_llama_rope_scaling( short_factor = self.rope_scaling['short_factor'], long_factor = self.rope_scaling['long_factor' ], ) + print('## Long RoPE') else: raise ValueError(f"Unknown RoPE scaling type {{scaling_type}}") pass From 0e0b71fe5447f7b007401e4d308670523a34973b Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 20 Aug 2024 15:03:43 -0700 Subject: [PATCH 139/147] Update _utils.py --- unsloth/models/_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 8cc056b03..e87ffe02d 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -1032,7 +1032,7 @@ def patch_llama_rope_scaling( self.rotary_emb = {longrope_rope_function}( dim = self.head_dim, max_position_embeddings = self.max_position_embeddings, - original_max_position_embeddings = self.original_max_position_embeddings, + original_max_position_embeddings = self.rope_scaling['original_max_position_embeddings'], base = self.rope_theta, short_factor = self.rope_scaling['short_factor'], long_factor = self.rope_scaling['long_factor' ], From 1d724f8b8d38ebb8c125c34e471ad06e1b9874b4 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 20 Aug 2024 15:05:12 -0700 Subject: [PATCH 140/147] Update _utils.py --- unsloth/models/_utils.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index e87ffe02d..1c48e8e58 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -1028,16 +1028,14 @@ def patch_llama_rope_scaling( base=self.rope_theta, ) elif scaling_type == "longrope": - print('## Long RoPE') self.rotary_emb = {longrope_rope_function}( dim = self.head_dim, max_position_embeddings = self.max_position_embeddings, - original_max_position_embeddings = self.rope_scaling['original_max_position_embeddings'], + original_max_position_embeddings = self.config.original_max_position_embeddings, base = self.rope_theta, - short_factor = self.rope_scaling['short_factor'], - long_factor = self.rope_scaling['long_factor' ], + short_factor = self.config.rope_scaling['short_factor'], + long_factor = self.config.rope_scaling['long_factor' ], ) - print('## Long RoPE') else: raise ValueError(f"Unknown RoPE scaling type {{scaling_type}}") pass From 9ea3579f219ea3878e7e9b506b54a8889d43fd2e Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 20 Aug 2024 15:08:31 -0700 Subject: [PATCH 141/147] Update llama.py --- unsloth/models/llama.py | 1 + 1 file changed, 1 insertion(+) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index eef4f49e0..c1dae2eda 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1247,6 +1247,7 @@ def __init__(self, # self._set_cos_sin_cache(seq_len=self.current_rope_size, device=device, dtype=torch.get_default_dtype()) # Short sequences + dtype = torch.bfloat16 if is_bfloat16_supported() else torch.float16 t = torch.arange(original_max_position_embeddings, device=self.short_inv_freq.device, dtype=torch.int64).float() freqs = torch.outer(t, self.short_inv_freq) emb = torch.cat((freqs, freqs), dim=-1) From 40cbfc5cdd1504a7680748d6e5fa13c6fc9efba8 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 20 Aug 2024 15:11:10 -0700 Subject: [PATCH 142/147] Update llama.py --- unsloth/models/llama.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index c1dae2eda..bf033a56a 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1277,10 +1277,17 @@ def forward(self, x, position_ids=None, seq_len=None): if seq_len > self.current_rope_size: self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) - return ( - self.cos_cached[:seq_len].to(dtype = x.dtype), - self.sin_cached[:seq_len].to(dtype = x.dtype), - ) + if seq_len < original_max_position_embeddings: + return ( + self.short_cos_cached[:seq_len].to(dtype = x.dtype), + self.short_sin_cached[:seq_len].to(dtype = x.dtype), + ) + else: + return ( + self.long_cos_cached[:seq_len].to(dtype = x.dtype), + self.long_sin_cached[:seq_len].to(dtype = x.dtype), + ) + pass pass def get_cached(self, seq_len = None): From 200b2350930861f2fe6b69514ffffa7d20102910 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 20 Aug 2024 15:13:23 -0700 Subject: [PATCH 143/147] Update llama.py --- unsloth/models/llama.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index bf033a56a..867b724ea 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1784,11 +1784,19 @@ def post_patch(model): pass pass # Downcast RoPE embedding to correct data type - if (name.endswith("rotary_emb") or hasattr(module, "cos_cached")) \ - and (module.cos_cached.dtype != correct_dtype): - - module.cos_cached = module.cos_cached.to(correct_dtype) - module.sin_cached = module.sin_cached.to(correct_dtype) + if (name.endswith("rotary_emb") or hasattr(module, "cos_cached")): + + if hasattr(module, "cos_cached") and \ + (module.cos_cached.dtype != correct_dtype): + + module.cos_cached = module.cos_cached.to(correct_dtype) + module.sin_cached = module.sin_cached.to(correct_dtype) + + elif hasattr(module, "short_cos_cached") and \ + (module.short_cos_cached.dtype != correct_dtype): + + module.short_cos_cached = module.short_cos_cached.to(correct_dtype) + module.short_sin_cached = module.short_sin_cached.to(correct_dtype) pass pass pass From f35f9b03560e9631286e67590fa3c45cc3c2282d Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 20 Aug 2024 15:15:09 -0700 Subject: [PATCH 144/147] Update llama.py --- unsloth/models/llama.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 867b724ea..42f01b7eb 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1277,7 +1277,7 @@ def forward(self, x, position_ids=None, seq_len=None): if seq_len > self.current_rope_size: self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) - if seq_len < original_max_position_embeddings: + if seq_len < self.original_max_position_embeddings: return ( self.short_cos_cached[:seq_len].to(dtype = x.dtype), self.short_sin_cached[:seq_len].to(dtype = x.dtype), @@ -1291,7 +1291,7 @@ def forward(self, x, position_ids=None, seq_len=None): pass def get_cached(self, seq_len = None): - if seq_len < original_max_position_embeddings: + if seq_len < self.original_max_position_embeddings: return self.short_cos_cached, self.short_sin_cached return self.long_cos_cached, self.long_sin_cached pass From 9266c1c5cbac3f18baba807a627c3c122ad7dc26 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 20 Aug 2024 15:21:18 -0700 Subject: [PATCH 145/147] Update llama.py --- unsloth/models/llama.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 42f01b7eb..376b4b4eb 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -187,8 +187,9 @@ def LlamaAttention_fast_forward_inference( # cos, sin = self.rotary_emb(Vn, seq_len = kv_seq_len) # Qn, Kn = inplace_rope_embedding(Qn, Kn, cos, sin, position_ids) - cos = self.rotary_emb.cos_cached[position_ids].unsqueeze(1) - sin = self.rotary_emb.sin_cached[position_ids].unsqueeze(1) + cos, sin = self.rotary_emb.get_cached(kv_seq_len) + cos = cos[position_ids].unsqueeze(1) + sin = sin[position_ids].unsqueeze(1) h = self.half_head_dim RH_Q = self.RH_Q From 647bbdbb2934a71240dd63845d4a715eb2d06caf Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 20 Aug 2024 16:29:11 -0700 Subject: [PATCH 146/147] Update mapper.py --- unsloth/models/mapper.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/unsloth/models/mapper.py b/unsloth/models/mapper.py index b8259a073..3f49c9655 100644 --- a/unsloth/models/mapper.py +++ b/unsloth/models/mapper.py @@ -249,6 +249,10 @@ "unsloth/gemma-2-2b-it", "google/gemma-2-2b-it", ), + "unsloth/Phi-3.5-mini-instruct-bnb-4bit" : ( + "unsloth/Phi-3.5-mini-instruct", + "microsoft/Phi-3.5-mini-instruct", + ), } INT_TO_FLOAT_MAPPER = {} From be8b3d8528c6e2cb34f8909e65e5ab420b597edc Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 20 Aug 2024 16:46:26 -0700 Subject: [PATCH 147/147] Phi 3.5 --- README.md | 2 +- unsloth/chat_templates.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index b23acffcb..0590415f4 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ -### Finetune Llama 3.1, Mistral, Phi-3 & Gemma 2-5x faster with 80% less memory! +### Finetune Llama 3.1, Mistral, Phi-3.5 & Gemma 2-5x faster with 80% less memory! ![](https://i.ibb.co/sJ7RhGG/image-41.png) diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py index 82f6aba14..f83df579b 100644 --- a/unsloth/chat_templates.py +++ b/unsloth/chat_templates.py @@ -473,7 +473,7 @@ # =========================================== Phi-3 phi3_template = \ - "{{ bos_token }}"\ + # "{{ bos_token }}"\ # Phi-3.5 removes BOS? "{% for message in messages %}"\ "{% if message['role'] == 'user' %}"\ "{{'<|user|>\n' + message['content'] + '<|end|>\n'}}"\ @@ -505,7 +505,9 @@ ''' phi3_template_eos_token = "<|end|>" -CHAT_TEMPLATES["phi-3"] = (phi3_template, phi3_template_eos_token, False, phi3_ollama,) +CHAT_TEMPLATES["phi-3"] = (phi3_template, phi3_template_eos_token, False, phi3_ollama,) +CHAT_TEMPLATES["phi-35"] = CHAT_TEMPLATES["phi-3"] +CHAT_TEMPLATES["phi-3.5"] = CHAT_TEMPLATES["phi-3"] pass # =========================================== Llama-3.1