From 7eb34655a5d335db22a67aeb340d0a522aeef7e6 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 18 Jul 2024 11:07:32 -0700
Subject: [PATCH 001/147] Update __init__.py

---
 unsloth/__init__.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/unsloth/__init__.py b/unsloth/__init__.py
index 6a2d999b4..ea2fe7685 100644
--- a/unsloth/__init__.py
+++ b/unsloth/__init__.py
@@ -61,10 +61,10 @@
 pass
 
 # Hugging Face Hub faster downloads (only enable during Colab and Kaggle sessions)
-keynames = "\n" + "\n".join(os.environ.keys())
-if "\nCOLAB_"  in keynames or "\nKAGGLE_" in keynames:
-    os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
-pass
+# keynames = "\n" + "\n".join(os.environ.keys())
+# if "\nCOLAB_"  in keynames or "\nKAGGLE_" in keynames:
+#     os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+# pass
 
 # We support Pytorch 2
 # Fixes https://github.com/unslothai/unsloth/issues/38

From 54dfb1a9e163dfb2e11c7c46ac182bb22849e6a3 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 18 Jul 2024 12:06:38 -0700
Subject: [PATCH 002/147] dynamic RoPE

---
 unsloth/__init__.py     |  8 ++++----
 unsloth/models/llama.py | 28 ++++++++++++++++++++--------
 2 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/unsloth/__init__.py b/unsloth/__init__.py
index ea2fe7685..6a2d999b4 100644
--- a/unsloth/__init__.py
+++ b/unsloth/__init__.py
@@ -61,10 +61,10 @@
 pass
 
 # Hugging Face Hub faster downloads (only enable during Colab and Kaggle sessions)
-# keynames = "\n" + "\n".join(os.environ.keys())
-# if "\nCOLAB_"  in keynames or "\nKAGGLE_" in keynames:
-#     os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
-# pass
+keynames = "\n" + "\n".join(os.environ.keys())
+if "\nCOLAB_"  in keynames or "\nKAGGLE_" in keynames:
+    os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+pass
 
 # We support Pytorch 2
 # Fixes https://github.com/unslothai/unsloth/issues/38
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 9bea364ca..0fcfe2a27 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -335,6 +335,9 @@ def LlamaAttention_fast_forward(
     if past_key_value is not None:
         kv_seq_len += past_key_value[0].shape[-2]
 
+    # Extend RoPE dynamically to fit in VRAM
+    self.rotary_emb.extend_rope_embedding(V, seq_len = kv_seq_len)
+
     if position_ids is None:
         cos = self.rotary_emb.cos_cached
         sin = self.rotary_emb.sin_cached
@@ -971,19 +974,21 @@ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         self.dim = dim
         self.max_position_embeddings = max_position_embeddings
         self.base = base
+        # Dynamic RoPE we first set it to a max of 4 * 8192 tokens then we iteratively grow this
+        self.current_rope_size = min(4 * 8192, self.max_position_embeddings)
 
         # Build here to make `torch.jit.trace` work.
-        self._set_cos_sin_cache(seq_len=max_position_embeddings, device=device, dtype=torch.get_default_dtype())
+        self._set_cos_sin_cache(seq_len=self.current_rope_size, device=device, dtype=torch.get_default_dtype())
     pass
 
     def _set_cos_sin_cache(self, seq_len, device, dtype):
         # Note: on the original Llama codebase, these tensors are created on the target device (and not on CPU) and
         # in FP32. They are applied (multiplied) in FP32 as well.
-        self.max_seq_len_cached = seq_len
+        self.current_rope_size = seq_len
         inv_freq = 1.0 / (
             self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64, device="cpu").float() / self.dim)
         )
-        t = torch.arange(self.max_seq_len_cached, device="cpu", dtype=torch.int64).float()
+        t = torch.arange(self.current_rope_size, device="cpu", dtype=torch.int64).float()
 
         freqs = torch.outer(t, inv_freq)
         # Different from paper, but it uses a different permutation in order to obtain the same calculation
@@ -994,14 +999,21 @@ def _set_cos_sin_cache(self, seq_len, device, dtype):
 
     def forward(self, x, position_ids=None, seq_len=None):
         # x: [bs, num_attention_heads, seq_len, head_size]
-        if seq_len > self.max_seq_len_cached:
+        if seq_len > self.current_rope_size:
             self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
 
         return (
-            self.cos_cached[:seq_len].to(dtype=x.dtype),
-            self.sin_cached[:seq_len].to(dtype=x.dtype),
+            self.cos_cached[:seq_len].to(dtype = x.dtype),
+            self.sin_cached[:seq_len].to(dtype = x.dtype),
         )
     pass
+
+    def extend_rope_embedding(self, x, seq_len):
+        if seq_len <= self.current_rope_size: return
+        # Iteratively grow by increments of 8192
+        self.current_rope_size = int(round(seq_len / 8192)) * 8192
+        self._set_cos_sin_cache(self.current_rope_size, device = "cuda:0", dtype = x.dtype)
+    pass
 pass
 
 
@@ -1016,11 +1028,11 @@ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, s
     pass
 
     def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
+        self.current_rope_size = seq_len
         inv_freq = 1.0 / (
             self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64, device="cpu").float() / self.dim)
         )
-        t = torch.arange(self.max_seq_len_cached, device="cpu", dtype=torch.int64).float()
+        t = torch.arange(self.current_rope_size, device="cpu", dtype=torch.int64).float()
         t = t / self.scaling_factor
 
         freqs = torch.outer(t, inv_freq)

From 6c8618c75443a08c4ec0304cc54acfae74b2ddcd Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 18 Jul 2024 12:08:49 -0700
Subject: [PATCH 003/147] Update mistral.py

---
 unsloth/models/mistral.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/unsloth/models/mistral.py b/unsloth/models/mistral.py
index 6eb3fccfa..d7376d952 100644
--- a/unsloth/models/mistral.py
+++ b/unsloth/models/mistral.py
@@ -78,6 +78,9 @@ def MistralAttention_fast_forward(
     if past_key_value is not None:
         kv_seq_len += past_key_value[0].shape[-2]
 
+    # Extend RoPE dynamically to fit in VRAM
+    self.rotary_emb.extend_rope_embedding(V, seq_len = kv_seq_len)
+
     if position_ids is None:
         cos = self.rotary_emb.cos_cached
         sin = self.rotary_emb.sin_cached

From a56b2d45c9fcaba1aefb3dd09dacf904362bff59 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 18 Jul 2024 12:33:22 -0700
Subject: [PATCH 004/147] Update llama.py

---
 unsloth/models/llama.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 0fcfe2a27..ee261b0f7 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1152,6 +1152,12 @@ def from_pretrained(
            f"\        /    Bfloat16 = {str(SUPPORTS_BFLOAT16).upper()}. FA [Xformers = {xformers_version}. FA2 = {HAS_FLASH_ATTENTION}]\n"\
            f' "-____-"     Free Apache license: http://github.com/unslothai/unsloth'
         print(statistics)
+        
+        # Warn about fast transfers
+        if os.environ.get("HF_HUB_ENABLE_HF_TRANSFER", "0") == "1":
+            logger.warning_once("Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!")
+        pass
+
         model_patcher.pre_patch()
         get_statistics() # For debugging - we use a download counter to see if environments are not breaking 
 

From 40aeb2629e90615a359c2b8870112a826ef5baa5 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 18 Jul 2024 13:25:30 -0700
Subject: [PATCH 005/147] Update tokenizer_utils.py

---
 unsloth/tokenizer_utils.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index dc0c7da85..060c1ccae 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -38,6 +38,17 @@
 IGNORED_TOKENIZER_CHECKING = frozenset((
     "CodeLlamaTokenizerFast",
     "CodeLlamaTokenizer",
+    ""
+))
+
+
+IGNORED_TOKENIZER_NAMES = frozenset((
+    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
+    "unsloth/Mistral-Nemo-Instruct-2407",
+    "mistralai/Mistral-Nemo-Instruct-2407",
+    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit",
+    "unsloth/Mistral-Nemo-Base-2407",
+    "mistralai/Mistral-Nemo-Base-2407",
 ))
 
 # Check environments
@@ -488,7 +499,7 @@ def load_correct_tokenizer(
         cache_dir         = cache_dir,
     )
 
-    if slow_tokenizer is not None:
+    if tokenizer_name not in IGNORED_TOKENIZER_NAMES and slow_tokenizer is not None:
         if hasattr(fast_tokenizer, "add_bos_token") and hasattr(slow_tokenizer, "add_bos_token"):
             fast_tokenizer.add_bos_token = slow_tokenizer.add_bos_token
         if hasattr(fast_tokenizer, "add_eos_token") and hasattr(slow_tokenizer, "add_eos_token"):

From fbf6cc747ef52812f5593a2daf96e55b5c200514 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 18 Jul 2024 13:33:13 -0700
Subject: [PATCH 006/147] Update mistral.py

---
 unsloth/models/mistral.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/mistral.py b/unsloth/models/mistral.py
index d7376d952..b2531056a 100644
--- a/unsloth/models/mistral.py
+++ b/unsloth/models/mistral.py
@@ -161,7 +161,7 @@ def MistralAttention_fast_forward(
         A = A.transpose(1, 2).contiguous()
     pass
     
-    attn_output = A.reshape(bsz, q_len, self.hidden_size)
+    attn_output = A.reshape(bsz, q_len, n_heads*head_dim)
     attn_output = self.apply_o(self, attn_output)
     attn_weights = None
     return attn_output, attn_weights, past_key_value

From 983c2b601aa3418cac25011317e08b454bde2c31 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 18 Jul 2024 14:31:35 -0700
Subject: [PATCH 007/147] Update llama.py

---
 unsloth/models/llama.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index ee261b0f7..ba45bbbfb 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -158,6 +158,14 @@ def LlamaAttention_fast_forward_inference(
         self.temp_QA = torch.empty((2, bsz, 1, attention_size), dtype = dtype, device = "cuda:0")
         self.temp_KV = torch.empty((2, bsz, 1, n_kv_heads*head_dim), dtype = dtype, device = "cuda:0")
         self.RH_Q = torch.empty((bsz, n_heads, 1, head_dim), dtype = dtype, device = "cuda:0")
+        
+        # Mistral Nemo 12b has weird dimensions
+        if attention_size != self.hidden_size:
+            self.temp_O = torch.empty((1, bsz, self.hidden_size), dtype = dtype, device = "cuda:0")
+        else:
+            self.temp_O = self.temp_QA[1][:,:,:self.hidden_size]
+        pass
+        
         self.attention = torch.empty((bsz, n_heads, 1, KV_CACHE_INCREMENT+seq_len), dtype = dtype, device = "cuda:0")
         self.scalar = 1.0 / math_sqrt(self.head_dim)
         self.half_head_dim = head_dim // 2
@@ -239,7 +247,7 @@ def LlamaAttention_fast_forward_inference(
     pass
     A = A.transpose(1, 2)
     A = A.reshape(bsz, 1, attention_size)
-    A = fast_linear_forward(self.o_proj, A, out = self.temp_QA[1][:,:,:self.hidden_size])
+    A = fast_linear_forward(self.o_proj, A, out = self.temp_O)
     return A, (Kn, Vn)
 pass
 
@@ -1152,7 +1160,7 @@ def from_pretrained(
            f"\        /    Bfloat16 = {str(SUPPORTS_BFLOAT16).upper()}. FA [Xformers = {xformers_version}. FA2 = {HAS_FLASH_ATTENTION}]\n"\
            f' "-____-"     Free Apache license: http://github.com/unslothai/unsloth'
         print(statistics)
-        
+
         # Warn about fast transfers
         if os.environ.get("HF_HUB_ENABLE_HF_TRANSFER", "0") == "1":
             logger.warning_once("Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!")

From ed56977a8c4b850d984517bc2da29319d20cc4c3 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 18 Jul 2024 14:43:03 -0700
Subject: [PATCH 008/147] Update __init__.py

---
 unsloth/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/__init__.py b/unsloth/__init__.py
index 6a2d999b4..464068154 100644
--- a/unsloth/__init__.py
+++ b/unsloth/__init__.py
@@ -89,7 +89,7 @@ def is_bf16_supported(including_emulation = False):
         return old_is_bf16_supported(including_emulation)
     torch.cuda.is_bf16_supported = is_bf16_supported
 else:
-    def is_bf16_supported(): SUPPORTS_BFLOAT16
+    def is_bf16_supported(): return SUPPORTS_BFLOAT16
     torch.cuda.is_bf16_supported = is_bf16_supported
 pass
 

From 2a251ec5948ea44d7332ab8b053b83276ae237ca Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 18 Jul 2024 18:18:09 -0700
Subject: [PATCH 009/147] Update flex_attention.py

---
 unsloth/kernels/flex_attention.py | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/unsloth/kernels/flex_attention.py b/unsloth/kernels/flex_attention.py
index 1eb248699..a992a0238 100644
--- a/unsloth/kernels/flex_attention.py
+++ b/unsloth/kernels/flex_attention.py
@@ -25,18 +25,23 @@
 }
 
 # Flex Attention supported from torch 2.5 onwards only
-import torch.nn.attention
-if hasattr(torch.nn.attention, "flex_attention"):
-    import torch.nn.attention.flex_attention
-    from torch.nn.attention.flex_attention import flex_attention
-    from torch.nn.attention.flex_attention import create_block_mask
-    FLEX_ATTENTION_PADDING = getattr(
-        torch.nn.attention.flex_attention,
-        "_DEFAULT_SPARSE_BLOCK_SIZE",
-        1,
-    )
-    flex_attention = torch.compile(flex_attention, dynamic = False)
-    HAS_FLEX_ATTENTION = True
+import torch.nn
+if hasattr(torch.nn, "attention"):
+    import torch.nn.attention
+    if hasattr(torch.nn.attention, "flex_attention"):
+        import torch.nn.attention.flex_attention
+        from torch.nn.attention.flex_attention import flex_attention
+        from torch.nn.attention.flex_attention import create_block_mask
+        FLEX_ATTENTION_PADDING = getattr(
+            torch.nn.attention.flex_attention,
+            "_DEFAULT_SPARSE_BLOCK_SIZE",
+            1,
+        )
+        flex_attention = torch.compile(flex_attention, dynamic = False)
+        HAS_FLEX_ATTENTION = True
+    else:
+        HAS_FLEX_ATTENTION = False
+    pass
 else:
     HAS_FLEX_ATTENTION = False
 pass

From 477793753f6aa4100b785a1a3557f34c3223cbcd Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 18 Jul 2024 21:57:33 -0700
Subject: [PATCH 010/147] Update llama.py

---
 unsloth/models/llama.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index ba45bbbfb..212767b39 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1163,7 +1163,7 @@ def from_pretrained(
 
         # Warn about fast transfers
         if os.environ.get("HF_HUB_ENABLE_HF_TRANSFER", "0") == "1":
-            logger.warning_once("Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!")
+            print("Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!")
         pass
 
         model_patcher.pre_patch()

From 152450462475b7621164be944fbe2945a26cddd0 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 18 Jul 2024 22:07:23 -0700
Subject: [PATCH 011/147] Update llama.py

---
 unsloth/models/llama.py | 53 ++++++++---------------------------------
 1 file changed, 10 insertions(+), 43 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 212767b39..1ac96a4f2 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -158,14 +158,6 @@ def LlamaAttention_fast_forward_inference(
         self.temp_QA = torch.empty((2, bsz, 1, attention_size), dtype = dtype, device = "cuda:0")
         self.temp_KV = torch.empty((2, bsz, 1, n_kv_heads*head_dim), dtype = dtype, device = "cuda:0")
         self.RH_Q = torch.empty((bsz, n_heads, 1, head_dim), dtype = dtype, device = "cuda:0")
-        
-        # Mistral Nemo 12b has weird dimensions
-        if attention_size != self.hidden_size:
-            self.temp_O = torch.empty((1, bsz, self.hidden_size), dtype = dtype, device = "cuda:0")
-        else:
-            self.temp_O = self.temp_QA[1][:,:,:self.hidden_size]
-        pass
-        
         self.attention = torch.empty((bsz, n_heads, 1, KV_CACHE_INCREMENT+seq_len), dtype = dtype, device = "cuda:0")
         self.scalar = 1.0 / math_sqrt(self.head_dim)
         self.half_head_dim = head_dim // 2
@@ -247,7 +239,7 @@ def LlamaAttention_fast_forward_inference(
     pass
     A = A.transpose(1, 2)
     A = A.reshape(bsz, 1, attention_size)
-    A = fast_linear_forward(self.o_proj, A, out = self.temp_O)
+    A = fast_linear_forward(self.o_proj, A, out = self.temp_QA[1][:,:,:self.hidden_size])
     return A, (Kn, Vn)
 pass
 
@@ -343,9 +335,6 @@ def LlamaAttention_fast_forward(
     if past_key_value is not None:
         kv_seq_len += past_key_value[0].shape[-2]
 
-    # Extend RoPE dynamically to fit in VRAM
-    self.rotary_emb.extend_rope_embedding(V, seq_len = kv_seq_len)
-
     if position_ids is None:
         cos = self.rotary_emb.cos_cached
         sin = self.rotary_emb.sin_cached
@@ -673,12 +662,6 @@ def LlamaModel_fast_forward(
             offloaded_gradient_checkpointing = True
     pass
 
-    # Check for Flex Attention
-    # if IS_GEMMA2 and HAS_FLEX_ATTENTION:
-    #     if not (seq_length % FLEX_ATTENTION_PADDING == 0):
-    #     USE_FLEX_ATTENTION = True
-
-
     # Gemma2 has alternating SWA and global attn
     if IS_GEMMA2 and not hasattr(self, "SWA_mask"):
         n = self.config.max_position_embeddings
@@ -982,21 +965,19 @@ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         self.dim = dim
         self.max_position_embeddings = max_position_embeddings
         self.base = base
-        # Dynamic RoPE we first set it to a max of 4 * 8192 tokens then we iteratively grow this
-        self.current_rope_size = min(4 * 8192, self.max_position_embeddings)
 
         # Build here to make `torch.jit.trace` work.
-        self._set_cos_sin_cache(seq_len=self.current_rope_size, device=device, dtype=torch.get_default_dtype())
+        self._set_cos_sin_cache(seq_len=max_position_embeddings, device=device, dtype=torch.get_default_dtype())
     pass
 
     def _set_cos_sin_cache(self, seq_len, device, dtype):
         # Note: on the original Llama codebase, these tensors are created on the target device (and not on CPU) and
         # in FP32. They are applied (multiplied) in FP32 as well.
-        self.current_rope_size = seq_len
+        self.max_seq_len_cached = seq_len
         inv_freq = 1.0 / (
             self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64, device="cpu").float() / self.dim)
         )
-        t = torch.arange(self.current_rope_size, device="cpu", dtype=torch.int64).float()
+        t = torch.arange(self.max_seq_len_cached, device="cpu", dtype=torch.int64).float()
 
         freqs = torch.outer(t, inv_freq)
         # Different from paper, but it uses a different permutation in order to obtain the same calculation
@@ -1007,21 +988,14 @@ def _set_cos_sin_cache(self, seq_len, device, dtype):
 
     def forward(self, x, position_ids=None, seq_len=None):
         # x: [bs, num_attention_heads, seq_len, head_size]
-        if seq_len > self.current_rope_size:
+        if seq_len > self.max_seq_len_cached:
             self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
 
         return (
-            self.cos_cached[:seq_len].to(dtype = x.dtype),
-            self.sin_cached[:seq_len].to(dtype = x.dtype),
+            self.cos_cached[:seq_len].to(dtype=x.dtype),
+            self.sin_cached[:seq_len].to(dtype=x.dtype),
         )
     pass
-
-    def extend_rope_embedding(self, x, seq_len):
-        if seq_len <= self.current_rope_size: return
-        # Iteratively grow by increments of 8192
-        self.current_rope_size = int(round(seq_len / 8192)) * 8192
-        self._set_cos_sin_cache(self.current_rope_size, device = "cuda:0", dtype = x.dtype)
-    pass
 pass
 
 
@@ -1036,11 +1010,11 @@ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, s
     pass
 
     def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.current_rope_size = seq_len
+        self.max_seq_len_cached = seq_len
         inv_freq = 1.0 / (
             self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64, device="cpu").float() / self.dim)
         )
-        t = torch.arange(self.current_rope_size, device="cpu", dtype=torch.int64).float()
+        t = torch.arange(self.max_seq_len_cached, device="cpu", dtype=torch.int64).float()
         t = t / self.scaling_factor
 
         freqs = torch.outer(t, inv_freq)
@@ -1160,12 +1134,6 @@ def from_pretrained(
            f"\        /    Bfloat16 = {str(SUPPORTS_BFLOAT16).upper()}. FA [Xformers = {xformers_version}. FA2 = {HAS_FLASH_ATTENTION}]\n"\
            f' "-____-"     Free Apache license: http://github.com/unslothai/unsloth'
         print(statistics)
-
-        # Warn about fast transfers
-        if os.environ.get("HF_HUB_ENABLE_HF_TRANSFER", "0") == "1":
-            print("Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!")
-        pass
-
         model_patcher.pre_patch()
         get_statistics() # For debugging - we use a download counter to see if environments are not breaking 
 
@@ -2113,5 +2081,4 @@ def for_training(model, use_gradient_checkpointing = True):
             internal_model._saved_temp_tokenizer.padding_side = "right"
         pass
     pass
-pass
-
+pass
\ No newline at end of file

From c1d349370411f2c7861d5967c0c4a2ca59935670 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 18 Jul 2024 22:53:23 -0700
Subject: [PATCH 012/147] Mistral Nemo

---
 unsloth/models/_utils.py  | 31 ++++++++++++++++++---
 unsloth/models/llama.py   | 58 ++++++++++++++++++++++++++++++++-------
 unsloth/models/mistral.py | 22 ++++++++++++++-
 3 files changed, 96 insertions(+), 15 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 025daec13..466a5fee7 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -65,8 +65,26 @@
 
 # =============================================
 # Edits all Config files to enable RoPE Scaling for all models
-from transformers import PretrainedConfig
 
+# Transformers had to update for Mistral Nemo 12b since Attention is (5120, 4096) now.
+def patch_mistral_nemo_config(config):
+    if "head_dim (" not in config:
+        add_head_dim = "If it is not specified, will default to `8`.\n"\
+            "        head_dim (`int`, *optional*, defaults to `hidden_size // num_attention_heads`):\n"\
+            "            The attention head dimension."
+        config = config.replace("If it is not specified, will default to `8`.", add_head_dim)
+
+        add_head_dim = "num_key_value_heads=8,\n        head_dim=None,"
+        config = config.replace("num_key_value_heads=8,", add_head_dim)
+
+        add_head_dim = "self.sliding_window = sliding_window\n        self.head_dim = head_dim or hidden_size // num_attention_heads\n"
+        config = config.replace("self.sliding_window = sliding_window", add_head_dim)
+    pass
+    return config
+pass
+
+from transformers import __version__ as transformers_version
+from transformers import PretrainedConfig
 model_architectures = ["llama", "mistral", "gemma", "gemma2", "qwen2",]
 
 for model_name in model_architectures:
@@ -87,8 +105,14 @@
         r"\n        self.rope_scaling = rope_scaling\n",
         config,
     )
-    exec(config, globals())
 
+    # Just for Mistral Nemo
+    if model_name == "mistral":
+        if Version(transformers_version) <= Version("4.42.4"):
+            config = patch_mistral_nemo_config(config)
+    pass
+
+    exec(config, globals())
     exec(f"import {config_filepath}", globals())
     exec(f"{config_filepath}.{config_filename} = {config_filename}", globals())
 pass
@@ -97,7 +121,6 @@
 # =============================================
 # torch.cuda.amp.custom_fwd is deprecated >= 2.4
 import torch
-from packaging.version import Version
 if Version(torch.__version__) < Version("2.4.0"):
     torch_amp_custom_fwd = torch.cuda.amp.custom_fwd
     torch_amp_custom_bwd = torch.cuda.amp.custom_bwd
@@ -748,7 +771,7 @@ def patch_linear_scaling(
         "self.rotary_emb = .+?\)", function,
         flags = re.DOTALL | re.MULTILINE,
     )
-    if len(rotary_emb) == 0: return
+    if len(rotary_emb) == 0: return None, function
     rotary_emb = rotary_emb[0]
     function = function.replace(rotary_emb, fix_rope_function, 1)
     function = exec_code + "\n\n" + function
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 1ac96a4f2..ca4e65159 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -158,6 +158,14 @@ def LlamaAttention_fast_forward_inference(
         self.temp_QA = torch.empty((2, bsz, 1, attention_size), dtype = dtype, device = "cuda:0")
         self.temp_KV = torch.empty((2, bsz, 1, n_kv_heads*head_dim), dtype = dtype, device = "cuda:0")
         self.RH_Q = torch.empty((bsz, n_heads, 1, head_dim), dtype = dtype, device = "cuda:0")
+        
+        # Mistral Nemo 12b has weird dimensions
+        if attention_size != self.hidden_size:
+            self.temp_O = torch.empty((1, bsz, self.hidden_size), dtype = dtype, device = "cuda:0")
+        else:
+            self.temp_O = self.temp_QA[1][:,:,:self.hidden_size]
+        pass
+        
         self.attention = torch.empty((bsz, n_heads, 1, KV_CACHE_INCREMENT+seq_len), dtype = dtype, device = "cuda:0")
         self.scalar = 1.0 / math_sqrt(self.head_dim)
         self.half_head_dim = head_dim // 2
@@ -239,7 +247,7 @@ def LlamaAttention_fast_forward_inference(
     pass
     A = A.transpose(1, 2)
     A = A.reshape(bsz, 1, attention_size)
-    A = fast_linear_forward(self.o_proj, A, out = self.temp_QA[1][:,:,:self.hidden_size])
+    A = fast_linear_forward(self.o_proj, A, out = self.temp_O)
     return A, (Kn, Vn)
 pass
 
@@ -335,6 +343,9 @@ def LlamaAttention_fast_forward(
     if past_key_value is not None:
         kv_seq_len += past_key_value[0].shape[-2]
 
+    # Extend RoPE dynamically to fit in VRAM
+    self.rotary_emb.extend_rope_embedding(V, seq_len = kv_seq_len)
+
     if position_ids is None:
         cos = self.rotary_emb.cos_cached
         sin = self.rotary_emb.sin_cached
@@ -662,6 +673,12 @@ def LlamaModel_fast_forward(
             offloaded_gradient_checkpointing = True
     pass
 
+    # Check for Flex Attention
+    # if IS_GEMMA2 and HAS_FLEX_ATTENTION:
+    #     if not (seq_length % FLEX_ATTENTION_PADDING == 0):
+    #     USE_FLEX_ATTENTION = True
+
+
     # Gemma2 has alternating SWA and global attn
     if IS_GEMMA2 and not hasattr(self, "SWA_mask"):
         n = self.config.max_position_embeddings
@@ -965,19 +982,21 @@ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         self.dim = dim
         self.max_position_embeddings = max_position_embeddings
         self.base = base
+        # Dynamic RoPE we first set it to a max of 4 * 8192 tokens then we iteratively grow this
+        self.current_rope_size = min(4 * 8192, self.max_position_embeddings)
 
         # Build here to make `torch.jit.trace` work.
-        self._set_cos_sin_cache(seq_len=max_position_embeddings, device=device, dtype=torch.get_default_dtype())
+        self._set_cos_sin_cache(seq_len=self.current_rope_size, device=device, dtype=torch.get_default_dtype())
     pass
 
     def _set_cos_sin_cache(self, seq_len, device, dtype):
         # Note: on the original Llama codebase, these tensors are created on the target device (and not on CPU) and
         # in FP32. They are applied (multiplied) in FP32 as well.
-        self.max_seq_len_cached = seq_len
+        self.current_rope_size = seq_len
         inv_freq = 1.0 / (
             self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64, device="cpu").float() / self.dim)
         )
-        t = torch.arange(self.max_seq_len_cached, device="cpu", dtype=torch.int64).float()
+        t = torch.arange(self.current_rope_size, device="cpu", dtype=torch.int64).float()
 
         freqs = torch.outer(t, inv_freq)
         # Different from paper, but it uses a different permutation in order to obtain the same calculation
@@ -988,14 +1007,21 @@ def _set_cos_sin_cache(self, seq_len, device, dtype):
 
     def forward(self, x, position_ids=None, seq_len=None):
         # x: [bs, num_attention_heads, seq_len, head_size]
-        if seq_len > self.max_seq_len_cached:
+        if seq_len > self.current_rope_size:
             self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
 
         return (
-            self.cos_cached[:seq_len].to(dtype=x.dtype),
-            self.sin_cached[:seq_len].to(dtype=x.dtype),
+            self.cos_cached[:seq_len].to(dtype = x.dtype),
+            self.sin_cached[:seq_len].to(dtype = x.dtype),
         )
     pass
+
+    def extend_rope_embedding(self, x, seq_len):
+        if seq_len <= self.current_rope_size: return
+        # Iteratively grow by increments of 8192
+        self.current_rope_size = int(round(seq_len / 8192)) * 8192
+        self._set_cos_sin_cache(self.current_rope_size, device = "cuda:0", dtype = x.dtype)
+    pass
 pass
 
 
@@ -1010,11 +1036,11 @@ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, s
     pass
 
     def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
+        self.current_rope_size = seq_len
         inv_freq = 1.0 / (
             self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64, device="cpu").float() / self.dim)
         )
-        t = torch.arange(self.max_seq_len_cached, device="cpu", dtype=torch.int64).float()
+        t = torch.arange(self.current_rope_size, device="cpu", dtype=torch.int64).float()
         t = t / self.scaling_factor
 
         freqs = torch.outer(t, inv_freq)
@@ -1134,6 +1160,15 @@ def from_pretrained(
            f"\        /    Bfloat16 = {str(SUPPORTS_BFLOAT16).upper()}. FA [Xformers = {xformers_version}. FA2 = {HAS_FLASH_ATTENTION}]\n"\
            f' "-____-"     Free Apache license: http://github.com/unslothai/unsloth'
         print(statistics)
+
+        # Warn about fast transfers
+        old_hf_transfer = os.environ.get("HF_HUB_ENABLE_HF_TRANSFER", "0")
+        if os.environ.get("HF_HUB_ENABLE_HF_TRANSFER", "0") == "1":
+            print("Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!")
+        pass
+        # Return old flag
+        os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = old_hf_transfer
+
         model_patcher.pre_patch()
         get_statistics() # For debugging - we use a download counter to see if environments are not breaking 
 
@@ -1215,6 +1250,8 @@ def from_pretrained(
             attn_implementation     = "eager",
             **kwargs,
         )
+        # Return old flag
+        os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = old_hf_transfer
         # We currently only support NVIDIA GPUs - AMD / Intel is a work in progress!
         post_check = check_nvidia()
 
@@ -2081,4 +2118,5 @@ def for_training(model, use_gradient_checkpointing = True):
             internal_model._saved_temp_tokenizer.padding_side = "right"
         pass
     pass
-pass
\ No newline at end of file
+pass
+
diff --git a/unsloth/models/mistral.py b/unsloth/models/mistral.py
index b2531056a..e0e034fc5 100644
--- a/unsloth/models/mistral.py
+++ b/unsloth/models/mistral.py
@@ -270,6 +270,24 @@ def MistralForCausalLM_fast_forward(
 pass
 
 
+# Transformers had to update for Mistral Nemo 12b since Attention is (5120, 4096) now.
+def patch_mistral_nemo_attention(function):
+    function = function.replace(
+        "(self.head_dim * self.num_heads) != self.hidden_size",
+        "False",
+    )
+    function = function.replace(
+        "self.head_dim = self.hidden_size // self.num_heads",
+        "self.head_dim = config.head_dim",
+    )
+    function = function.replace(
+        "self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False)",
+        "self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)",
+    )
+    return function
+pass
+
+
 class FastMistralModel(FastLlamaModel):
 
     @staticmethod
@@ -280,7 +298,9 @@ def pre_patch():
             scaled_rope_module = LlamaLinearScalingRotaryEmbedding,
             attention_module   = MistralAttention,
         )
-        if init_name is not None:
+        # Just for Mistral Nemo models!
+        function = patch_mistral_nemo_attention(function)
+        if True:#init_name is not None:
             exec(function, globals())
             MistralAttention.__init__  = eval(init_name)
         pass

From 10c13545c346990c78717b529af5cdac6d1856d1 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 19 Jul 2024 01:03:51 -0700
Subject: [PATCH 013/147] Update tokenizer_utils.py

---
 unsloth/tokenizer_utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index e0c89e451..7b88b0932 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -498,7 +498,8 @@ def load_correct_tokenizer(
         cache_dir         = cache_dir,
     )
 
-    if tokenizer_name in IGNORED_TOKENIZER_NAMES: pass
+    if tokenizer_name in IGNORED_TOKENIZER_NAMES:
+        return fast_tokenizer
     elif slow_tokenizer is not None:
         if hasattr(fast_tokenizer, "add_bos_token") and hasattr(slow_tokenizer, "add_bos_token"):
             fast_tokenizer.add_bos_token = slow_tokenizer.add_bos_token

From ad3d38ad4dde514b842688d6fa184e085eaf5320 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 19 Jul 2024 01:06:41 -0700
Subject: [PATCH 014/147] Update tokenizer_utils.py

---
 unsloth/tokenizer_utils.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index 7b88b0932..07cd87412 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -41,14 +41,17 @@
 ))
 
 
-IGNORED_TOKENIZER_NAMES = frozenset((
+IGNORED_TOKENIZER_NAMES = [
     "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
     "unsloth/Mistral-Nemo-Instruct-2407",
     "mistralai/Mistral-Nemo-Instruct-2407",
     "unsloth/Mistral-Nemo-Base-2407-bnb-4bit",
     "unsloth/Mistral-Nemo-Base-2407",
     "mistralai/Mistral-Nemo-Base-2407",
-))
+]
+IGNORED_TOKENIZER_NAMES = frozenset(
+    [x.lower() for x in IGNORED_TOKENIZER_NAMES]
+)
 
 # Check environments
 keynames = "\n" + "\n".join(os.environ.keys())

From 8ee997cac28e3bc3ff205252ca543ab46ade3d25 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 19 Jul 2024 01:29:52 -0700
Subject: [PATCH 015/147] Update tokenizer_utils.py

---
 unsloth/tokenizer_utils.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index 07cd87412..3f75d1686 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -682,6 +682,11 @@ def fix_untrained_tokens(model, tokenizer, train_dataset, eps = 1e-16):
     embedding_matrix = model.get_input_embeddings ().weight
     lm_head_matrix   = model.get_output_embeddings().weight
 
+    # Ignore some model checks for now
+    if model.config._name_or_path in  IGNORED_TOKENIZER_NAMES:
+        return
+    pass
+
     # Get untrained tokens
     indicator_untrained = torch.amax(embedding_matrix, axis = 1) <= eps
     where_untrained = torch.where(indicator_untrained)[0]

From 565a5a389460bc8e4d0f56cc5fb6276bbb658065 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 19 Jul 2024 09:27:18 -0700
Subject: [PATCH 016/147] Fix Gemma

---
 unsloth/models/gemma.py | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/unsloth/models/gemma.py b/unsloth/models/gemma.py
index bc70b993a..ce89ad3be 100644
--- a/unsloth/models/gemma.py
+++ b/unsloth/models/gemma.py
@@ -210,22 +210,24 @@ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         self.dim = dim
         self.max_position_embeddings = max_position_embeddings
         self.base = base
+        # Dynamic RoPE we first set it to a max of 4 * 8192 tokens then we iteratively grow this
+        self.current_rope_size = min(4 * 8192, self.max_position_embeddings)
 
         # Build here to make `torch.jit.trace` work.
-        self._set_cos_sin_cache(seq_len=max_position_embeddings, device=device, dtype=torch.get_default_dtype())
+        self._set_cos_sin_cache(seq_len=self.current_rope_size, device=device, dtype=torch.get_default_dtype())
     pass
 
     def _set_cos_sin_cache(self, seq_len, device, dtype):
         # Note: on the original Llama codebase, these tensors are created on the target device (and not on CPU) and
         # in FP32. They are applied (multiplied) in FP32 as well.
-        self.max_seq_len_cached = seq_len
+        self.current_rope_size = seq_len
 
         # The difference is we do division explicity instead of t * (1/x) ie we do t/x.
         freq_exponents = (2.0 / self.dim) * (
             torch.arange(self.dim // 2, dtype = torch.int64, device = "cpu").float()
         )
         timescale = self.base**freq_exponents
-        positions = torch.arange(self.max_seq_len_cached, device = "cpu", dtype = torch.int64).float()
+        positions = torch.arange(self.current_rope_size, device = "cpu", dtype = torch.int64).float()
         radians_new = positions[..., None] / timescale[None, None, :]
         radians_new = radians_new.squeeze(0)
 
@@ -239,7 +241,7 @@ def _set_cos_sin_cache(self, seq_len, device, dtype):
 
     def forward(self, x, position_ids=None, seq_len=None):
         # x: [bs, num_attention_heads, seq_len, head_size]
-        if seq_len > self.max_seq_len_cached:
+        if seq_len > self.current_rope_size:
             self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
 
         return (
@@ -247,6 +249,13 @@ def forward(self, x, position_ids=None, seq_len=None):
             self.sin_cached[:seq_len].to(dtype=x.dtype),
         )
     pass
+
+    def extend_rope_embedding(self, x, seq_len):
+        if seq_len <= self.current_rope_size: return
+        # Iteratively grow by increments of 8192
+        self.current_rope_size = int(round(seq_len / 8192)) * 8192
+        self._set_cos_sin_cache(self.current_rope_size, device = "cuda:0", dtype = x.dtype)
+    pass
 pass
 
 
@@ -263,14 +272,14 @@ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, s
     def _set_cos_sin_cache(self, seq_len, device, dtype):
 # Note: on the original Llama codebase, these tensors are created on the target device (and not on CPU) and
         # in FP32. They are applied (multiplied) in FP32 as well.
-        self.max_seq_len_cached = seq_len
+        self.current_rope_size = seq_len
 
         # The difference is we do division explicity instead of t * (1/x) ie we do t/x.
         freq_exponents = (2.0 / self.dim) * (
             torch.arange(self.dim // 2, dtype = torch.int64, device = "cpu").float()
         )
         timescale = self.base**freq_exponents
-        positions = torch.arange(self.max_seq_len_cached, device = "cpu", dtype = torch.int64).float()
+        positions = torch.arange(self.current_rope_size, device = "cpu", dtype = torch.int64).float()
         positions = positions /  self.scaling_factor
         radians_new = positions[..., None] / timescale[None, None, :]
         radians_new = radians_new.squeeze(0)

From 182ab7e0cb28b21c0b3b119668ec3cd9aceb15de Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 19 Jul 2024 09:32:27 -0700
Subject: [PATCH 017/147] Update mistral.py

---
 unsloth/models/mistral.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/unsloth/models/mistral.py b/unsloth/models/mistral.py
index e0e034fc5..ed6207bb0 100644
--- a/unsloth/models/mistral.py
+++ b/unsloth/models/mistral.py
@@ -299,8 +299,9 @@ def pre_patch():
             attention_module   = MistralAttention,
         )
         # Just for Mistral Nemo models!
-        function = patch_mistral_nemo_attention(function)
-        if True:#init_name is not None:
+        if function is not None:
+            function = patch_mistral_nemo_attention(function)
+            # if True:#init_name is not None:
             exec(function, globals())
             MistralAttention.__init__  = eval(init_name)
         pass

From 72e1b03544c3d23a0c28f883f242fa0f96e8091b Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 20 Jul 2024 11:53:32 -0700
Subject: [PATCH 018/147] Update llama.py

---
 unsloth/models/llama.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index ca4e65159..32610bbfd 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1068,6 +1068,8 @@ def _fast_generate(*args, **kwargs):
         # For newer HF
         kwargs["cache_implementation"] = "dynamic"
 
+        print(kwargs)
+
         # Set pad token
         # old_pad_token_id = getattr(model.config, "pad_token_id", None)
         # old_eos_token_id = getattr(model.config, "eos_token_id", None)

From ba515ec92dbc85c03c65d3f31e10166cc73ef323 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 20 Jul 2024 12:47:36 -0700
Subject: [PATCH 019/147] Update llama.py

---
 unsloth/models/llama.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 32610bbfd..ff51b90b8 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1068,7 +1068,14 @@ def _fast_generate(*args, **kwargs):
         # For newer HF
         kwargs["cache_implementation"] = "dynamic"
 
-        print(kwargs)
+        # Remove token_type_ids
+        kwargs.pop("token_type_ids", None)
+
+        # Check pad_token
+        kwargs["pad_token_id"] = kwargs.pop(
+            "pad_token_id",
+            getattr(model.config, "eos_token_id", None),
+        )
 
         # Set pad token
         # old_pad_token_id = getattr(model.config, "pad_token_id", None)

From 5f496efdb4db75371aa17d5b1b393f96cd55a2bd Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 20 Jul 2024 13:22:36 -0700
Subject: [PATCH 020/147] Update tokenizer_utils.py

---
 unsloth/tokenizer_utils.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index 3f75d1686..0469f4d61 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -688,7 +688,12 @@ def fix_untrained_tokens(model, tokenizer, train_dataset, eps = 1e-16):
     pass
 
     # Get untrained tokens
-    indicator_untrained = torch.amax(embedding_matrix, axis = 1) <= eps
+    indicator_untrained1 = torch.amax(embedding_matrix, axis = 1) <= eps
+    # Check lm_head as well
+    indicator_untrained2 = torch.amax(lm_head_matrix,   axis = 1) <= eps
+    # Combine both checks
+    indicator_untrained = indicator_untrained1 & indicator_untrained2
+    
     where_untrained = torch.where(indicator_untrained)[0]
     n_untrained = where_untrained.shape[0]
     n_trained = embedding_matrix.shape[0] - n_untrained

From e41cc4093c70095e4aef390c8afae85c38aa4eb3 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 20 Jul 2024 13:25:59 -0700
Subject: [PATCH 021/147] Update tokenizer_utils.py

---
 unsloth/tokenizer_utils.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index 0469f4d61..8474c2c6b 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -42,12 +42,12 @@
 
 
 IGNORED_TOKENIZER_NAMES = [
-    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
-    "unsloth/Mistral-Nemo-Instruct-2407",
-    "mistralai/Mistral-Nemo-Instruct-2407",
-    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit",
-    "unsloth/Mistral-Nemo-Base-2407",
-    "mistralai/Mistral-Nemo-Base-2407",
+    # "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
+    # "unsloth/Mistral-Nemo-Instruct-2407",
+    # "mistralai/Mistral-Nemo-Instruct-2407",
+    # "unsloth/Mistral-Nemo-Base-2407-bnb-4bit",
+    # "unsloth/Mistral-Nemo-Base-2407",
+    # "mistralai/Mistral-Nemo-Base-2407",
 ]
 IGNORED_TOKENIZER_NAMES = frozenset(
     [x.lower() for x in IGNORED_TOKENIZER_NAMES]

From c553b175d239f023882562ac727a92e6fcc95417 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Mon, 22 Jul 2024 22:58:02 -0700
Subject: [PATCH 022/147] Llama 3.1

---
 unsloth/models/_utils.py | 93 +++++++++++++++++++++++++++++++++++++++-
 unsloth/models/llama.py  | 73 +++++++++++++++++++++++++++++++
 unsloth/models/mapper.py | 14 ++++++
 3 files changed, 179 insertions(+), 1 deletion(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 466a5fee7..c7c779a23 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -33,6 +33,7 @@
     "unsloth_offloaded_gradient_checkpoint",
     "torch_compile_options",
     "patch_linear_scaling",
+    "patch_llama_rope_scaling",
     "check_nvidia",
     "create_boolean_mask",
     "torch_amp_custom_fwd",
@@ -332,7 +333,13 @@ def patch_tokenizer(model, tokenizer):
         Check if pad_token is not the same as eos_token otherwise the loss will ignore it!!
         Fixes https://github.com/unslothai/unsloth/issues/5
     """
-    possible_reserved_tokens = ("<|reserved", "<|placeholder", "[control")
+    possible_reserved_tokens = (
+        "<|reserved",                # Llama-3
+        "<|placeholder",             # Phi-3
+        "[control",                  # Forgot where lol
+        "<pad>",                     # Mistral Nemo
+        "<|finetune_right_pad_id|>", # Llama-3.1
+    )
 
     if model is not None:
         model.config.update({"unsloth_version" : __version__})
@@ -779,6 +786,90 @@ def patch_linear_scaling(
 pass
 
 
+# Patches for Llama-3 LlamaExtendedRotaryEmbedding
+def patch_llama_rope_scaling(
+    model_name = "llama",
+    rope_module = None,
+    scaled_rope_module = None,
+    extended_rope_module = None,
+    attention_module = None,
+):
+    assert(\
+        rope_module is not None and \
+        scaled_rope_module is not None and \
+        extended_rope_module is not None
+    )
+    assert(attention_module is not None)
+
+    rope_name = rope_module.__name__
+    scaled_rope_name = scaled_rope_module.__name__
+    model_filepath = f"transformers.models.{model_name}.modeling_{model_name}"
+    exec_code = \
+        f"import torch.nn as nn\n"\
+        f"from typing import Union, Optional, List, Any, Callable, Tuple\n"\
+        f"from {model_filepath} import logger, "\
+        f"{model_name.title()}Attention, {model_name.title()}Config"
+
+    try:
+        function = inspect.getsource(attention_module.__init__)
+    except:
+        # Most likely already patched!
+        return None, None
+    where = function.find("def")
+    function = function.split("\n")
+    function = "\n".join(x[where:] for x in function)
+    init_name = f"{model_name.title()}Attention__init__"
+    function = function.replace("def __init__", f"def {init_name}")
+    function = function.replace(
+        "super().__init__()",
+        f"super({model_name.title()}Attention, self).__init__()",
+    )
+    fix_rope_function = """
+    if getattr(self.config, "rope_scaling", None) is None:
+        # Hack
+        if self.config.max_position_embeddings == 131072
+        self.rotary_emb = {rope_function}(
+            self.head_dim,
+            max_position_embeddings=self.max_position_embeddings,
+            base=self.rope_theta,
+        )
+    else:
+        scaling_type = self.config.rope_scaling["type"]
+        scaling_factor = self.config.rope_scaling.get("factor")
+        if scaling_type == "linear":
+            self.rotary_emb = {scaled_rope_function}(
+                self.head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                scaling_factor=scaling_factor,
+                base=self.rope_theta,
+            )
+        elif scaling_type == "extended":
+            self.rotary_emb = {extended_rope_function}(
+                self.head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                base=self.rope_theta,
+            )
+        else:
+            raise ValueError(f"Unknown RoPE scaling type {{scaling_type}}")
+    pass
+    """
+    fix_rope_function = fix_rope_function.format(
+        rope_function          = rope_module.__name__,
+        scaled_rope_function   = scaled_rope_module.__name__,
+        extended_rope_function = extended_rope_module.__name__,
+    )
+    rotary_emb = re.findall(
+        "self.rotary_emb = .+?\)", function,
+        flags = re.DOTALL | re.MULTILINE,
+    )
+    if len(rotary_emb) == 0: return None, function
+    rotary_emb = rotary_emb[0]
+    function = function.replace(rotary_emb, fix_rope_function, 1)
+    function = exec_code + "\n\n" + function
+    return init_name, function
+pass
+
+
 def check_nvidia():
     # Unsloth doesn't work yet on AMD devices - we're working on it!
     output = np.array([0,])
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index ff51b90b8..2d224b3ca 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1052,6 +1052,68 @@ def _set_cos_sin_cache(self, seq_len, device, dtype):
 pass
 
 
+# See https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/rotary_embedding.py#L736
+# For Llama 3.1
+class LlamaExtendedRotaryEmbedding(LlamaRotaryEmbedding):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        # Dynamic RoPE we first set it to a max of 4 * 8192 tokens then we iteratively grow this
+        self.current_rope_size = min(4 * 8192, self.max_position_embeddings)
+
+        # Normal Llama-3 RoPE
+        inv_freq = 1.0 / (
+            self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64, device="cpu").float() / self.dim)
+        )
+        inv_freq = self.apply_scaling(inv_freq)
+        self.register_buffer("inv_freq", inv_freq, persistent = False)
+
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(seq_len=self.current_rope_size, device=device, dtype=torch.get_default_dtype())
+    pass
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        # Note: on the original Llama codebase, these tensors are created on the target device (and not on CPU) and
+        # in FP32. They are applied (multiplied) in FP32 as well.
+        self.current_rope_size = seq_len
+        
+        t = torch.arange(self.current_rope_size, device="cpu", dtype=torch.int64).float()
+
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype=dtype, device=device, non_blocking=True), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype=dtype, device=device, non_blocking=True), persistent=False)
+    pass
+
+    def apply_scaling(self, freqs: torch.Tensor):
+        scale_factor = 8
+        low_freq_factor = 1
+        high_freq_factor = 4
+        old_context_len = 8192
+
+        low_freq_wavelen = old_context_len / low_freq_factor
+        high_freq_wavelen = old_context_len / high_freq_factor
+        new_freqs = []
+        for freq in freqs:
+            wavelen = 2 * math.pi / freq
+            if wavelen < high_freq_wavelen:
+                new_freqs.append(freq)
+            elif wavelen > low_freq_wavelen:
+                new_freqs.append(freq / scale_factor)
+            else:
+                assert low_freq_wavelen != high_freq_wavelen
+                smooth = (old_context_len / wavelen - low_freq_factor) / (
+                    high_freq_factor - low_freq_factor)
+                new_freqs.append((1 - smooth) * freq / scale_factor +
+                                 smooth * freq)
+        return torch.tensor(new_freqs, dtype=freqs.dtype, device=freqs.device)
+    pass
+pass
+
+
 def _wrap_fast_inference(generate, device_type, dtype, model):
     # Wraps inference with bfloat16 / float16
     @torch.inference_mode
@@ -1108,6 +1170,17 @@ class FastLlamaModel:
 
     @staticmethod
     def pre_patch():
+        init_name, function = patch_llama_rope_scaling(
+            model_name           = "llama",
+            rope_module          = LlamaRotaryEmbedding,
+            scaled_rope_module   = LlamaLinearScalingRotaryEmbedding,
+            extended_rope_module = LlamaExtendedRotaryEmbedding,
+            attention_module     = LlamaAttention,
+        )
+        if init_name is not None:
+            exec(function, globals())
+            LlamaAttention.__init__  = eval(init_name)
+        pass
         LlamaAttention      .forward = LlamaAttention_fast_forward
         LlamaSdpaAttention  .forward = LlamaAttention_fast_forward
         LlamaFlashAttention2.forward = LlamaAttention_fast_forward
diff --git a/unsloth/models/mapper.py b/unsloth/models/mapper.py
index 38cbdbe99..462c85f2a 100644
--- a/unsloth/models/mapper.py
+++ b/unsloth/models/mapper.py
@@ -218,6 +218,20 @@
         "unsloth/Mistral-Nemo-Base-2407",
         "mistralai/Mistral-Nemo-Base-2407",
     ),
+    "unsloth/llama-3.1-8b-bnb-4bit" : (
+        "unsloth/llama-3.1-8b",
+        "meta-llama/Meta-Llama-3.1-8B",
+    ),
+    "unsloth/llama-3.1-8b-Instruct-bnb-4bit" : (
+        "unsloth/llama-3.1-8b-Instruct",
+        "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    ),
+    "unsloth/llama-3.1-70b-bnb-4bit" : (
+        "meta-llama/Meta-Llama-3.1-70B",
+    ),
+    "unsloth/llama-3.1-70b-Instruct-bnb-4bit" : (
+        "meta-llama/Meta-Llama-3.1-70B-Instruct",
+    ),
 }
 
 INT_TO_FLOAT_MAPPER = {}

From 00ad7992f69ea086ee4b8e9229d6c901ace494c5 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Mon, 22 Jul 2024 23:01:18 -0700
Subject: [PATCH 023/147] Update _utils.py

---
 unsloth/models/_utils.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index c7c779a23..27eb226f2 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -826,13 +826,19 @@ def patch_llama_rope_scaling(
     )
     fix_rope_function = """
     if getattr(self.config, "rope_scaling", None) is None:
-        # Hack
-        if self.config.max_position_embeddings == 131072
-        self.rotary_emb = {rope_function}(
-            self.head_dim,
-            max_position_embeddings=self.max_position_embeddings,
-            base=self.rope_theta,
-        )
+        # Hack to check for Llama-3.1
+        if 'llama-3.1' in str(self.config.config._name_or_path).lower():
+            self.rotary_emb = {extended_rope_function}(
+                self.head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                base=self.rope_theta,
+            )
+        else:
+            self.rotary_emb = {rope_function}(
+                self.head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                base=self.rope_theta,
+            )
     else:
         scaling_type = self.config.rope_scaling["type"]
         scaling_factor = self.config.rope_scaling.get("factor")

From ae2d1b6cacf8ce46c5aed68ef44921f6a498d8e2 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 23 Jul 2024 10:27:36 -0700
Subject: [PATCH 024/147] Llama 3.1

---
 README.md                |  5 +++--
 unsloth/models/_utils.py |  2 ++
 unsloth/models/llama.py  | 12 +++++++-----
 unsloth/models/mapper.py | 14 ++++++++------
 4 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/README.md b/README.md
index 05977bad7..c666f2d9c 100644
--- a/README.md
+++ b/README.md
@@ -22,7 +22,7 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and
 
 | Unsloth supports | Free Notebooks | Performance | Memory use |
 |-----------|---------|--------|----------|
-| **Llama 3 (8B)**      | [▶️ Start for free](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing)               | 2x faster | 60% less |
+| **Llama 3.1 (8B)**      | [▶️ Start for free](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing)               | 2x faster | 60% less |
 | **Mistral Nemo (12B)** | [▶️ Start for free](https://colab.research.google.com/drive/17d3U-CAIwzmbDRqbZ9NnpHxCkmXB6LZ0?usp=sharing)               | 2x faster | 60% less |
 | **Gemma 2 (9B)**      | [▶️ Start for free](https://colab.research.google.com/drive/1vIrqH5uYDQwsJ4-OO3DErvuv4pBgVwk4?usp=sharing)               | 2x faster | 63% less |
 | **Phi-3 (mini)** | [▶️ Start for free](https://colab.research.google.com/drive/1lN6hPQveB_mHSnTOYifygFcrO8C1bxq4?usp=sharing)               | 2x faster | 50% less |
@@ -32,13 +32,14 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and
 | **DPO Zephyr**     | [▶️ Start for free](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing)               | 1.9x faster | 43% less |
 | **TinyLlama**  | [▶️ Start for free](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing)               | 3.9x faster | 74% less |
 
-- **Kaggle Notebooks** for [Llama 3 (8B)](https://www.kaggle.com/code/danielhanchen/kaggle-llama-3-8b-unsloth-notebook), [Gemma 2 (9B)](https://www.kaggle.com/code/danielhanchen/kaggle-gemma-7b-unsloth-notebook/), [Mistral (7B)](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook)
+- **Kaggle Notebooks** for [Llama 3.1 (8B)](https://www.kaggle.com/code/danielhanchen/kaggle-llama-3-8b-unsloth-notebook), [Gemma 2 (9B)](https://www.kaggle.com/code/danielhanchen/kaggle-gemma-7b-unsloth-notebook/), [Mistral (7B)](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook)
 - Run [Llama 3 conversational notebook](https://colab.research.google.com/drive/1XamvWYinY6FOSX9GLvnqSjjsNflxdhNc?usp=sharing) and [Mistral v0.3 ChatML](https://colab.research.google.com/drive/15F1xyn8497_dUbxZP4zWmPZ3PJx1Oymv?usp=sharing)
 - This [text completion notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing) is for continued pretraining / raw text
 - This [continued pretraining notebook](https://colab.research.google.com/drive/1tEd1FrOXWMnCU9UIvdYhs61tkxdMuKZu?usp=sharing) is for learning another language
 - Click [here](https://github.com/unslothai/unsloth/wiki) for detailed documentation for Unsloth.
 
 ## 🦥 Unsloth.ai News
+- 📣 NEW! [Llama 3.1 8b, 70b](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) both Base and Instruct now supported
 - 📣 NEW! [Mistral Nemo-12b](https://colab.research.google.com/drive/17d3U-CAIwzmbDRqbZ9NnpHxCkmXB6LZ0?usp=sharing) both Base and Instruct now supported
 - 📣 NEW! [Gemma-2-9b](https://colab.research.google.com/drive/1vIrqH5uYDQwsJ4-OO3DErvuv4pBgVwk4?usp=sharing) and Gemma-2-27b now supported
 - 📣 UPDATE! [Phi-3 mini](https://colab.research.google.com/drive/1hhdhBa1j_hsymiW9m-WzxQtgqTH_NHqi?usp=sharing) model updated. [Phi-3 Medium](https://colab.research.google.com/drive/1hhdhBa1j_hsymiW9m-WzxQtgqTH_NHqi?usp=sharing) 2x faster finetuning.
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 27eb226f2..394213b9f 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -868,9 +868,11 @@ def patch_llama_rope_scaling(
         "self.rotary_emb = .+?\)", function,
         flags = re.DOTALL | re.MULTILINE,
     )
+    print(rotary_emb)
     if len(rotary_emb) == 0: return None, function
     rotary_emb = rotary_emb[0]
     function = function.replace(rotary_emb, fix_rope_function, 1)
+    print(function)
     function = exec_code + "\n\n" + function
     return init_name, function
 pass
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 2d224b3ca..58fcc9276 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1088,11 +1088,13 @@ def _set_cos_sin_cache(self, seq_len, device, dtype):
         self.register_buffer("sin_cached", emb.sin().to(dtype=dtype, device=device, non_blocking=True), persistent=False)
     pass
 
-    def apply_scaling(self, freqs: torch.Tensor):
+    # From https://github.com/meta-llama/llama-models/blob/main/models/llama3_1/api/model.py#L41
+    def apply_scaling(freqs: torch.Tensor):
+        # Values obtained from grid search
         scale_factor = 8
         low_freq_factor = 1
         high_freq_factor = 4
-        old_context_len = 8192
+        old_context_len = 8192  # original llama3 length
 
         low_freq_wavelen = old_context_len / low_freq_factor
         high_freq_wavelen = old_context_len / high_freq_factor
@@ -1106,9 +1108,9 @@ def apply_scaling(self, freqs: torch.Tensor):
             else:
                 assert low_freq_wavelen != high_freq_wavelen
                 smooth = (old_context_len / wavelen - low_freq_factor) / (
-                    high_freq_factor - low_freq_factor)
-                new_freqs.append((1 - smooth) * freq / scale_factor +
-                                 smooth * freq)
+                    high_freq_factor - low_freq_factor
+                )
+                new_freqs.append((1 - smooth) * freq / scale_factor + smooth * freq)
         return torch.tensor(new_freqs, dtype=freqs.dtype, device=freqs.device)
     pass
 pass
diff --git a/unsloth/models/mapper.py b/unsloth/models/mapper.py
index 462c85f2a..fc13c94e8 100644
--- a/unsloth/models/mapper.py
+++ b/unsloth/models/mapper.py
@@ -218,18 +218,20 @@
         "unsloth/Mistral-Nemo-Base-2407",
         "mistralai/Mistral-Nemo-Base-2407",
     ),
-    "unsloth/llama-3.1-8b-bnb-4bit" : (
-        "unsloth/llama-3.1-8b",
+    "unsloth/Meta-Llama-3.1-8B-bnb-4bit" : (
+        "unsloth/Meta-Llama-3.1-8B",
         "meta-llama/Meta-Llama-3.1-8B",
     ),
-    "unsloth/llama-3.1-8b-Instruct-bnb-4bit" : (
-        "unsloth/llama-3.1-8b-Instruct",
+    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit" : (
+        "unsloth/Meta-Llama-3.1-8B-Instruct",
         "meta-llama/Meta-Llama-3.1-8B-Instruct",
     ),
-    "unsloth/llama-3.1-70b-bnb-4bit" : (
+    "unsloth/Meta-Llama-3.1-70B-bnb-4bit" : (
+        "unsloth/Meta-Llama-3.1-70B",
         "meta-llama/Meta-Llama-3.1-70B",
     ),
-    "unsloth/llama-3.1-70b-Instruct-bnb-4bit" : (
+    "unsloth/Meta-Llama-3.1-70B-Instruct-bnb-4bit" : (
+        "unsloth/Meta-Llama-3.1-70B-Instruct",
         "meta-llama/Meta-Llama-3.1-70B-Instruct",
     ),
 }

From 77c502cc0c97a84cf9230308919ad56aed9ef4f9 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 23 Jul 2024 10:33:07 -0700
Subject: [PATCH 025/147] Update _utils.py

---
 unsloth/models/_utils.py | 25 ++++++++++---------------
 1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 394213b9f..3ea38eab1 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -826,22 +826,17 @@ def patch_llama_rope_scaling(
     )
     fix_rope_function = """
     if getattr(self.config, "rope_scaling", None) is None:
-        # Hack to check for Llama-3.1
-        if 'llama-3.1' in str(self.config.config._name_or_path).lower():
-            self.rotary_emb = {extended_rope_function}(
-                self.head_dim,
-                max_position_embeddings=self.max_position_embeddings,
-                base=self.rope_theta,
-            )
-        else:
-            self.rotary_emb = {rope_function}(
-                self.head_dim,
-                max_position_embeddings=self.max_position_embeddings,
-                base=self.rope_theta,
-            )
+        self.rotary_emb = {rope_function}(
+            self.head_dim,
+            max_position_embeddings=self.max_position_embeddings,
+            base=self.rope_theta,
+        )
     else:
-        scaling_type = self.config.rope_scaling["type"]
+        scaling_type1 = self.config.rope_scaling.get("type", None)
+        scaling_type2 = self.config.rope_scaling.get("rope_type", None)
+        scaling_type = scaling_type1 if scaling_type1 is not None else scaling_type2
         scaling_factor = self.config.rope_scaling.get("factor")
+        
         if scaling_type == "linear":
             self.rotary_emb = {scaled_rope_function}(
                 self.head_dim,
@@ -849,7 +844,7 @@ def patch_llama_rope_scaling(
                 scaling_factor=scaling_factor,
                 base=self.rope_theta,
             )
-        elif scaling_type == "extended":
+        elif scaling_type == "llama3":
             self.rotary_emb = {extended_rope_function}(
                 self.head_dim,
                 max_position_embeddings=self.max_position_embeddings,

From 41ee26ce655e05769172c209ea6dd3f8174baefc Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 23 Jul 2024 10:35:03 -0700
Subject: [PATCH 026/147] Update llama.py

---
 unsloth/models/llama.py | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 58fcc9276..403a7130b 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1054,7 +1054,7 @@ def _set_cos_sin_cache(self, seq_len, device, dtype):
 
 # See https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/rotary_embedding.py#L736
 # For Llama 3.1
-class LlamaExtendedRotaryEmbedding(LlamaRotaryEmbedding):
+class LlamaExtendedRotaryEmbedding(torch.nn.Module):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
         self.dim = dim
@@ -1113,6 +1113,24 @@ def apply_scaling(freqs: torch.Tensor):
                 new_freqs.append((1 - smooth) * freq / scale_factor + smooth * freq)
         return torch.tensor(new_freqs, dtype=freqs.dtype, device=freqs.device)
     pass
+
+    def forward(self, x, position_ids=None, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.current_rope_size:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+        return (
+            self.cos_cached[:seq_len].to(dtype = x.dtype),
+            self.sin_cached[:seq_len].to(dtype = x.dtype),
+        )
+    pass
+
+    def extend_rope_embedding(self, x, seq_len):
+        if seq_len <= self.current_rope_size: return
+        # Iteratively grow by increments of 8192
+        self.current_rope_size = int(round(seq_len / 8192)) * 8192
+        self._set_cos_sin_cache(self.current_rope_size, device = "cuda:0", dtype = x.dtype)
+    pass
 pass
 
 
From 3dabf84ab67164c5a4c42c1ea598bdfbee320c6f Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 23 Jul 2024 10:36:06 -0700
Subject: [PATCH 027/147] Update llama.py

---
 unsloth/models/llama.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 403a7130b..830b345d7 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1089,7 +1089,7 @@ def _set_cos_sin_cache(self, seq_len, device, dtype):
     pass
 
     # From https://github.com/meta-llama/llama-models/blob/main/models/llama3_1/api/model.py#L41
-    def apply_scaling(freqs: torch.Tensor):
+    def apply_scaling(self, freqs: torch.Tensor):
         # Values obtained from grid search
         scale_factor = 8
         low_freq_factor = 1

From 07634b920399ffd0546be7a460bc27c48cc60b34 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 23 Jul 2024 10:43:36 -0700
Subject: [PATCH 028/147] hack for rotary

---
 unsloth/models/gemma.py |  8 ++++++--
 unsloth/models/llama.py | 12 +++++++++---
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/unsloth/models/gemma.py b/unsloth/models/gemma.py
index ce89ad3be..6c9a57abf 100644
--- a/unsloth/models/gemma.py
+++ b/unsloth/models/gemma.py
@@ -205,7 +205,9 @@ class GemmaFixedRotaryEmbedding(torch.nn.Module):
     # Fixes https://github.com/huggingface/transformers/pull/28837
     # https://github.com/microsoft/DeepSpeed/issues/4932
     # The precision of RoPE buffers is not correct, so we cast to int64.
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None,
+        config = None, # [TODO] Hack to pass in config - need to remove later
+    ):
         super().__init__()
         self.dim = dim
         self.max_position_embeddings = max_position_embeddings
@@ -264,7 +266,9 @@ class GemmaFixedLinearScalingRotaryEmbedding(GemmaFixedRotaryEmbedding):
     # Fixes https://github.com/huggingface/transformers/pull/28837
     # https://github.com/microsoft/DeepSpeed/issues/4932
     # The precision of RoPE buffers is not correct, so we cast to int64.
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0,
+        config = None, # [TODO] Hack to pass in config - need to remove later
+    ):
         self.scaling_factor = scaling_factor
         super().__init__(dim, max_position_embeddings, base, device)
     pass
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 830b345d7..929c32496 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -977,7 +977,9 @@ class LlamaRotaryEmbedding(torch.nn.Module):
     # Fixes https://github.com/huggingface/transformers/pull/28837
     # https://github.com/microsoft/DeepSpeed/issues/4932
     # The precision of RoPE buffers is not correct, so we cast to int64.
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None,
+        config = None, # [TODO] Hack to pass in config - need to remove later
+    ):
         super().__init__()
         self.dim = dim
         self.max_position_embeddings = max_position_embeddings
@@ -1030,7 +1032,9 @@ class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding):
     # Fixes https://github.com/huggingface/transformers/pull/28837
     # https://github.com/microsoft/DeepSpeed/issues/4932
     # The precision of RoPE buffers is not correct, so we cast to int64.
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0,
+        config = None, # [TODO] Hack to pass in config - need to remove later
+    ):
         self.scaling_factor = scaling_factor
         super().__init__(dim, max_position_embeddings, base, device)
     pass
@@ -1055,7 +1059,9 @@ def _set_cos_sin_cache(self, seq_len, device, dtype):
 # See https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/rotary_embedding.py#L736
 # For Llama 3.1
 class LlamaExtendedRotaryEmbedding(torch.nn.Module):
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None,
+        config = None, # [TODO] Hack to pass in config - need to remove later
+    ):
         super().__init__()
         self.dim = dim
         self.max_position_embeddings = max_position_embeddings

From 4a46220131efa70892b48468406dc3bcaaf569bc Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 23 Jul 2024 10:48:45 -0700
Subject: [PATCH 029/147] patch RoPE

---
 unsloth/models/_utils.py | 12 ++++++------
 unsloth/models/gemma.py  |  6 ++++--
 unsloth/models/llama.py  |  9 ++++++---
 3 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 3ea38eab1..2b8410032 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -752,7 +752,7 @@ def patch_linear_scaling(
     fix_rope_function = """
     if getattr(self.config, "rope_scaling", None) is None:
         self.rotary_emb = {rope_function}(
-            self.head_dim,
+            dim = self.head_dim,
             max_position_embeddings=self.max_position_embeddings,
             base=self.rope_theta,
         )
@@ -761,7 +761,7 @@ def patch_linear_scaling(
         scaling_factor = self.config.rope_scaling["factor"]
         if scaling_type == "linear":
             self.rotary_emb = {scaled_rope_function}(
-                self.head_dim,
+                dim = self.head_dim,
                 max_position_embeddings=self.max_position_embeddings,
                 scaling_factor=scaling_factor,
                 base=self.rope_theta,
@@ -827,7 +827,7 @@ def patch_llama_rope_scaling(
     fix_rope_function = """
     if getattr(self.config, "rope_scaling", None) is None:
         self.rotary_emb = {rope_function}(
-            self.head_dim,
+            dim = self.head_dim,
             max_position_embeddings=self.max_position_embeddings,
             base=self.rope_theta,
         )
@@ -836,17 +836,17 @@ def patch_llama_rope_scaling(
         scaling_type2 = self.config.rope_scaling.get("rope_type", None)
         scaling_type = scaling_type1 if scaling_type1 is not None else scaling_type2
         scaling_factor = self.config.rope_scaling.get("factor")
-        
+
         if scaling_type == "linear":
             self.rotary_emb = {scaled_rope_function}(
-                self.head_dim,
+                dim = self.head_dim,
                 max_position_embeddings=self.max_position_embeddings,
                 scaling_factor=scaling_factor,
                 base=self.rope_theta,
             )
         elif scaling_type == "llama3":
             self.rotary_emb = {extended_rope_function}(
-                self.head_dim,
+                dim = self.head_dim,
                 max_position_embeddings=self.max_position_embeddings,
                 base=self.rope_theta,
             )
diff --git a/unsloth/models/gemma.py b/unsloth/models/gemma.py
index 6c9a57abf..3dccf63ae 100644
--- a/unsloth/models/gemma.py
+++ b/unsloth/models/gemma.py
@@ -205,9 +205,10 @@ class GemmaFixedRotaryEmbedding(torch.nn.Module):
     # Fixes https://github.com/huggingface/transformers/pull/28837
     # https://github.com/microsoft/DeepSpeed/issues/4932
     # The precision of RoPE buffers is not correct, so we cast to int64.
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None,
+    def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=None,
         config = None, # [TODO] Hack to pass in config - need to remove later
     ):
+        if config is not None: return # [TODO] Hack to pass in config - need to remove later
         super().__init__()
         self.dim = dim
         self.max_position_embeddings = max_position_embeddings
@@ -266,9 +267,10 @@ class GemmaFixedLinearScalingRotaryEmbedding(GemmaFixedRotaryEmbedding):
     # Fixes https://github.com/huggingface/transformers/pull/28837
     # https://github.com/microsoft/DeepSpeed/issues/4932
     # The precision of RoPE buffers is not correct, so we cast to int64.
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0,
+    def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0,
         config = None, # [TODO] Hack to pass in config - need to remove later
     ):
+        if config is not None: return # [TODO] Hack to pass in config - need to remove later
         self.scaling_factor = scaling_factor
         super().__init__(dim, max_position_embeddings, base, device)
     pass
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 929c32496..d043f03d1 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -977,9 +977,10 @@ class LlamaRotaryEmbedding(torch.nn.Module):
     # Fixes https://github.com/huggingface/transformers/pull/28837
     # https://github.com/microsoft/DeepSpeed/issues/4932
     # The precision of RoPE buffers is not correct, so we cast to int64.
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None,
+    def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=None,
         config = None, # [TODO] Hack to pass in config - need to remove later
     ):
+        if config is not None: return # [TODO] Hack to pass in config - need to remove later
         super().__init__()
         self.dim = dim
         self.max_position_embeddings = max_position_embeddings
@@ -1032,9 +1033,10 @@ class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding):
     # Fixes https://github.com/huggingface/transformers/pull/28837
     # https://github.com/microsoft/DeepSpeed/issues/4932
     # The precision of RoPE buffers is not correct, so we cast to int64.
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0,
+    def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0,
         config = None, # [TODO] Hack to pass in config - need to remove later
     ):
+        if config is not None: return # [TODO] Hack to pass in config - need to remove later
         self.scaling_factor = scaling_factor
         super().__init__(dim, max_position_embeddings, base, device)
     pass
@@ -1059,9 +1061,10 @@ def _set_cos_sin_cache(self, seq_len, device, dtype):
 # See https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/rotary_embedding.py#L736
 # For Llama 3.1
 class LlamaExtendedRotaryEmbedding(torch.nn.Module):
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None,
+    def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=None,
         config = None, # [TODO] Hack to pass in config - need to remove later
     ):
+        if config is not None: return # [TODO] Hack to pass in config - need to remove later
         super().__init__()
         self.dim = dim
         self.max_position_embeddings = max_position_embeddings

From 2d9f189cbe977c4d5bafc9629e9aa0558e373e96 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 23 Jul 2024 10:53:31 -0700
Subject: [PATCH 030/147] refix rope

---
 unsloth/models/gemma.py | 5 ++---
 unsloth/models/llama.py | 9 +++++----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/unsloth/models/gemma.py b/unsloth/models/gemma.py
index 3dccf63ae..e3f1e615d 100644
--- a/unsloth/models/gemma.py
+++ b/unsloth/models/gemma.py
@@ -208,8 +208,8 @@ class GemmaFixedRotaryEmbedding(torch.nn.Module):
     def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=None,
         config = None, # [TODO] Hack to pass in config - need to remove later
     ):
-        if config is not None: return # [TODO] Hack to pass in config - need to remove later
         super().__init__()
+        if config is not None: return # [TODO] Hack to pass in config - need to remove later
         self.dim = dim
         self.max_position_embeddings = max_position_embeddings
         self.base = base
@@ -270,9 +270,8 @@ class GemmaFixedLinearScalingRotaryEmbedding(GemmaFixedRotaryEmbedding):
     def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0,
         config = None, # [TODO] Hack to pass in config - need to remove later
     ):
-        if config is not None: return # [TODO] Hack to pass in config - need to remove later
         self.scaling_factor = scaling_factor
-        super().__init__(dim, max_position_embeddings, base, device)
+        super().__init__(dim = dim, max_position_embeddings = max_position_embeddings, base = base, device = device, config = config)
     pass
 
     def _set_cos_sin_cache(self, seq_len, device, dtype):
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index d043f03d1..a4a6527ff 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -980,8 +980,9 @@ class LlamaRotaryEmbedding(torch.nn.Module):
     def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=None,
         config = None, # [TODO] Hack to pass in config - need to remove later
     ):
-        if config is not None: return # [TODO] Hack to pass in config - need to remove later
         super().__init__()
+        if config is not None: return # [TODO] Hack to pass in config - need to remove later
+        
         self.dim = dim
         self.max_position_embeddings = max_position_embeddings
         self.base = base
@@ -1036,9 +1037,8 @@ class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding):
     def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0,
         config = None, # [TODO] Hack to pass in config - need to remove later
     ):
-        if config is not None: return # [TODO] Hack to pass in config - need to remove later
         self.scaling_factor = scaling_factor
-        super().__init__(dim, max_position_embeddings, base, device)
+        super().__init__(dim = dim, max_position_embeddings = max_position_embeddings, base = base, device = device, config = config)
     pass
 
     def _set_cos_sin_cache(self, seq_len, device, dtype):
@@ -1064,8 +1064,9 @@ class LlamaExtendedRotaryEmbedding(torch.nn.Module):
     def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=None,
         config = None, # [TODO] Hack to pass in config - need to remove later
     ):
-        if config is not None: return # [TODO] Hack to pass in config - need to remove later
         super().__init__()
+        if config is not None: return # [TODO] Hack to pass in config - need to remove later
+        
         self.dim = dim
         self.max_position_embeddings = max_position_embeddings
         self.base = base

From 80d62c3fa6ae248623c974b2926b61c3dba62da3 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 23 Jul 2024 10:54:54 -0700
Subject: [PATCH 031/147] Update _utils.py

---
 unsloth/models/_utils.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 2b8410032..b021e89e9 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -863,11 +863,9 @@ def patch_llama_rope_scaling(
         "self.rotary_emb = .+?\)", function,
         flags = re.DOTALL | re.MULTILINE,
     )
-    print(rotary_emb)
     if len(rotary_emb) == 0: return None, function
     rotary_emb = rotary_emb[0]
     function = function.replace(rotary_emb, fix_rope_function, 1)
-    print(function)
     function = exec_code + "\n\n" + function
     return init_name, function
 pass

From 7d7a5f77655b373c0c50b8df7a2a43ee950dc852 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 23 Jul 2024 10:58:31 -0700
Subject: [PATCH 032/147] Update llama.py

---
 unsloth/models/llama.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index a4a6527ff..a4b655216 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1065,7 +1065,7 @@ def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=
         config = None, # [TODO] Hack to pass in config - need to remove later
     ):
         super().__init__()
-        if config is not None: return # [TODO] Hack to pass in config - need to remove later
+        # if config is not None: return # [TODO] Hack to pass in config - need to remove later
         
         self.dim = dim
         self.max_position_embeddings = max_position_embeddings

From 2f9bd5bcb61f1530a48ee08bbdd5adbd4ec39a33 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 23 Jul 2024 11:09:24 -0700
Subject: [PATCH 033/147] Llama 3.1 check

---
 pyproject.toml           |  4 ++--
 unsloth/models/loader.py | 15 ++++++++++++++-
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 29b35577e..829b35ad3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -35,7 +35,7 @@ exclude = ["images*"]
 huggingface = [
     "packaging",
     "tyro",
-    "transformers>=4.42.3",
+    "transformers>=4.43.1",
     "datasets>=2.16.0",
     "sentencepiece>=0.2.0",
     "tqdm",
@@ -188,7 +188,7 @@ colab-ampere-torch220 = [
 colab-new = [
     "packaging",
     "tyro",
-    "transformers>=4.42.3",
+    "transformers>=4.43.1",
     "datasets>=2.16.0",
     "sentencepiece>=0.2.0",
     "tqdm",
diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 0f170597b..ece8af282 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -27,6 +27,7 @@
 SUPPORTS_FOURBIT = transformers_version >= Version("4.37")
 SUPPORTS_GEMMA   = transformers_version >= Version("4.38")
 SUPPORTS_GEMMA2  = transformers_version >= Version("4.42")
+SUPPORTS_LLAMA31 = transformers_version >= Version("4.43.1")
 if SUPPORTS_GEMMA:
     from .gemma  import FastGemmaModel
 if SUPPORTS_GEMMA2:
@@ -130,7 +131,19 @@ def from_pretrained(
 
         model_type = model_config.model_type
 
-        if   model_type == "llama":   dispatch_model = FastLlamaModel
+        if   model_type == "llama":
+            scaling_type1 = model_config.rope_scaling.get("type", None)
+            scaling_type2 = model_config.rope_scaling.get("rope_type", None)
+            scaling_type = scaling_type1 if scaling_type1 is not None else scaling_type2
+
+            if scaling_type == "llama3" and not SUPPORTS_LLAMA31:
+                raise ImportError(
+                    f"Unsloth: Your transformers version of {transformers_version} does not support Llama 3.1.\n"\
+                    f"The minimum required version is 4.43.1\n"\
+                    f'Try `pip install --upgrade "transformers>=4.43.1"`\n'\
+                    f"to obtain the latest transformers build, then restart this session."\
+                )
+            dispatch_model = FastLlamaModel
         elif model_type == "mistral": dispatch_model = FastMistralModel
         elif model_type == "gemma":
             if not SUPPORTS_GEMMA:

From 740979b1b9af32d39af7904973a71aaadf009984 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 23 Jul 2024 11:12:58 -0700
Subject: [PATCH 034/147] Update llama.py

---
 unsloth/models/llama.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index a4b655216..295d92f62 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1010,6 +1010,7 @@ def _set_cos_sin_cache(self, seq_len, device, dtype):
     pass
 
     def forward(self, x, position_ids=None, seq_len=None):
+        print(x, position_ids, seq_len)
         # x: [bs, num_attention_heads, seq_len, head_size]
         if seq_len > self.current_rope_size:
             self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)

From 47d230b3cd043306463e2b76bd8023f867427ea2 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 23 Jul 2024 11:13:15 -0700
Subject: [PATCH 035/147] Update llama.py

---
 unsloth/models/llama.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 295d92f62..ff4d19c54 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1010,7 +1010,7 @@ def _set_cos_sin_cache(self, seq_len, device, dtype):
     pass
 
     def forward(self, x, position_ids=None, seq_len=None):
-        print(x, position_ids, seq_len)
+        print(__LINE__, x, position_ids, seq_len)
         # x: [bs, num_attention_heads, seq_len, head_size]
         if seq_len > self.current_rope_size:
             self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
@@ -1127,6 +1127,7 @@ def apply_scaling(self, freqs: torch.Tensor):
 
     def forward(self, x, position_ids=None, seq_len=None):
         # x: [bs, num_attention_heads, seq_len, head_size]
+        print(__LINE__, x, position_ids, seq_len)
         if seq_len > self.current_rope_size:
             self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
 

From f849b8b61f387b672d74de4a4372d03fdebcf809 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 23 Jul 2024 11:15:35 -0700
Subject: [PATCH 036/147] Update llama.py

---
 unsloth/models/llama.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index ff4d19c54..d2bbb5a3a 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1010,7 +1010,6 @@ def _set_cos_sin_cache(self, seq_len, device, dtype):
     pass
 
     def forward(self, x, position_ids=None, seq_len=None):
-        print(__LINE__, x, position_ids, seq_len)
         # x: [bs, num_attention_heads, seq_len, head_size]
         if seq_len > self.current_rope_size:
             self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
@@ -1066,6 +1065,7 @@ def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=
         config = None, # [TODO] Hack to pass in config - need to remove later
     ):
         super().__init__()
+        print(__FILE__, __LINE__)
         # if config is not None: return # [TODO] Hack to pass in config - need to remove later
         
         self.dim = dim
@@ -1080,6 +1080,7 @@ def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=
         )
         inv_freq = self.apply_scaling(inv_freq)
         self.register_buffer("inv_freq", inv_freq, persistent = False)
+        print(__FILE__, __LINE__)
 
         # Build here to make `torch.jit.trace` work.
         self._set_cos_sin_cache(seq_len=self.current_rope_size, device=device, dtype=torch.get_default_dtype())
@@ -1089,6 +1090,7 @@ def _set_cos_sin_cache(self, seq_len, device, dtype):
         # Note: on the original Llama codebase, these tensors are created on the target device (and not on CPU) and
         # in FP32. They are applied (multiplied) in FP32 as well.
         self.current_rope_size = seq_len
+        print(__FILE__, __LINE__)
         
         t = torch.arange(self.current_rope_size, device="cpu", dtype=torch.int64).float()
 
@@ -1127,7 +1129,6 @@ def apply_scaling(self, freqs: torch.Tensor):
 
     def forward(self, x, position_ids=None, seq_len=None):
         # x: [bs, num_attention_heads, seq_len, head_size]
-        print(__LINE__, x, position_ids, seq_len)
         if seq_len > self.current_rope_size:
             self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
 

From 6157cef3d1a37bb432389686ba35038d751b6ba6 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 23 Jul 2024 11:16:31 -0700
Subject: [PATCH 037/147] Update llama.py

---
 unsloth/models/llama.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index d2bbb5a3a..de9eb80da 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1080,7 +1080,7 @@ def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=
         )
         inv_freq = self.apply_scaling(inv_freq)
         self.register_buffer("inv_freq", inv_freq, persistent = False)
-        print(__FILE__, __LINE__)
+        print(__LINE__)
 
         # Build here to make `torch.jit.trace` work.
         self._set_cos_sin_cache(seq_len=self.current_rope_size, device=device, dtype=torch.get_default_dtype())
@@ -1090,7 +1090,7 @@ def _set_cos_sin_cache(self, seq_len, device, dtype):
         # Note: on the original Llama codebase, these tensors are created on the target device (and not on CPU) and
         # in FP32. They are applied (multiplied) in FP32 as well.
         self.current_rope_size = seq_len
-        print(__FILE__, __LINE__)
+        print(__LINE__)
         
         t = torch.arange(self.current_rope_size, device="cpu", dtype=torch.int64).float()
 

From 5da00a946e2af9ebfd1aaf1f3885e94b628745a3 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 23 Jul 2024 11:16:40 -0700
Subject: [PATCH 038/147] Update llama.py

---
 unsloth/models/llama.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index de9eb80da..3f358fe67 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1065,7 +1065,7 @@ def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=
         config = None, # [TODO] Hack to pass in config - need to remove later
     ):
         super().__init__()
-        print(__FILE__, __LINE__)
+        print(__LINE__)
         # if config is not None: return # [TODO] Hack to pass in config - need to remove later
         
         self.dim = dim

From 2ff7d8368c44c78db1e8cd10326b3c88055d8832 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 23 Jul 2024 11:18:12 -0700
Subject: [PATCH 039/147] Update llama.py

---
 unsloth/models/llama.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 3f358fe67..3085ccd0b 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1065,7 +1065,7 @@ def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=
         config = None, # [TODO] Hack to pass in config - need to remove later
     ):
         super().__init__()
-        print(__LINE__)
+        print(1068)
         # if config is not None: return # [TODO] Hack to pass in config - need to remove later
         
         self.dim = dim
@@ -1080,7 +1080,7 @@ def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=
         )
         inv_freq = self.apply_scaling(inv_freq)
         self.register_buffer("inv_freq", inv_freq, persistent = False)
-        print(__LINE__)
+        print(1083)
 
         # Build here to make `torch.jit.trace` work.
         self._set_cos_sin_cache(seq_len=self.current_rope_size, device=device, dtype=torch.get_default_dtype())
@@ -1090,7 +1090,7 @@ def _set_cos_sin_cache(self, seq_len, device, dtype):
         # Note: on the original Llama codebase, these tensors are created on the target device (and not on CPU) and
         # in FP32. They are applied (multiplied) in FP32 as well.
         self.current_rope_size = seq_len
-        print(__LINE__)
+        print(1093)
         
         t = torch.arange(self.current_rope_size, device="cpu", dtype=torch.int64).float()
 

From 7c441f3480b217e1909d7b7eb53eb77a6481c7fc Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 23 Jul 2024 11:21:29 -0700
Subject: [PATCH 040/147] Update llama.py

---
 unsloth/models/llama.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 3085ccd0b..f9981f56e 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1065,9 +1065,15 @@ def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=
         config = None, # [TODO] Hack to pass in config - need to remove later
     ):
         super().__init__()
-        print(1068)
-        # if config is not None: return # [TODO] Hack to pass in config - need to remove later
-        
+        if config is not None:
+            # [TODO] Hack to pass in config - need to remove later
+            base = config.rope_theta
+            partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
+            dim = int((config.hidden_size // config.num_attention_heads))
+            device = "cuda"
+            max_position_embeddings = config.max_position_embeddings
+        pass
+            
         self.dim = dim
         self.max_position_embeddings = max_position_embeddings
         self.base = base
@@ -1080,7 +1086,6 @@ def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=
         )
         inv_freq = self.apply_scaling(inv_freq)
         self.register_buffer("inv_freq", inv_freq, persistent = False)
-        print(1083)
 
         # Build here to make `torch.jit.trace` work.
         self._set_cos_sin_cache(seq_len=self.current_rope_size, device=device, dtype=torch.get_default_dtype())
@@ -1090,7 +1095,6 @@ def _set_cos_sin_cache(self, seq_len, device, dtype):
         # Note: on the original Llama codebase, these tensors are created on the target device (and not on CPU) and
         # in FP32. They are applied (multiplied) in FP32 as well.
         self.current_rope_size = seq_len
-        print(1093)
         
         t = torch.arange(self.current_rope_size, device="cpu", dtype=torch.int64).float()
 

From 5d9245660fd3a739d992f4a0e717ee8c85bdb635 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 23 Jul 2024 11:22:27 -0700
Subject: [PATCH 041/147] Update llama.py

---
 unsloth/models/llama.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index f9981f56e..8fc480d14 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1073,7 +1073,7 @@ def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=
             device = "cuda"
             max_position_embeddings = config.max_position_embeddings
         pass
-            
+        
         self.dim = dim
         self.max_position_embeddings = max_position_embeddings
         self.base = base

From 4a3fddd055333f2eeb4ba58cdbf374e449ce3c3a Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 23 Jul 2024 11:23:00 -0700
Subject: [PATCH 042/147] Update llama.py

---
 unsloth/models/llama.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 8fc480d14..474dad329 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1067,13 +1067,14 @@ def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=
         super().__init__()
         if config is not None:
             # [TODO] Hack to pass in config - need to remove later
+            print(1)
             base = config.rope_theta
             partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
             dim = int((config.hidden_size // config.num_attention_heads))
             device = "cuda"
             max_position_embeddings = config.max_position_embeddings
         pass
-        
+
         self.dim = dim
         self.max_position_embeddings = max_position_embeddings
         self.base = base

From ca3a1b7315c54ccffafe60b5c7abe6869cd7be6a Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 23 Jul 2024 11:23:18 -0700
Subject: [PATCH 043/147] Update llama.py

---
 unsloth/models/llama.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 474dad329..aef04d604 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -981,8 +981,16 @@ def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=
         config = None, # [TODO] Hack to pass in config - need to remove later
     ):
         super().__init__()
-        if config is not None: return # [TODO] Hack to pass in config - need to remove later
-        
+        if config is not None:
+            # [TODO] Hack to pass in config - need to remove later
+            print(2)
+            base = config.rope_theta
+            partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
+            dim = int((config.hidden_size // config.num_attention_heads))
+            device = "cuda"
+            max_position_embeddings = config.max_position_embeddings
+        pass
+
         self.dim = dim
         self.max_position_embeddings = max_position_embeddings
         self.base = base

From b93a75778e2ef0b7c9b83b1cc329c6e2f7649b73 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 23 Jul 2024 11:24:58 -0700
Subject: [PATCH 044/147] Update llama.py

---
 unsloth/models/llama.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index aef04d604..338ae0a7c 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -983,7 +983,6 @@ def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=
         super().__init__()
         if config is not None:
             # [TODO] Hack to pass in config - need to remove later
-            print(2)
             base = config.rope_theta
             partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
             dim = int((config.hidden_size // config.num_attention_heads))
@@ -1075,7 +1074,6 @@ def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=
         super().__init__()
         if config is not None:
             # [TODO] Hack to pass in config - need to remove later
-            print(1)
             base = config.rope_theta
             partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
             dim = int((config.hidden_size // config.num_attention_heads))

From c86b13d46512c5e7a8b2221e885a3a00eb0ad59a Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 23 Jul 2024 11:40:49 -0700
Subject: [PATCH 045/147] Llama 3.1 (#797)

* Llama 3.1

* Update _utils.py

* Llama 3.1

* Update _utils.py

* Update llama.py

* Update llama.py

* hack for rotary

* patch RoPE

* refix rope

* Update _utils.py

* Update llama.py

* Llama 3.1 check

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py
---
 README.md                |   5 +-
 pyproject.toml           |   4 +-
 unsloth/models/_utils.py |  98 ++++++++++++++++++++++++++++++-
 unsloth/models/gemma.py  |  11 +++-
 unsloth/models/llama.py  | 123 ++++++++++++++++++++++++++++++++++++++-
 unsloth/models/loader.py |  15 ++++-
 unsloth/models/mapper.py |  16 +++++
 7 files changed, 258 insertions(+), 14 deletions(-)

diff --git a/README.md b/README.md
index 05977bad7..c666f2d9c 100644
--- a/README.md
+++ b/README.md
@@ -22,7 +22,7 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and
 
 | Unsloth supports | Free Notebooks | Performance | Memory use |
 |-----------|---------|--------|----------|
-| **Llama 3 (8B)**      | [▶️ Start for free](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing)               | 2x faster | 60% less |
+| **Llama 3.1 (8B)**      | [▶️ Start for free](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing)               | 2x faster | 60% less |
 | **Mistral Nemo (12B)** | [▶️ Start for free](https://colab.research.google.com/drive/17d3U-CAIwzmbDRqbZ9NnpHxCkmXB6LZ0?usp=sharing)               | 2x faster | 60% less |
 | **Gemma 2 (9B)**      | [▶️ Start for free](https://colab.research.google.com/drive/1vIrqH5uYDQwsJ4-OO3DErvuv4pBgVwk4?usp=sharing)               | 2x faster | 63% less |
 | **Phi-3 (mini)** | [▶️ Start for free](https://colab.research.google.com/drive/1lN6hPQveB_mHSnTOYifygFcrO8C1bxq4?usp=sharing)               | 2x faster | 50% less |
@@ -32,13 +32,14 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and
 | **DPO Zephyr**     | [▶️ Start for free](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing)               | 1.9x faster | 43% less |
 | **TinyLlama**  | [▶️ Start for free](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing)               | 3.9x faster | 74% less |
 
-- **Kaggle Notebooks** for [Llama 3 (8B)](https://www.kaggle.com/code/danielhanchen/kaggle-llama-3-8b-unsloth-notebook), [Gemma 2 (9B)](https://www.kaggle.com/code/danielhanchen/kaggle-gemma-7b-unsloth-notebook/), [Mistral (7B)](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook)
+- **Kaggle Notebooks** for [Llama 3.1 (8B)](https://www.kaggle.com/code/danielhanchen/kaggle-llama-3-8b-unsloth-notebook), [Gemma 2 (9B)](https://www.kaggle.com/code/danielhanchen/kaggle-gemma-7b-unsloth-notebook/), [Mistral (7B)](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook)
 - Run [Llama 3 conversational notebook](https://colab.research.google.com/drive/1XamvWYinY6FOSX9GLvnqSjjsNflxdhNc?usp=sharing) and [Mistral v0.3 ChatML](https://colab.research.google.com/drive/15F1xyn8497_dUbxZP4zWmPZ3PJx1Oymv?usp=sharing)
 - This [text completion notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing) is for continued pretraining / raw text
 - This [continued pretraining notebook](https://colab.research.google.com/drive/1tEd1FrOXWMnCU9UIvdYhs61tkxdMuKZu?usp=sharing) is for learning another language
 - Click [here](https://github.com/unslothai/unsloth/wiki) for detailed documentation for Unsloth.
 
 ## 🦥 Unsloth.ai News
+- 📣 NEW! [Llama 3.1 8b, 70b](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) both Base and Instruct now supported
 - 📣 NEW! [Mistral Nemo-12b](https://colab.research.google.com/drive/17d3U-CAIwzmbDRqbZ9NnpHxCkmXB6LZ0?usp=sharing) both Base and Instruct now supported
 - 📣 NEW! [Gemma-2-9b](https://colab.research.google.com/drive/1vIrqH5uYDQwsJ4-OO3DErvuv4pBgVwk4?usp=sharing) and Gemma-2-27b now supported
 - 📣 UPDATE! [Phi-3 mini](https://colab.research.google.com/drive/1hhdhBa1j_hsymiW9m-WzxQtgqTH_NHqi?usp=sharing) model updated. [Phi-3 Medium](https://colab.research.google.com/drive/1hhdhBa1j_hsymiW9m-WzxQtgqTH_NHqi?usp=sharing) 2x faster finetuning.
diff --git a/pyproject.toml b/pyproject.toml
index 29b35577e..829b35ad3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -35,7 +35,7 @@ exclude = ["images*"]
 huggingface = [
     "packaging",
     "tyro",
-    "transformers>=4.42.3",
+    "transformers>=4.43.1",
     "datasets>=2.16.0",
     "sentencepiece>=0.2.0",
     "tqdm",
@@ -188,7 +188,7 @@ colab-ampere-torch220 = [
 colab-new = [
     "packaging",
     "tyro",
-    "transformers>=4.42.3",
+    "transformers>=4.43.1",
     "datasets>=2.16.0",
     "sentencepiece>=0.2.0",
     "tqdm",
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 466a5fee7..b021e89e9 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -33,6 +33,7 @@
     "unsloth_offloaded_gradient_checkpoint",
     "torch_compile_options",
     "patch_linear_scaling",
+    "patch_llama_rope_scaling",
     "check_nvidia",
     "create_boolean_mask",
     "torch_amp_custom_fwd",
@@ -332,7 +333,13 @@ def patch_tokenizer(model, tokenizer):
         Check if pad_token is not the same as eos_token otherwise the loss will ignore it!!
         Fixes https://github.com/unslothai/unsloth/issues/5
     """
-    possible_reserved_tokens = ("<|reserved", "<|placeholder", "[control")
+    possible_reserved_tokens = (
+        "<|reserved",                # Llama-3
+        "<|placeholder",             # Phi-3
+        "[control",                  # Forgot where lol
+        "<pad>",                     # Mistral Nemo
+        "<|finetune_right_pad_id|>", # Llama-3.1
+    )
 
     if model is not None:
         model.config.update({"unsloth_version" : __version__})
@@ -745,7 +752,7 @@ def patch_linear_scaling(
     fix_rope_function = """
     if getattr(self.config, "rope_scaling", None) is None:
         self.rotary_emb = {rope_function}(
-            self.head_dim,
+            dim = self.head_dim,
             max_position_embeddings=self.max_position_embeddings,
             base=self.rope_theta,
         )
@@ -754,7 +761,7 @@ def patch_linear_scaling(
         scaling_factor = self.config.rope_scaling["factor"]
         if scaling_type == "linear":
             self.rotary_emb = {scaled_rope_function}(
-                self.head_dim,
+                dim = self.head_dim,
                 max_position_embeddings=self.max_position_embeddings,
                 scaling_factor=scaling_factor,
                 base=self.rope_theta,
@@ -779,6 +786,91 @@ def patch_linear_scaling(
 pass
 
 
+# Patches for Llama-3 LlamaExtendedRotaryEmbedding
+def patch_llama_rope_scaling(
+    model_name = "llama",
+    rope_module = None,
+    scaled_rope_module = None,
+    extended_rope_module = None,
+    attention_module = None,
+):
+    assert(\
+        rope_module is not None and \
+        scaled_rope_module is not None and \
+        extended_rope_module is not None
+    )
+    assert(attention_module is not None)
+
+    rope_name = rope_module.__name__
+    scaled_rope_name = scaled_rope_module.__name__
+    model_filepath = f"transformers.models.{model_name}.modeling_{model_name}"
+    exec_code = \
+        f"import torch.nn as nn\n"\
+        f"from typing import Union, Optional, List, Any, Callable, Tuple\n"\
+        f"from {model_filepath} import logger, "\
+        f"{model_name.title()}Attention, {model_name.title()}Config"
+
+    try:
+        function = inspect.getsource(attention_module.__init__)
+    except:
+        # Most likely already patched!
+        return None, None
+    where = function.find("def")
+    function = function.split("\n")
+    function = "\n".join(x[where:] for x in function)
+    init_name = f"{model_name.title()}Attention__init__"
+    function = function.replace("def __init__", f"def {init_name}")
+    function = function.replace(
+        "super().__init__()",
+        f"super({model_name.title()}Attention, self).__init__()",
+    )
+    fix_rope_function = """
+    if getattr(self.config, "rope_scaling", None) is None:
+        self.rotary_emb = {rope_function}(
+            dim = self.head_dim,
+            max_position_embeddings=self.max_position_embeddings,
+            base=self.rope_theta,
+        )
+    else:
+        scaling_type1 = self.config.rope_scaling.get("type", None)
+        scaling_type2 = self.config.rope_scaling.get("rope_type", None)
+        scaling_type = scaling_type1 if scaling_type1 is not None else scaling_type2
+        scaling_factor = self.config.rope_scaling.get("factor")
+
+        if scaling_type == "linear":
+            self.rotary_emb = {scaled_rope_function}(
+                dim = self.head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                scaling_factor=scaling_factor,
+                base=self.rope_theta,
+            )
+        elif scaling_type == "llama3":
+            self.rotary_emb = {extended_rope_function}(
+                dim = self.head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                base=self.rope_theta,
+            )
+        else:
+            raise ValueError(f"Unknown RoPE scaling type {{scaling_type}}")
+    pass
+    """
+    fix_rope_function = fix_rope_function.format(
+        rope_function          = rope_module.__name__,
+        scaled_rope_function   = scaled_rope_module.__name__,
+        extended_rope_function = extended_rope_module.__name__,
+    )
+    rotary_emb = re.findall(
+        "self.rotary_emb = .+?\)", function,
+        flags = re.DOTALL | re.MULTILINE,
+    )
+    if len(rotary_emb) == 0: return None, function
+    rotary_emb = rotary_emb[0]
+    function = function.replace(rotary_emb, fix_rope_function, 1)
+    function = exec_code + "\n\n" + function
+    return init_name, function
+pass
+
+
 def check_nvidia():
     # Unsloth doesn't work yet on AMD devices - we're working on it!
     output = np.array([0,])
diff --git a/unsloth/models/gemma.py b/unsloth/models/gemma.py
index ce89ad3be..e3f1e615d 100644
--- a/unsloth/models/gemma.py
+++ b/unsloth/models/gemma.py
@@ -205,8 +205,11 @@ class GemmaFixedRotaryEmbedding(torch.nn.Module):
     # Fixes https://github.com/huggingface/transformers/pull/28837
     # https://github.com/microsoft/DeepSpeed/issues/4932
     # The precision of RoPE buffers is not correct, so we cast to int64.
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+    def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=None,
+        config = None, # [TODO] Hack to pass in config - need to remove later
+    ):
         super().__init__()
+        if config is not None: return # [TODO] Hack to pass in config - need to remove later
         self.dim = dim
         self.max_position_embeddings = max_position_embeddings
         self.base = base
@@ -264,9 +267,11 @@ class GemmaFixedLinearScalingRotaryEmbedding(GemmaFixedRotaryEmbedding):
     # Fixes https://github.com/huggingface/transformers/pull/28837
     # https://github.com/microsoft/DeepSpeed/issues/4932
     # The precision of RoPE buffers is not correct, so we cast to int64.
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+    def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0,
+        config = None, # [TODO] Hack to pass in config - need to remove later
+    ):
         self.scaling_factor = scaling_factor
-        super().__init__(dim, max_position_embeddings, base, device)
+        super().__init__(dim = dim, max_position_embeddings = max_position_embeddings, base = base, device = device, config = config)
     pass
 
     def _set_cos_sin_cache(self, seq_len, device, dtype):
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index ff51b90b8..338ae0a7c 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -977,8 +977,19 @@ class LlamaRotaryEmbedding(torch.nn.Module):
     # Fixes https://github.com/huggingface/transformers/pull/28837
     # https://github.com/microsoft/DeepSpeed/issues/4932
     # The precision of RoPE buffers is not correct, so we cast to int64.
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+    def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=None,
+        config = None, # [TODO] Hack to pass in config - need to remove later
+    ):
         super().__init__()
+        if config is not None:
+            # [TODO] Hack to pass in config - need to remove later
+            base = config.rope_theta
+            partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
+            dim = int((config.hidden_size // config.num_attention_heads))
+            device = "cuda"
+            max_position_embeddings = config.max_position_embeddings
+        pass
+
         self.dim = dim
         self.max_position_embeddings = max_position_embeddings
         self.base = base
@@ -1030,9 +1041,11 @@ class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding):
     # Fixes https://github.com/huggingface/transformers/pull/28837
     # https://github.com/microsoft/DeepSpeed/issues/4932
     # The precision of RoPE buffers is not correct, so we cast to int64.
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+    def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0,
+        config = None, # [TODO] Hack to pass in config - need to remove later
+    ):
         self.scaling_factor = scaling_factor
-        super().__init__(dim, max_position_embeddings, base, device)
+        super().__init__(dim = dim, max_position_embeddings = max_position_embeddings, base = base, device = device, config = config)
     pass
 
     def _set_cos_sin_cache(self, seq_len, device, dtype):
@@ -1052,6 +1065,99 @@ def _set_cos_sin_cache(self, seq_len, device, dtype):
 pass
 
 
+# See https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/rotary_embedding.py#L736
+# For Llama 3.1
+class LlamaExtendedRotaryEmbedding(torch.nn.Module):
+    def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=None,
+        config = None, # [TODO] Hack to pass in config - need to remove later
+    ):
+        super().__init__()
+        if config is not None:
+            # [TODO] Hack to pass in config - need to remove later
+            base = config.rope_theta
+            partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
+            dim = int((config.hidden_size // config.num_attention_heads))
+            device = "cuda"
+            max_position_embeddings = config.max_position_embeddings
+        pass
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        # Dynamic RoPE we first set it to a max of 4 * 8192 tokens then we iteratively grow this
+        self.current_rope_size = min(4 * 8192, self.max_position_embeddings)
+
+        # Normal Llama-3 RoPE
+        inv_freq = 1.0 / (
+            self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64, device="cpu").float() / self.dim)
+        )
+        inv_freq = self.apply_scaling(inv_freq)
+        self.register_buffer("inv_freq", inv_freq, persistent = False)
+
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(seq_len=self.current_rope_size, device=device, dtype=torch.get_default_dtype())
+    pass
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        # Note: on the original Llama codebase, these tensors are created on the target device (and not on CPU) and
+        # in FP32. They are applied (multiplied) in FP32 as well.
+        self.current_rope_size = seq_len
+        
+        t = torch.arange(self.current_rope_size, device="cpu", dtype=torch.int64).float()
+
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype=dtype, device=device, non_blocking=True), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype=dtype, device=device, non_blocking=True), persistent=False)
+    pass
+
+    # From https://github.com/meta-llama/llama-models/blob/main/models/llama3_1/api/model.py#L41
+    def apply_scaling(self, freqs: torch.Tensor):
+        # Values obtained from grid search
+        scale_factor = 8
+        low_freq_factor = 1
+        high_freq_factor = 4
+        old_context_len = 8192  # original llama3 length
+
+        low_freq_wavelen = old_context_len / low_freq_factor
+        high_freq_wavelen = old_context_len / high_freq_factor
+        new_freqs = []
+        for freq in freqs:
+            wavelen = 2 * math.pi / freq
+            if wavelen < high_freq_wavelen:
+                new_freqs.append(freq)
+            elif wavelen > low_freq_wavelen:
+                new_freqs.append(freq / scale_factor)
+            else:
+                assert low_freq_wavelen != high_freq_wavelen
+                smooth = (old_context_len / wavelen - low_freq_factor) / (
+                    high_freq_factor - low_freq_factor
+                )
+                new_freqs.append((1 - smooth) * freq / scale_factor + smooth * freq)
+        return torch.tensor(new_freqs, dtype=freqs.dtype, device=freqs.device)
+    pass
+
+    def forward(self, x, position_ids=None, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.current_rope_size:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+        return (
+            self.cos_cached[:seq_len].to(dtype = x.dtype),
+            self.sin_cached[:seq_len].to(dtype = x.dtype),
+        )
+    pass
+
+    def extend_rope_embedding(self, x, seq_len):
+        if seq_len <= self.current_rope_size: return
+        # Iteratively grow by increments of 8192
+        self.current_rope_size = int(round(seq_len / 8192)) * 8192
+        self._set_cos_sin_cache(self.current_rope_size, device = "cuda:0", dtype = x.dtype)
+    pass
+pass
+
+
 def _wrap_fast_inference(generate, device_type, dtype, model):
     # Wraps inference with bfloat16 / float16
     @torch.inference_mode
@@ -1108,6 +1214,17 @@ class FastLlamaModel:
 
     @staticmethod
     def pre_patch():
+        init_name, function = patch_llama_rope_scaling(
+            model_name           = "llama",
+            rope_module          = LlamaRotaryEmbedding,
+            scaled_rope_module   = LlamaLinearScalingRotaryEmbedding,
+            extended_rope_module = LlamaExtendedRotaryEmbedding,
+            attention_module     = LlamaAttention,
+        )
+        if init_name is not None:
+            exec(function, globals())
+            LlamaAttention.__init__  = eval(init_name)
+        pass
         LlamaAttention      .forward = LlamaAttention_fast_forward
         LlamaSdpaAttention  .forward = LlamaAttention_fast_forward
         LlamaFlashAttention2.forward = LlamaAttention_fast_forward
diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 0f170597b..ece8af282 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -27,6 +27,7 @@
 SUPPORTS_FOURBIT = transformers_version >= Version("4.37")
 SUPPORTS_GEMMA   = transformers_version >= Version("4.38")
 SUPPORTS_GEMMA2  = transformers_version >= Version("4.42")
+SUPPORTS_LLAMA31 = transformers_version >= Version("4.43.1")
 if SUPPORTS_GEMMA:
     from .gemma  import FastGemmaModel
 if SUPPORTS_GEMMA2:
@@ -130,7 +131,19 @@ def from_pretrained(
 
         model_type = model_config.model_type
 
-        if   model_type == "llama":   dispatch_model = FastLlamaModel
+        if   model_type == "llama":
+            scaling_type1 = model_config.rope_scaling.get("type", None)
+            scaling_type2 = model_config.rope_scaling.get("rope_type", None)
+            scaling_type = scaling_type1 if scaling_type1 is not None else scaling_type2
+
+            if scaling_type == "llama3" and not SUPPORTS_LLAMA31:
+                raise ImportError(
+                    f"Unsloth: Your transformers version of {transformers_version} does not support Llama 3.1.\n"\
+                    f"The minimum required version is 4.43.1\n"\
+                    f'Try `pip install --upgrade "transformers>=4.43.1"`\n'\
+                    f"to obtain the latest transformers build, then restart this session."\
+                )
+            dispatch_model = FastLlamaModel
         elif model_type == "mistral": dispatch_model = FastMistralModel
         elif model_type == "gemma":
             if not SUPPORTS_GEMMA:
diff --git a/unsloth/models/mapper.py b/unsloth/models/mapper.py
index 38cbdbe99..fc13c94e8 100644
--- a/unsloth/models/mapper.py
+++ b/unsloth/models/mapper.py
@@ -218,6 +218,22 @@
         "unsloth/Mistral-Nemo-Base-2407",
         "mistralai/Mistral-Nemo-Base-2407",
     ),
+    "unsloth/Meta-Llama-3.1-8B-bnb-4bit" : (
+        "unsloth/Meta-Llama-3.1-8B",
+        "meta-llama/Meta-Llama-3.1-8B",
+    ),
+    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit" : (
+        "unsloth/Meta-Llama-3.1-8B-Instruct",
+        "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    ),
+    "unsloth/Meta-Llama-3.1-70B-bnb-4bit" : (
+        "unsloth/Meta-Llama-3.1-70B",
+        "meta-llama/Meta-Llama-3.1-70B",
+    ),
+    "unsloth/Meta-Llama-3.1-70B-Instruct-bnb-4bit" : (
+        "unsloth/Meta-Llama-3.1-70B-Instruct",
+        "meta-llama/Meta-Llama-3.1-70B-Instruct",
+    ),
 }
 
 INT_TO_FLOAT_MAPPER = {}

From 22968a2134f3fb265a6158610a8ef173ba9547aa Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 23 Jul 2024 11:51:08 -0700
Subject: [PATCH 046/147] Update README.md

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index c666f2d9c..e7ef854cf 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@
     <img alt="unsloth logo" src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20logo%20black%20text.png" height="110" style="max-width: 100%;">
   </picture></a>
   
-<a href="https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing"><img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/start free finetune button.png" height="48"></a>
+<a href="https://colab.research.google.com/drive/1Ys44kVvmeZtnICzWz0xgpRnrIOjZAuxp?usp=sharing"><img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/start free finetune button.png" height="48"></a>
 <a href="https://discord.gg/unsloth"><img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/Discord button.png" height="48"></a>
 <a href="https://ko-fi.com/unsloth"><img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/buy me a coffee button.png" height="48"></a>
 
@@ -22,7 +22,7 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and
 
 | Unsloth supports | Free Notebooks | Performance | Memory use |
 |-----------|---------|--------|----------|
-| **Llama 3.1 (8B)**      | [▶️ Start for free](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing)               | 2x faster | 60% less |
+| **Llama 3.1 (8B)**      | [▶️ Start for free](https://colab.research.google.com/drive/1Ys44kVvmeZtnICzWz0xgpRnrIOjZAuxp?usp=sharing)               | 2x faster | 60% less |
 | **Mistral Nemo (12B)** | [▶️ Start for free](https://colab.research.google.com/drive/17d3U-CAIwzmbDRqbZ9NnpHxCkmXB6LZ0?usp=sharing)               | 2x faster | 60% less |
 | **Gemma 2 (9B)**      | [▶️ Start for free](https://colab.research.google.com/drive/1vIrqH5uYDQwsJ4-OO3DErvuv4pBgVwk4?usp=sharing)               | 2x faster | 63% less |
 | **Phi-3 (mini)** | [▶️ Start for free](https://colab.research.google.com/drive/1lN6hPQveB_mHSnTOYifygFcrO8C1bxq4?usp=sharing)               | 2x faster | 50% less |
@@ -39,7 +39,7 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and
 - Click [here](https://github.com/unslothai/unsloth/wiki) for detailed documentation for Unsloth.
 
 ## 🦥 Unsloth.ai News
-- 📣 NEW! [Llama 3.1 8b, 70b](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) both Base and Instruct now supported
+- 📣 NEW! [Llama 3.1 8b, 70b](https://colab.research.google.com/drive/1Ys44kVvmeZtnICzWz0xgpRnrIOjZAuxp?usp=sharing) both Base and Instruct now supported
 - 📣 NEW! [Mistral Nemo-12b](https://colab.research.google.com/drive/17d3U-CAIwzmbDRqbZ9NnpHxCkmXB6LZ0?usp=sharing) both Base and Instruct now supported
 - 📣 NEW! [Gemma-2-9b](https://colab.research.google.com/drive/1vIrqH5uYDQwsJ4-OO3DErvuv4pBgVwk4?usp=sharing) and Gemma-2-27b now supported
 - 📣 UPDATE! [Phi-3 mini](https://colab.research.google.com/drive/1hhdhBa1j_hsymiW9m-WzxQtgqTH_NHqi?usp=sharing) model updated. [Phi-3 Medium](https://colab.research.google.com/drive/1hhdhBa1j_hsymiW9m-WzxQtgqTH_NHqi?usp=sharing) 2x faster finetuning.

From 824511e265ff9c45b2448d4c89c93d0306c42741 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 23 Jul 2024 12:07:27 -0700
Subject: [PATCH 047/147] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index e7ef854cf..1c98c43f1 100644
--- a/README.md
+++ b/README.md
@@ -32,7 +32,7 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and
 | **DPO Zephyr**     | [▶️ Start for free](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing)               | 1.9x faster | 43% less |
 | **TinyLlama**  | [▶️ Start for free](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing)               | 3.9x faster | 74% less |
 
-- **Kaggle Notebooks** for [Llama 3.1 (8B)](https://www.kaggle.com/code/danielhanchen/kaggle-llama-3-8b-unsloth-notebook), [Gemma 2 (9B)](https://www.kaggle.com/code/danielhanchen/kaggle-gemma-7b-unsloth-notebook/), [Mistral (7B)](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook)
+- **Kaggle Notebooks** for [Llama 3.1 (8B)](https://www.kaggle.com/danielhanchen/kaggle-llama-3-1-8b-unsloth-notebook), [Gemma 2 (9B)](https://www.kaggle.com/code/danielhanchen/kaggle-gemma-7b-unsloth-notebook/), [Mistral (7B)](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook)
 - Run [Llama 3 conversational notebook](https://colab.research.google.com/drive/1XamvWYinY6FOSX9GLvnqSjjsNflxdhNc?usp=sharing) and [Mistral v0.3 ChatML](https://colab.research.google.com/drive/15F1xyn8497_dUbxZP4zWmPZ3PJx1Oymv?usp=sharing)
 - This [text completion notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing) is for continued pretraining / raw text
 - This [continued pretraining notebook](https://colab.research.google.com/drive/1tEd1FrOXWMnCU9UIvdYhs61tkxdMuKZu?usp=sharing) is for learning another language

From 777453967fc8476a846983e9c5eeab3382b88543 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 23 Jul 2024 12:12:29 -0700
Subject: [PATCH 048/147] Update loader.py

---
 unsloth/models/loader.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index ece8af282..85416b81b 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -132,9 +132,12 @@ def from_pretrained(
         model_type = model_config.model_type
 
         if   model_type == "llama":
-            scaling_type1 = model_config.rope_scaling.get("type", None)
-            scaling_type2 = model_config.rope_scaling.get("rope_type", None)
-            scaling_type = scaling_type1 if scaling_type1 is not None else scaling_type2
+            scaling_type = None
+            if getattr(model_config, "rope_scaling", None) is not None:
+                scaling_type1 = model_config.rope_scaling.get("type", None)
+                scaling_type2 = model_config.rope_scaling.get("rope_type", None)
+                scaling_type = scaling_type1 if scaling_type1 is not None else scaling_type2
+            pass
 
             if scaling_type == "llama3" and not SUPPORTS_LLAMA31:
                 raise ImportError(

From caa402828715d428b5426955df8fecc8e3fe1c80 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 23 Jul 2024 12:25:24 -0700
Subject: [PATCH 049/147] Update _utils.py

---
 unsloth/models/_utils.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index b021e89e9..5a2e85997 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -393,7 +393,10 @@ def patch_tokenizer(model, tokenizer):
         tokenizer.pad_token = possible_pad_token
         if model is not None:
             config = model.config.update({"pad_token_id" : tokenizer.pad_token_id})
-    pass
+    else:
+        if model is not None:
+            if model.config.pad_token_id is None:
+                config = model.config.update({"pad_token_id" : tokenizer.pad_token_id})
     return model, tokenizer
 pass
 

From 4dd4ad2104ae9865a029f0408df89c7121f353e9 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 23 Jul 2024 12:27:46 -0700
Subject: [PATCH 050/147] Update llama.py

---
 unsloth/models/llama.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 338ae0a7c..719aee537 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1178,10 +1178,11 @@ def _fast_generate(*args, **kwargs):
         kwargs.pop("token_type_ids", None)
 
         # Check pad_token
-        kwargs["pad_token_id"] = kwargs.pop(
-            "pad_token_id",
-            getattr(model.config, "eos_token_id", None),
-        )
+        model_eos_token_id = getattr(model.config, "eos_token_id", None)
+        if hasattr(model_eos_token_id, "__iter__"):
+            model_eos_token_id = model_eos_token_id[0]
+
+        kwargs["pad_token_id"] = kwargs.pop("pad_token_id", model_eos_token_id)
 
         # Set pad token
         # old_pad_token_id = getattr(model.config, "pad_token_id", None)

From cc11b7886138e45690a459019f57c53675a70623 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 23 Jul 2024 12:28:12 -0700
Subject: [PATCH 051/147] Update llama.py

---
 unsloth/models/llama.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 719aee537..ba4362b3c 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1179,7 +1179,7 @@ def _fast_generate(*args, **kwargs):
 
         # Check pad_token
         model_eos_token_id = getattr(model.config, "eos_token_id", None)
-        if hasattr(model_eos_token_id, "__iter__"):
+        if model_eos_token_id is not None and hasattr(model_eos_token_id, "__iter__"):
             model_eos_token_id = model_eos_token_id[0]
 
         kwargs["pad_token_id"] = kwargs.pop("pad_token_id", model_eos_token_id)

From d1f3b6c1c4f69cd09ebdcab014bd72ac1217ee71 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 23 Jul 2024 13:14:21 -0700
Subject: [PATCH 052/147] Create Run.png

---
 images/Run.png | Bin 0 -> 11471 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 images/Run.png

diff --git a/images/Run.png b/images/Run.png
new file mode 100644
index 0000000000000000000000000000000000000000..fd737aa4d6e3684a0ae3405565f95d52e521b785
GIT binary patch
literal 11471
zcmXw9bwHHQ)29*X1}Om%NkJr}5hNv)?hffZjxGrS2`Q-qM7lw`r6o?fkE0uoZg`*b
z`+NU5j@#$ishypj&+L3rQ<1}e{NynT3JSi0yv#=w6f_9%fBz#a;IEYUvKsJ(<1DZ1
zhJu1iiu|CWq-9W{pl}8$$Vh2`GxisKd^OH8AC4$ZpK8lL#X&VA;a)a-L>UUtBksW+
zdu&$gVLDnoIT>u{X=hlfH?OG`FnP9lU%ZQNr>`+Fr}&|h-6;DzI~vpHK$71qGTe+f
zaX!J!xA#JLZ{NSNwi-!wvpV({r$0gYmg*+{=Dii{L~QBg3Y0G4NNyG--jTgpPiyit
zZf=7D_Zd2GAokX}WF`|6(`Y$80!=PRFPpsZliNIQB6@So62Tia>xmN~LrF4u>|WaT
zuj2RC+_KqZ#Kgp#nWB{mw#U4+lGJK=_QKmx^r_^j+81D~Ui4_p5zG<P=cuVMTF(Hw
zAbG4_hW1Q*LIfjX^k`l-dQ^i~s&l;Ez=R!P2p4r1Hjt6|$uWpwMc*Fe^5j7p?$j|)
zmHTrH6%*>TOOQcm5JW8CMpq^)BO|j=N{MJ)732xtTnc3ASPH7hWE;$E+FoJmXlU3J
z>5xE|KpD&6k^|oSA<2)RgtNdIs3-+JJ{kIT^O0WiIy3hsZ2}T8X(^!lyWVOMh+^N;
zn?r0;$X4A^3APA^P+;9)l7-l}1rZSuGaT5=Zi(g=anw|)Kb5u>JNbwk@L%+N_vHP`
zeaXNLP>I=ziI++?vG_L6^GY%0FL|&v76PS`28>6wV*duoCNN%szN`NQp@~IuA;oYf
z9~~HD24?~P)T53TuwXpstf!w{gtuC$sb+nK^+|}eakMJFNMIKF(MU2gFx^a1R2DRg
zDk@J#koHa;vG?UDtCnt6UHV(iIv`&J1`AFDF7x;;Z3IAXS4IhOE8dH-C>Ka(tprdf
zu9A5yTQ?gS@4;mB6B}v(r!(QT{eoPNQ8sMm-4^N36@^qqo#W_3=#fEe1k*@)NeI+=
zs-dDx?!?*hco1teyTI_jq0+#EvD$8Pi)?<X)E7TwJ|gylMvopy;dtS8NIouOfPrkp
zWo08D#gmAJqcWggkmN`YBI!|_ptNI%V}*$rM6tyaiB_Obp_68C1FIv`9ombYWoGeQ
z+szFTjS0cLkMMSl1axwLj>F6+78P~l-^35LLT5k|d}D~zRUV@ehgsTE#8BR6flBlx
z+7v3L3MG;SNzh}S>WOzMlw-;B!JEn*^O8;Dp~x9BmG05+*NbcV9C09vf!MqoEL+7M
zfVlC0y?Cl8L#T8IT^t`lsydk9+y%HuJ-)p4M18p&92Jv60BH@Z57Z@?W<)X=wlcFQ
zmtl=^eg!_qgkf{LS7IT_l&`!sc@wv%fvSQDb&vjA#4-I2(Chc60UZ75@Rv*xV$pG4
zf4B?5o<1=AvqXecCkWl}^KUxQgrIiRD^pknQb=EBhTO%EI;zd#eBkFM?``QAdi{`Q
zkiQw}?zh5Oo-=I-wpN8JM|)u+m0cm&G_&{xS)S1q3bq!6Uu^9lH6j*lB4RF?g;|Kx
zcAiL?fT+%q&coE1`Za#m-qI71fqwgY!w<<$e#<{7e$n3C8j*o_JJq`%g&eE}T-3R)
zk^^jo8i3|K^UDuOvI>ID++{gi<suSn&53wI@S24j7>$q1LyzKpm6^Bq2%?&u$Ai?E
zX%-{;y)|1&$R>ZsR|+R}U`Xcop=`9hH8n@bCXu~-??1`Ev+}-zu#`TAECx{q!6+P5
zk?&<p<1(*$)SH=WVM2{jH+6n-A$hyRx&``_&BT#+p1g-U;@17s=D?j$b^}+T5OCPr
zgUc)=O_EH`0pA=fXcoHl=LY<LGu!!is*V4?#ep<-R7lS@BGr>gG&Z;B$o@<snuW3=
z4Q}~2mJEJtlSOThXhHi<vZsPn<C&RqXMZp%?sIT2S}ax#AClac#}DY;nAh1R!JDc<
zol>GINSS|hJ&k@(6(@<@t>P+3ImT)iL*nt3abJUBsL!x3ss2qBZ{@Ad`4of!_3v;%
zP5P(8=w1O8^=P7G@TTZfHn=O2*fTvAGYc8@AuK3r$3pfW<ji;*0ZL|$YfDh;aATov
z3&-jEEtNSq#u!%LxS$AREy(3r-o>*bC&Cr)i)=YxJYL>KA@5Tv<67Fhyb-0<PUhDd
zbU7en4s4FJBG{<R*@%@MO^eEd5Yk>(D?$1}#@Ef}^;fG}SIw#<YN$Uy8zf`oJ@h{d
zQ{GWJzqGiHm1h#4-P;U}sTBavKD=Wl&zk&reG#&^Xy7E3hg9|*9aeQ<*vNs!b1)1`
z)Jj>|Osh9S=9pW-dgf=_DNVA?&Aly%zsg}!s;)(U7)BDR1}KFyqZ-sF(j3wb-$t!N
zljh`96z1K#ZOb@=rsAc2U(vxQr`ge|u_4TA*WTB`N6P9_ht3>Cuz7qXVl9FnI#NmH
zrz<hi^gQ97In#+Yj=x+`chy_HHTR1u#r|?l&rK$`0$|Psrx&sygRxP%+pplZkSd~<
z84E-=*+^D;*C%ry5ktlB^8i>!BrB-ujv+jQ8@cQY4sV=4$t~@1f(fwxJaQECJYp`d
z4`=hlGaWj`D^TKf7(8Z&`B`9}Y0CX?!ZuZJ|K`Yf+pKP33k-U}BH#5=-g0E+h?ra$
z^*I0MNkY#mH9!C2_D6tvcV-i+!~r6mE*cC%2cgcMPfU0yH<~{pwYC;El<gPSZ^^Pe
zsefY_1~_E{)X{{fcWaYP_{pd!WvGf$&vVdbnzgLlyDbc%)j#(Oh+Cm=AAN)kk@2BH
zNkBIFHaVu;wtcQmuL+VE<YG``@|rExMJ2_oXlS(cGDRD@){`DpGzVLAe<LO0Ga|X-
z9STcq7YjdsJ#MhOK3IG9TGKY~1#y+}Tp{@Q<%tA~?Rr;pmzpxzV?Apzv>fYx^gL?6
zb-ouxqOvuEn@LPCe)mBrd8YdSU=h{bCI+w=S-YZ%x~gs%*>(+n^@HbYX^X}Dg}V0Q
z+wDGndn2QF@8Z)=qawtv31V`cwPKcT-kD$S{E=wJy(7A$c0OHuS?5{YN<>8!6Zyl?
z+mT7IO^Dl8x<EC$fM)t7t<uJpkxI&nZR_~TcOJ2XU3!K-kEmWAUp<BbB@!S&4WhtX
z>WuAQf{iQ#xJE0q+ab4?S5;uMW)<^5K+u>8DIAp)V&dxKZ;jsm;(U~A@o}cmPyT>2
zh8xbF<L>AEI1I~)E1Y8H4x@oE$i-f6(Qn(A=;JL^I&<Q%iDhuJn5|L#bRCbfj?*k0
zH3Oo7@RHq4XkDk<3m#6<TA7Mj)!2Mb+0t=KAtu0O?6T<g-KmN5JoRY#82)Hqk^vhz
zAu66r)rLlte~$VG+XHX+<e+hwG@xh-lwlcomqnm67r_Rc$6JXXpa5N4D%>x^Ay7rr
z6$v@HtP!=ZsZ+7zFBROHq9aytvp(gnWFlR*o4ufP-rp6mN07k))B6$5DiFqSyChmL
z=$v#YA@uFiV)z12#y&WF@L<ZG5|H}Irn$v?_=GsbJm_a-3MmVBdjNQ%Uiq;GU}kOc
z*Wcbr%&xh!G1@sCyP9a5OUB~V3EF5Gy|RW<_wN*(W1WEki%}||^`d6|F6ulHQPCj9
z$I5`nz{nmrSlr(s#Raw)sQk?yK;%tjNqlHc6uQ5`+pus`$_vwl3oHyW8CIjzA~;gf
z6@wUTXo6}j2N+9%a?IUv&aW$zyOUQr$w+{^*lc;v4CcFrtCv2+l|C!N6;rcmLx+uw
z!mLvc+6cVS{-`l|hT{C~Ooao6!Bp)t&nCP+2Lp{B4$tK81X&>?@>TUuf%u7daeCK7
zDLNiBo>9b7ZFqVZlF7};lp*06H_p=V_T2|DJL4tuowi2IA^~k6E|X9cm|6^zcDhBo
z)ykAhk*aUG=^4zNYd6-$R}&J3IRt1a?#)Z@C&lPrAsly#_1`aQ`-#1id8Qa0<`lfC
zgN^$|hY}|6!v1EiZs#d<!~n`6u{boY?XS4~C#`aSo;|^dZp|I1{KKtdUeQfYfeoD}
zQl(XRI=qOV0DVG`6%ur{O!IZ_bSj86paM4AT_Q36+c@9|6WdcTyev%ZZ1{^c{KZ!|
z>fYi<ryYOv6ZaP2++YW|&#|$Av1}Hiqbm<8tZ(amK0EpsL=GQn@NToddEdk$QhJ7A
zw?Iz4w`kAp{^c@3b0#Wh^PJJzKJw^rm8-<pPg|d+b?>E}$&Fli;fqMv=Hkrk_US-0
ztF`@OmoM~VZX~FMC5}1gk{q>3TzgAXlC#Cq8cqZUUB!nw7f@v;fzgvPb9}Y7BOagn
z3H7GaIDQG8xG~wD1d$>_HuRN#D%)QL;*FWQJ;{0~?Dc}qU}Giuz8=N4rW-;frZ>z7
zi4TLt$Fh1)S~VP<OqQm+>smb{@)g}E*~1cM&5Z%FK%{Cu-z3t{AQ5$=<9x=H>8}93
z0rk3%rTb5guiN|Ipd>ta5HY`Wy%7BqPnmZ#g>DJ#c>XZe-LA=#lRMa9`^e?rYCj{M
zm+8NC-(<oH%#_NO+6p?SAaTp4D<0_7ZVFhf<8xW&bm<6Xo<CgjdCpn{9+O;T?fL_0
zp+1IZvXnb6FTTes*uT9|f8|}!GYU8*5aNzmt(}jC)#~cP&Ybr6=m?+?zPPHCa|z<W
zIpHgvtVg+?UbE*4`0t>apSk{C#q>qEjSH7f7C5w0E=A`;2?P45gpOJPI^PKvP+2(G
zXOE?lR;45uySlz}++5vJ!{_@TB4`0Birs%OnZf6%!TM%-QHnGfL{m|bLxr=GL&gV}
z8Kw!>t^?5Uzk>6byMC;v+nvAxce36qT%hEVIIF(Jb#{Ow@5loJ32z!p6n1sYroukC
zkd6GwF_C>fn8j(n`h%T@9WQoy{F>)%$u&fn0N4#cw5xVpp@-d1#;9EIUgdZChsgMt
zk&3RVjyw%;#oC1}bQQCL<|#IYFr4{(p|OBX$yyEQlTn`hIiVJEiRQ4HtlsN%h>4y9
zEH1KUe(0m3HB`L5n!-UQnXCQkZy!3IYeQT_f?!W;ZsfccoyB}{i0@{fZic6l00hE&
z&p>9C%dLEP+*uP)-Z=S*P)jJ_8p@RqllY7?&L<610b<&=XS4<4Qk=7~bt3rB@4@0#
z9#ik<Z_Gb}my=hVFgf{h=YkG4%jD{G#TOY9QDD@~64}ozz~5ariH=$I`7Aj&UZ&De
zM6Y&H4tCY8q|)_?0|TX31EuLUhvDQd6WLT`qUZuPW|8A8muIm{+K7p058r#o+VMPJ
zrIF9&VjU0xc9?tYNQlUAtzjfcTU2=ATct<!?EtmGk&Q)JvE`YIJXB$A&bn<oKP7H?
zx#^ubCvGO|q4kFJRdOA6vx$*_HWXSf>VADGp8dWxAR=#GxgX)Y&xd?Xyb6b&yUoi+
zM@zKkW=8cpWS|xr&FGLQ0$os$$Lc}Z38W7^5HqJDV$^;zec-fS>y>pfmO;X<M3c$A
z`PskmOE;_MqJUJ&qS(;H9T2AsTslL>xxc44T&4d~eHIEWy3iMV3#vZNkO2n$n20ic
z86%Q_<sI^UvGk3{r6;$Gi?EL_6*y{BagXKuAKmeDlvGmIb8|nky$;r;ToO%WB)yrb
zyK|<y0cbet@Tjx`-keQk5;(3Y^Uc2z=QP4?X?u5MPscC9`r(rpgJ@ksMOr(kiFOO?
zPVT$NB&|a^*u7y}@O&}d)}j3QtC_p!DH3l&a5oy;IX+J^x3O|$By|BgSMd`C5sO!N
z!%yStkYYL9JROT)2%b=v4GW0}tm@;5hx>{9r!GiL`?ud-*=;3LIqy@ftzCEW_@Bn*
ztuKc_*IR_f$K9tM_IjX>hfIfL`p!OHD&YKVbo^jaFb(oA#yMa9$irMS*xooxC#(un
zt(00S|6t4qggogdbcAx1_3Au*ZN)9|kzBQ<K5m4-CM}xzXy-jCz`!A18d37oTMdC1
zp<h8ESdz2xsN%tuqsK|>)m5ET48Qwf`hIyi%6&JBH<dUSVq6>j#CudikPYrJy<4o$
zbX~kFPnr(TLG+eJw9FI&EdJ2p)54ruatcmkna$<DaWNJEgU2gfzg|;O`kg|Re=Nbf
z12?b9hx+j@i;t>m#-*POoSeN@KA@x-(_9PRtgSp=UPKi;2=<y~ybHMW2BQ_6p5vuR
z#<Zr^X#hG8v~w<=P9nD7d;uIzzQB;`nc3-?v+pQ~m(@$Mbeu)UY?C#7_G$p@u@(N>
z?|H1z+)S)9kRm;iZYQ4@>!?P>yOL7f7FPS4NDWInyBXstM44uGF#gR|0-p|~irK9E
zCztMr*-u$RrN51QH|2Cu8N$zju%I$fT}*Jn4`gd2sY#aJPZaEfN6`lC1C{y_JeH0o
z`}u%1UfeZ=4G)&gwWh(QiP~%YEBDzll1TpsW9WvqqCvRF=GH97Z<6Sy2xO^fMcW9w
zMigt$PW_(t!R&WScZP>c7h2N!aW7o2ZkO{Rxup5lAaBuUW<PyUF3z!BbAFyeR8mZe
zYF{3%pR7;5zg7^izJKaF?w2t3V^>J8$V|y$M(>9VQ-BzSpBT!xY>(UQVi#JI&ht2)
zwM5`pQZM_$9=YR{XM>AOgn9%C@x6^J7Ec<Nq81cW=zq9x=LBw`#56wZI<0~L;bnZe
zxOQN0Zu;VK5d>ry<nPW(eZhjrsb14--(h{BZdf&yg(sLKDs4_rht7!>8ww-8Mit8g
z*4Uo1pWpCMY2u%saPe~LJP$OkPh)RwaXq0Jho(!$G&gA3;t|bm#&mJA@n55<OAiyn
zytAVz9oem%6u{!jy=Mna+O?kf0DElJefK6j?P7MPwQakps$)~PySrjR!5U`}FW|w-
zhvwal2luo_ny`M{dNS<SJ2lrut;h(DXj9bEI36P3Mc*l{Ts!|b-q-A@mr>+Z3%HMR
z`0Uj4mlX@TB#7|vN!9&zLP4kO;st|?Sl|FtS9byqTl4D>Rl9_g(0T`+yW8_m#iz&T
z6`)E)9X={F>y*_&qoqhv|J2pyPI_3}3CPAhl6=;bBBV+vFrTHNt6vq=-rmpQb2bOi
z$ffPIUbkAm1*`7MMXswRZ)<!MK??Spt4FFfMbFiT4RFBsi_^oD0TY__zvlNAqmu{^
z#j9lNsDTH;ki(|i9J+jTwS#95%VmgOPW5?PhvVb$bU}=}8}-)4R7yuS0XlJL>Uw*5
z3-0i+#R=wyn*Rky<j&P1e^zSwQZ&xP+5@K!kgh0=kZ=arV3#T1XU*s4+ki|yM}0+2
zL_s6^RF)WgG;4fO0}tOVst`7t7Z(+4o~_%LKZ6DDH)R*nX|8|e40!RYF~w;?-*!hC
z|IrRpwKMpp-jb0nB-Xf`$D_SzhXU3hco|g+EOP$li@vkt2Pyv<-HnaHHdkwfem|#l
zVDUlF&3p)luGc_j%*=BO@cY<_uqVQPXhloWqm)+_OM54Ovho$pBR@-YL2>*>s)g9O
zv9{9i$wMZ%^6fD4yL|L#j;j3jeVhKcyzcUhR6Y@&znBxt3is`dofYKg;Z||YRhzEh
zHuUbFkY5YR{L091xYDzsSGOLmynT9OZe5U`W~_7S+GuJ081SEy<*1GP=WE7OM_cni
zhLSoLmQ*Wka^~4PR8;{)#qFk@d5PopG9bYaUZkIGl6yd8_QcSMZMfr@5@#)hNMW+o
zIXvi03q|{v0kKropi!$}TEuj6N10{tb(+C-)dL%L)t4MqR@>rJEO*;>W^~z0LaH)l
zqbbf`=hE{421ygPT-0?wkQjRS9z$1)PFwou4#|8;&tm`Ekd-kupBx8Rw({pzW~EI+
zg<dQP&ANHFm-<-hE2oGu%8y?9v2%3}^(VE8x)%k0dL(?&fKKX~u&o`h4A9Bh5?=V+
zpX9idr6H=FhaM0RaO!?_%w3N7w=bex<L_KHCh~SS&Xt*L%*3|-l3;$^6(9vkWqYz>
zbiIVDrj#kjUSW^jPZi(aBSG^UUh~m@DP@uJWuvZL!n8((NR`ok5JfwX&^Vs@SKH5@
zOsKSlo#*h78hDmgSu3g?t)TmdSOa;@uMbE2{(-MgQ2WaNu@X6B<W38aYYxOB{2RF8
zl{TewLjEbJ*3zvNg0uMM(75`|=JojRI%x8B|6@aPeOq0j=97CDs~eVcl_>MEvRS>*
zF4UXBC$|D6bt@V*pJS8U@KmQXw)4uHdYs&@+Co-?#obJ`_j45coi7dp{pFbFrg<f&
z>AWAmm9F*E_7*W6`sKfH-TS#ROeL^4ZA&=DWC)i}q{89%>HBFTF|HLXD}Ra71N~X8
z{h+CA`pvL8X&1eq4Yf!IJ+naJf|7`5wKu^&jIQZHSFH^jsiOws0T^Eh-qS@Sug*+W
zW7ns)9Zm!cdffE~N2FuY;4MD=W=Dx5Wrz274LHNs;0OI}R6niFJh@xlUP$<x(uO?G
z7t#tR*EVeWaJFGJUebN3(g;*8y=s#V=(s4_)67upZWv`6i}$C>9uqsR2Ji&x8)r#&
z4h5;CygfVLx@)PiS~eOl7=8X8s-Y?z4|>gTE<xB*Q}QR1Yyfblh3x|q>z+`%mG4go
zvo}DfLX~C2hw-^!Z&?-G380kHu(gDp;Fs^CCAWCt6*A5;lV3D8IN#2-x^s1vfAG*M
z5_a@{)H}U|<O`>jw{Y4gZidJ;|0*f55&L}{YI<!@J&#Y-M9D?+jDf<<0)2Fzx+Xn5
z8<{gz=*3h&L}97WjL&a>yp7!5UtY<=w^|^-%j8}s)0|JY5k5^Cx11?<-)8*7U2)`t
z$xp?lfXwIyy!+@}c4;!?3z6Y>aB>+4)OSXs1UaGE0O)qvf7NJJzU3soH@QMdQW(Mu
z+{xHar~TDltDOH_g3WVrhEtB4iL##6^l+)m7<bl*YrT%)@n><5ro8-PlDtK}k_;@A
z?xj9;<VbSY=_iL{kdUxV{(0x>;{5DNhswkFpD@TS?7FAc)-2FZMVt6j9jE6et481+
zUqa({9^ZFwZSH^dlLEu6_FdSjb)MuDHg<z^j32&P`1=W(&oYod`}YNFcp@B=$p=K{
z+ye-otjJ-ya3#LlP1O-+2kWRi%llmh7spF&u5S_bFR^CU0k_!bsFSt5Xf7d_tvH3e
ze}FFEZZ-0rN-VbSqW+rjVl$rRmlZ|T5^tHGNQ0V`lDT%abX2$s@)ZkTGd(!{u-YD$
z{cX9V<>tmxc0l0L*<!C=Fe`#<ZDos%L<i#@sgr^n;%g3xoF@{Jagc;U|89~(L*5P*
zz?o%?TWDZbC~_j7P9$7F&)t?@nC#*0{th*)D=&0u`dsO=OMKEQhHGW<O@O^p@GE-r
zE%y8jhN@alU0Klu;r5tarV<z^Q&{eVBqia#vLHYIVIrGfbOWLo8JdCs32;?SSvq$d
zfX+|K2XMSZCezcE{cIq<31`-}rK0pJ!p*N&sDI)+X@q$M?3X!+pNjZ3AkFOBN95i8
zu2=J%CV9^j5ToQpvGrp5hDu}Kb8y4nfhnp(yYH&0`UNhCz6gqWQk}}PxIj}Kp()Vn
zbWjWGHDs1{R8AIPholm&r#I=f**kn)Drv9xGdl=5Pi$;V{-rs{zGvCw&nM8f`fw&J
z#u|?V5Vk&m>GGF&zwJ_d#krJjkogQ4i7|UO<K~yVrY>_*-tz9Y*EP#h25c{rcyRqh
zj%PRE#L!Q#wQSC>Ov8RF4cD^F+pK;Y1_^?=O8n~c<+}&?W8Y!{aLIHdSoE=VsCXn3
zNufu;D^KdTKx9Nd6+2D^tOEcyKDxQ7_T~2SADis&9R{^;X5E4ht3*Mdro0@|Xv@Ii
z7DwgpkWT~DNi~~zqDR6BNhw44xV^g7hAv<A;BijhONRy&nI<FqJVPCf3yd>*V<9wM
z7Y|VRjW*sO-tp^Dhb676n}p_!|4LH#_kD)>hsHe2<EQVityH&zj<7rzS9_xx+xJ;5
zs0ucn3=K?*xe_Djfb#^~y%!FI10<%~USe1}xozqj4j%#FL%y)0hLdByMQ9^2;1q;N
z7{x(PNB>tIS#`DI?l-@tV|LiKYqF|pG&dIF2nVoNaFfoT3qAwilfUY1^Z-@>iW2pl
zZ`5%eWWryeEb&>8^1vGPnACMgrFIBubF|wLr5*?QSco9oqLYs;r-48{(iVXCy?#z8
zx?cAW4CbVwK>QV|SZL{)s1s~8rvCd0frRcu{XfmzxPVh)rn#qq2q^kXXAOa`H=ncF
zN7|Kon+bM775F_m38|Q!fz<kA^^b3vS@dSmurPI%sDiJNtZoJ*A$=F7GrC2nm~}vk
zI7t6m!l`+yF*5Kx9&l;^6Vpu{lt_4fWWdUdH@i991`cwdL_9pPtK}%g4!}y2{av3I
zx?1a=>~FP9U-lZm={P)60MK(&UtiV1uoxghy*G(mxw5k@BJrs&Vn)U4cKQ)e<!sf2
z&P>yaedF|TPTwB)0ty$=nVF8oou^F_`?%nHOTcLsf;BMTyR=FI;Oi(<h{NyFtZ_TZ
zuY>B$;N}_<SWrCk!KV_<Yw`H&M+%@M@*fCW(p#wxa3s0IVoke$y37aE6>jRLn(MaP
zE^Bc*fHL(==Ub+^d`oO09n<@Fg&ek}Z+@qx3}4hlTeS%?*5nTj^Ly;D$BoA`#TNqc
zI?{;S^H<n6@}yFKTc?g#Wa;qFl&qs%EPz`$170Dhy#PQ46jp?YW&G+F&`xrvSvmCe
zbN^D$3j7xAbjuA1m1z+?;PPm~(DpfLb8sHoW?x{AgF)KV53Dd;QPp1zjj)ne&HV}Y
zOo#A&ss$KvHi}(NUjuXa?|cqrX(%<1fRr?5zrDES!(nZF1yvgkM6)AhWRhgIZ9zv)
zH(rb00h&Uy`{G8eFTaeuoo`88u>R@fL{bw_jJ@|>qO8N}0$|YizJtsjfC<JbV-Lcg
z2zOrI`2wYu%Fn9JvSHd1wbStk-T&Z(oujqI4GFWItLsT$fRMV9yN>IHR*PJWuwmG4
zi@>2<fs%9HSY9~Vlg89*Cm={^lTDud$z-Yfthy8xpcG<s+oS9BU@JF+rZzgb&KR!3
zV0`(yCdQ`{reAs2-4hJbxtlODYMq8me?Ql-FVftKCSZM)A6)QGZ>RP4jM7-hFs!8I
zDl2b-L1@LDy1W*N1dLmJ3VKqFDE?B4mx=C|6Jxui0|3Ty*@4EMMyj;Lae2~5N$0Av
zmQq$g(;LX3W{y7$Ah8ZipJak|LNbY4teaH3yw(F|3!s}HDTBH~`V}G{ek{VT;?YB8
zlFB%2XDVe17*Xr|)w+78JvfEOHMZBgD2LmxFseCC02J8XyipUk*>z=fo7HI63aIFT
zgp?bRXD#(6&6W(me4$r;+`4wf+SZl8!IcgGs7i~|8jE)<ss^8&S{?W`y}i~qxLUlO
zzT`1C08zL*$&QQgZl#cv`ee<`lO_=BscZ6Ymz=zdq*u-wBzIvRi)W>Tho6?A5`xnK
z)g`z;<>;vj7TOJNQj3UYG`1}V_VdWT7H0#shd^(fDb0GdTSpc<iUUnJ*|a`jUM-2*
z6d-WoHPu_r&OHJ7frL1j!^8n7%gM=Zb@@wt&G0+D+RabEMCq^!i^Z&S%n8_lZJrY0
z2cY)54auN8qI+2B!9GnR^?Sne>N(2dMy$Tav0w^XN@!ztA!*j0>A@~$H>-Yo<mY*>
zSlaag>fOTmr5$&+u^40>lBa@_?QT_mM5s1X!7exD&Jy4LVT*|T-T5#vDNgtZWETY}
zL({^Re#5xVd>iXv{HA@{o(~Jp%t%%J2;VerILA0*u>%z=Y3RPR!ME_}oc&nKBqvck
z_;uDh&MLg`8#i+Ea4#B!P6J1zY*EO{)qvgVHpTex_Q&5ljk8%*PD&x;mdMLtky|qG
z!bR+NfhAv#7T}&I&2aek=(>E8{w(he%VT7tjBB-ewdFj#k&dVRQJK3fG~fFkyeJ#v
zPj|OCZk(q7J7!YY5O|G=HPO^!kaP}9HLyLjAh~>ba77rBc}1-p-sajwrtdp>%O7w#
ziUsm5{R>=<Fcn-*rJkM#Z?5o=jpb=1@mIrl&i1&zl;a8%*9lf3f6S5@N~8c1>a0{a
zwEs0i+!D=L2XGlGNuLq>2keIxZl5$-dXYnK5$XhfpC0Dm08tW`bJUVK|EgBF7ujXa
z{<`}Ir9+POGh@ef`SEF$6q3YTV}{i->_0lbo0LR%>e|2s(=(xf-u}b_o0nj;7F#DS
z!+9X;fv<~IXRm|&2EWdl0@nq|pSll}_z^%FE?+Q5*KI%FchWr_WEj}klfz=e{Jg-m
zbAfiuZLxW5v@_C5OZ>w~vS;!CE>N8WdA@^WMVpOh9>QYi6Cb~?u6ohAb#b+wbusEb
zqqLL(UP<r~G91J4xY56Imcmpzm{ru&%^gSfyvSRCX)m)|o!noq;K+VljN!dzGktYk
z?f*U_b0Bbh-q?zhw)oGJD~nO1$P_KtiIOiFfDO#%c%M?vjKTy(a3Ls%DC+hv6mR>*
z`BGchKCMBfQxb!gjI3vD&#+_EwA|yk21{nH22AHKSb>=zKK;8LKBW|GkG=L5=C*$O
zstc;hVrSs=O7_`ktK3$|-@X|7Tvt10V~XhyO~+50r|1tzI5@umqL#H0a&B*>-v<|9
zzvvO#=jM(}oJ%;(!TE`=knN-*I)HAbpt!GKP#q3Hx@}?_>3YXSA<!NbY$})blenT%
zW=hx&^TdJOvdKH#Ns+X|Ihg!i5cB^o^2I<X%qUKBo`}oYK*~!7I4^G0H=@@Qv^hXI
zQe>JI)P!u%ich$c5XQ;;>%ss{xI-1Pm*NF90-7cW%q@CVQgI;o9SJ#F{~8lM>?%YL
z&9L!xlEMRmHsb!*hA<HwPk#4`M)V@*gDOCd_^PgexB7>%RS0WGS734la)>xf7A|OY
zNGp#J4J>yAZX1c?okU~eiIs7lfzd?KCNTfM8N#C@0Q)#lGGtQ<9)!a1UpK@iYP7wL
zUUnyG^!R>rFgOFSS%gfuAqFod-*@;d+5&y|DsY7^^Gx^_TlP7gJP#mUMaS_|1`ebe
z@pf+zDJJw(_6UrIq^IA?|4lBXfE$PSIlt(v!E1Q=xewXhB5%=1I76`)B8>wv`K3Ka
zv?73{*?lKqV$q%*`b4y~L%Cy`(Gl4(CbO_>7d^CzZ^@hk>{!(gPDrieZO&1EzLPKH
zounq>h^lw<z|}J`@x>(-^KR%+F?<`d9Ce3l4uWJaR6)<Y8tNXo*jEh4sYK81`Swq!
z@PM9vaB}4<U^k(NS|17gYc{#{ClqyB$0{R$m_&P**O)f{TN?V^z(y^ni5U3oV*^Z$
z*-8f_uPSYF*<`y~?=BFAYT#nW`-vrFi_NodzW77E{jSM19AGqGlm)BRb0pzzy?WHc
zxmp(yV3E{2oa|xYNEtpK2vW@s3xKdy1Dk{1na;!_2QjRPWvyy7g@QME;eAvEuz%AH
z#XI_K`}SZTgo0T+h)sm`w2;iYmBq2R`}%F`aJ3N_HfU|QrG%^E$oGHx?ebRZnm2%`
zM&LMVxbHC)f43MTg}#!-k$ql*zp2tiC)#PF=*?I;8svvu4|k8<paG%EnOIU_-5E}q
ze*MY^M&*V4KF4<!XU;ULcammfnXkEphVUbWbSGy(NkI$GWbYUN7{u}xovHqRSC$bb
zH~Nzd&fnW`oL;okn!k_bkTwKe;`5N&xHJ(7VyN;wNrquA;DPDN|JnQrn74Y;@N>?a
z7HXg+OdM*K;q7Vhy9CKTv9Pya11fCTKd?X{iV8#T&E@!KkKif-W~J6mcSeUsUNG7o
z)(ASpt%(TGFaFUnt!&ubzB}W-iyZ>P9Bogz!$UH&s}?DAgD#z{RTB=Sfy@N4Ahv4O
zW)L3_Mb*FEp<dS8uLu<}jF14@;ZQ*+34u4GnukciO)P%g@T!y@IyX?LGqHe!U7Q2(
zGtljw>yTTSSk9rNU#9(eFxEN~8~C~u{V|XCM}DBI4k^44KhT<o##D)}_6$pv=i8lA
zU{%M>|M&A9qObD2QtOU(g>ZfhDS!&V&EV_?a{`h`7m{h_tsd!-Eb6NL{A>((XlC+^
zhfrwG9(fosB@5v#e2I6V5Ydn1{{hIuySdyRltU%`7yrNBM3O|TXwC#&(f4GA<0Q(R
z<C{-=f6g;@<jRZ&Bm+{()LKRFy&_{CoDkBFIs3s2zScmMz{Wx+G{g9aRH*(<M7(C0
znMDfvNV0jB4FN<vsY6Us0HMzGv$GgEzm<+^_8ARx$(~ys?Ttq$e0%(nC;=XJISLG|
z4O9)>^Am5owqYfkTEs6?&Hfx62>v~Q?BTq80=~QC!fl{$z*!wfw$((?fzY9-0VwRW
z<vqN}lw{o<-;oGC1NRnj=@9s@ky4&)2K)Bva-SJqShFrv!4U(l+kBzsetXUb7mb!J
zTk5f~d<s~^OqLoU7~Pg$qBbKM8oXIghk5URizSQ5h$Mas#U`OERsxK=P(aMgcZ(0E
zVFoK=yBCY`&_|ywL&f|>2>acCF#%HJ6o^q-%RdCUT>CN;H#s(u0_dLVC1U$%ezmVX
znIQU&KaJll)Zppt7c{_YyGd%Pa#MOb@)Pc(=jhXyv*FZD?C`tQ`h+u7EYs5L#1NOx
zkGARhoN=zV_f?X?`;$O-kh_?&<nB>BgZTYLnn1mu<QrM1VZ)9bXUbXf+ko3Fg9<%Z
T?q8ro8AaipicI-ili>dYjQ&|m

literal 0
HcmV?d00001


From a96d16e44ba2a07f5b6cdc10919286c23a984fe1 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 23 Jul 2024 15:08:09 -0700
Subject: [PATCH 053/147] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 1c98c43f1..4c1271396 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@
 <a href="https://discord.gg/unsloth"><img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/Discord button.png" height="48"></a>
 <a href="https://ko-fi.com/unsloth"><img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/buy me a coffee button.png" height="48"></a>
 
-### Finetune Llama 3, Mistral, Phi-3 & Gemma 2-5x faster with 80% less memory!
+### Finetune Llama 3.1, Mistral, Phi-3 & Gemma 2-5x faster with 80% less memory!
 
 ![](https://i.ibb.co/sJ7RhGG/image-41.png)
 

From bd180c13579f199516ac285ad724f99d11c562c3 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 24 Jul 2024 14:05:31 -0700
Subject: [PATCH 054/147] Mistral

---
 unsloth/models/_utils.py |  4 ++--
 unsloth/models/mapper.py | 11 +++++++++--
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 5a2e85997..213cb5b0a 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2024.7"
+__version__ = "2024.8"
 
 __all__ = [
     "prepare_model_for_kbit_training",
@@ -336,7 +336,7 @@ def patch_tokenizer(model, tokenizer):
     possible_reserved_tokens = (
         "<|reserved",                # Llama-3
         "<|placeholder",             # Phi-3
-        "[control",                  # Forgot where lol
+        "[control",                  # Mistral type models
         "<pad>",                     # Mistral Nemo
         "<|finetune_right_pad_id|>", # Llama-3.1
     )
diff --git a/unsloth/models/mapper.py b/unsloth/models/mapper.py
index fc13c94e8..462555f31 100644
--- a/unsloth/models/mapper.py
+++ b/unsloth/models/mapper.py
@@ -227,13 +227,20 @@
         "meta-llama/Meta-Llama-3.1-8B-Instruct",
     ),
     "unsloth/Meta-Llama-3.1-70B-bnb-4bit" : (
-        "unsloth/Meta-Llama-3.1-70B",
         "meta-llama/Meta-Llama-3.1-70B",
     ),
+    "unsloth/Meta-Llama-3.1-405B-bnb-4bit" : (
+        "meta-llama/Meta-Llama-3.1-405B",
+    ),
+    "unsloth/Meta-Llama-3.1-405B-Instruct-bnb-4bit" : (
+        "meta-llama/Meta-Llama-3.1-405B-Instruct",
+    ),
     "unsloth/Meta-Llama-3.1-70B-Instruct-bnb-4bit" : (
-        "unsloth/Meta-Llama-3.1-70B-Instruct",
         "meta-llama/Meta-Llama-3.1-70B-Instruct",
     ),
+    "unsloth/Mistral-Large-Instruct-2407-bnb-4bit" : (
+        "mistralai/Mistral-Large-Instruct-2407",
+    ),
 }
 
 INT_TO_FLOAT_MAPPER = {}

From 6e30a7a006d51dc5692f4687a5b38a19c7e48596 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 24 Jul 2024 23:45:39 -0700
Subject: [PATCH 055/147] Patch PEFT

---
 unsloth/models/llama.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index ba4362b3c..96eb5035e 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -2063,9 +2063,9 @@ def patch_peft_model(
                     (getattr(gate_proj, "base_layer", gate_proj).bias is None) and \
                     (getattr(  up_proj, "base_layer",   up_proj).bias is None) and \
                     (getattr(down_proj, "base_layer", down_proj).bias is None) and \
-                    (getattr(gate_proj, "lora_magnitude_vector", None) is None) and \
-                    (getattr(  up_proj, "lora_magnitude_vector", None) is None) and \
-                    (getattr(down_proj, "lora_magnitude_vector", None) is None):
+                    (len(getattr(gate_proj, "lora_magnitude_vector", [])) == 0) and \
+                    (len(getattr(  up_proj, "lora_magnitude_vector", [])) == 0) and \
+                    (len(getattr(down_proj, "lora_magnitude_vector", [])) == 0):
 
                     # https://stackoverflow.com/questions/50599045/python-replacing-a-function-within-a-class-of-a-module
                     layer.mlp.forward = types.MethodType(apply_lora_mlp, layer.mlp)
@@ -2085,11 +2085,11 @@ def patch_peft_model(
                     hasattr(k_proj, "lora_A") and \
                     hasattr(v_proj, "lora_A") and \
                     (getattr(q_proj, "base_layer", q_proj).bias is None) and \
-                    (getattr(q_proj, "base_layer", k_proj).bias is None) and \
-                    (getattr(q_proj, "base_layer", v_proj).bias is None) and \
-                    (getattr(q_proj, "lora_magnitude_vector", None) is None) and \
-                    (getattr(k_proj, "lora_magnitude_vector", None) is None) and \
-                    (getattr(v_proj, "lora_magnitude_vector", None) is None):
+                    (getattr(k_proj, "base_layer", k_proj).bias is None) and \
+                    (getattr(v_proj, "base_layer", v_proj).bias is None) and \
+                    (len(getattr(q_proj, "lora_magnitude_vector", [])) == 0) and \
+                    (len(getattr(k_proj, "lora_magnitude_vector", [])) == 0) and \
+                    (len(getattr(v_proj, "lora_magnitude_vector", [])) == 0):
 
                     layer.self_attn.apply_qkv = apply_lora_qkv
                     n_qkv += 1
@@ -2106,7 +2106,7 @@ def patch_peft_model(
                 o_proj = layer.self_attn.o_proj
                 if hasattr(o_proj, "lora_A") and \
                     (getattr(o_proj, "base_layer", o_proj).bias is None) and \
-                    (getattr(o_proj, "lora_magnitude_vector", None) is None):
+                    (len(getattr(o_proj, "lora_magnitude_vector", [])) == 0):
 
                     layer.self_attn.apply_o = apply_lora_o
                     n_o += 1

From 08d3ef4bb3a1da4de67c9e4135e4ea4838895164 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 25 Jul 2024 00:17:19 -0700
Subject: [PATCH 056/147] Fix PEFT

---
 unsloth/models/_utils.py | 54 +++++++++++++++++++++-------------------
 1 file changed, 29 insertions(+), 25 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 213cb5b0a..23297cc4a 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -404,32 +404,36 @@ def patch_tokenizer(model, tokenizer):
 # =============================================
 # Weirdly LoraLayer.update_layer downcasts PEFT layers to float16??
 # For mixed precision, we need it to be in float32 not float16.
-from peft.tuners.lora.layer import LoraLayer
-import inspect, re
-try:
-    source = inspect.getsource(LoraLayer.update_layer)
-    text = "if weight is not None:\n"
-    start = source.find(text) + len(text)
-    end = source.find("self.to(weight.device)", start)
-    spaces = re.findall(r"^([ ]{1,})break", source, flags = re.MULTILINE)[0]
-    source = source.replace(source[start : end], spaces)
-    spaces = len(re.match(r"[\s]{1,}", source).group(0))
-    lines = source.split("\n")
-    source = "\n".join(x[spaces:] for x in lines)
-    source = re.sub("([^\.])nn\.", r"\1torch.nn.", source)
-    source = source.replace("def update_layer", "def LoraLayer_update_layer")
-    exec(source, globals())
-
-    # Fix up incorrect downcasting of LoRA weights
+from packaging import Version
+from peft import __version__
+if Version(__version__) < Version("0.12.0"):
     from peft.tuners.lora.layer import LoraLayer
-    LoraLayer.update_layer = LoraLayer_update_layer
-    from peft.tuners.lora import LoraLayer
-    LoraLayer.update_layer = LoraLayer_update_layer
-except:
-    logger.warning_once(
-        "Unsloth unsuccessfully patched LoraLayer.update_layer. Please file a bug report.\n"\
-        "Luckily, your training run will still work in the meantime!"
-    )
+    import inspect, re
+    try:
+        source = inspect.getsource(LoraLayer.update_layer)
+        text = "if weight is not None:\n"
+        start = source.find(text) + len(text)
+        end = source.find("self.to(weight.device)", start)
+        spaces = re.findall(r"^([ ]{1,})break", source, flags = re.MULTILINE)[0]
+        source = source.replace(source[start : end], spaces)
+        spaces = len(re.match(r"[\s]{1,}", source).group(0))
+        lines = source.split("\n")
+        source = "\n".join(x[spaces:] for x in lines)
+        source = re.sub("([^\.])nn\.", r"\1torch.nn.", source)
+        source = source.replace("def update_layer", "def LoraLayer_update_layer")
+        exec(source, globals())
+
+        # Fix up incorrect downcasting of LoRA weights
+        from peft.tuners.lora.layer import LoraLayer
+        LoraLayer.update_layer = LoraLayer_update_layer
+        from peft.tuners.lora import LoraLayer
+        LoraLayer.update_layer = LoraLayer_update_layer
+    except:
+        logger.warning_once(
+            "Unsloth unsuccessfully patched LoraLayer.update_layer. Please file a bug report.\n"\
+            "Luckily, your training run will still work in the meantime!"
+        )
+    pass
 pass
 # =============================================
 

From 66e0453ea85a33132c2e9b6c616726cc4bc0b0f1 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 25 Jul 2024 00:19:32 -0700
Subject: [PATCH 057/147] Update llama.py

---
 unsloth/models/llama.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 96eb5035e..5bc2983a2 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -96,7 +96,7 @@ def fix_prepare_inputs_for_generation(module):
     pass
 pass
 
-
+torch_matmul = torch.matmul
 def LlamaAttention_fast_forward_inference(
     self,
     hidden_states:  torch.Tensor,
@@ -238,10 +238,10 @@ def LlamaAttention_fast_forward_inference(
     if bsz == 1:
         Qn *= self.scalar # See https://github.com/ggerganov/llama.cpp/issues/7805#issuecomment-2153349963
         # It seems like doing (Q * scalar) @ K is better than (Q @ K) * scalar to stop overflows
-        A = torch.matmul(Qn, Knn.transpose(2, 3), out = self.attention[:,:,:,:cached_len])
+        A = torch_matmul(Qn, Knn.transpose(2, 3), out = self.attention[:,:,:,:cached_len])
         # if attention_mask is not None: A += attention_mask # Must add attention_mask for batched
         A[:] = torch_nn_functional_softmax(A, dim = -1, dtype = torch.float32)#.to(A.dtype)
-        A = torch.matmul(A, Vnn, out = Qn)
+        A = torch_matmul(A, Vnn, out = Qn)
     else:
         A = scaled_dot_product_attention(Qn, Knn, Vnn, attn_mask = attention_mask, is_causal = False)
     pass

From 7fccd21d9f1388ca51063455c4ffae8e7c06720c Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 25 Jul 2024 00:28:20 -0700
Subject: [PATCH 058/147] Update loader.py

---
 unsloth/models/loader.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 85416b81b..6b83b8e73 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -95,6 +95,9 @@ def from_pretrained(
         model_name = _get_model_name(model_name, load_in_4bit)
 
         # First check if it's a normal model via AutoConfig
+        from huggingface_hub.utils import disable_progress_bars, enable_progress_bars, are_progress_bars_disabled
+        was_disabled = are_progress_bars_disabled()
+        disable_progress_bars()
         try:
             model_config = AutoConfig.from_pretrained(model_name, token = token, revision = revision)
             is_model = True
@@ -129,6 +132,8 @@ def from_pretrained(
             model_config = AutoConfig.from_pretrained(model_name, token = token, revision = revision)
         pass
 
+        if not was_disabled: enable_progress_bars()
+
         model_type = model_config.model_type
 
         if   model_type == "llama":

From 9e1ad7c319e4b6d7412d5f9a104abceef29a7247 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 25 Jul 2024 00:29:12 -0700
Subject: [PATCH 059/147] Update _utils.py

---
 unsloth/models/_utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 23297cc4a..9dc82f1e5 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -404,7 +404,6 @@ def patch_tokenizer(model, tokenizer):
 # =============================================
 # Weirdly LoraLayer.update_layer downcasts PEFT layers to float16??
 # For mixed precision, we need it to be in float32 not float16.
-from packaging import Version
 from peft import __version__
 if Version(__version__) < Version("0.12.0"):
     from peft.tuners.lora.layer import LoraLayer

From 8e5054bbea23cb91628cfe8923696806ca4a6274 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 25 Jul 2024 00:33:38 -0700
Subject: [PATCH 060/147] Update _utils.py

---
 unsloth/models/_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 9dc82f1e5..5a267a459 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -404,8 +404,8 @@ def patch_tokenizer(model, tokenizer):
 # =============================================
 # Weirdly LoraLayer.update_layer downcasts PEFT layers to float16??
 # For mixed precision, we need it to be in float32 not float16.
-from peft import __version__
-if Version(__version__) < Version("0.12.0"):
+from peft import __version__ as peft_version
+if Version(peft_version) < Version("0.12.0"):
     from peft.tuners.lora.layer import LoraLayer
     import inspect, re
     try:

From fd753fed99ed5f10ef8a9b7139588d9de9ddecfb Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 25 Jul 2024 08:53:21 -0700
Subject: [PATCH 061/147] Update llama.py

---
 unsloth/models/llama.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 5bc2983a2..bc434ecf1 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -2063,9 +2063,9 @@ def patch_peft_model(
                     (getattr(gate_proj, "base_layer", gate_proj).bias is None) and \
                     (getattr(  up_proj, "base_layer",   up_proj).bias is None) and \
                     (getattr(down_proj, "base_layer", down_proj).bias is None) and \
-                    (len(getattr(gate_proj, "lora_magnitude_vector", [])) == 0) and \
-                    (len(getattr(  up_proj, "lora_magnitude_vector", [])) == 0) and \
-                    (len(getattr(down_proj, "lora_magnitude_vector", [])) == 0):
+                    (len(getattr(gate_proj, "lora_magnitude_vector", []) or []) == 0) and \
+                    (len(getattr(  up_proj, "lora_magnitude_vector", []) or []) == 0) and \
+                    (len(getattr(down_proj, "lora_magnitude_vector", []) or []) == 0):
 
                     # https://stackoverflow.com/questions/50599045/python-replacing-a-function-within-a-class-of-a-module
                     layer.mlp.forward = types.MethodType(apply_lora_mlp, layer.mlp)
@@ -2087,9 +2087,9 @@ def patch_peft_model(
                     (getattr(q_proj, "base_layer", q_proj).bias is None) and \
                     (getattr(k_proj, "base_layer", k_proj).bias is None) and \
                     (getattr(v_proj, "base_layer", v_proj).bias is None) and \
-                    (len(getattr(q_proj, "lora_magnitude_vector", [])) == 0) and \
-                    (len(getattr(k_proj, "lora_magnitude_vector", [])) == 0) and \
-                    (len(getattr(v_proj, "lora_magnitude_vector", [])) == 0):
+                    (len(getattr(q_proj, "lora_magnitude_vector", []) or []) == 0) and \
+                    (len(getattr(k_proj, "lora_magnitude_vector", []) or []) == 0) and \
+                    (len(getattr(v_proj, "lora_magnitude_vector", []) or []) == 0):
 
                     layer.self_attn.apply_qkv = apply_lora_qkv
                     n_qkv += 1
@@ -2106,7 +2106,7 @@ def patch_peft_model(
                 o_proj = layer.self_attn.o_proj
                 if hasattr(o_proj, "lora_A") and \
                     (getattr(o_proj, "base_layer", o_proj).bias is None) and \
-                    (len(getattr(o_proj, "lora_magnitude_vector", [])) == 0):
+                    (len(getattr(o_proj, "lora_magnitude_vector", []) or []) == 0):
 
                     layer.self_attn.apply_o = apply_lora_o
                     n_o += 1

From 01c35f9e17cf455e97f7ce6cf55ecd653363433f Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 26 Jul 2024 16:31:40 -0700
Subject: [PATCH 062/147] Update __init__.py

---
 unsloth/__init__.py | 36 +++++++++++++++++++++++++++++++-----
 1 file changed, 31 insertions(+), 5 deletions(-)

diff --git a/unsloth/__init__.py b/unsloth/__init__.py
index 464068154..265d08c90 100644
--- a/unsloth/__init__.py
+++ b/unsloth/__init__.py
@@ -60,6 +60,37 @@
                       "We have some installation instructions on our Github page.")
 pass
 
+# =============================================
+# Check if Unsloth's model list has been updated
+import os, requests, inspect, re
+import numpy as np
+import subprocess
+
+try:
+    file_location = inspect.getfile(torch)
+    package, _ = os.path.split(file_location)
+    dist_packages, package = os.path.split(package)
+    old_mapper = os.path.join(dist_packages, "unsloth", "models", "mapper.py")
+    new_mapper = "https://raw.githubusercontent.com/unslothai/unsloth/main/unsloth/models/mapper.py"
+    with open(old_mapper, "r") as old_mapper: old_mapper = old_mapper.read()
+    with requests.get(new_mapper) as new_mapper: new_mapper = new_mapper.text
+    old_mapper = re.findall(r'\"unsloth\/([^\"]{1,})\-bnb\-4bit\" \: \(', old_mapper)
+    new_mapper = re.findall(r'\"unsloth\/([^\"]{1,})\-bnb\-4bit\" \: \(', new_mapper)
+    new_models = list(frozenset(new_mapper) - frozenset(old_mapper))
+
+    print(1)
+    if len(new_models) != 0:
+        warnings.warn(
+            f"Unsloth: Some new models including {new_models} have dropped!\n"\
+            "If you want to try them out, please update Unsloth via:\n\n"
+            'pip install --upgrade --force-reinstall --no-cache-dir \\\n    "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"'
+        )
+    pass
+    del new_models, old_mapper, dist_packages, package, file_location
+except:
+    pass
+# =============================================
+
 # Hugging Face Hub faster downloads (only enable during Colab and Kaggle sessions)
 keynames = "\n" + "\n".join(os.environ.keys())
 if "\nCOLAB_"  in keynames or "\nKAGGLE_" in keynames:
@@ -103,11 +134,6 @@ def is_bf16_supported(): return SUPPORTS_BFLOAT16
     except: pass
 else: from triton.common.build import libcuda_dirs
 
-import os
-import re
-import numpy as np
-import subprocess
-
 try:
     cdequantize_blockwise_fp32 = bnb.functional.lib.cdequantize_blockwise_fp32
     libcuda_dirs()

From 08379f8a9cc0448b13897ffbf0897ad01f7549dc Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 27 Jul 2024 20:30:32 -0700
Subject: [PATCH 063/147] Edits

---
 pyproject.toml           |  4 ++--
 unsloth/__init__.py      | 29 +----------------------------
 unsloth/models/llama.py  |  3 ++-
 unsloth/models/loader.py |  6 +++---
 4 files changed, 8 insertions(+), 34 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 829b35ad3..3335a7593 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -35,7 +35,7 @@ exclude = ["images*"]
 huggingface = [
     "packaging",
     "tyro",
-    "transformers>=4.43.1",
+    "transformers>=4.43.2",
     "datasets>=2.16.0",
     "sentencepiece>=0.2.0",
     "tqdm",
@@ -188,7 +188,7 @@ colab-ampere-torch220 = [
 colab-new = [
     "packaging",
     "tyro",
-    "transformers>=4.43.1",
+    "transformers>=4.43.2",
     "datasets>=2.16.0",
     "sentencepiece>=0.2.0",
     "tqdm",
diff --git a/unsloth/__init__.py b/unsloth/__init__.py
index 265d08c90..db54c9a16 100644
--- a/unsloth/__init__.py
+++ b/unsloth/__init__.py
@@ -60,37 +60,10 @@
                       "We have some installation instructions on our Github page.")
 pass
 
-# =============================================
-# Check if Unsloth's model list has been updated
-import os, requests, inspect, re
+import os, re
 import numpy as np
 import subprocess
 
-try:
-    file_location = inspect.getfile(torch)
-    package, _ = os.path.split(file_location)
-    dist_packages, package = os.path.split(package)
-    old_mapper = os.path.join(dist_packages, "unsloth", "models", "mapper.py")
-    new_mapper = "https://raw.githubusercontent.com/unslothai/unsloth/main/unsloth/models/mapper.py"
-    with open(old_mapper, "r") as old_mapper: old_mapper = old_mapper.read()
-    with requests.get(new_mapper) as new_mapper: new_mapper = new_mapper.text
-    old_mapper = re.findall(r'\"unsloth\/([^\"]{1,})\-bnb\-4bit\" \: \(', old_mapper)
-    new_mapper = re.findall(r'\"unsloth\/([^\"]{1,})\-bnb\-4bit\" \: \(', new_mapper)
-    new_models = list(frozenset(new_mapper) - frozenset(old_mapper))
-
-    print(1)
-    if len(new_models) != 0:
-        warnings.warn(
-            f"Unsloth: Some new models including {new_models} have dropped!\n"\
-            "If you want to try them out, please update Unsloth via:\n\n"
-            'pip install --upgrade --force-reinstall --no-cache-dir \\\n    "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"'
-        )
-    pass
-    del new_models, old_mapper, dist_packages, package, file_location
-except:
-    pass
-# =============================================
-
 # Hugging Face Hub faster downloads (only enable during Colab and Kaggle sessions)
 keynames = "\n" + "\n".join(os.environ.keys())
 if "\nCOLAB_"  in keynames or "\nKAGGLE_" in keynames:
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index bc434ecf1..d36353999 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -18,6 +18,7 @@
 from ._utils import *
 from ._utils import __version__
 from torch.nn.functional import scaled_dot_product_attention
+from transformers import __version__ as transformers_version
 from transformers.models.llama.modeling_llama import (
     logger,
     BaseModelOutputWithPast,
@@ -1281,7 +1282,7 @@ def from_pretrained(
         max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
 
         statistics = \
-           f"==((====))==  Unsloth: Fast {model_patcher.__name__[4:-5]} patching release {__version__}\n"\
+           f"==((====))==  Unsloth {__version__}: Fast {model_patcher.__name__[4:-5]} patching. Transformers = {transformers_version}\n"\
            f"   \\\   /|    GPU: {gpu_stats.name}. Max memory: {max_memory} GB. Platform = {platform_system}.\n"\
            f"O^O/ \_/ \\    Pytorch: {torch.__version__}. CUDA = {gpu_stats.major}.{gpu_stats.minor}. CUDA Toolkit = {torch.version.cuda}.\n"\
            f"\        /    Bfloat16 = {str(SUPPORTS_BFLOAT16).upper()}. FA [Xformers = {xformers_version}. FA2 = {HAS_FLASH_ATTENTION}]\n"\
diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 6b83b8e73..ecf871d5d 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -27,7 +27,7 @@
 SUPPORTS_FOURBIT = transformers_version >= Version("4.37")
 SUPPORTS_GEMMA   = transformers_version >= Version("4.38")
 SUPPORTS_GEMMA2  = transformers_version >= Version("4.42")
-SUPPORTS_LLAMA31 = transformers_version >= Version("4.43.1")
+SUPPORTS_LLAMA31 = transformers_version >= Version("4.43.2")
 if SUPPORTS_GEMMA:
     from .gemma  import FastGemmaModel
 if SUPPORTS_GEMMA2:
@@ -147,8 +147,8 @@ def from_pretrained(
             if scaling_type == "llama3" and not SUPPORTS_LLAMA31:
                 raise ImportError(
                     f"Unsloth: Your transformers version of {transformers_version} does not support Llama 3.1.\n"\
-                    f"The minimum required version is 4.43.1\n"\
-                    f'Try `pip install --upgrade "transformers>=4.43.1"`\n'\
+                    f"The minimum required version is 4.43.2\n"\
+                    f'Try `pip install --upgrade "transformers>=4.43.2"`\n'\
                     f"to obtain the latest transformers build, then restart this session."\
                 )
             dispatch_model = FastLlamaModel

From f6c2b4aa7d99b16e43bc165fd75b10970e766af1 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 27 Jul 2024 22:16:33 -0700
Subject: [PATCH 064/147] Checks

---
 pyproject.toml           |  6 ++-
 unsloth/models/_utils.py | 22 ++++++++++-
 unsloth/models/llama.py  |  2 +-
 unsloth/models/loader.py | 81 +++++++++++++++++++++++++++++++++-------
 4 files changed, 92 insertions(+), 19 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 3335a7593..6777f7c26 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -46,7 +46,8 @@ huggingface = [
     "trl>=0.7.9,<0.9.0",
     "peft>=0.7.1,!=0.11.0",
     "protobuf<4.0.0",
-    "huggingface_hub[hf_transfer]",
+    "huggingface_hub",
+    "hf-transfer",
 ]
 cu118only = [
     "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.22.post7%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'",
@@ -196,7 +197,8 @@ colab-new = [
     "wheel>=0.42.0",
     "numpy",
     "protobuf<4.0.0",
-    "huggingface_hub[hf_transfer]",
+    "huggingface_hub",
+    "hf-transfer",
 ]
 colab-no-deps = [
     "accelerate>=0.26.1",
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 5a267a459..a3263f85a 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -122,7 +122,8 @@ def patch_mistral_nemo_config(config):
 # =============================================
 # torch.cuda.amp.custom_fwd is deprecated >= 2.4
 import torch
-if Version(torch.__version__) < Version("2.4.0"):
+torch_version = torch.__version__
+if Version(torch_version) < Version("2.4.0"):
     torch_amp_custom_fwd = torch.cuda.amp.custom_fwd
     torch_amp_custom_bwd = torch.cuda.amp.custom_bwd
 else:
@@ -184,7 +185,7 @@ def patch_mistral_nemo_config(config):
 
 # Check TRL version
 from trl import __version__ as trl_version
-if Version(xformers_version) >= Version("0.9.0"):
+if Version(trl_version) >= Version("0.9.0"):
     raise ImportError(
         "Unsloth: If you are in Colab, we updated the top cell install instructions - please change it to below "\
         "then press Disconnect Runtime and then Restart it.\n"\
@@ -199,7 +200,24 @@ def patch_mistral_nemo_config(config):
     )
 pass
 
+# Confirm versions
 # =============================================
+if   Version(torch_version) < Version("2.2.0") and Version(xformers_version) >= Version("0.0.24"):
+    raise ImportError(
+        f"Unsloth: You have torch = {torch_version} but xformers = {xformers_version}.\n"\
+        f"Please install xformers < 0.0.24 for torch = {torch_version}."
+    )
+elif Version(torch_version) < Version("2.3.0") and Version(xformers_version) >= Version("0.0.26"):
+    raise ImportError(
+        f"Unsloth: You have torch = {torch_version} but xformers = {xformers_version}.\n"\
+        f"Please install xformers < 0.0.26 for torch = {torch_version}."
+    )
+elif Version(torch_version) < Version("2.4.0") and Version(xformers_version) >= Version("0.0.27"):
+    raise ImportError(
+        f"Unsloth: You have torch = {torch_version} but xformers = {xformers_version}.\n"\
+        f"Please install xformers < 0.0.27 for torch = {torch_version}."
+    )
+pass
 
 # =============================================
 # Torch compile settings
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index d36353999..6b16a4cc6 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1282,7 +1282,7 @@ def from_pretrained(
         max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
 
         statistics = \
-           f"==((====))==  Unsloth {__version__}: Fast {model_patcher.__name__[4:-5]} patching. Transformers = {transformers_version}\n"\
+           f"==((====))==  Unsloth {__version__}: Fast {model_patcher.__name__[4:-5]} patching. Transformers = {transformers_version}.\n"\
            f"   \\\   /|    GPU: {gpu_stats.name}. Max memory: {max_memory} GB. Platform = {platform_system}.\n"\
            f"O^O/ \_/ \\    Pytorch: {torch.__version__}. CUDA = {gpu_stats.major}.{gpu_stats.minor}. CUDA Toolkit = {torch.version.cuda}.\n"\
            f"\        /    Bfloat16 = {str(SUPPORTS_BFLOAT16).upper()}. FA [Xformers = {xformers_version}. FA2 = {HAS_FLASH_ATTENTION}]\n"\
diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index ecf871d5d..2a7fa75fe 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -35,9 +35,14 @@
 pass
 
 
-def _get_model_name(model_name, load_in_4bit = True):
+def __get_model_name(
+    model_name,
+    load_in_4bit = True,
+    INT_TO_FLOAT_MAPPER = None,
+    FLOAT_TO_INT_MAPPER = None,
+):
 
-    if not SUPPORTS_FOURBIT and model_name in INT_TO_FLOAT_MAPPER:
+    if not SUPPORTS_FOURBIT and model_name.lower() in INT_TO_FLOAT_MAPPER:
         model_name = INT_TO_FLOAT_MAPPER[model_name.lower()]
         logger.warning_once(
             f"Unsloth: Your transformers version of {transformers_version} does not support native "\
@@ -46,25 +51,71 @@ def _get_model_name(model_name, load_in_4bit = True):
             f"to obtain the latest transformers build, then restart this session.\n"\
             f"For now, we shall load `{model_name}` instead (still 4bit, just slower downloading)."
         )
+        return model_name
     
-    elif not load_in_4bit and model_name in INT_TO_FLOAT_MAPPER:
+    elif not load_in_4bit and model_name.lower() in INT_TO_FLOAT_MAPPER:
         new_model_name = INT_TO_FLOAT_MAPPER[model_name.lower()]
         # logger.warning_once(
         #     f"Unsloth: You passed in `{model_name}` which is a 4bit model, yet you set\n"\
         #     f"`load_in_4bit = False`. We shall load `{new_model_name}` instead."
         # )
-        model_name = new_model_name
+        return new_model_name
 
-    elif load_in_4bit and SUPPORTS_FOURBIT and model_name in FLOAT_TO_INT_MAPPER:
+    elif load_in_4bit and SUPPORTS_FOURBIT and model_name.lower() in FLOAT_TO_INT_MAPPER:
         new_model_name = FLOAT_TO_INT_MAPPER[model_name.lower()]
         # logger.warning_once(
         #     f"Unsloth: You passed in `{model_name}` and `load_in_4bit = True`.\n"\
         #     f"We shall load `{new_model_name}` for 4x faster loading."
         # )
-        model_name = new_model_name
+        return new_model_name
     pass
 
-    return model_name
+    return None
+pass
+
+
+def _get_new_mapper():
+    try:
+        import requests
+        new_mapper = "https://raw.githubusercontent.com/unslothai/unsloth/main/unsloth/models/mapper.py"
+        with requests.get(new_mapper, timeout = 3) as new_mapper: new_mapper = new_mapper.text
+        new_mapper = new_mapper\
+            .replace("INT_TO_FLOAT_MAPPER", "NEW_INT_TO_FLOAT_MAPPER")\
+            .replace("FLOAT_TO_INT_MAPPER", "NEW_FLOAT_TO_INT_MAPPER")
+        exec(new_mapper, locals())
+        return NEW_INT_TO_FLOAT_MAPPER, NEW_FLOAT_TO_INT_MAPPER
+    except:
+        return {}, {}
+    pass
+pass
+
+
+def _get_model_name(model_name, load_in_4bit = True):
+    new_model_name = __get_model_name(
+        model_name = model_name,
+        load_in_4bit = load_in_4bit,
+        INT_TO_FLOAT_MAPPER = INT_TO_FLOAT_MAPPER,
+        FLOAT_TO_INT_MAPPER = FLOAT_TO_INT_MAPPER,
+    )
+    if new_model_name is None and \
+        model_name.count("/") == 1 and \
+        model_name[0].isalnum():
+        # Try checking if a new Unsloth version allows it!
+        NEW_INT_TO_FLOAT_MAPPER, NEW_FLOAT_TO_INT_MAPPER = _get_new_mapper()
+        upgraded_model_name = __get_model_name(
+            model_name = model_name,
+            load_in_4bit = load_in_4bit,
+            INT_TO_FLOAT_MAPPER = NEW_INT_TO_FLOAT_MAPPER,
+            FLOAT_TO_INT_MAPPER = NEW_FLOAT_TO_INT_MAPPER,
+        )
+        if upgraded_model_name is not None:
+            raise NotImplementedError(
+                f"Unsloth: {model_name} is not supported in your current Unsloth version! Please update Unsloth via:\n\n"\
+                'pip install --upgrade --force-reinstall --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"'
+            )
+        pass
+    pass
+    return new_model_name if new_model_name is not None else model_name
 pass
 
 
@@ -98,16 +149,22 @@ def from_pretrained(
         from huggingface_hub.utils import disable_progress_bars, enable_progress_bars, are_progress_bars_disabled
         was_disabled = are_progress_bars_disabled()
         disable_progress_bars()
+
+        autoconfig_error = None
+        peft_error = None
         try:
             model_config = AutoConfig.from_pretrained(model_name, token = token, revision = revision)
             is_model = True
-        except:
+        except Exception as autoconfig_error:
+            autoconfig_error = str(autoconfig_error)
             is_model = False
         try:
             peft_config = PeftConfig .from_pretrained(model_name, token = token, revision = revision)
             is_peft = True
-        except:
+        except Exception as peft_error:
+            peft_error = str(peft_error)
             is_peft = False
+        pass
 
         # Cannot be both!
         if is_model and is_peft:
@@ -118,11 +175,7 @@ def from_pretrained(
                 "Please separate the LoRA and base models to 2 repos."
             )
         elif not is_model and not is_peft:
-            raise RuntimeError(
-                f"Unsloth: `{model_name}` is not a base model or a PEFT model.\n"\
-                "We could not locate a `config.json` or `adapter_config.json` file.\n"\
-                "Are you certain the model name is correct? Does it actually exist?"
-            )
+            raise RuntimeError(autoconfig_error or peft_error)
         pass
 
         # Get base model for PEFT:

From 78fa9d058db4f039183da7a56e81a5cc4dfe289f Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 27 Jul 2024 22:30:48 -0700
Subject: [PATCH 065/147] Update _utils.py

---
 unsloth/models/_utils.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index a3263f85a..04ffd2062 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -135,35 +135,36 @@ def patch_mistral_nemo_config(config):
 # =============================================
 # Get Flash Attention v2 if Ampere (RTX 30xx, A100)
 import bitsandbytes as bnb
-from transformers.models.llama.modeling_llama import logger
 from transformers import AutoTokenizer
+from transformers.utils.import_utils import _is_package_available
 
 major_version, minor_version = torch.cuda.get_device_capability()
 SUPPORTS_BFLOAT16 = False
 
 if major_version >= 8:
     SUPPORTS_BFLOAT16 = True
-    try:
-        from flash_attn import flash_attn_func
+    if _is_package_available("flash_attn"):
         # Check for CUDA linking errors "undefined symbol: _ZNK3c106SymIntltEl"
         try:
             from flash_attn.flash_attn_interface import flash_attn_cuda
             HAS_FLASH_ATTENTION = True
         except:
-            logger.warning_once(
+            print(
                 "Unsloth: Your Flash Attention 2 installation seems to be broken?\n"\
                 "A possible explanation is you have a new CUDA version which isn't\n"\
                 "yet compatible with FA2? Please file a ticket to Unsloth or FA2.\n"\
-                "We shall now use Xformers instead, which gets a 0.01% performance hit.\n"\
+                "We shall now use Xformers instead, which does not have any performance hits!\n"\
                 "We found this negligible impact by benchmarking on 1x A100."
             )
             HAS_FLASH_ATTENTION = False
-    except:
+    else:
         HAS_FLASH_ATTENTION = False
 else:
     # Tri Dao's benchmark shows xformers is faster for now.
     HAS_FLASH_ATTENTION = False
 pass
+
+from transformers.models.llama.modeling_llama import logger
 import xformers.ops.fmha as xformers
 xformers_attention = xformers.memory_efficient_attention
 from xformers import __version__ as xformers_version

From 4ea5789db6d601e452a7a701fe5ac53669876856 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 27 Jul 2024 22:32:39 -0700
Subject: [PATCH 066/147] Update _utils.py

---
 unsloth/models/_utils.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 04ffd2062..74dfe6987 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -156,7 +156,15 @@ def patch_mistral_nemo_config(config):
                 "We shall now use Xformers instead, which does not have any performance hits!\n"\
                 "We found this negligible impact by benchmarking on 1x A100."
             )
+
+            # Stop Flash Attention from importing!
+            import transformers.utils.import_utils
+            transformers.utils.import_utils.is_flash_attn_2_available = lambda *args, **kwargs: False
+            import transformers.utils
+            transformers.utils.is_flash_attn_2_available = lambda *args, **kwargs: False
+
             HAS_FLASH_ATTENTION = False
+        pass
     else:
         HAS_FLASH_ATTENTION = False
 else:

From aab503ac80e2032805854c2ff211cdc2a4b23b3f Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 27 Jul 2024 22:35:24 -0700
Subject: [PATCH 067/147] Update loader.py

---
 unsloth/models/loader.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 2a7fa75fe..615b8286f 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -111,7 +111,8 @@ def _get_model_name(model_name, load_in_4bit = True):
         if upgraded_model_name is not None:
             raise NotImplementedError(
                 f"Unsloth: {model_name} is not supported in your current Unsloth version! Please update Unsloth via:\n\n"\
-                'pip install --upgrade --force-reinstall --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"'
+                'pip uninstall unsloth -y\n'\
+                'pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"'
             )
         pass
     pass

From b310f5e06a6b00e18d66324c53a686626d0b9040 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 27 Jul 2024 22:48:52 -0700
Subject: [PATCH 068/147] Update _utils.py

---
 unsloth/models/_utils.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 74dfe6987..8435c834f 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -173,6 +173,22 @@ def patch_mistral_nemo_config(config):
 pass
 
 from transformers.models.llama.modeling_llama import logger
+
+# =============================================
+# Get Xformers
+from xformers._cpp_lib import _register_extensions
+try:
+    _register_extensions() # Check if C++ modules are loaded correctly
+except Exception as error:
+    raise ImportError(
+        "Unsloth: Xformers was not installed correctly. "\
+        "Please install xformers separately first. "\
+        "Then confirm if it's correctly installed by running:\n"\
+        "python -m xformers.info\n\n"
+        "Longer error message:\n" + str(error)
+    )
+pass
+
 import xformers.ops.fmha as xformers
 xformers_attention = xformers.memory_efficient_attention
 from xformers import __version__ as xformers_version

From 1f6705673883efbed5373d9bc845d56f40fd6dc9 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 27 Jul 2024 22:50:38 -0700
Subject: [PATCH 069/147] Update mapper.py

---
 unsloth/models/mapper.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/unsloth/models/mapper.py b/unsloth/models/mapper.py
index 462555f31..a3191392c 100644
--- a/unsloth/models/mapper.py
+++ b/unsloth/models/mapper.py
@@ -218,10 +218,10 @@
         "unsloth/Mistral-Nemo-Base-2407",
         "mistralai/Mistral-Nemo-Base-2407",
     ),
-    "unsloth/Meta-Llama-3.1-8B-bnb-4bit" : (
-        "unsloth/Meta-Llama-3.1-8B",
-        "meta-llama/Meta-Llama-3.1-8B",
-    ),
+    # "unsloth/Meta-Llama-3.1-8B-bnb-4bit" : (
+    #     "unsloth/Meta-Llama-3.1-8B",
+    #     "meta-llama/Meta-Llama-3.1-8B",
+    # ),
     "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit" : (
         "unsloth/Meta-Llama-3.1-8B-Instruct",
         "meta-llama/Meta-Llama-3.1-8B-Instruct",

From a8e7556fc6aaac537f67d418b2c862ac349e66cd Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 27 Jul 2024 22:52:06 -0700
Subject: [PATCH 070/147] Update loader.py

---
 unsloth/models/loader.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 615b8286f..fc356c3c1 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -97,6 +97,7 @@ def _get_model_name(model_name, load_in_4bit = True):
         INT_TO_FLOAT_MAPPER = INT_TO_FLOAT_MAPPER,
         FLOAT_TO_INT_MAPPER = FLOAT_TO_INT_MAPPER,
     )
+    print(new_model_name)
     if new_model_name is None and \
         model_name.count("/") == 1 and \
         model_name[0].isalnum():
@@ -108,6 +109,7 @@ def _get_model_name(model_name, load_in_4bit = True):
             INT_TO_FLOAT_MAPPER = NEW_INT_TO_FLOAT_MAPPER,
             FLOAT_TO_INT_MAPPER = NEW_FLOAT_TO_INT_MAPPER,
         )
+        print(upgraded_model_name)
         if upgraded_model_name is not None:
             raise NotImplementedError(
                 f"Unsloth: {model_name} is not supported in your current Unsloth version! Please update Unsloth via:\n\n"\

From 38b5c77635f7a0febb694a45d41489c6c827f564 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 27 Jul 2024 22:54:49 -0700
Subject: [PATCH 071/147] Update loader.py

---
 unsloth/models/loader.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index fc356c3c1..f06515e62 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -98,11 +98,11 @@ def _get_model_name(model_name, load_in_4bit = True):
         FLOAT_TO_INT_MAPPER = FLOAT_TO_INT_MAPPER,
     )
     print(new_model_name)
-    if new_model_name is None and \
-        model_name.count("/") == 1 and \
-        model_name[0].isalnum():
+    if new_model_name is None and model_name.count("/") == 1 and model_name[0].isalnum():
         # Try checking if a new Unsloth version allows it!
         NEW_INT_TO_FLOAT_MAPPER, NEW_FLOAT_TO_INT_MAPPER = _get_new_mapper()
+        print(NEW_INT_TO_FLOAT_MAPPER)
+        print(NEW_FLOAT_TO_INT_MAPPER)
         upgraded_model_name = __get_model_name(
             model_name = model_name,
             load_in_4bit = load_in_4bit,

From 4a8c9e88249ceabfd955c79c9eacef221e0ef175 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 27 Jul 2024 22:56:59 -0700
Subject: [PATCH 072/147] Update _utils.py

---
 unsloth/models/_utils.py | 66 +++++++++++++++++++---------------------
 1 file changed, 32 insertions(+), 34 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 8435c834f..994f97ab7 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -176,21 +176,6 @@ def patch_mistral_nemo_config(config):
 
 # =============================================
 # Get Xformers
-from xformers._cpp_lib import _register_extensions
-try:
-    _register_extensions() # Check if C++ modules are loaded correctly
-except Exception as error:
-    raise ImportError(
-        "Unsloth: Xformers was not installed correctly. "\
-        "Please install xformers separately first. "\
-        "Then confirm if it's correctly installed by running:\n"\
-        "python -m xformers.info\n\n"
-        "Longer error message:\n" + str(error)
-    )
-pass
-
-import xformers.ops.fmha as xformers
-xformers_attention = xformers.memory_efficient_attention
 from xformers import __version__ as xformers_version
 # Temporarily disable 0.0.27 and higher - inference issues
 if Version(xformers_version) >= Version("0.0.27"):
@@ -208,25 +193,6 @@ def patch_mistral_nemo_config(config):
     )
 pass
 
-# Check TRL version
-from trl import __version__ as trl_version
-if Version(trl_version) >= Version("0.9.0"):
-    raise ImportError(
-        "Unsloth: If you are in Colab, we updated the top cell install instructions - please change it to below "\
-        "then press Disconnect Runtime and then Restart it.\n"\
-        "\n"\
-        "%%capture\n"
-        "# Installs Unsloth, Xformers (Flash Attention) and all other packages!\n"
-        '!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"\n'
-        '!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes\n'\
-        '\n'\
-        f"Otherwise in local machines, your TRL version of {trl_version} is too new.\n"\
-        'Please downgrade TRL via `pip install --force-reinstall "trl<0.9.0"'
-    )
-pass
-
-# Confirm versions
-# =============================================
 if   Version(torch_version) < Version("2.2.0") and Version(xformers_version) >= Version("0.0.24"):
     raise ImportError(
         f"Unsloth: You have torch = {torch_version} but xformers = {xformers_version}.\n"\
@@ -244,6 +210,38 @@ def patch_mistral_nemo_config(config):
     )
 pass
 
+from xformers._cpp_lib import _register_extensions
+try:
+    _register_extensions() # Check if C++ modules are loaded correctly
+except Exception as error:
+    raise ImportError(
+        "Unsloth: Xformers was not installed correctly.\n"\
+        "Please install xformers separately first.\n"\
+        "Then confirm if it's correctly installed by running:\n"\
+        "python -m xformers.info\n\n"
+        "Longer error message:\n" + str(error)
+    )
+pass
+import xformers.ops.fmha as xformers
+xformers_attention = xformers.memory_efficient_attention
+
+# Check TRL version
+from trl import __version__ as trl_version
+if Version(trl_version) >= Version("0.9.0"):
+    raise ImportError(
+        "Unsloth: If you are in Colab, we updated the top cell install instructions - please change it to below "\
+        "then press Disconnect Runtime and then Restart it.\n"\
+        "\n"\
+        "%%capture\n"
+        "# Installs Unsloth, Xformers (Flash Attention) and all other packages!\n"
+        '!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"\n'
+        '!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes\n'\
+        '\n'\
+        f"Otherwise in local machines, your TRL version of {trl_version} is too new.\n"\
+        'Please downgrade TRL via `pip install --force-reinstall "trl<0.9.0"'
+    )
+pass
+
 # =============================================
 # Torch compile settings
 

From c03fd228a240eb97d6ee386668c30b8352779935 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 27 Jul 2024 22:58:15 -0700
Subject: [PATCH 073/147] Update loader.py

---
 unsloth/models/loader.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index f06515e62..21e58bb3d 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -75,17 +75,17 @@ def __get_model_name(
 
 
 def _get_new_mapper():
-    try:
-        import requests
-        new_mapper = "https://raw.githubusercontent.com/unslothai/unsloth/main/unsloth/models/mapper.py"
-        with requests.get(new_mapper, timeout = 3) as new_mapper: new_mapper = new_mapper.text
-        new_mapper = new_mapper\
-            .replace("INT_TO_FLOAT_MAPPER", "NEW_INT_TO_FLOAT_MAPPER")\
-            .replace("FLOAT_TO_INT_MAPPER", "NEW_FLOAT_TO_INT_MAPPER")
-        exec(new_mapper, locals())
-        return NEW_INT_TO_FLOAT_MAPPER, NEW_FLOAT_TO_INT_MAPPER
-    except:
-        return {}, {}
+    # try:
+    import requests
+    new_mapper = "https://raw.githubusercontent.com/unslothai/unsloth/main/unsloth/models/mapper.py"
+    with requests.get(new_mapper, timeout = 3) as new_mapper: new_mapper = new_mapper.text
+    new_mapper = new_mapper\
+        .replace("INT_TO_FLOAT_MAPPER", "NEW_INT_TO_FLOAT_MAPPER")\
+        .replace("FLOAT_TO_INT_MAPPER", "NEW_FLOAT_TO_INT_MAPPER")
+    exec(new_mapper, locals())
+    return NEW_INT_TO_FLOAT_MAPPER, NEW_FLOAT_TO_INT_MAPPER
+    # except:
+    #     return {}, {}
     pass
 pass
 

From 9d5195296f7dcaac51519659fda13ee4027b2b1a Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 27 Jul 2024 22:59:16 -0700
Subject: [PATCH 074/147] Update loader.py

---
 unsloth/models/loader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 21e58bb3d..074e83739 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -82,7 +82,7 @@ def _get_new_mapper():
     new_mapper = new_mapper\
         .replace("INT_TO_FLOAT_MAPPER", "NEW_INT_TO_FLOAT_MAPPER")\
         .replace("FLOAT_TO_INT_MAPPER", "NEW_FLOAT_TO_INT_MAPPER")
-    exec(new_mapper, locals())
+    exec(new_mapper)
     return NEW_INT_TO_FLOAT_MAPPER, NEW_FLOAT_TO_INT_MAPPER
     # except:
     #     return {}, {}

From ddc2dde97c2d234e75db032e41c7fafe2b6f384f Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 27 Jul 2024 22:59:47 -0700
Subject: [PATCH 075/147] Update loader.py

---
 unsloth/models/loader.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 074e83739..9f2f53aa8 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -79,6 +79,7 @@ def _get_new_mapper():
     import requests
     new_mapper = "https://raw.githubusercontent.com/unslothai/unsloth/main/unsloth/models/mapper.py"
     with requests.get(new_mapper, timeout = 3) as new_mapper: new_mapper = new_mapper.text
+    new_mapper = new_mapper[new_mapper.find("__INT_TO_FLOAT_MAPPER"):]
     new_mapper = new_mapper\
         .replace("INT_TO_FLOAT_MAPPER", "NEW_INT_TO_FLOAT_MAPPER")\
         .replace("FLOAT_TO_INT_MAPPER", "NEW_FLOAT_TO_INT_MAPPER")

From b6ef70963011888c679ceaa6a929ce2cb32be1c7 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 27 Jul 2024 23:00:47 -0700
Subject: [PATCH 076/147] Update loader.py

---
 unsloth/models/loader.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 9f2f53aa8..6bfbf0f28 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -84,6 +84,7 @@ def _get_new_mapper():
         .replace("INT_TO_FLOAT_MAPPER", "NEW_INT_TO_FLOAT_MAPPER")\
         .replace("FLOAT_TO_INT_MAPPER", "NEW_FLOAT_TO_INT_MAPPER")
     exec(new_mapper)
+    print(new_mapper)
     return NEW_INT_TO_FLOAT_MAPPER, NEW_FLOAT_TO_INT_MAPPER
     # except:
     #     return {}, {}

From ed8bc007c6beef621941a4ee5882c6c55c0f1df5 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 27 Jul 2024 23:02:10 -0700
Subject: [PATCH 077/147] Update loader.py

---
 unsloth/models/loader.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 6bfbf0f28..f321f23ed 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -83,8 +83,7 @@ def _get_new_mapper():
     new_mapper = new_mapper\
         .replace("INT_TO_FLOAT_MAPPER", "NEW_INT_TO_FLOAT_MAPPER")\
         .replace("FLOAT_TO_INT_MAPPER", "NEW_FLOAT_TO_INT_MAPPER")
-    exec(new_mapper)
-    print(new_mapper)
+    exec(new_mapper, globals())
     return NEW_INT_TO_FLOAT_MAPPER, NEW_FLOAT_TO_INT_MAPPER
     # except:
     #     return {}, {}

From e1d61ce5d9f4c88af7d8c0c5d4ad9ca65b9e9327 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 27 Jul 2024 23:03:13 -0700
Subject: [PATCH 078/147] Update loader.py

---
 unsloth/models/loader.py | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index f321f23ed..15d3f952b 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -75,18 +75,18 @@ def __get_model_name(
 
 
 def _get_new_mapper():
-    # try:
-    import requests
-    new_mapper = "https://raw.githubusercontent.com/unslothai/unsloth/main/unsloth/models/mapper.py"
-    with requests.get(new_mapper, timeout = 3) as new_mapper: new_mapper = new_mapper.text
-    new_mapper = new_mapper[new_mapper.find("__INT_TO_FLOAT_MAPPER"):]
-    new_mapper = new_mapper\
-        .replace("INT_TO_FLOAT_MAPPER", "NEW_INT_TO_FLOAT_MAPPER")\
-        .replace("FLOAT_TO_INT_MAPPER", "NEW_FLOAT_TO_INT_MAPPER")
-    exec(new_mapper, globals())
-    return NEW_INT_TO_FLOAT_MAPPER, NEW_FLOAT_TO_INT_MAPPER
-    # except:
-    #     return {}, {}
+    try:
+        import requests
+        new_mapper = "https://raw.githubusercontent.com/unslothai/unsloth/main/unsloth/models/mapper.py"
+        with requests.get(new_mapper, timeout = 3) as new_mapper: new_mapper = new_mapper.text
+        new_mapper = new_mapper[new_mapper.find("__INT_TO_FLOAT_MAPPER"):]
+        new_mapper = new_mapper\
+            .replace("INT_TO_FLOAT_MAPPER", "NEW_INT_TO_FLOAT_MAPPER")\
+            .replace("FLOAT_TO_INT_MAPPER", "NEW_FLOAT_TO_INT_MAPPER")
+        exec(new_mapper, globals())
+        return NEW_INT_TO_FLOAT_MAPPER, NEW_FLOAT_TO_INT_MAPPER
+    except:
+        return {}, {}
     pass
 pass
 
@@ -102,15 +102,12 @@ def _get_model_name(model_name, load_in_4bit = True):
     if new_model_name is None and model_name.count("/") == 1 and model_name[0].isalnum():
         # Try checking if a new Unsloth version allows it!
         NEW_INT_TO_FLOAT_MAPPER, NEW_FLOAT_TO_INT_MAPPER = _get_new_mapper()
-        print(NEW_INT_TO_FLOAT_MAPPER)
-        print(NEW_FLOAT_TO_INT_MAPPER)
         upgraded_model_name = __get_model_name(
             model_name = model_name,
             load_in_4bit = load_in_4bit,
             INT_TO_FLOAT_MAPPER = NEW_INT_TO_FLOAT_MAPPER,
             FLOAT_TO_INT_MAPPER = NEW_FLOAT_TO_INT_MAPPER,
         )
-        print(upgraded_model_name)
         if upgraded_model_name is not None:
             raise NotImplementedError(
                 f"Unsloth: {model_name} is not supported in your current Unsloth version! Please update Unsloth via:\n\n"\

From 858e1a2a8c23dc08e6bbd17c841e5a7b133cee06 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 27 Jul 2024 23:03:29 -0700
Subject: [PATCH 079/147] Update mapper.py

---
 unsloth/models/mapper.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/unsloth/models/mapper.py b/unsloth/models/mapper.py
index a3191392c..462555f31 100644
--- a/unsloth/models/mapper.py
+++ b/unsloth/models/mapper.py
@@ -218,10 +218,10 @@
         "unsloth/Mistral-Nemo-Base-2407",
         "mistralai/Mistral-Nemo-Base-2407",
     ),
-    # "unsloth/Meta-Llama-3.1-8B-bnb-4bit" : (
-    #     "unsloth/Meta-Llama-3.1-8B",
-    #     "meta-llama/Meta-Llama-3.1-8B",
-    # ),
+    "unsloth/Meta-Llama-3.1-8B-bnb-4bit" : (
+        "unsloth/Meta-Llama-3.1-8B",
+        "meta-llama/Meta-Llama-3.1-8B",
+    ),
     "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit" : (
         "unsloth/Meta-Llama-3.1-8B-Instruct",
         "meta-llama/Meta-Llama-3.1-8B-Instruct",

From ea0a49448dca64808734cff797e352a69236343c Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 27 Jul 2024 23:04:23 -0700
Subject: [PATCH 080/147] Update loader.py

---
 unsloth/models/loader.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 15d3f952b..e2bfe1d63 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -98,7 +98,6 @@ def _get_model_name(model_name, load_in_4bit = True):
         INT_TO_FLOAT_MAPPER = INT_TO_FLOAT_MAPPER,
         FLOAT_TO_INT_MAPPER = FLOAT_TO_INT_MAPPER,
     )
-    print(new_model_name)
     if new_model_name is None and model_name.count("/") == 1 and model_name[0].isalnum():
         # Try checking if a new Unsloth version allows it!
         NEW_INT_TO_FLOAT_MAPPER, NEW_FLOAT_TO_INT_MAPPER = _get_new_mapper()

From a7bfbe7927ea75f959e1d7c84e7bf50945d405ff Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sun, 28 Jul 2024 00:10:02 -0700
Subject: [PATCH 081/147] Better debugging (#826)

* Update __init__.py

* Edits

* Checks

* Update _utils.py

* Update _utils.py

* Update loader.py

* Update _utils.py

* Update mapper.py

* Update loader.py

* Update loader.py

* Update _utils.py

* Update loader.py

* Update loader.py

* Update loader.py

* Update loader.py

* Update loader.py

* Update loader.py

* Update mapper.py

* Update loader.py
---
 pyproject.toml           | 10 +++--
 unsloth/__init__.py      |  9 ++---
 unsloth/models/_utils.py | 65 ++++++++++++++++++++++++------
 unsloth/models/llama.py  |  3 +-
 unsloth/models/loader.py | 87 ++++++++++++++++++++++++++++++++--------
 5 files changed, 135 insertions(+), 39 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 829b35ad3..6777f7c26 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -35,7 +35,7 @@ exclude = ["images*"]
 huggingface = [
     "packaging",
     "tyro",
-    "transformers>=4.43.1",
+    "transformers>=4.43.2",
     "datasets>=2.16.0",
     "sentencepiece>=0.2.0",
     "tqdm",
@@ -46,7 +46,8 @@ huggingface = [
     "trl>=0.7.9,<0.9.0",
     "peft>=0.7.1,!=0.11.0",
     "protobuf<4.0.0",
-    "huggingface_hub[hf_transfer]",
+    "huggingface_hub",
+    "hf-transfer",
 ]
 cu118only = [
     "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.22.post7%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'",
@@ -188,7 +189,7 @@ colab-ampere-torch220 = [
 colab-new = [
     "packaging",
     "tyro",
-    "transformers>=4.43.1",
+    "transformers>=4.43.2",
     "datasets>=2.16.0",
     "sentencepiece>=0.2.0",
     "tqdm",
@@ -196,7 +197,8 @@ colab-new = [
     "wheel>=0.42.0",
     "numpy",
     "protobuf<4.0.0",
-    "huggingface_hub[hf_transfer]",
+    "huggingface_hub",
+    "hf-transfer",
 ]
 colab-no-deps = [
     "accelerate>=0.26.1",
diff --git a/unsloth/__init__.py b/unsloth/__init__.py
index 464068154..db54c9a16 100644
--- a/unsloth/__init__.py
+++ b/unsloth/__init__.py
@@ -60,6 +60,10 @@
                       "We have some installation instructions on our Github page.")
 pass
 
+import os, re
+import numpy as np
+import subprocess
+
 # Hugging Face Hub faster downloads (only enable during Colab and Kaggle sessions)
 keynames = "\n" + "\n".join(os.environ.keys())
 if "\nCOLAB_"  in keynames or "\nKAGGLE_" in keynames:
@@ -103,11 +107,6 @@ def is_bf16_supported(): return SUPPORTS_BFLOAT16
     except: pass
 else: from triton.common.build import libcuda_dirs
 
-import os
-import re
-import numpy as np
-import subprocess
-
 try:
     cdequantize_blockwise_fp32 = bnb.functional.lib.cdequantize_blockwise_fp32
     libcuda_dirs()
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 5a267a459..994f97ab7 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -122,7 +122,8 @@ def patch_mistral_nemo_config(config):
 # =============================================
 # torch.cuda.amp.custom_fwd is deprecated >= 2.4
 import torch
-if Version(torch.__version__) < Version("2.4.0"):
+torch_version = torch.__version__
+if Version(torch_version) < Version("2.4.0"):
     torch_amp_custom_fwd = torch.cuda.amp.custom_fwd
     torch_amp_custom_bwd = torch.cuda.amp.custom_bwd
 else:
@@ -134,37 +135,47 @@ def patch_mistral_nemo_config(config):
 # =============================================
 # Get Flash Attention v2 if Ampere (RTX 30xx, A100)
 import bitsandbytes as bnb
-from transformers.models.llama.modeling_llama import logger
 from transformers import AutoTokenizer
+from transformers.utils.import_utils import _is_package_available
 
 major_version, minor_version = torch.cuda.get_device_capability()
 SUPPORTS_BFLOAT16 = False
 
 if major_version >= 8:
     SUPPORTS_BFLOAT16 = True
-    try:
-        from flash_attn import flash_attn_func
+    if _is_package_available("flash_attn"):
         # Check for CUDA linking errors "undefined symbol: _ZNK3c106SymIntltEl"
         try:
             from flash_attn.flash_attn_interface import flash_attn_cuda
             HAS_FLASH_ATTENTION = True
         except:
-            logger.warning_once(
+            print(
                 "Unsloth: Your Flash Attention 2 installation seems to be broken?\n"\
                 "A possible explanation is you have a new CUDA version which isn't\n"\
                 "yet compatible with FA2? Please file a ticket to Unsloth or FA2.\n"\
-                "We shall now use Xformers instead, which gets a 0.01% performance hit.\n"\
+                "We shall now use Xformers instead, which does not have any performance hits!\n"\
                 "We found this negligible impact by benchmarking on 1x A100."
             )
+
+            # Stop Flash Attention from importing!
+            import transformers.utils.import_utils
+            transformers.utils.import_utils.is_flash_attn_2_available = lambda *args, **kwargs: False
+            import transformers.utils
+            transformers.utils.is_flash_attn_2_available = lambda *args, **kwargs: False
+
             HAS_FLASH_ATTENTION = False
-    except:
+        pass
+    else:
         HAS_FLASH_ATTENTION = False
 else:
     # Tri Dao's benchmark shows xformers is faster for now.
     HAS_FLASH_ATTENTION = False
 pass
-import xformers.ops.fmha as xformers
-xformers_attention = xformers.memory_efficient_attention
+
+from transformers.models.llama.modeling_llama import logger
+
+# =============================================
+# Get Xformers
 from xformers import __version__ as xformers_version
 # Temporarily disable 0.0.27 and higher - inference issues
 if Version(xformers_version) >= Version("0.0.27"):
@@ -182,9 +193,41 @@ def patch_mistral_nemo_config(config):
     )
 pass
 
+if   Version(torch_version) < Version("2.2.0") and Version(xformers_version) >= Version("0.0.24"):
+    raise ImportError(
+        f"Unsloth: You have torch = {torch_version} but xformers = {xformers_version}.\n"\
+        f"Please install xformers < 0.0.24 for torch = {torch_version}."
+    )
+elif Version(torch_version) < Version("2.3.0") and Version(xformers_version) >= Version("0.0.26"):
+    raise ImportError(
+        f"Unsloth: You have torch = {torch_version} but xformers = {xformers_version}.\n"\
+        f"Please install xformers < 0.0.26 for torch = {torch_version}."
+    )
+elif Version(torch_version) < Version("2.4.0") and Version(xformers_version) >= Version("0.0.27"):
+    raise ImportError(
+        f"Unsloth: You have torch = {torch_version} but xformers = {xformers_version}.\n"\
+        f"Please install xformers < 0.0.27 for torch = {torch_version}."
+    )
+pass
+
+from xformers._cpp_lib import _register_extensions
+try:
+    _register_extensions() # Check if C++ modules are loaded correctly
+except Exception as error:
+    raise ImportError(
+        "Unsloth: Xformers was not installed correctly.\n"\
+        "Please install xformers separately first.\n"\
+        "Then confirm if it's correctly installed by running:\n"\
+        "python -m xformers.info\n\n"
+        "Longer error message:\n" + str(error)
+    )
+pass
+import xformers.ops.fmha as xformers
+xformers_attention = xformers.memory_efficient_attention
+
 # Check TRL version
 from trl import __version__ as trl_version
-if Version(xformers_version) >= Version("0.9.0"):
+if Version(trl_version) >= Version("0.9.0"):
     raise ImportError(
         "Unsloth: If you are in Colab, we updated the top cell install instructions - please change it to below "\
         "then press Disconnect Runtime and then Restart it.\n"\
@@ -199,8 +242,6 @@ def patch_mistral_nemo_config(config):
     )
 pass
 
-# =============================================
-
 # =============================================
 # Torch compile settings
 
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index bc434ecf1..6b16a4cc6 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -18,6 +18,7 @@
 from ._utils import *
 from ._utils import __version__
 from torch.nn.functional import scaled_dot_product_attention
+from transformers import __version__ as transformers_version
 from transformers.models.llama.modeling_llama import (
     logger,
     BaseModelOutputWithPast,
@@ -1281,7 +1282,7 @@ def from_pretrained(
         max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
 
         statistics = \
-           f"==((====))==  Unsloth: Fast {model_patcher.__name__[4:-5]} patching release {__version__}\n"\
+           f"==((====))==  Unsloth {__version__}: Fast {model_patcher.__name__[4:-5]} patching. Transformers = {transformers_version}.\n"\
            f"   \\\   /|    GPU: {gpu_stats.name}. Max memory: {max_memory} GB. Platform = {platform_system}.\n"\
            f"O^O/ \_/ \\    Pytorch: {torch.__version__}. CUDA = {gpu_stats.major}.{gpu_stats.minor}. CUDA Toolkit = {torch.version.cuda}.\n"\
            f"\        /    Bfloat16 = {str(SUPPORTS_BFLOAT16).upper()}. FA [Xformers = {xformers_version}. FA2 = {HAS_FLASH_ATTENTION}]\n"\
diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 6b83b8e73..e2bfe1d63 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -27,7 +27,7 @@
 SUPPORTS_FOURBIT = transformers_version >= Version("4.37")
 SUPPORTS_GEMMA   = transformers_version >= Version("4.38")
 SUPPORTS_GEMMA2  = transformers_version >= Version("4.42")
-SUPPORTS_LLAMA31 = transformers_version >= Version("4.43.1")
+SUPPORTS_LLAMA31 = transformers_version >= Version("4.43.2")
 if SUPPORTS_GEMMA:
     from .gemma  import FastGemmaModel
 if SUPPORTS_GEMMA2:
@@ -35,9 +35,14 @@
 pass
 
 
-def _get_model_name(model_name, load_in_4bit = True):
+def __get_model_name(
+    model_name,
+    load_in_4bit = True,
+    INT_TO_FLOAT_MAPPER = None,
+    FLOAT_TO_INT_MAPPER = None,
+):
 
-    if not SUPPORTS_FOURBIT and model_name in INT_TO_FLOAT_MAPPER:
+    if not SUPPORTS_FOURBIT and model_name.lower() in INT_TO_FLOAT_MAPPER:
         model_name = INT_TO_FLOAT_MAPPER[model_name.lower()]
         logger.warning_once(
             f"Unsloth: Your transformers version of {transformers_version} does not support native "\
@@ -46,25 +51,71 @@ def _get_model_name(model_name, load_in_4bit = True):
             f"to obtain the latest transformers build, then restart this session.\n"\
             f"For now, we shall load `{model_name}` instead (still 4bit, just slower downloading)."
         )
+        return model_name
     
-    elif not load_in_4bit and model_name in INT_TO_FLOAT_MAPPER:
+    elif not load_in_4bit and model_name.lower() in INT_TO_FLOAT_MAPPER:
         new_model_name = INT_TO_FLOAT_MAPPER[model_name.lower()]
         # logger.warning_once(
         #     f"Unsloth: You passed in `{model_name}` which is a 4bit model, yet you set\n"\
         #     f"`load_in_4bit = False`. We shall load `{new_model_name}` instead."
         # )
-        model_name = new_model_name
+        return new_model_name
 
-    elif load_in_4bit and SUPPORTS_FOURBIT and model_name in FLOAT_TO_INT_MAPPER:
+    elif load_in_4bit and SUPPORTS_FOURBIT and model_name.lower() in FLOAT_TO_INT_MAPPER:
         new_model_name = FLOAT_TO_INT_MAPPER[model_name.lower()]
         # logger.warning_once(
         #     f"Unsloth: You passed in `{model_name}` and `load_in_4bit = True`.\n"\
         #     f"We shall load `{new_model_name}` for 4x faster loading."
         # )
-        model_name = new_model_name
+        return new_model_name
     pass
 
-    return model_name
+    return None
+pass
+
+
+def _get_new_mapper():
+    try:
+        import requests
+        new_mapper = "https://raw.githubusercontent.com/unslothai/unsloth/main/unsloth/models/mapper.py"
+        with requests.get(new_mapper, timeout = 3) as new_mapper: new_mapper = new_mapper.text
+        new_mapper = new_mapper[new_mapper.find("__INT_TO_FLOAT_MAPPER"):]
+        new_mapper = new_mapper\
+            .replace("INT_TO_FLOAT_MAPPER", "NEW_INT_TO_FLOAT_MAPPER")\
+            .replace("FLOAT_TO_INT_MAPPER", "NEW_FLOAT_TO_INT_MAPPER")
+        exec(new_mapper, globals())
+        return NEW_INT_TO_FLOAT_MAPPER, NEW_FLOAT_TO_INT_MAPPER
+    except:
+        return {}, {}
+    pass
+pass
+
+
+def _get_model_name(model_name, load_in_4bit = True):
+    new_model_name = __get_model_name(
+        model_name = model_name,
+        load_in_4bit = load_in_4bit,
+        INT_TO_FLOAT_MAPPER = INT_TO_FLOAT_MAPPER,
+        FLOAT_TO_INT_MAPPER = FLOAT_TO_INT_MAPPER,
+    )
+    if new_model_name is None and model_name.count("/") == 1 and model_name[0].isalnum():
+        # Try checking if a new Unsloth version allows it!
+        NEW_INT_TO_FLOAT_MAPPER, NEW_FLOAT_TO_INT_MAPPER = _get_new_mapper()
+        upgraded_model_name = __get_model_name(
+            model_name = model_name,
+            load_in_4bit = load_in_4bit,
+            INT_TO_FLOAT_MAPPER = NEW_INT_TO_FLOAT_MAPPER,
+            FLOAT_TO_INT_MAPPER = NEW_FLOAT_TO_INT_MAPPER,
+        )
+        if upgraded_model_name is not None:
+            raise NotImplementedError(
+                f"Unsloth: {model_name} is not supported in your current Unsloth version! Please update Unsloth via:\n\n"\
+                'pip uninstall unsloth -y\n'\
+                'pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"'
+            )
+        pass
+    pass
+    return new_model_name if new_model_name is not None else model_name
 pass
 
 
@@ -98,16 +149,22 @@ def from_pretrained(
         from huggingface_hub.utils import disable_progress_bars, enable_progress_bars, are_progress_bars_disabled
         was_disabled = are_progress_bars_disabled()
         disable_progress_bars()
+
+        autoconfig_error = None
+        peft_error = None
         try:
             model_config = AutoConfig.from_pretrained(model_name, token = token, revision = revision)
             is_model = True
-        except:
+        except Exception as autoconfig_error:
+            autoconfig_error = str(autoconfig_error)
             is_model = False
         try:
             peft_config = PeftConfig .from_pretrained(model_name, token = token, revision = revision)
             is_peft = True
-        except:
+        except Exception as peft_error:
+            peft_error = str(peft_error)
             is_peft = False
+        pass
 
         # Cannot be both!
         if is_model and is_peft:
@@ -118,11 +175,7 @@ def from_pretrained(
                 "Please separate the LoRA and base models to 2 repos."
             )
         elif not is_model and not is_peft:
-            raise RuntimeError(
-                f"Unsloth: `{model_name}` is not a base model or a PEFT model.\n"\
-                "We could not locate a `config.json` or `adapter_config.json` file.\n"\
-                "Are you certain the model name is correct? Does it actually exist?"
-            )
+            raise RuntimeError(autoconfig_error or peft_error)
         pass
 
         # Get base model for PEFT:
@@ -147,8 +200,8 @@ def from_pretrained(
             if scaling_type == "llama3" and not SUPPORTS_LLAMA31:
                 raise ImportError(
                     f"Unsloth: Your transformers version of {transformers_version} does not support Llama 3.1.\n"\
-                    f"The minimum required version is 4.43.1\n"\
-                    f'Try `pip install --upgrade "transformers>=4.43.1"`\n'\
+                    f"The minimum required version is 4.43.2\n"\
+                    f'Try `pip install --upgrade "transformers>=4.43.2"`\n'\
                     f"to obtain the latest transformers build, then restart this session."\
                 )
             dispatch_model = FastLlamaModel

From 18900721c2a3ac7f95d228d8fb41b2c3bfb6f869 Mon Sep 17 00:00:00 2001
From: XiaoYang <xyangk@gmail.com>
Date: Wed, 31 Jul 2024 01:15:09 +0800
Subject: [PATCH 082/147] fix UnboundLocalError (#834)

* When an exception has been assigned using as target, it is cleared at the end of the except clause.(https://docs.python.org/3/reference/compound_stmts.html#the-try-statement)

* Update loader.py

---------

Co-authored-by: xiaoyang <xiaoyang@youzan.com>
Co-authored-by: Daniel Han <danielhanchen@gmail.com>
---
 unsloth/models/loader.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index e2bfe1d63..34deb8f9b 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -155,14 +155,14 @@ def from_pretrained(
         try:
             model_config = AutoConfig.from_pretrained(model_name, token = token, revision = revision)
             is_model = True
-        except Exception as autoconfig_error:
-            autoconfig_error = str(autoconfig_error)
+        except Exception as error:
+            autoconfig_error = str(error)
             is_model = False
         try:
             peft_config = PeftConfig .from_pretrained(model_name, token = token, revision = revision)
             is_peft = True
-        except Exception as peft_error:
-            peft_error = str(peft_error)
+        except Exception as error:
+            peft_error = str(error)
             is_peft = False
         pass
 

From be0930d1f6d9a742e6971ba8e9206c04e87d16d6 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 30 Jul 2024 10:18:51 -0700
Subject: [PATCH 083/147] Update loader.py

---
 unsloth/models/loader.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 34deb8f9b..f22e81efa 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -175,6 +175,15 @@ def from_pretrained(
                 "Please separate the LoRA and base models to 2 repos."
             )
         elif not is_model and not is_peft:
+            error = autoconfig_error or peft_error
+            # Old transformers version
+            if "rope_scaling" in error.lower() and not SUPPORTS_LLAMA31:
+                raise ImportError(
+                    f"Unsloth: Your transformers version of {transformers_version} does not support new RoPE scaling methods.\n"\
+                    f"This includes Llama 3.1. The minimum required version is 4.43.2\n"\
+                    f'Try `pip install --upgrade "transformers>=4.43.2"`\n'\
+                    f"to obtain the latest transformers build, then restart this session."\
+                ) 
             raise RuntimeError(autoconfig_error or peft_error)
         pass
 

From 4285d1b479d665b5f94136353ba2d8c3a73a789f Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 30 Jul 2024 10:29:54 -0700
Subject: [PATCH 084/147] Update llama.py

---
 unsloth/models/llama.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 6b16a4cc6..496a37e7a 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -2012,7 +2012,8 @@ def patch_peft_model(
                     model.peft_config[active_adapter].base_model_name_or_path = name
                 pass
             # Add revision to enable future fast inference paths
-            model.peft_config[active_adapter].revision = f"unsloth"
+            # [TODO] Bugs out!see https://github.com/unslothai/unsloth/issues/492
+            # model.peft_config[active_adapter].revision = f"unsloth"
         pass
 
         from transformers.trainer import Trainer 

From 42e09d192fb3d8a6e7b96563c0047fdd19585219 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 30 Jul 2024 19:56:36 -0700
Subject: [PATCH 085/147] bugs

---
 unsloth/models/_utils.py | 28 ++++++++++++++--------------
 unsloth/models/loader.py |  1 +
 unsloth/models/mapper.py |  1 +
 3 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 994f97ab7..8677879aa 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -178,20 +178,20 @@ def patch_mistral_nemo_config(config):
 # Get Xformers
 from xformers import __version__ as xformers_version
 # Temporarily disable 0.0.27 and higher - inference issues
-if Version(xformers_version) >= Version("0.0.27"):
-    raise ImportError(
-        "Unsloth: If you are in Colab, we updated the top cell install instructions - please change it to below "\
-        "then press Disconnect Runtime and then Restart it.\n"\
-        "\n"\
-        "%%capture\n"
-        "# Installs Unsloth, Xformers (Flash Attention) and all other packages!\n"
-        '!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"\n'
-        '!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes\n'\
-        '\n'\
-        f"Otherwise in local machines, your xformers version of {xformers_version} is too new.\n"\
-        'Please downgrade xformers via `pip install --force-reinstall "xformers<0.0.27"'
-    )
-pass
+# if Version(xformers_version) >= Version("0.0.27"):
+#     raise ImportError(
+#         "Unsloth: If you are in Colab, we updated the top cell install instructions - please change it to below "\
+#         "then press Disconnect Runtime and then Restart it.\n"\
+#         "\n"\
+#         "%%capture\n"
+#         "# Installs Unsloth, Xformers (Flash Attention) and all other packages!\n"
+#         '!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"\n'
+#         '!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes\n'\
+#         '\n'\
+#         f"Otherwise in local machines, your xformers version of {xformers_version} is too new.\n"\
+#         'Please downgrade xformers via `pip install --force-reinstall "xformers<0.0.27"'
+#     )
+# pass
 
 if   Version(torch_version) < Version("2.2.0") and Version(xformers_version) >= Version("0.0.24"):
     raise ImportError(
diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index f22e81efa..fb6d5c501 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -42,6 +42,7 @@ def __get_model_name(
     FLOAT_TO_INT_MAPPER = None,
 ):
 
+    model_name = str(model_name)
     if not SUPPORTS_FOURBIT and model_name.lower() in INT_TO_FLOAT_MAPPER:
         model_name = INT_TO_FLOAT_MAPPER[model_name.lower()]
         logger.warning_once(
diff --git a/unsloth/models/mapper.py b/unsloth/models/mapper.py
index 462555f31..254a68a42 100644
--- a/unsloth/models/mapper.py
+++ b/unsloth/models/mapper.py
@@ -67,6 +67,7 @@
         "codellama/CodeLlama-7b-hf",
     ),
     "unsloth/codellama-13b-bnb-4bit" : (
+        "unsloth/codellama-13b",
         "codellama/CodeLlama-13b-hf",
     ),
     "unsloth/yi-6b-bnb-4bit" : (

From 79ef745c7b9f369644d9a740a2b8be29e9dad860 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 30 Jul 2024 19:57:53 -0700
Subject: [PATCH 086/147] Update _utils.py

---
 unsloth/models/_utils.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 8677879aa..f4e4257b2 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -203,12 +203,12 @@ def patch_mistral_nemo_config(config):
         f"Unsloth: You have torch = {torch_version} but xformers = {xformers_version}.\n"\
         f"Please install xformers < 0.0.26 for torch = {torch_version}."
     )
-elif Version(torch_version) < Version("2.4.0") and Version(xformers_version) >= Version("0.0.27"):
-    raise ImportError(
-        f"Unsloth: You have torch = {torch_version} but xformers = {xformers_version}.\n"\
-        f"Please install xformers < 0.0.27 for torch = {torch_version}."
-    )
-pass
+# elif Version(torch_version) < Version("2.4.0") and Version(xformers_version) >= Version("0.0.27"):
+#     raise ImportError(
+#         f"Unsloth: You have torch = {torch_version} but xformers = {xformers_version}.\n"\
+#         f"Please install xformers < 0.0.27 for torch = {torch_version}."
+#     )
+# pass
 
 from xformers._cpp_lib import _register_extensions
 try:

From 9617ecbbb8bef7864961096f925f03e40ffa7f99 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 30 Jul 2024 22:48:53 -0700
Subject: [PATCH 087/147] flash-attn softcapping

---
 pyproject.toml           | 22 ++++++-------
 unsloth/models/_utils.py | 14 +++++++++
 unsloth/models/gemma2.py | 68 +++++++++++++++++++++++++++++-----------
 unsloth/models/llama.py  | 39 +++++++++++++----------
 unsloth/models/loader.py | 16 ++++++++++
 unsloth/models/mapper.py |  1 -
 6 files changed, 113 insertions(+), 47 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 6777f7c26..e711325be 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -171,7 +171,7 @@ colab-ampere-torch211 = [
     "unsloth[cu121onlytorch211]",
     "packaging",
     "ninja",
-    "flash-attn",
+    "flash-attn>=2.6.3",
 ]
 colab-torch220 = [
     "unsloth[huggingface]",
@@ -184,7 +184,7 @@ colab-ampere-torch220 = [
     "unsloth[cu121onlytorch220]",
     "packaging",
     "ninja",
-    "flash-attn",
+    "flash-attn>=2.6.3",
 ]
 colab-new = [
     "packaging",
@@ -215,7 +215,7 @@ colab-ampere = [
     "unsloth[colab-ampere-torch220]",
     "packaging",
     "ninja",
-    "flash-attn",
+    "flash-attn>=2.6.3",
 ]
 cu118-ampere = [
     "unsloth[huggingface]",
@@ -223,7 +223,7 @@ cu118-ampere = [
     "unsloth[cu118only]",
     "packaging",
     "ninja",
-    "flash-attn",
+    "flash-attn>=2.6.3",
 ]
 cu121-ampere = [
     "unsloth[huggingface]",
@@ -231,7 +231,7 @@ cu121-ampere = [
     "unsloth[cu121only]",
     "packaging",
     "ninja",
-    "flash-attn",
+    "flash-attn>=2.6.3",
 ]
 cu118-ampere-torch211 = [
     "unsloth[huggingface]",
@@ -239,7 +239,7 @@ cu118-ampere-torch211 = [
     "unsloth[cu118onlytorch211]",
     "packaging",
     "ninja",
-    "flash-attn",
+    "flash-attn>=2.6.3",
 ]
 cu121-ampere-torch211 = [
     "unsloth[huggingface]",
@@ -247,7 +247,7 @@ cu121-ampere-torch211 = [
     "unsloth[cu121onlytorch211]",
     "packaging",
     "ninja",
-    "flash-attn",
+    "flash-attn>=2.6.3",
 ]
 cu118-ampere-torch220 = [
     "unsloth[huggingface]",
@@ -255,7 +255,7 @@ cu118-ampere-torch220 = [
     "unsloth[cu118onlytorch220]",
     "packaging",
     "ninja",
-    "flash-attn",
+    "flash-attn>=2.6.3",
 ]
 cu121-ampere-torch220 = [
     "unsloth[huggingface]",
@@ -263,7 +263,7 @@ cu121-ampere-torch220 = [
     "unsloth[cu121onlytorch220]",
     "packaging",
     "ninja",
-    "flash-attn",
+    "flash-attn>=2.6.3",
 ]
 cu118-ampere-torch230 = [
     "unsloth[huggingface]",
@@ -271,7 +271,7 @@ cu118-ampere-torch230 = [
     "unsloth[cu118onlytorch230]",
     "packaging",
     "ninja",
-    "flash-attn",
+    "flash-attn>=2.6.3",
 ]
 cu121-ampere-torch230 = [
     "unsloth[huggingface]",
@@ -279,7 +279,7 @@ cu121-ampere-torch230 = [
     "unsloth[cu121onlytorch230]",
     "packaging",
     "ninja",
-    "flash-attn",
+    "flash-attn>=2.6.3",
 ]
 
 [project.urls]
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index f4e4257b2..c9bc6065f 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -21,6 +21,7 @@
     "xformers_version",
     "__version__",
     "HAS_FLASH_ATTENTION",
+    "HAS_FLASH_ATTENTION_SOFTCAPPING",
     "PRE_CHECK",
     "platform_system",
     "patch_tokenizer",
@@ -140,6 +141,8 @@ def patch_mistral_nemo_config(config):
 
 major_version, minor_version = torch.cuda.get_device_capability()
 SUPPORTS_BFLOAT16 = False
+HAS_FLASH_ATTENTION = False
+HAS_FLASH_ATTENTION_SOFTCAPPING = False
 
 if major_version >= 8:
     SUPPORTS_BFLOAT16 = True
@@ -148,6 +151,17 @@ def patch_mistral_nemo_config(config):
         try:
             from flash_attn.flash_attn_interface import flash_attn_cuda
             HAS_FLASH_ATTENTION = True
+
+            # Also check for softcapping
+            from flash_attn import __version__ as flash_attn_version
+            HAS_FLASH_ATTENTION_SOFTCAPPING = Version(flash_attn_version) >= Version("2.6.3")
+            if not HAS_FLASH_ATTENTION_SOFTCAPPING:
+                print(
+                    "Unsloth: If you want to finetune Gemma 2, upgrade flash-attn to version 2.6.3 or higher!\n"\
+                    "Newer versions support faster and less memory usage kernels for Gemma 2's attention softcapping!\n"\
+                    "To update flash-attn, do the below:\n"\
+                    '\npip install --no-deps --upgrade "flash-attn>=2.6.3"'
+                )
         except:
             print(
                 "Unsloth: Your Flash Attention 2 installation seems to be broken?\n"\
diff --git a/unsloth/models/gemma2.py b/unsloth/models/gemma2.py
index 0d21c47b0..ecd45fbce 100644
--- a/unsloth/models/gemma2.py
+++ b/unsloth/models/gemma2.py
@@ -56,6 +56,8 @@
     Gemma2FlashAttention2 = Gemma2Attention
 pass
 
+if HAS_FLASH_ATTENTION_SOFTCAPPING:
+    from flash_attn import flash_attn_func
 
 # [TODO] We must randomnly use torch.compile?
 # I checked the gradients and formulas and I'm sure it's correct.
@@ -126,8 +128,31 @@ def Gemma2Attention_fast_forward(
         V = torch.cat([past_key_value[1], V], dim = 2)
     pass
     past_key_value = (K, V) if use_cache else None
-    
-    A = slow_attention_softcapping(Q, K, V, causal_mask, self, bsz, kv_seq_len)
+
+    # Only enable if the attention_mask is True
+    has_sliding_window = type(attention_mask) is bool and attention_mask is True
+    if HAS_FLASH_ATTENTION_SOFTCAPPING and type(attention_mask) is bool:
+        window = (-1, -1)
+        if has_sliding_window:
+            sw = getattr(self.config, "sliding_window", None)
+            sw = kv_seq_len if (sw is None or sw == "null") else sw
+            window = (-1, -1) if (kv_seq_len <= sw) else (sw, sw)
+        pass
+
+        Q = Q.transpose(1, 2)
+        K = K.transpose(1, 2)
+        V = V.transpose(1, 2)
+        A = flash_attn_func(
+            Q, K, V,
+            causal = True,
+            softcap = self.config.attn_logit_softcapping,
+            softmax_scale = self.config.query_pre_attn_scalar,
+            window_size = window,
+        )
+        A = A.reshape(bsz, q_len, n_heads*head_dim)
+    else:
+        A = slow_attention_softcapping(Q, K, V, causal_mask, self, bsz, kv_seq_len)
+    pass
     A = self.apply_o(self, A)
     return A, None, past_key_value
 pass
@@ -205,6 +230,8 @@ def Gemma2DecoderLayer_fast_forward(
 from math import sqrt as math_sqrt
 KV_CACHE_INCREMENT = 256 # KV Cache update size
 torch_nn_functional_softmax = torch.nn.functional.softmax
+torch_matmul = torch.matmul
+torch_tanh   = torch.tanh
 
 def Gemma2Attention_fast_forward_inference(
     self,
@@ -322,13 +349,13 @@ def Gemma2Attention_fast_forward_inference(
     # if bsz == 1:
     Qn *= self.scalar # See https://github.com/ggerganov/llama.cpp/issues/7805#issuecomment-2153349963
     # It seems like doing (Q * scalar) @ K is better than (Q @ K) * scalar to stop overflows
-    A = torch.matmul(Qn, Knn.transpose(2, 3), out = self.attention[:,:,:,:cached_len])
+    A = torch_matmul(Qn, Knn.transpose(2, 3), out = self.attention[:,:,:,:cached_len])
     # if attention_mask is not None: A += attention_mask # Must add attention_mask for batched
 
-    A *= self.reciprocal_t; torch.tanh(A, out = A); A *= self.t;  # Logit softcapping
+    A *= self.reciprocal_t; torch_tanh(A, out = A); A *= self.t;  # Logit softcapping
 
     A[:] = torch_nn_functional_softmax(A, dim = -1, dtype = torch.float32)#.to(A.dtype)
-    A = torch.matmul(A, Vnn, out = Qn)
+    A = torch_matmul(A, Vnn, out = Qn)
     # else:
     #     A = scaled_dot_product_attention(Qn, Knn, Vnn, attn_mask = attention_mask, is_causal = False)
     # pass
@@ -359,19 +386,24 @@ def Gemma2Model_fast_forward_inference(
     bsz, q_len, hd = hidden_states.shape
     seq_len = past_key_values[0][0].shape[-2]
     if bsz != 1:
-        SWA = _prepare_4d_causal_attention_mask_for_sdpa(
-            attention_mask,
-            (bsz, q_len),
-            hidden_states,
-            seq_len,
-            sliding_window = self.config.sliding_window,
-        )
-        GA = _prepare_4d_causal_attention_mask_for_sdpa(
-            attention_mask,
-            (bsz, q_len),
-            hidden_states,
-            seq_len,
-        )
+        if HAS_FLASH_ATTENTION_SOFTCAPPING:
+            SWA = True
+            GA  = False
+        else:
+            SWA = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask,
+                (bsz, q_len),
+                hidden_states,
+                seq_len,
+                sliding_window = self.config.sliding_window,
+            )
+            GA = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask,
+                (bsz, q_len),
+                hidden_states,
+                seq_len,
+            )
+        pass
     else:
         SWA = attention_mask
         GA  = attention_mask
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 496a37e7a..b5244ed4e 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -682,23 +682,28 @@ def LlamaModel_fast_forward(
 
     # Gemma2 has alternating SWA and global attn
     if IS_GEMMA2 and not hasattr(self, "SWA_mask"):
-        n = self.config.max_position_embeddings
-        # masked_fill is making stuff slower!
-        # self. GA_mask = create_boolean_mask(n = n, sliding_window = 0)
-        # self.SWA_mask = create_boolean_mask(n = n, sliding_window = self.config.sliding_window)
-        from transformers.modeling_attn_mask_utils import AttentionMaskConverter
-        self.SWA_mask = AttentionMaskConverter(
-            is_causal = True,
-            sliding_window = self.config.sliding_window,
-        )\
-            .to_causal_4d(1, n, n, dtype = inputs_embeds.dtype, device = "cuda:0",)\
-            .squeeze(0).squeeze(0)
-
-        self.GA_mask = AttentionMaskConverter(
-            is_causal = True,
-        )\
-            .to_causal_4d(1, n, n, dtype = inputs_embeds.dtype, device = "cuda:0",)\
-            .squeeze(0).squeeze(0)
+        if HAS_FLASH_ATTENTION_SOFTCAPPING:
+            self.SWA_mask = True
+            self.GA_mask  = False
+        else:
+            n = self.config.max_position_embeddings
+            # masked_fill is making stuff slower!
+            # self. GA_mask = create_boolean_mask(n = n, sliding_window = 0)
+            # self.SWA_mask = create_boolean_mask(n = n, sliding_window = self.config.sliding_window)
+            from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+            self.SWA_mask = AttentionMaskConverter(
+                is_causal = True,
+                sliding_window = self.config.sliding_window,
+            )\
+                .to_causal_4d(1, n, n, dtype = inputs_embeds.dtype, device = "cuda:0",)\
+                .squeeze(0).squeeze(0)
+
+            self.GA_mask = AttentionMaskConverter(
+                is_causal = True,
+            )\
+                .to_causal_4d(1, n, n, dtype = inputs_embeds.dtype, device = "cuda:0",)\
+                .squeeze(0).squeeze(0)
+        pass
     pass
 
     # Go through every layer!
diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index fb6d5c501..47152d676 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from ._utils import is_bfloat16_supported, HAS_FLASH_ATTENTION, HAS_FLASH_ATTENTION_SOFTCAPPING
 from .llama import FastLlamaModel, logger
 from .mistral import FastMistralModel
 from .qwen2 import FastQwen2Model
@@ -233,6 +234,21 @@ def from_pretrained(
                     f'Try `pip install --upgrade "transformers>=4.42.3"`\n'\
                     f"to obtain the latest transformers build, then restart this session."\
                 )
+            # Also check for softcapping support in flash-attn which is faster!
+            if is_bfloat16_supported() and not HAS_FLASH_ATTENTION:
+                print(
+                    "Unsloth: If you want to finetune Gemma 2, install flash-attn to make it faster!\n"\
+                    "To install flash-attn, do the below:\n"\
+                    '\npip install --no-deps --upgrade "flash-attn>=2.6.3"'
+                )
+            elif HAS_FLASH_ATTENTION and not HAS_FLASH_ATTENTION_SOFTCAPPING:
+                print(
+                    "Unsloth: If you want to finetune Gemma 2, upgrade flash-attn to version 2.6.3 or higher!\n"\
+                    "Newer versions support faster and less memory usage kernels for Gemma 2's attention softcapping!\n"\
+                    "To update flash-attn, do the below:\n"\
+                    '\npip install --no-deps --upgrade "flash-attn>=2.6.3"'
+                )
+            
             dispatch_model = FastGemma2Model
         elif model_type == "qwen2":
             dispatch_model = FastQwen2Model
diff --git a/unsloth/models/mapper.py b/unsloth/models/mapper.py
index 254a68a42..462555f31 100644
--- a/unsloth/models/mapper.py
+++ b/unsloth/models/mapper.py
@@ -67,7 +67,6 @@
         "codellama/CodeLlama-7b-hf",
     ),
     "unsloth/codellama-13b-bnb-4bit" : (
-        "unsloth/codellama-13b",
         "codellama/CodeLlama-13b-hf",
     ),
     "unsloth/yi-6b-bnb-4bit" : (

From d326c988585d3c764bacefec0f92432c8a50e85a Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 30 Jul 2024 23:04:00 -0700
Subject: [PATCH 088/147] Update gemma2.py

---
 unsloth/models/gemma2.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/unsloth/models/gemma2.py b/unsloth/models/gemma2.py
index ecd45fbce..a0880daef 100644
--- a/unsloth/models/gemma2.py
+++ b/unsloth/models/gemma2.py
@@ -131,6 +131,9 @@ def Gemma2Attention_fast_forward(
 
     # Only enable if the attention_mask is True
     has_sliding_window = type(attention_mask) is bool and attention_mask is True
+    print(HAS_FLASH_ATTENTION_SOFTCAPPING)
+    print(has_sliding_window)
+    print(attention_mask)
     if HAS_FLASH_ATTENTION_SOFTCAPPING and type(attention_mask) is bool:
         window = (-1, -1)
         if has_sliding_window:

From 86b71c4ef5f90379b075d2ab97827b3c2537d501 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 30 Jul 2024 23:07:47 -0700
Subject: [PATCH 089/147] Update gemma2.py

---
 unsloth/models/gemma2.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/unsloth/models/gemma2.py b/unsloth/models/gemma2.py
index a0880daef..2191a99c0 100644
--- a/unsloth/models/gemma2.py
+++ b/unsloth/models/gemma2.py
@@ -130,11 +130,8 @@ def Gemma2Attention_fast_forward(
     past_key_value = (K, V) if use_cache else None
 
     # Only enable if the attention_mask is True
-    has_sliding_window = type(attention_mask) is bool and attention_mask is True
-    print(HAS_FLASH_ATTENTION_SOFTCAPPING)
-    print(has_sliding_window)
-    print(attention_mask)
-    if HAS_FLASH_ATTENTION_SOFTCAPPING and type(attention_mask) is bool:
+    has_sliding_window = type(causal_mask) is bool and causal_mask is True
+    if HAS_FLASH_ATTENTION_SOFTCAPPING and attention_mask is None:
         window = (-1, -1)
         if has_sliding_window:
             sw = getattr(self.config, "sliding_window", None)

From cf1054c9bcc74bd659739f34444f46d8c79837cf Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 30 Jul 2024 23:11:23 -0700
Subject: [PATCH 090/147] Update gemma2.py

---
 unsloth/models/gemma2.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/unsloth/models/gemma2.py b/unsloth/models/gemma2.py
index 2191a99c0..d2bfb7899 100644
--- a/unsloth/models/gemma2.py
+++ b/unsloth/models/gemma2.py
@@ -139,6 +139,11 @@ def Gemma2Attention_fast_forward(
             window = (-1, -1) if (kv_seq_len <= sw) else (sw, sw)
         pass
 
+        # FA uses 1 / sqrt for softmax_scale!
+        if not hasattr(self, "_flash_attention_softmax_scale"):
+            self._flash_attention_softmax_scale = 1.0 / self.config.query_pre_attn_scalar**0.5
+        pass
+
         Q = Q.transpose(1, 2)
         K = K.transpose(1, 2)
         V = V.transpose(1, 2)
@@ -146,7 +151,7 @@ def Gemma2Attention_fast_forward(
             Q, K, V,
             causal = True,
             softcap = self.config.attn_logit_softcapping,
-            softmax_scale = self.config.query_pre_attn_scalar,
+            softmax_scale = self._flash_attention_softmax_scale,
             window_size = window,
         )
         A = A.reshape(bsz, q_len, n_heads*head_dim)

From 8db7e809d0dd60fb0262b3d0c4db70d43100cce0 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 30 Jul 2024 23:11:35 -0700
Subject: [PATCH 091/147] Update gemma2.py

---
 unsloth/models/gemma2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/gemma2.py b/unsloth/models/gemma2.py
index d2bfb7899..1cbaf5b16 100644
--- a/unsloth/models/gemma2.py
+++ b/unsloth/models/gemma2.py
@@ -141,7 +141,7 @@ def Gemma2Attention_fast_forward(
 
         # FA uses 1 / sqrt for softmax_scale!
         if not hasattr(self, "_flash_attention_softmax_scale"):
-            self._flash_attention_softmax_scale = 1.0 / self.config.query_pre_attn_scalar**0.5
+            self._flash_attention_softmax_scale = 1.0 / (self.config.query_pre_attn_scalar**0.5)
         pass
 
         Q = Q.transpose(1, 2)

From 0c932bc0bb79b405af6e4b623088c86bdc51e48e Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 30 Jul 2024 23:51:47 -0700
Subject: [PATCH 092/147] Update mapper.py

---
 unsloth/models/mapper.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/unsloth/models/mapper.py b/unsloth/models/mapper.py
index 462555f31..57ba67658 100644
--- a/unsloth/models/mapper.py
+++ b/unsloth/models/mapper.py
@@ -241,6 +241,14 @@
     "unsloth/Mistral-Large-Instruct-2407-bnb-4bit" : (
         "mistralai/Mistral-Large-Instruct-2407",
     ),
+    "unsloth/gemma-2-2b-bnb-4bit" : (
+        "unsloth/gemma-2-2b",
+        "google/gemma-2-2b",
+    ),
+    "unsloth/gemma-2-2b-it-bnb-4bit" : (
+        "unsloth/gemma-2-2b-it",
+        "google/gemma-2-2b-it",
+    ),
 }
 
 INT_TO_FLOAT_MAPPER = {}

From 7af632075c201075e8469917169862d002bc8dc5 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 31 Jul 2024 08:53:20 -0700
Subject: [PATCH 093/147] Update README.md

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 4c1271396..d843158d2 100644
--- a/README.md
+++ b/README.md
@@ -39,6 +39,7 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and
 - Click [here](https://github.com/unslothai/unsloth/wiki) for detailed documentation for Unsloth.
 
 ## 🦥 Unsloth.ai News
+- 📣 NEW! [Gemma-2-2b](https://colab.research.google.com/drive/1weTpKOjBZxZJ5PQ-Ql8i6ptAY2x-FWVA?usp=sharing) now supported! Gemma-2-9b and Gemma-2-27b are alrady supported!
 - 📣 NEW! [Llama 3.1 8b, 70b](https://colab.research.google.com/drive/1Ys44kVvmeZtnICzWz0xgpRnrIOjZAuxp?usp=sharing) both Base and Instruct now supported
 - 📣 NEW! [Mistral Nemo-12b](https://colab.research.google.com/drive/17d3U-CAIwzmbDRqbZ9NnpHxCkmXB6LZ0?usp=sharing) both Base and Instruct now supported
 - 📣 NEW! [Gemma-2-9b](https://colab.research.google.com/drive/1vIrqH5uYDQwsJ4-OO3DErvuv4pBgVwk4?usp=sharing) and Gemma-2-27b now supported

From fdfe1f59f56935f1945269e5beda50969810158a Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 31 Jul 2024 08:54:22 -0700
Subject: [PATCH 094/147] Update _utils.py

---
 unsloth/models/_utils.py | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index c9bc6065f..fe3aa9040 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -192,20 +192,20 @@ def patch_mistral_nemo_config(config):
 # Get Xformers
 from xformers import __version__ as xformers_version
 # Temporarily disable 0.0.27 and higher - inference issues
-# if Version(xformers_version) >= Version("0.0.27"):
-#     raise ImportError(
-#         "Unsloth: If you are in Colab, we updated the top cell install instructions - please change it to below "\
-#         "then press Disconnect Runtime and then Restart it.\n"\
-#         "\n"\
-#         "%%capture\n"
-#         "# Installs Unsloth, Xformers (Flash Attention) and all other packages!\n"
-#         '!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"\n'
-#         '!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes\n'\
-#         '\n'\
-#         f"Otherwise in local machines, your xformers version of {xformers_version} is too new.\n"\
-#         'Please downgrade xformers via `pip install --force-reinstall "xformers<0.0.27"'
-#     )
-# pass
+if Version(xformers_version) >= Version("0.0.27"):
+    raise ImportError(
+        "Unsloth: If you are in Colab, we updated the top cell install instructions - please change it to below "\
+        "then press Disconnect Runtime and then Restart it.\n"\
+        "\n"\
+        "%%capture\n"
+        "# Installs Unsloth, Xformers (Flash Attention) and all other packages!\n"
+        '!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"\n'
+        '!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes\n'\
+        '\n'\
+        f"Otherwise in local machines, your xformers version of {xformers_version} is too new.\n"\
+        'Please downgrade xformers via `pip install --force-reinstall "xformers<0.0.27"'
+    )
+pass
 
 if   Version(torch_version) < Version("2.2.0") and Version(xformers_version) >= Version("0.0.24"):
     raise ImportError(
@@ -217,12 +217,12 @@ def patch_mistral_nemo_config(config):
         f"Unsloth: You have torch = {torch_version} but xformers = {xformers_version}.\n"\
         f"Please install xformers < 0.0.26 for torch = {torch_version}."
     )
-# elif Version(torch_version) < Version("2.4.0") and Version(xformers_version) >= Version("0.0.27"):
-#     raise ImportError(
-#         f"Unsloth: You have torch = {torch_version} but xformers = {xformers_version}.\n"\
-#         f"Please install xformers < 0.0.27 for torch = {torch_version}."
-#     )
-# pass
+elif Version(torch_version) < Version("2.4.0") and Version(xformers_version) >= Version("0.0.27"):
+    raise ImportError(
+        f"Unsloth: You have torch = {torch_version} but xformers = {xformers_version}.\n"\
+        f"Please install xformers < 0.0.27 for torch = {torch_version}."
+    )
+pass
 
 from xformers._cpp_lib import _register_extensions
 try:

From b85670de83fd8eb10a9ca61045361918ea35686b Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 31 Jul 2024 08:54:58 -0700
Subject: [PATCH 095/147] Gemma (#843)

* bugs

* Update _utils.py

* flash-attn softcapping

* Update gemma2.py

* Update gemma2.py

* Update gemma2.py

* Update gemma2.py

* Update mapper.py

* Update README.md

* Update _utils.py
---
 README.md                |  1 +
 pyproject.toml           | 22 ++++++------
 unsloth/models/_utils.py | 14 ++++++++
 unsloth/models/gemma2.py | 73 ++++++++++++++++++++++++++++++----------
 unsloth/models/llama.py  | 39 +++++++++++----------
 unsloth/models/loader.py | 17 ++++++++++
 unsloth/models/mapper.py |  8 +++++
 7 files changed, 128 insertions(+), 46 deletions(-)

diff --git a/README.md b/README.md
index 4c1271396..d843158d2 100644
--- a/README.md
+++ b/README.md
@@ -39,6 +39,7 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and
 - Click [here](https://github.com/unslothai/unsloth/wiki) for detailed documentation for Unsloth.
 
 ## 🦥 Unsloth.ai News
+- 📣 NEW! [Gemma-2-2b](https://colab.research.google.com/drive/1weTpKOjBZxZJ5PQ-Ql8i6ptAY2x-FWVA?usp=sharing) now supported! Gemma-2-9b and Gemma-2-27b are alrady supported!
 - 📣 NEW! [Llama 3.1 8b, 70b](https://colab.research.google.com/drive/1Ys44kVvmeZtnICzWz0xgpRnrIOjZAuxp?usp=sharing) both Base and Instruct now supported
 - 📣 NEW! [Mistral Nemo-12b](https://colab.research.google.com/drive/17d3U-CAIwzmbDRqbZ9NnpHxCkmXB6LZ0?usp=sharing) both Base and Instruct now supported
 - 📣 NEW! [Gemma-2-9b](https://colab.research.google.com/drive/1vIrqH5uYDQwsJ4-OO3DErvuv4pBgVwk4?usp=sharing) and Gemma-2-27b now supported
diff --git a/pyproject.toml b/pyproject.toml
index 6777f7c26..e711325be 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -171,7 +171,7 @@ colab-ampere-torch211 = [
     "unsloth[cu121onlytorch211]",
     "packaging",
     "ninja",
-    "flash-attn",
+    "flash-attn>=2.6.3",
 ]
 colab-torch220 = [
     "unsloth[huggingface]",
@@ -184,7 +184,7 @@ colab-ampere-torch220 = [
     "unsloth[cu121onlytorch220]",
     "packaging",
     "ninja",
-    "flash-attn",
+    "flash-attn>=2.6.3",
 ]
 colab-new = [
     "packaging",
@@ -215,7 +215,7 @@ colab-ampere = [
     "unsloth[colab-ampere-torch220]",
     "packaging",
     "ninja",
-    "flash-attn",
+    "flash-attn>=2.6.3",
 ]
 cu118-ampere = [
     "unsloth[huggingface]",
@@ -223,7 +223,7 @@ cu118-ampere = [
     "unsloth[cu118only]",
     "packaging",
     "ninja",
-    "flash-attn",
+    "flash-attn>=2.6.3",
 ]
 cu121-ampere = [
     "unsloth[huggingface]",
@@ -231,7 +231,7 @@ cu121-ampere = [
     "unsloth[cu121only]",
     "packaging",
     "ninja",
-    "flash-attn",
+    "flash-attn>=2.6.3",
 ]
 cu118-ampere-torch211 = [
     "unsloth[huggingface]",
@@ -239,7 +239,7 @@ cu118-ampere-torch211 = [
     "unsloth[cu118onlytorch211]",
     "packaging",
     "ninja",
-    "flash-attn",
+    "flash-attn>=2.6.3",
 ]
 cu121-ampere-torch211 = [
     "unsloth[huggingface]",
@@ -247,7 +247,7 @@ cu121-ampere-torch211 = [
     "unsloth[cu121onlytorch211]",
     "packaging",
     "ninja",
-    "flash-attn",
+    "flash-attn>=2.6.3",
 ]
 cu118-ampere-torch220 = [
     "unsloth[huggingface]",
@@ -255,7 +255,7 @@ cu118-ampere-torch220 = [
     "unsloth[cu118onlytorch220]",
     "packaging",
     "ninja",
-    "flash-attn",
+    "flash-attn>=2.6.3",
 ]
 cu121-ampere-torch220 = [
     "unsloth[huggingface]",
@@ -263,7 +263,7 @@ cu121-ampere-torch220 = [
     "unsloth[cu121onlytorch220]",
     "packaging",
     "ninja",
-    "flash-attn",
+    "flash-attn>=2.6.3",
 ]
 cu118-ampere-torch230 = [
     "unsloth[huggingface]",
@@ -271,7 +271,7 @@ cu118-ampere-torch230 = [
     "unsloth[cu118onlytorch230]",
     "packaging",
     "ninja",
-    "flash-attn",
+    "flash-attn>=2.6.3",
 ]
 cu121-ampere-torch230 = [
     "unsloth[huggingface]",
@@ -279,7 +279,7 @@ cu121-ampere-torch230 = [
     "unsloth[cu121onlytorch230]",
     "packaging",
     "ninja",
-    "flash-attn",
+    "flash-attn>=2.6.3",
 ]
 
 [project.urls]
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 994f97ab7..fe3aa9040 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -21,6 +21,7 @@
     "xformers_version",
     "__version__",
     "HAS_FLASH_ATTENTION",
+    "HAS_FLASH_ATTENTION_SOFTCAPPING",
     "PRE_CHECK",
     "platform_system",
     "patch_tokenizer",
@@ -140,6 +141,8 @@ def patch_mistral_nemo_config(config):
 
 major_version, minor_version = torch.cuda.get_device_capability()
 SUPPORTS_BFLOAT16 = False
+HAS_FLASH_ATTENTION = False
+HAS_FLASH_ATTENTION_SOFTCAPPING = False
 
 if major_version >= 8:
     SUPPORTS_BFLOAT16 = True
@@ -148,6 +151,17 @@ def patch_mistral_nemo_config(config):
         try:
             from flash_attn.flash_attn_interface import flash_attn_cuda
             HAS_FLASH_ATTENTION = True
+
+            # Also check for softcapping
+            from flash_attn import __version__ as flash_attn_version
+            HAS_FLASH_ATTENTION_SOFTCAPPING = Version(flash_attn_version) >= Version("2.6.3")
+            if not HAS_FLASH_ATTENTION_SOFTCAPPING:
+                print(
+                    "Unsloth: If you want to finetune Gemma 2, upgrade flash-attn to version 2.6.3 or higher!\n"\
+                    "Newer versions support faster and less memory usage kernels for Gemma 2's attention softcapping!\n"\
+                    "To update flash-attn, do the below:\n"\
+                    '\npip install --no-deps --upgrade "flash-attn>=2.6.3"'
+                )
         except:
             print(
                 "Unsloth: Your Flash Attention 2 installation seems to be broken?\n"\
diff --git a/unsloth/models/gemma2.py b/unsloth/models/gemma2.py
index 0d21c47b0..1cbaf5b16 100644
--- a/unsloth/models/gemma2.py
+++ b/unsloth/models/gemma2.py
@@ -56,6 +56,8 @@
     Gemma2FlashAttention2 = Gemma2Attention
 pass
 
+if HAS_FLASH_ATTENTION_SOFTCAPPING:
+    from flash_attn import flash_attn_func
 
 # [TODO] We must randomnly use torch.compile?
 # I checked the gradients and formulas and I'm sure it's correct.
@@ -126,8 +128,36 @@ def Gemma2Attention_fast_forward(
         V = torch.cat([past_key_value[1], V], dim = 2)
     pass
     past_key_value = (K, V) if use_cache else None
-    
-    A = slow_attention_softcapping(Q, K, V, causal_mask, self, bsz, kv_seq_len)
+
+    # Only enable if the attention_mask is True
+    has_sliding_window = type(causal_mask) is bool and causal_mask is True
+    if HAS_FLASH_ATTENTION_SOFTCAPPING and attention_mask is None:
+        window = (-1, -1)
+        if has_sliding_window:
+            sw = getattr(self.config, "sliding_window", None)
+            sw = kv_seq_len if (sw is None or sw == "null") else sw
+            window = (-1, -1) if (kv_seq_len <= sw) else (sw, sw)
+        pass
+
+        # FA uses 1 / sqrt for softmax_scale!
+        if not hasattr(self, "_flash_attention_softmax_scale"):
+            self._flash_attention_softmax_scale = 1.0 / (self.config.query_pre_attn_scalar**0.5)
+        pass
+
+        Q = Q.transpose(1, 2)
+        K = K.transpose(1, 2)
+        V = V.transpose(1, 2)
+        A = flash_attn_func(
+            Q, K, V,
+            causal = True,
+            softcap = self.config.attn_logit_softcapping,
+            softmax_scale = self._flash_attention_softmax_scale,
+            window_size = window,
+        )
+        A = A.reshape(bsz, q_len, n_heads*head_dim)
+    else:
+        A = slow_attention_softcapping(Q, K, V, causal_mask, self, bsz, kv_seq_len)
+    pass
     A = self.apply_o(self, A)
     return A, None, past_key_value
 pass
@@ -205,6 +235,8 @@ def Gemma2DecoderLayer_fast_forward(
 from math import sqrt as math_sqrt
 KV_CACHE_INCREMENT = 256 # KV Cache update size
 torch_nn_functional_softmax = torch.nn.functional.softmax
+torch_matmul = torch.matmul
+torch_tanh   = torch.tanh
 
 def Gemma2Attention_fast_forward_inference(
     self,
@@ -322,13 +354,13 @@ def Gemma2Attention_fast_forward_inference(
     # if bsz == 1:
     Qn *= self.scalar # See https://github.com/ggerganov/llama.cpp/issues/7805#issuecomment-2153349963
     # It seems like doing (Q * scalar) @ K is better than (Q @ K) * scalar to stop overflows
-    A = torch.matmul(Qn, Knn.transpose(2, 3), out = self.attention[:,:,:,:cached_len])
+    A = torch_matmul(Qn, Knn.transpose(2, 3), out = self.attention[:,:,:,:cached_len])
     # if attention_mask is not None: A += attention_mask # Must add attention_mask for batched
 
-    A *= self.reciprocal_t; torch.tanh(A, out = A); A *= self.t;  # Logit softcapping
+    A *= self.reciprocal_t; torch_tanh(A, out = A); A *= self.t;  # Logit softcapping
 
     A[:] = torch_nn_functional_softmax(A, dim = -1, dtype = torch.float32)#.to(A.dtype)
-    A = torch.matmul(A, Vnn, out = Qn)
+    A = torch_matmul(A, Vnn, out = Qn)
     # else:
     #     A = scaled_dot_product_attention(Qn, Knn, Vnn, attn_mask = attention_mask, is_causal = False)
     # pass
@@ -359,19 +391,24 @@ def Gemma2Model_fast_forward_inference(
     bsz, q_len, hd = hidden_states.shape
     seq_len = past_key_values[0][0].shape[-2]
     if bsz != 1:
-        SWA = _prepare_4d_causal_attention_mask_for_sdpa(
-            attention_mask,
-            (bsz, q_len),
-            hidden_states,
-            seq_len,
-            sliding_window = self.config.sliding_window,
-        )
-        GA = _prepare_4d_causal_attention_mask_for_sdpa(
-            attention_mask,
-            (bsz, q_len),
-            hidden_states,
-            seq_len,
-        )
+        if HAS_FLASH_ATTENTION_SOFTCAPPING:
+            SWA = True
+            GA  = False
+        else:
+            SWA = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask,
+                (bsz, q_len),
+                hidden_states,
+                seq_len,
+                sliding_window = self.config.sliding_window,
+            )
+            GA = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask,
+                (bsz, q_len),
+                hidden_states,
+                seq_len,
+            )
+        pass
     else:
         SWA = attention_mask
         GA  = attention_mask
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 496a37e7a..b5244ed4e 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -682,23 +682,28 @@ def LlamaModel_fast_forward(
 
     # Gemma2 has alternating SWA and global attn
     if IS_GEMMA2 and not hasattr(self, "SWA_mask"):
-        n = self.config.max_position_embeddings
-        # masked_fill is making stuff slower!
-        # self. GA_mask = create_boolean_mask(n = n, sliding_window = 0)
-        # self.SWA_mask = create_boolean_mask(n = n, sliding_window = self.config.sliding_window)
-        from transformers.modeling_attn_mask_utils import AttentionMaskConverter
-        self.SWA_mask = AttentionMaskConverter(
-            is_causal = True,
-            sliding_window = self.config.sliding_window,
-        )\
-            .to_causal_4d(1, n, n, dtype = inputs_embeds.dtype, device = "cuda:0",)\
-            .squeeze(0).squeeze(0)
-
-        self.GA_mask = AttentionMaskConverter(
-            is_causal = True,
-        )\
-            .to_causal_4d(1, n, n, dtype = inputs_embeds.dtype, device = "cuda:0",)\
-            .squeeze(0).squeeze(0)
+        if HAS_FLASH_ATTENTION_SOFTCAPPING:
+            self.SWA_mask = True
+            self.GA_mask  = False
+        else:
+            n = self.config.max_position_embeddings
+            # masked_fill is making stuff slower!
+            # self. GA_mask = create_boolean_mask(n = n, sliding_window = 0)
+            # self.SWA_mask = create_boolean_mask(n = n, sliding_window = self.config.sliding_window)
+            from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+            self.SWA_mask = AttentionMaskConverter(
+                is_causal = True,
+                sliding_window = self.config.sliding_window,
+            )\
+                .to_causal_4d(1, n, n, dtype = inputs_embeds.dtype, device = "cuda:0",)\
+                .squeeze(0).squeeze(0)
+
+            self.GA_mask = AttentionMaskConverter(
+                is_causal = True,
+            )\
+                .to_causal_4d(1, n, n, dtype = inputs_embeds.dtype, device = "cuda:0",)\
+                .squeeze(0).squeeze(0)
+        pass
     pass
 
     # Go through every layer!
diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index f22e81efa..47152d676 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from ._utils import is_bfloat16_supported, HAS_FLASH_ATTENTION, HAS_FLASH_ATTENTION_SOFTCAPPING
 from .llama import FastLlamaModel, logger
 from .mistral import FastMistralModel
 from .qwen2 import FastQwen2Model
@@ -42,6 +43,7 @@ def __get_model_name(
     FLOAT_TO_INT_MAPPER = None,
 ):
 
+    model_name = str(model_name)
     if not SUPPORTS_FOURBIT and model_name.lower() in INT_TO_FLOAT_MAPPER:
         model_name = INT_TO_FLOAT_MAPPER[model_name.lower()]
         logger.warning_once(
@@ -232,6 +234,21 @@ def from_pretrained(
                     f'Try `pip install --upgrade "transformers>=4.42.3"`\n'\
                     f"to obtain the latest transformers build, then restart this session."\
                 )
+            # Also check for softcapping support in flash-attn which is faster!
+            if is_bfloat16_supported() and not HAS_FLASH_ATTENTION:
+                print(
+                    "Unsloth: If you want to finetune Gemma 2, install flash-attn to make it faster!\n"\
+                    "To install flash-attn, do the below:\n"\
+                    '\npip install --no-deps --upgrade "flash-attn>=2.6.3"'
+                )
+            elif HAS_FLASH_ATTENTION and not HAS_FLASH_ATTENTION_SOFTCAPPING:
+                print(
+                    "Unsloth: If you want to finetune Gemma 2, upgrade flash-attn to version 2.6.3 or higher!\n"\
+                    "Newer versions support faster and less memory usage kernels for Gemma 2's attention softcapping!\n"\
+                    "To update flash-attn, do the below:\n"\
+                    '\npip install --no-deps --upgrade "flash-attn>=2.6.3"'
+                )
+            
             dispatch_model = FastGemma2Model
         elif model_type == "qwen2":
             dispatch_model = FastQwen2Model
diff --git a/unsloth/models/mapper.py b/unsloth/models/mapper.py
index 462555f31..57ba67658 100644
--- a/unsloth/models/mapper.py
+++ b/unsloth/models/mapper.py
@@ -241,6 +241,14 @@
     "unsloth/Mistral-Large-Instruct-2407-bnb-4bit" : (
         "mistralai/Mistral-Large-Instruct-2407",
     ),
+    "unsloth/gemma-2-2b-bnb-4bit" : (
+        "unsloth/gemma-2-2b",
+        "google/gemma-2-2b",
+    ),
+    "unsloth/gemma-2-2b-it-bnb-4bit" : (
+        "unsloth/gemma-2-2b-it",
+        "google/gemma-2-2b-it",
+    ),
 }
 
 INT_TO_FLOAT_MAPPER = {}

From dfca5516e74e60d52915d4287121d9ff8b80b314 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 31 Jul 2024 09:50:11 -0700
Subject: [PATCH 096/147] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index d843158d2..9407c452a 100644
--- a/README.md
+++ b/README.md
@@ -39,7 +39,7 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and
 - Click [here](https://github.com/unslothai/unsloth/wiki) for detailed documentation for Unsloth.
 
 ## 🦥 Unsloth.ai News
-- 📣 NEW! [Gemma-2-2b](https://colab.research.google.com/drive/1weTpKOjBZxZJ5PQ-Ql8i6ptAY2x-FWVA?usp=sharing) now supported! Gemma-2-9b and Gemma-2-27b are alrady supported!
+- 📣 NEW! [Gemma-2-2b](https://colab.research.google.com/drive/1weTpKOjBZxZJ5PQ-Ql8i6ptAY2x-FWVA?usp=sharing) now supported! Gemma-2-9b and Gemma-2-27b are alrady supported! And uploaded [GGUF quants](https://huggingface.co/unsloth/gemma-2-it-GGUF) Try out [Chat interface](https://colab.research.google.com/drive/1i-8ESvtLRGNkkUQQr_-z_rcSAIo9c3lM?usp=sharing) for Gemma-2-2b Instruct!
 - 📣 NEW! [Llama 3.1 8b, 70b](https://colab.research.google.com/drive/1Ys44kVvmeZtnICzWz0xgpRnrIOjZAuxp?usp=sharing) both Base and Instruct now supported
 - 📣 NEW! [Mistral Nemo-12b](https://colab.research.google.com/drive/17d3U-CAIwzmbDRqbZ9NnpHxCkmXB6LZ0?usp=sharing) both Base and Instruct now supported
 - 📣 NEW! [Gemma-2-9b](https://colab.research.google.com/drive/1vIrqH5uYDQwsJ4-OO3DErvuv4pBgVwk4?usp=sharing) and Gemma-2-27b now supported

From 2de142712d2dc8892d216dfca365dc3ba2707c43 Mon Sep 17 00:00:00 2001
From: XiaoYang <xyangk@gmail.com>
Date: Thu, 1 Aug 2024 03:05:08 +0800
Subject: [PATCH 097/147] Fix ROPE extension issue and device mismatch (#840)

* When an exception has been assigned using as target, it is cleared at the end of the except clause.(https://docs.python.org/3/reference/compound_stmts.html#the-try-statement)

* Update loader.py

* round up to extend rope size

* inv_freq.device changed, make sure they are on the same device

---------

Co-authored-by: xiaoyang <xiaoyang@youzan.com>
Co-authored-by: Daniel Han <danielhanchen@gmail.com>
---
 unsloth/models/llama.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index b5244ed4e..e6c9280bc 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -14,6 +14,7 @@
 
 import torch
 import gc
+import math
 from typing import Optional, Tuple, List, Union
 from ._utils import *
 from ._utils import __version__
@@ -1036,7 +1037,7 @@ def forward(self, x, position_ids=None, seq_len=None):
     def extend_rope_embedding(self, x, seq_len):
         if seq_len <= self.current_rope_size: return
         # Iteratively grow by increments of 8192
-        self.current_rope_size = int(round(seq_len / 8192)) * 8192
+        self.current_rope_size = math.ceil(seq_len / 8192) * 8192
         self._set_cos_sin_cache(self.current_rope_size, device = "cuda:0", dtype = x.dtype)
     pass
 pass
@@ -1109,7 +1110,7 @@ def _set_cos_sin_cache(self, seq_len, device, dtype):
         # in FP32. They are applied (multiplied) in FP32 as well.
         self.current_rope_size = seq_len
         
-        t = torch.arange(self.current_rope_size, device="cpu", dtype=torch.int64).float()
+        t = torch.arange(self.current_rope_size, device=self.inv_freq.device, dtype=torch.int64).float()
 
         freqs = torch.outer(t, self.inv_freq)
         # Different from paper, but it uses a different permutation in order to obtain the same calculation
@@ -1158,7 +1159,7 @@ def forward(self, x, position_ids=None, seq_len=None):
     def extend_rope_embedding(self, x, seq_len):
         if seq_len <= self.current_rope_size: return
         # Iteratively grow by increments of 8192
-        self.current_rope_size = int(round(seq_len / 8192)) * 8192
+        self.current_rope_size = math.ceil(seq_len / 8192) * 8192
         self._set_cos_sin_cache(self.current_rope_size, device = "cuda:0", dtype = x.dtype)
     pass
 pass

From d0a7dcec1dd2b9f67c9be97d3b9ac05341b5fc9b Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 31 Jul 2024 12:09:33 -0700
Subject: [PATCH 098/147] Update gemma.py

---
 unsloth/models/gemma.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/unsloth/models/gemma.py b/unsloth/models/gemma.py
index e3f1e615d..a0894ec7a 100644
--- a/unsloth/models/gemma.py
+++ b/unsloth/models/gemma.py
@@ -14,6 +14,7 @@
 
 from .llama import *
 from ._utils import __version__
+import math
 
 try:
     from transformers.models.gemma.modeling_gemma import (
@@ -256,7 +257,7 @@ def forward(self, x, position_ids=None, seq_len=None):
     def extend_rope_embedding(self, x, seq_len):
         if seq_len <= self.current_rope_size: return
         # Iteratively grow by increments of 8192
-        self.current_rope_size = int(round(seq_len / 8192)) * 8192
+        self.current_rope_size = math.ceil(seq_len / 8192) * 8192
         self._set_cos_sin_cache(self.current_rope_size, device = "cuda:0", dtype = x.dtype)
     pass
 pass

From 4e570be9ae4ced8cdc64e498125708e34942befc Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 31 Jul 2024 12:10:33 -0700
Subject: [PATCH 099/147] Fix RoPE extension (#846)

* bugs

* Update _utils.py

* flash-attn softcapping

* Update gemma2.py

* Update gemma2.py

* Update gemma2.py

* Update gemma2.py

* Update mapper.py

* Update README.md

* Update _utils.py

* Fix ROPE extension issue and device mismatch (#840)

* When an exception has been assigned using as target, it is cleared at the end of the except clause.(https://docs.python.org/3/reference/compound_stmts.html#the-try-statement)

* Update loader.py

* round up to extend rope size

* inv_freq.device changed, make sure they are on the same device

---------

Co-authored-by: xiaoyang <xiaoyang@youzan.com>
Co-authored-by: Daniel Han <danielhanchen@gmail.com>

* Update gemma.py

---------

Co-authored-by: XiaoYang <xyangk@gmail.com>
Co-authored-by: xiaoyang <xiaoyang@youzan.com>
---
 unsloth/models/gemma.py | 3 ++-
 unsloth/models/llama.py | 7 ++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/unsloth/models/gemma.py b/unsloth/models/gemma.py
index e3f1e615d..a0894ec7a 100644
--- a/unsloth/models/gemma.py
+++ b/unsloth/models/gemma.py
@@ -14,6 +14,7 @@
 
 from .llama import *
 from ._utils import __version__
+import math
 
 try:
     from transformers.models.gemma.modeling_gemma import (
@@ -256,7 +257,7 @@ def forward(self, x, position_ids=None, seq_len=None):
     def extend_rope_embedding(self, x, seq_len):
         if seq_len <= self.current_rope_size: return
         # Iteratively grow by increments of 8192
-        self.current_rope_size = int(round(seq_len / 8192)) * 8192
+        self.current_rope_size = math.ceil(seq_len / 8192) * 8192
         self._set_cos_sin_cache(self.current_rope_size, device = "cuda:0", dtype = x.dtype)
     pass
 pass
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index b5244ed4e..e6c9280bc 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -14,6 +14,7 @@
 
 import torch
 import gc
+import math
 from typing import Optional, Tuple, List, Union
 from ._utils import *
 from ._utils import __version__
@@ -1036,7 +1037,7 @@ def forward(self, x, position_ids=None, seq_len=None):
     def extend_rope_embedding(self, x, seq_len):
         if seq_len <= self.current_rope_size: return
         # Iteratively grow by increments of 8192
-        self.current_rope_size = int(round(seq_len / 8192)) * 8192
+        self.current_rope_size = math.ceil(seq_len / 8192) * 8192
         self._set_cos_sin_cache(self.current_rope_size, device = "cuda:0", dtype = x.dtype)
     pass
 pass
@@ -1109,7 +1110,7 @@ def _set_cos_sin_cache(self, seq_len, device, dtype):
         # in FP32. They are applied (multiplied) in FP32 as well.
         self.current_rope_size = seq_len
         
-        t = torch.arange(self.current_rope_size, device="cpu", dtype=torch.int64).float()
+        t = torch.arange(self.current_rope_size, device=self.inv_freq.device, dtype=torch.int64).float()
 
         freqs = torch.outer(t, self.inv_freq)
         # Different from paper, but it uses a different permutation in order to obtain the same calculation
@@ -1158,7 +1159,7 @@ def forward(self, x, position_ids=None, seq_len=None):
     def extend_rope_embedding(self, x, seq_len):
         if seq_len <= self.current_rope_size: return
         # Iteratively grow by increments of 8192
-        self.current_rope_size = int(round(seq_len / 8192)) * 8192
+        self.current_rope_size = math.ceil(seq_len / 8192) * 8192
         self._set_cos_sin_cache(self.current_rope_size, device = "cuda:0", dtype = x.dtype)
     pass
 pass

From f65cc9877c9ee42b9c6719a4fe168b00abceb095 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sun, 4 Aug 2024 11:28:21 -0700
Subject: [PATCH 100/147] Update pyproject.toml

---
 pyproject.toml | 40 ++++++++++------------------------------
 1 file changed, 10 insertions(+), 30 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index e711325be..fdc098854 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -50,54 +50,34 @@ huggingface = [
     "hf-transfer",
 ]
 cu118only = [
-    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.22.post7%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'",
-    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.22.post7%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'",
-    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.22.post7%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'",
+    "xformers==0.0.22.post7",
 ]
 cu121only = [
-    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.22.post7-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'",
-    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.22.post7-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'",
-    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.22.post7-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'",
+    "xformers==0.0.22.post7",
 ]
 cu118onlytorch211 = [
-    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'",
-    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'",
-    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'",
+    "xformers==0.0.23",
 ]
 cu121onlytorch211 = [
-    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'",
-    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'",
-    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'",
+    "xformers==0.0.23",
 ]
 cu118onlytorch212 = [
-    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23.post1%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'",
-    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23.post1%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'",
-    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23.post1%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'",
+    "xformers==0.0.23.post1",
 ]
 cu121onlytorch212 = [
-    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23.post1-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'",
-    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23.post1-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'",
-    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23.post1-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'",
+    "xformers==0.0.23.post1",
 ]
 cu118onlytorch220 = [
-    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.24%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'",
-    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.24%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'",
-    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.24%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'",
+    "xformers==0.0.24",
 ]
 cu121onlytorch220 = [
-    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.24-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'",
-    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.24-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'",
-    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.24-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'",
+    "xformers==0.0.24",
 ]
 cu118onlytorch230 = [
-    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.26.post1%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'",
-    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.26.post1%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'",
-    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.26.post1%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'",
+    "xformers==0.0.26.post1",
 ]
 cu121onlytorch230 = [
-    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.26.post1-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'",
-    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.26.post1-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'",
-    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.26.post1-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'",
+    "xformers==0.0.26.post1",
 ]
 
 cu118 = [

From 16b6932c43baaf0097943ab14321a8f3c1bc6415 Mon Sep 17 00:00:00 2001
From: moontidef <53668275+relic-yuexi@users.noreply.github.com>
Date: Mon, 5 Aug 2024 14:45:34 +0800
Subject: [PATCH 101/147] fix: fix config.torch_dtype bug (#874)

fix the bug #404
and the bug https://github.com/hiyouga/LLaMA-Factory/issues/4698#issue-2393500878
---
 unsloth/models/llama.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index e6c9280bc..445e5026f 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -570,7 +570,14 @@ def LlamaModel_fast_forward(
     # Embed positions
     if inputs_embeds is None:
         inputs_embeds = self.embed_tokens(input_ids)
-
+        
+    if self.config.torch_dtype == "float32":
+        self.config.torch_dtype = torch.float32
+    elif self.config.torch_dtype == "bfloat16":
+        self.config.torch_dtype = torch.bfloat16
+    elif self.config.torch_dtype == "float16":
+        self.config.torch_dtype = torch.float16
+        
     inputs_embeds = inputs_embeds.to(self.config.torch_dtype)
 
     # Normalized from Gemma

From 46b434869847e202e1cf594ab8466819cb398e7a Mon Sep 17 00:00:00 2001
From: emuchogu <edward@e-ofisi.com>
Date: Mon, 5 Aug 2024 09:45:51 +0300
Subject: [PATCH 102/147] pascal support (#870)

Co-authored-by: Edward Muchogu <muchogu@gmail.com>
---
 README.md | 389 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 389 insertions(+)

diff --git a/README.md b/README.md
index 9407c452a..4e29a43ec 100644
--- a/README.md
+++ b/README.md
@@ -436,6 +436,395 @@ Two Tesla T4s on Kaggle
 ![](https://i.ibb.co/sJ7RhGG/image-41.png)
 <br>
 
+## NVIDIA Pascal Support
+
+Support for NVIDIA Pascal family of cards, specifically the P40 and P100.
+
+### Setup Guide
+
+1. Create three files (`Dockerfile`, `unsloth_env_file.yml`, and `docker-compose.yml`) with the contents provided below.
+2. Ensure Docker and Docker Compose are installed on your system.
+3. Install the NVIDIA Container Toolkit for GPU support if not already done.
+4. Place all three files in the same directory.
+5. Open a terminal and navigate to the directory containing these files.
+6. Run the following command to build and start the container:
+
+   ```
+   docker-compose up --build
+   ```
+
+7. Once the container is running, access Jupyter Lab by opening a web browser and navigating to `http://localhost:8888`.
+
+### Configuration Files
+
+#### 1. Dockerfile
+
+```dockerfile
+# Stage 1: Base image with system dependencies
+FROM nvidia/cuda:11.8.0-devel-ubuntu22.04 as base
+
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    git \
+    vim \
+    curl \
+    wget \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Miniconda only if it's not already installed
+RUN if [ ! -d "/opt/conda" ]; then \
+        wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh && \
+        bash miniconda.sh -b -p /opt/conda && \
+        rm miniconda.sh; \
+    fi
+
+# Set path to conda
+ENV PATH /opt/conda/bin:$PATH
+
+# Set path to conda
+ENV PATH /opt/conda/bin:$PATH
+
+# Stage 2: Python environment setup
+FROM base as python-env
+
+COPY unsloth_env_file.yml unsloth_env_file.yml
+
+RUN conda env create -f unsloth_env_file.yml
+
+SHELL ["conda", "run", "-n", "unsloth_env", "/bin/bash", "-c"]
+
+# Stage 3: Final image
+FROM python-env as final
+
+# Install Unsloth (This step is separate because it's likely to change more frequently)
+RUN pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
+
+ENV PATH /usr/local/cuda/bin:$PATH
+ENV LD_LIBRARY_PATH /usr/local/cuda/lib64:$LD_LIBRARY_PATH
+
+# Set the working directory
+WORKDIR /workspace
+
+# Set the default command to run Jupyter Lab
+CMD ["conda", "run", "--no-capture-output", "-n", "unsloth_env", "jupyter", "lab", "--ip=0.0.0.0", "--no-browser", "--allow-root", "--NotebookApp.token=''", "--NotebookApp.password=''"]
+```
+
+#### 2. unsloth_env_file.yml
+
+```yaml
+name: unsloth_env
+channels:
+  - xformers
+  - pytorch
+  - nvidia
+  - conda-forge
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=conda_forge
+  - _openmp_mutex=4.5=2_gnu
+  - aiohttp=3.9.5=py310h5eee18b_0
+  - aiosignal=1.2.0=pyhd3eb1b0_0
+  - anyio=4.2.0=py310h06a4308_0
+  - argon2-cffi=21.3.0=pyhd3eb1b0_0
+  - argon2-cffi-bindings=21.2.0=py310h7f8727e_0
+  - arrow-cpp=16.1.0=hc1eb8f0_0
+  - async-lru=2.0.4=pyhd8ed1ab_0
+  - async-timeout=4.0.3=py310h06a4308_0
+  - attrs=23.1.0=py310h06a4308_0
+  - aws-c-auth=0.6.19=h5eee18b_0
+  - aws-c-cal=0.5.20=hdbd6064_0
+  - aws-c-common=0.8.5=h5eee18b_0
+  - aws-c-compression=0.2.16=h5eee18b_0
+  - aws-c-event-stream=0.2.15=h6a678d5_0
+  - aws-c-http=0.6.25=h5eee18b_0
+  - aws-c-io=0.13.10=h5eee18b_0
+  - aws-c-mqtt=0.7.13=h5eee18b_0
+  - aws-c-s3=0.1.51=hdbd6064_0
+  - aws-c-sdkutils=0.1.6=h5eee18b_0
+  - aws-checksums=0.1.13=h5eee18b_0
+  - aws-crt-cpp=0.18.16=h6a678d5_0
+  - aws-sdk-cpp=1.10.55=h721c034_0
+  - babel=2.14.0=pyhd8ed1ab_0
+  - beautifulsoup4=4.12.3=py310h06a4308_0
+  - blas=1.0=mkl
+  - bleach=4.1.0=pyhd3eb1b0_0
+  - boost-cpp=1.82.0=hdb19cb5_2
+  - bottleneck=1.3.7=py310ha9d4c09_0
+  - brotli-python=1.0.9=py310h6a678d5_8
+  - bzip2=1.0.8=h5eee18b_6
+  - c-ares=1.19.1=h5eee18b_0
+  - ca-certificates=2024.7.4=hbcca054_0
+  - certifi=2024.7.4=pyhd8ed1ab_0
+  - cffi=1.16.0=py310h5eee18b_1
+  - charset-normalizer=3.3.2=pyhd3eb1b0_0
+  - cuda-cudart=11.8.89=0
+  - cuda-cupti=11.8.87=0
+  - cuda-libraries=11.8.0=0
+  - cuda-nvrtc=11.8.89=0
+  - cuda-nvtx=11.8.86=0
+  - cuda-runtime=11.8.0=0
+  - cuda-version=11.8=hcce14f8_3
+  - cudatoolkit=11.8.0=h6a678d5_0
+  - datasets=2.19.1=py310h06a4308_0
+  - debugpy=1.6.7=py310h6a678d5_0
+  - decorator=5.1.1=pyhd3eb1b0_0
+  - defusedxml=0.7.1=pyhd3eb1b0_0
+  - dill=0.3.8=py310h06a4308_0
+  - entrypoints=0.4=py310h06a4308_0
+  - ffmpeg=4.3=hf484d3e_0
+  - filelock=3.13.1=py310h06a4308_0
+  - freetype=2.12.1=h4a9f257_0
+  - frozenlist=1.4.0=py310h5eee18b_0
+  - fsspec=2024.3.1=py310h06a4308_0
+  - gflags=2.2.2=h6a678d5_1
+  - glog=0.5.0=h6a678d5_1
+  - gmp=6.2.1=h295c915_3
+  - gmpy2=2.1.2=py310heeb90bb_0
+  - gnutls=3.6.15=he1e5248_0
+  - h11=0.14.0=pyhd8ed1ab_0
+  - h2=4.1.0=pyhd8ed1ab_0
+  - hpack=4.0.0=pyh9f0ad1d_0
+  - httpcore=1.0.5=pyhd8ed1ab_0
+  - httpx=0.27.0=pyhd8ed1ab_0
+  - hyperframe=6.0.1=pyhd8ed1ab_0
+  - icu=73.1=h6a678d5_0
+  - idna=3.7=py310h06a4308_0
+  - importlib-metadata=7.0.1=py310h06a4308_0
+  - importlib_metadata=7.0.1=hd8ed1ab_0
+  - importlib_resources=6.4.0=pyhd8ed1ab_0
+  - intel-openmp=2023.1.0=hdb19cb5_46306
+  - ipykernel=6.28.0=py310h06a4308_0
+  - ipython_genutils=0.2.0=pyhd3eb1b0_1
+  - jedi=0.19.1=py310h06a4308_0
+  - jinja2=3.1.4=py310h06a4308_0
+  - jpeg=9e=h5eee18b_2
+  - json5=0.9.25=pyhd8ed1ab_0
+  - jsonschema=4.19.2=py310h06a4308_0
+  - jsonschema-specifications=2023.7.1=py310h06a4308_0
+  - jupyter-lsp=2.2.5=pyhd8ed1ab_0
+  - jupyter_client=7.4.9=py310h06a4308_0
+  - jupyter_core=5.7.2=py310h06a4308_0
+  - jupyter_events=0.10.0=py310h06a4308_0
+  - jupyter_server=2.14.1=py310h06a4308_0
+  - jupyter_server_terminals=0.4.4=py310h06a4308_1
+  - jupyterlab=4.2.4=pyhd8ed1ab_0
+  - jupyterlab_pygments=0.3.0=pyhd8ed1ab_1
+  - jupyterlab_server=2.27.3=pyhd8ed1ab_0
+  - krb5=1.20.1=h143b758_1
+  - lame=3.100=h7b6447c_0
+  - lcms2=2.12=h3be6417_0
+  - ld_impl_linux-64=2.38=h1181459_1
+  - lerc=3.0=h295c915_0
+  - libabseil=20240116.2=cxx17_h6a678d5_0
+  - libboost=1.82.0=h109eef0_2
+  - libbrotlicommon=1.0.9=h5eee18b_8
+  - libbrotlidec=1.0.9=h5eee18b_8
+  - libbrotlienc=1.0.9=h5eee18b_8
+  - libcublas=11.11.3.6=0
+  - libcufft=10.9.0.58=0
+  - libcufile=1.9.1.3=0
+  - libcurand=10.3.5.147=0
+  - libcurl=8.7.1=h251f7ec_0
+  - libcusolver=11.4.1.48=0
+  - libcusparse=11.7.5.86=0
+  - libdeflate=1.17=h5eee18b_1
+  - libedit=3.1.20230828=h5eee18b_0
+  - libev=4.33=h7f8727e_1
+  - libevent=2.1.12=hdbd6064_1
+  - libffi=3.4.4=h6a678d5_1
+  - libgcc-ng=14.1.0=h77fa898_0
+  - libgomp=14.1.0=h77fa898_0
+  - libgrpc=1.62.2=h2d74bed_0
+  - libiconv=1.16=h5eee18b_3
+  - libidn2=2.3.4=h5eee18b_0
+  - libjpeg-turbo=2.0.0=h9bf148f_0
+  - libnghttp2=1.57.0=h2d74bed_0
+  - libnpp=11.8.0.86=0
+  - libnvjpeg=11.9.0.86=0
+  - libpng=1.6.39=h5eee18b_0
+  - libprotobuf=4.25.3=he621ea3_0
+  - libsodium=1.0.18=h7b6447c_0
+  - libssh2=1.11.0=h251f7ec_0
+  - libstdcxx-ng=11.2.0=h1234567_1
+  - libtasn1=4.19.0=h5eee18b_0
+  - libthrift=0.15.0=h1795dd8_2
+  - libtiff=4.5.1=h6a678d5_0
+  - libunistring=0.9.10=h27cfd23_0
+  - libuuid=1.41.5=h5eee18b_0
+  - libwebp-base=1.3.2=h5eee18b_0
+  - llvm-openmp=14.0.6=h9e868ea_0
+  - lz4-c=1.9.4=h6a678d5_1
+  - markupsafe=2.1.3=py310h5eee18b_0
+  - mistune=2.0.4=py310h06a4308_0
+  - mkl=2023.1.0=h213fc3f_46344
+  - mkl-service=2.4.0=py310h5eee18b_1
+  - mkl_fft=1.3.8=py310h5eee18b_0
+  - mkl_random=1.2.4=py310hdb19cb5_0
+  - mpc=1.1.0=h10f8cd9_1
+  - mpfr=4.0.2=hb69a4c5_1
+  - mpmath=1.3.0=py310h06a4308_0
+  - multidict=6.0.4=py310h5eee18b_0
+  - multiprocess=0.70.15=py310h06a4308_0
+  - nb_conda_kernels=2.3.1=py310h06a4308_0
+  - nbclassic=1.1.0=py310h06a4308_0
+  - nbclient=0.8.0=py310h06a4308_0
+  - nbconvert=7.10.0=py310h06a4308_0
+  - nbformat=5.9.2=py310h06a4308_0
+  - ncurses=6.4=h6a678d5_0
+  - nest-asyncio=1.6.0=py310h06a4308_0
+  - nettle=3.7.3=hbbd107a_1
+  - networkx=3.3=py310h06a4308_0
+  - notebook=6.5.7=py310h06a4308_0
+  - notebook-shim=0.2.3=py310h06a4308_0
+  - numexpr=2.8.7=py310h85018f9_0
+  - numpy=1.26.4=py310h5f9d8c6_0
+  - numpy-base=1.26.4=py310hb5e798b_0
+  - openh264=2.1.1=h4ff587b_0
+  - openjpeg=2.4.0=h9ca470c_2
+  - openssl=3.3.1=h4bc722e_2
+  - orc=2.0.1=h2d29ad5_0
+  - overrides=7.4.0=py310h06a4308_0
+  - packaging=24.1=py310h06a4308_0
+  - pandas=2.2.2=py310h6a678d5_0
+  - pandocfilters=1.5.0=pyhd3eb1b0_0
+  - pillow=10.4.0=py310h5eee18b_0
+  - pip=24.0=py310h06a4308_0
+  - platformdirs=3.10.0=py310h06a4308_0
+  - prometheus_client=0.14.1=py310h06a4308_0
+  - prompt_toolkit=3.0.43=hd3eb1b0_0
+  - psutil=5.9.0=py310h5eee18b_0
+  - ptyprocess=0.7.0=pyhd3eb1b0_2
+  - pure_eval=0.2.2=pyhd3eb1b0_0
+  - pyarrow=16.1.0=py310h1128e8f_0
+  - pycparser=2.21=pyhd3eb1b0_0
+  - pysocks=1.7.1=py310h06a4308_0
+  - python=3.10.14=h955ad1f_1
+  - python-dateutil=2.9.0post0=py310h06a4308_2
+  - python-fastjsonschema=2.16.2=py310h06a4308_0
+  - python-json-logger=2.0.7=py310h06a4308_0
+  - python-tzdata=2023.3=pyhd3eb1b0_0
+  - python-xxhash=2.0.2=py310h5eee18b_1
+  - pytorch=2.1.0=py3.10_cuda11.8_cudnn8.7.0_0
+  - pytorch-cuda=11.8=h7e8668a_5
+  - pytorch-mutex=1.0=cuda
+  - pytz=2024.1=py310h06a4308_0
+  - pyyaml=6.0.1=py310h5eee18b_0
+  - pyzmq=24.0.1=py310h5eee18b_0
+  - re2=2022.04.01=h295c915_0
+  - readline=8.2=h5eee18b_0
+  - referencing=0.30.2=py310h06a4308_0
+  - regex=2023.10.3=py310h5eee18b_0
+  - requests=2.32.3=py310h06a4308_0
+  - rfc3339-validator=0.1.4=py310h06a4308_0
+  - rfc3986-validator=0.1.1=py310h06a4308_0
+  - rpds-py=0.10.6=py310hb02cf49_0
+  - s2n=1.3.27=hdbd6064_0
+  - safetensors=0.4.2=py310ha89cbab_1
+  - send2trash=1.8.2=py310h06a4308_0
+  - setuptools=69.5.1=py310h06a4308_0
+  - six=1.16.0=pyhd3eb1b0_1
+  - snappy=1.1.10=h6a678d5_1
+  - sniffio=1.3.0=py310h06a4308_0
+  - soupsieve=2.5=py310h06a4308_0
+  - sqlite=3.45.3=h5eee18b_0
+  - stack_data=0.2.0=pyhd3eb1b0_0
+  - sympy=1.12=py310h06a4308_0
+  - tbb=2021.8.0=hdb19cb5_0
+  - terminado=0.17.1=py310h06a4308_0
+  - tinycss2=1.2.1=py310h06a4308_0
+  - tk=8.6.14=h39e8969_0
+  - tokenizers=0.19.1=py310hff361bb_0
+  - tomli=2.0.1=pyhd8ed1ab_0
+  - torchaudio=2.1.0=py310_cu118
+  - torchtriton=2.1.0=py310
+  - torchvision=0.16.0=py310_cu118
+  - tornado=6.4.1=py310h5eee18b_0
+  - tqdm=4.66.4=py310h2f386ee_0
+  - traitlets=5.14.3=py310h06a4308_0
+  - typing-extensions=4.11.0=py310h06a4308_0
+  - typing_extensions=4.11.0=py310h06a4308_0
+  - tzdata=2024a=h04d1e81_0
+  - urllib3=2.2.2=py310h06a4308_0
+  - utf8proc=2.6.1=h5eee18b_1
+  - webencodings=0.5.1=py310h06a4308_1
+  - websocket-client=1.8.0=py310h06a4308_0
+  - wheel=0.43.0=py310h06a4308_0
+  - xformers=0.0.22.post7=py310_cu11.8.0_pyt2.1.0
+  - xxhash=0.8.0=h7f8727e_3
+  - xz=5.4.6=h5eee18b_1
+  - yaml=0.2.5=h7b6447c_0
+  - yarl=1.9.3=py310h5eee18b_0
+  - zeromq=4.3.5=h6a678d5_0
+  - zipp=3.17.0=py310h06a4308_0
+  - zlib=1.2.13=h5eee18b_1
+  - zstd=1.5.5=hc292b87_2
+  - pip:
+      - accelerate==0.33.0
+      - asttokens==2.4.1
+      - bitsandbytes==0.43.2
+      - comm==0.2.2
+      - docstring-parser==0.16
+      - exceptiongroup==1.2.2
+      - executing==2.0.1
+      - gguf==0.9.1
+      - hf-transfer==0.1.8
+      - huggingface-hub==0.24.2
+      - iprogress==0.4
+      - ipython==8.26.0
+      - ipywidgets==8.1.3
+      - jupyterlab-widgets==3.0.11
+      - markdown-it-py==3.0.0
+      - matplotlib-inline==0.1.7
+      - mdurl==0.1.2
+      - parso==0.8.4
+      - peft==0.12.0
+      - pexpect==4.9.0
+      - prompt-toolkit==3.0.47
+      - protobuf==3.20.3
+      - pure-eval==0.2.3
+      - pygments==2.18.0
+      - rich==13.7.1
+      - sentencepiece==0.2.0
+      - shtab==1.7.1
+      - stack-data==0.6.3
+      - transformers==4.43.3
+      - trl==0.8.6
+      - tyro==0.8.5
+      - wcwidth==0.2.13
+      - widgetsnbextension==4.0.11
+ ```
+
+#### 3. docker-compose.yml
+
+```yaml
+version: '3.8'
+
+services:
+  unsloth-env:
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=all
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    volumes:
+      - ./cache:/root/.cache
+      - ./workspace:/workspace
+    working_dir: /workspace
+    ports:
+      - "8888:8888"  # For Jupyter Lab
+    tty: true
+    stdin_open: true
+    build:
+      context: .
+      dockerfile: Dockerfile
+```
+
+
 ### Thank You to
 - [HuyNguyen-hust](https://github.com/HuyNguyen-hust) for making [RoPE Embeddings 28% faster](https://github.com/unslothai/unsloth/pull/238)
 - [RandomInternetPreson](https://github.com/RandomInternetPreson) for confirming WSL support

From 28dea9ac9550b136d8493d3b3ea57c859f20aab1 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sun, 4 Aug 2024 23:49:35 -0700
Subject: [PATCH 103/147] Update llama.py

---
 unsloth/models/llama.py | 33 +++++++++++++++++++++++++--------
 1 file changed, 25 insertions(+), 8 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 445e5026f..cec743e59 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -570,14 +570,7 @@ def LlamaModel_fast_forward(
     # Embed positions
     if inputs_embeds is None:
         inputs_embeds = self.embed_tokens(input_ids)
-        
-    if self.config.torch_dtype == "float32":
-        self.config.torch_dtype = torch.float32
-    elif self.config.torch_dtype == "bfloat16":
-        self.config.torch_dtype = torch.bfloat16
-    elif self.config.torch_dtype == "float16":
-        self.config.torch_dtype = torch.float16
-        
+
     inputs_embeds = inputs_embeds.to(self.config.torch_dtype)
 
     # Normalized from Gemma
@@ -1580,6 +1573,30 @@ def from_pretrained(
             internal_model = internal_model.model
         pass
         internal_model._saved_temp_tokenizer = tokenizer
+
+        # Also fix torch_dtype
+        internal_model = model
+        while hasattr(internal_model, "model"):
+            if hasattr(internal_model, "config"):
+                if   internal_model.config.torch_dtype ==  "float32":
+                    internal_model.config.torch_dtype = torch.float32
+                elif internal_model.config.torch_dtype == "bfloat16":
+                    internal_model.config.torch_dtype = torch.bfloat16
+                elif internal_model.config.torch_dtype ==  "float16":
+                    internal_model.config.torch_dtype = torch.float16
+                pass
+            pass
+            internal_model = internal_model.model
+        pass
+        if hasattr(internal_model, "config"):
+            if   internal_model.config.torch_dtype ==  "float32":
+                internal_model.config.torch_dtype = torch.float32
+            elif internal_model.config.torch_dtype == "bfloat16":
+                internal_model.config.torch_dtype = torch.bfloat16
+            elif internal_model.config.torch_dtype ==  "float16":
+                internal_model.config.torch_dtype = torch.float16
+            pass
+        pass
         
         return model, tokenizer
     pass

From 291bc6e25495070a9118bb0618ba6172abb11970 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sun, 4 Aug 2024 23:50:40 -0700
Subject: [PATCH 104/147] Update README.md

---
 README.md | 389 ------------------------------------------------------
 1 file changed, 389 deletions(-)

diff --git a/README.md b/README.md
index 4e29a43ec..9407c452a 100644
--- a/README.md
+++ b/README.md
@@ -436,395 +436,6 @@ Two Tesla T4s on Kaggle
 ![](https://i.ibb.co/sJ7RhGG/image-41.png)
 <br>
 
-## NVIDIA Pascal Support
-
-Support for NVIDIA Pascal family of cards, specifically the P40 and P100.
-
-### Setup Guide
-
-1. Create three files (`Dockerfile`, `unsloth_env_file.yml`, and `docker-compose.yml`) with the contents provided below.
-2. Ensure Docker and Docker Compose are installed on your system.
-3. Install the NVIDIA Container Toolkit for GPU support if not already done.
-4. Place all three files in the same directory.
-5. Open a terminal and navigate to the directory containing these files.
-6. Run the following command to build and start the container:
-
-   ```
-   docker-compose up --build
-   ```
-
-7. Once the container is running, access Jupyter Lab by opening a web browser and navigating to `http://localhost:8888`.
-
-### Configuration Files
-
-#### 1. Dockerfile
-
-```dockerfile
-# Stage 1: Base image with system dependencies
-FROM nvidia/cuda:11.8.0-devel-ubuntu22.04 as base
-
-# Install system dependencies
-RUN apt-get update && apt-get install -y \
-    build-essential \
-    git \
-    vim \
-    curl \
-    wget \
-    && rm -rf /var/lib/apt/lists/*
-
-# Install Miniconda only if it's not already installed
-RUN if [ ! -d "/opt/conda" ]; then \
-        wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh && \
-        bash miniconda.sh -b -p /opt/conda && \
-        rm miniconda.sh; \
-    fi
-
-# Set path to conda
-ENV PATH /opt/conda/bin:$PATH
-
-# Set path to conda
-ENV PATH /opt/conda/bin:$PATH
-
-# Stage 2: Python environment setup
-FROM base as python-env
-
-COPY unsloth_env_file.yml unsloth_env_file.yml
-
-RUN conda env create -f unsloth_env_file.yml
-
-SHELL ["conda", "run", "-n", "unsloth_env", "/bin/bash", "-c"]
-
-# Stage 3: Final image
-FROM python-env as final
-
-# Install Unsloth (This step is separate because it's likely to change more frequently)
-RUN pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
-
-ENV PATH /usr/local/cuda/bin:$PATH
-ENV LD_LIBRARY_PATH /usr/local/cuda/lib64:$LD_LIBRARY_PATH
-
-# Set the working directory
-WORKDIR /workspace
-
-# Set the default command to run Jupyter Lab
-CMD ["conda", "run", "--no-capture-output", "-n", "unsloth_env", "jupyter", "lab", "--ip=0.0.0.0", "--no-browser", "--allow-root", "--NotebookApp.token=''", "--NotebookApp.password=''"]
-```
-
-#### 2. unsloth_env_file.yml
-
-```yaml
-name: unsloth_env
-channels:
-  - xformers
-  - pytorch
-  - nvidia
-  - conda-forge
-  - defaults
-dependencies:
-  - _libgcc_mutex=0.1=conda_forge
-  - _openmp_mutex=4.5=2_gnu
-  - aiohttp=3.9.5=py310h5eee18b_0
-  - aiosignal=1.2.0=pyhd3eb1b0_0
-  - anyio=4.2.0=py310h06a4308_0
-  - argon2-cffi=21.3.0=pyhd3eb1b0_0
-  - argon2-cffi-bindings=21.2.0=py310h7f8727e_0
-  - arrow-cpp=16.1.0=hc1eb8f0_0
-  - async-lru=2.0.4=pyhd8ed1ab_0
-  - async-timeout=4.0.3=py310h06a4308_0
-  - attrs=23.1.0=py310h06a4308_0
-  - aws-c-auth=0.6.19=h5eee18b_0
-  - aws-c-cal=0.5.20=hdbd6064_0
-  - aws-c-common=0.8.5=h5eee18b_0
-  - aws-c-compression=0.2.16=h5eee18b_0
-  - aws-c-event-stream=0.2.15=h6a678d5_0
-  - aws-c-http=0.6.25=h5eee18b_0
-  - aws-c-io=0.13.10=h5eee18b_0
-  - aws-c-mqtt=0.7.13=h5eee18b_0
-  - aws-c-s3=0.1.51=hdbd6064_0
-  - aws-c-sdkutils=0.1.6=h5eee18b_0
-  - aws-checksums=0.1.13=h5eee18b_0
-  - aws-crt-cpp=0.18.16=h6a678d5_0
-  - aws-sdk-cpp=1.10.55=h721c034_0
-  - babel=2.14.0=pyhd8ed1ab_0
-  - beautifulsoup4=4.12.3=py310h06a4308_0
-  - blas=1.0=mkl
-  - bleach=4.1.0=pyhd3eb1b0_0
-  - boost-cpp=1.82.0=hdb19cb5_2
-  - bottleneck=1.3.7=py310ha9d4c09_0
-  - brotli-python=1.0.9=py310h6a678d5_8
-  - bzip2=1.0.8=h5eee18b_6
-  - c-ares=1.19.1=h5eee18b_0
-  - ca-certificates=2024.7.4=hbcca054_0
-  - certifi=2024.7.4=pyhd8ed1ab_0
-  - cffi=1.16.0=py310h5eee18b_1
-  - charset-normalizer=3.3.2=pyhd3eb1b0_0
-  - cuda-cudart=11.8.89=0
-  - cuda-cupti=11.8.87=0
-  - cuda-libraries=11.8.0=0
-  - cuda-nvrtc=11.8.89=0
-  - cuda-nvtx=11.8.86=0
-  - cuda-runtime=11.8.0=0
-  - cuda-version=11.8=hcce14f8_3
-  - cudatoolkit=11.8.0=h6a678d5_0
-  - datasets=2.19.1=py310h06a4308_0
-  - debugpy=1.6.7=py310h6a678d5_0
-  - decorator=5.1.1=pyhd3eb1b0_0
-  - defusedxml=0.7.1=pyhd3eb1b0_0
-  - dill=0.3.8=py310h06a4308_0
-  - entrypoints=0.4=py310h06a4308_0
-  - ffmpeg=4.3=hf484d3e_0
-  - filelock=3.13.1=py310h06a4308_0
-  - freetype=2.12.1=h4a9f257_0
-  - frozenlist=1.4.0=py310h5eee18b_0
-  - fsspec=2024.3.1=py310h06a4308_0
-  - gflags=2.2.2=h6a678d5_1
-  - glog=0.5.0=h6a678d5_1
-  - gmp=6.2.1=h295c915_3
-  - gmpy2=2.1.2=py310heeb90bb_0
-  - gnutls=3.6.15=he1e5248_0
-  - h11=0.14.0=pyhd8ed1ab_0
-  - h2=4.1.0=pyhd8ed1ab_0
-  - hpack=4.0.0=pyh9f0ad1d_0
-  - httpcore=1.0.5=pyhd8ed1ab_0
-  - httpx=0.27.0=pyhd8ed1ab_0
-  - hyperframe=6.0.1=pyhd8ed1ab_0
-  - icu=73.1=h6a678d5_0
-  - idna=3.7=py310h06a4308_0
-  - importlib-metadata=7.0.1=py310h06a4308_0
-  - importlib_metadata=7.0.1=hd8ed1ab_0
-  - importlib_resources=6.4.0=pyhd8ed1ab_0
-  - intel-openmp=2023.1.0=hdb19cb5_46306
-  - ipykernel=6.28.0=py310h06a4308_0
-  - ipython_genutils=0.2.0=pyhd3eb1b0_1
-  - jedi=0.19.1=py310h06a4308_0
-  - jinja2=3.1.4=py310h06a4308_0
-  - jpeg=9e=h5eee18b_2
-  - json5=0.9.25=pyhd8ed1ab_0
-  - jsonschema=4.19.2=py310h06a4308_0
-  - jsonschema-specifications=2023.7.1=py310h06a4308_0
-  - jupyter-lsp=2.2.5=pyhd8ed1ab_0
-  - jupyter_client=7.4.9=py310h06a4308_0
-  - jupyter_core=5.7.2=py310h06a4308_0
-  - jupyter_events=0.10.0=py310h06a4308_0
-  - jupyter_server=2.14.1=py310h06a4308_0
-  - jupyter_server_terminals=0.4.4=py310h06a4308_1
-  - jupyterlab=4.2.4=pyhd8ed1ab_0
-  - jupyterlab_pygments=0.3.0=pyhd8ed1ab_1
-  - jupyterlab_server=2.27.3=pyhd8ed1ab_0
-  - krb5=1.20.1=h143b758_1
-  - lame=3.100=h7b6447c_0
-  - lcms2=2.12=h3be6417_0
-  - ld_impl_linux-64=2.38=h1181459_1
-  - lerc=3.0=h295c915_0
-  - libabseil=20240116.2=cxx17_h6a678d5_0
-  - libboost=1.82.0=h109eef0_2
-  - libbrotlicommon=1.0.9=h5eee18b_8
-  - libbrotlidec=1.0.9=h5eee18b_8
-  - libbrotlienc=1.0.9=h5eee18b_8
-  - libcublas=11.11.3.6=0
-  - libcufft=10.9.0.58=0
-  - libcufile=1.9.1.3=0
-  - libcurand=10.3.5.147=0
-  - libcurl=8.7.1=h251f7ec_0
-  - libcusolver=11.4.1.48=0
-  - libcusparse=11.7.5.86=0
-  - libdeflate=1.17=h5eee18b_1
-  - libedit=3.1.20230828=h5eee18b_0
-  - libev=4.33=h7f8727e_1
-  - libevent=2.1.12=hdbd6064_1
-  - libffi=3.4.4=h6a678d5_1
-  - libgcc-ng=14.1.0=h77fa898_0
-  - libgomp=14.1.0=h77fa898_0
-  - libgrpc=1.62.2=h2d74bed_0
-  - libiconv=1.16=h5eee18b_3
-  - libidn2=2.3.4=h5eee18b_0
-  - libjpeg-turbo=2.0.0=h9bf148f_0
-  - libnghttp2=1.57.0=h2d74bed_0
-  - libnpp=11.8.0.86=0
-  - libnvjpeg=11.9.0.86=0
-  - libpng=1.6.39=h5eee18b_0
-  - libprotobuf=4.25.3=he621ea3_0
-  - libsodium=1.0.18=h7b6447c_0
-  - libssh2=1.11.0=h251f7ec_0
-  - libstdcxx-ng=11.2.0=h1234567_1
-  - libtasn1=4.19.0=h5eee18b_0
-  - libthrift=0.15.0=h1795dd8_2
-  - libtiff=4.5.1=h6a678d5_0
-  - libunistring=0.9.10=h27cfd23_0
-  - libuuid=1.41.5=h5eee18b_0
-  - libwebp-base=1.3.2=h5eee18b_0
-  - llvm-openmp=14.0.6=h9e868ea_0
-  - lz4-c=1.9.4=h6a678d5_1
-  - markupsafe=2.1.3=py310h5eee18b_0
-  - mistune=2.0.4=py310h06a4308_0
-  - mkl=2023.1.0=h213fc3f_46344
-  - mkl-service=2.4.0=py310h5eee18b_1
-  - mkl_fft=1.3.8=py310h5eee18b_0
-  - mkl_random=1.2.4=py310hdb19cb5_0
-  - mpc=1.1.0=h10f8cd9_1
-  - mpfr=4.0.2=hb69a4c5_1
-  - mpmath=1.3.0=py310h06a4308_0
-  - multidict=6.0.4=py310h5eee18b_0
-  - multiprocess=0.70.15=py310h06a4308_0
-  - nb_conda_kernels=2.3.1=py310h06a4308_0
-  - nbclassic=1.1.0=py310h06a4308_0
-  - nbclient=0.8.0=py310h06a4308_0
-  - nbconvert=7.10.0=py310h06a4308_0
-  - nbformat=5.9.2=py310h06a4308_0
-  - ncurses=6.4=h6a678d5_0
-  - nest-asyncio=1.6.0=py310h06a4308_0
-  - nettle=3.7.3=hbbd107a_1
-  - networkx=3.3=py310h06a4308_0
-  - notebook=6.5.7=py310h06a4308_0
-  - notebook-shim=0.2.3=py310h06a4308_0
-  - numexpr=2.8.7=py310h85018f9_0
-  - numpy=1.26.4=py310h5f9d8c6_0
-  - numpy-base=1.26.4=py310hb5e798b_0
-  - openh264=2.1.1=h4ff587b_0
-  - openjpeg=2.4.0=h9ca470c_2
-  - openssl=3.3.1=h4bc722e_2
-  - orc=2.0.1=h2d29ad5_0
-  - overrides=7.4.0=py310h06a4308_0
-  - packaging=24.1=py310h06a4308_0
-  - pandas=2.2.2=py310h6a678d5_0
-  - pandocfilters=1.5.0=pyhd3eb1b0_0
-  - pillow=10.4.0=py310h5eee18b_0
-  - pip=24.0=py310h06a4308_0
-  - platformdirs=3.10.0=py310h06a4308_0
-  - prometheus_client=0.14.1=py310h06a4308_0
-  - prompt_toolkit=3.0.43=hd3eb1b0_0
-  - psutil=5.9.0=py310h5eee18b_0
-  - ptyprocess=0.7.0=pyhd3eb1b0_2
-  - pure_eval=0.2.2=pyhd3eb1b0_0
-  - pyarrow=16.1.0=py310h1128e8f_0
-  - pycparser=2.21=pyhd3eb1b0_0
-  - pysocks=1.7.1=py310h06a4308_0
-  - python=3.10.14=h955ad1f_1
-  - python-dateutil=2.9.0post0=py310h06a4308_2
-  - python-fastjsonschema=2.16.2=py310h06a4308_0
-  - python-json-logger=2.0.7=py310h06a4308_0
-  - python-tzdata=2023.3=pyhd3eb1b0_0
-  - python-xxhash=2.0.2=py310h5eee18b_1
-  - pytorch=2.1.0=py3.10_cuda11.8_cudnn8.7.0_0
-  - pytorch-cuda=11.8=h7e8668a_5
-  - pytorch-mutex=1.0=cuda
-  - pytz=2024.1=py310h06a4308_0
-  - pyyaml=6.0.1=py310h5eee18b_0
-  - pyzmq=24.0.1=py310h5eee18b_0
-  - re2=2022.04.01=h295c915_0
-  - readline=8.2=h5eee18b_0
-  - referencing=0.30.2=py310h06a4308_0
-  - regex=2023.10.3=py310h5eee18b_0
-  - requests=2.32.3=py310h06a4308_0
-  - rfc3339-validator=0.1.4=py310h06a4308_0
-  - rfc3986-validator=0.1.1=py310h06a4308_0
-  - rpds-py=0.10.6=py310hb02cf49_0
-  - s2n=1.3.27=hdbd6064_0
-  - safetensors=0.4.2=py310ha89cbab_1
-  - send2trash=1.8.2=py310h06a4308_0
-  - setuptools=69.5.1=py310h06a4308_0
-  - six=1.16.0=pyhd3eb1b0_1
-  - snappy=1.1.10=h6a678d5_1
-  - sniffio=1.3.0=py310h06a4308_0
-  - soupsieve=2.5=py310h06a4308_0
-  - sqlite=3.45.3=h5eee18b_0
-  - stack_data=0.2.0=pyhd3eb1b0_0
-  - sympy=1.12=py310h06a4308_0
-  - tbb=2021.8.0=hdb19cb5_0
-  - terminado=0.17.1=py310h06a4308_0
-  - tinycss2=1.2.1=py310h06a4308_0
-  - tk=8.6.14=h39e8969_0
-  - tokenizers=0.19.1=py310hff361bb_0
-  - tomli=2.0.1=pyhd8ed1ab_0
-  - torchaudio=2.1.0=py310_cu118
-  - torchtriton=2.1.0=py310
-  - torchvision=0.16.0=py310_cu118
-  - tornado=6.4.1=py310h5eee18b_0
-  - tqdm=4.66.4=py310h2f386ee_0
-  - traitlets=5.14.3=py310h06a4308_0
-  - typing-extensions=4.11.0=py310h06a4308_0
-  - typing_extensions=4.11.0=py310h06a4308_0
-  - tzdata=2024a=h04d1e81_0
-  - urllib3=2.2.2=py310h06a4308_0
-  - utf8proc=2.6.1=h5eee18b_1
-  - webencodings=0.5.1=py310h06a4308_1
-  - websocket-client=1.8.0=py310h06a4308_0
-  - wheel=0.43.0=py310h06a4308_0
-  - xformers=0.0.22.post7=py310_cu11.8.0_pyt2.1.0
-  - xxhash=0.8.0=h7f8727e_3
-  - xz=5.4.6=h5eee18b_1
-  - yaml=0.2.5=h7b6447c_0
-  - yarl=1.9.3=py310h5eee18b_0
-  - zeromq=4.3.5=h6a678d5_0
-  - zipp=3.17.0=py310h06a4308_0
-  - zlib=1.2.13=h5eee18b_1
-  - zstd=1.5.5=hc292b87_2
-  - pip:
-      - accelerate==0.33.0
-      - asttokens==2.4.1
-      - bitsandbytes==0.43.2
-      - comm==0.2.2
-      - docstring-parser==0.16
-      - exceptiongroup==1.2.2
-      - executing==2.0.1
-      - gguf==0.9.1
-      - hf-transfer==0.1.8
-      - huggingface-hub==0.24.2
-      - iprogress==0.4
-      - ipython==8.26.0
-      - ipywidgets==8.1.3
-      - jupyterlab-widgets==3.0.11
-      - markdown-it-py==3.0.0
-      - matplotlib-inline==0.1.7
-      - mdurl==0.1.2
-      - parso==0.8.4
-      - peft==0.12.0
-      - pexpect==4.9.0
-      - prompt-toolkit==3.0.47
-      - protobuf==3.20.3
-      - pure-eval==0.2.3
-      - pygments==2.18.0
-      - rich==13.7.1
-      - sentencepiece==0.2.0
-      - shtab==1.7.1
-      - stack-data==0.6.3
-      - transformers==4.43.3
-      - trl==0.8.6
-      - tyro==0.8.5
-      - wcwidth==0.2.13
-      - widgetsnbextension==4.0.11
- ```
-
-#### 3. docker-compose.yml
-
-```yaml
-version: '3.8'
-
-services:
-  unsloth-env:
-    environment:
-      - NVIDIA_VISIBLE_DEVICES=all
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              count: all
-              capabilities: [gpu]
-    volumes:
-      - ./cache:/root/.cache
-      - ./workspace:/workspace
-    working_dir: /workspace
-    ports:
-      - "8888:8888"  # For Jupyter Lab
-    tty: true
-    stdin_open: true
-    build:
-      context: .
-      dockerfile: Dockerfile
-```
-
-
 ### Thank You to
 - [HuyNguyen-hust](https://github.com/HuyNguyen-hust) for making [RoPE Embeddings 28% faster](https://github.com/unslothai/unsloth/pull/238)
 - [RandomInternetPreson](https://github.com/RandomInternetPreson) for confirming WSL support

From b43855fb3635ce06860b27f7c8f9987a16b47ad7 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sun, 4 Aug 2024 23:59:57 -0700
Subject: [PATCH 105/147] Update README.md

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 9407c452a..35cbbe697 100644
--- a/README.md
+++ b/README.md
@@ -37,8 +37,10 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and
 - This [text completion notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing) is for continued pretraining / raw text
 - This [continued pretraining notebook](https://colab.research.google.com/drive/1tEd1FrOXWMnCU9UIvdYhs61tkxdMuKZu?usp=sharing) is for learning another language
 - Click [here](https://github.com/unslothai/unsloth/wiki) for detailed documentation for Unsloth.
+- Install Unsloth with `pip install unsloth[colab-new]` then `pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes`
 
 ## 🦥 Unsloth.ai News
+- 📣 NEW! `pip install unsloth` now works! Head over to [pypi](https://pypi.org/project/unsloth/) to check it out! This allows non git pull installs. Use `pip install unsloth[colab-new]` for non dependency installs.
 - 📣 NEW! [Gemma-2-2b](https://colab.research.google.com/drive/1weTpKOjBZxZJ5PQ-Ql8i6ptAY2x-FWVA?usp=sharing) now supported! Gemma-2-9b and Gemma-2-27b are alrady supported! And uploaded [GGUF quants](https://huggingface.co/unsloth/gemma-2-it-GGUF) Try out [Chat interface](https://colab.research.google.com/drive/1i-8ESvtLRGNkkUQQr_-z_rcSAIo9c3lM?usp=sharing) for Gemma-2-2b Instruct!
 - 📣 NEW! [Llama 3.1 8b, 70b](https://colab.research.google.com/drive/1Ys44kVvmeZtnICzWz0xgpRnrIOjZAuxp?usp=sharing) both Base and Instruct now supported
 - 📣 NEW! [Mistral Nemo-12b](https://colab.research.google.com/drive/17d3U-CAIwzmbDRqbZ9NnpHxCkmXB6LZ0?usp=sharing) both Base and Instruct now supported

From bfe38e6ea8d3d7cf8ce9e37962de03c71c90cbe2 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Mon, 5 Aug 2024 00:00:53 -0700
Subject: [PATCH 106/147] Update README.md

---
 README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 35cbbe697..86c3fbd86 100644
--- a/README.md
+++ b/README.md
@@ -37,7 +37,6 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and
 - This [text completion notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing) is for continued pretraining / raw text
 - This [continued pretraining notebook](https://colab.research.google.com/drive/1tEd1FrOXWMnCU9UIvdYhs61tkxdMuKZu?usp=sharing) is for learning another language
 - Click [here](https://github.com/unslothai/unsloth/wiki) for detailed documentation for Unsloth.
-- Install Unsloth with `pip install unsloth[colab-new]` then `pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes`
 
 ## 🦥 Unsloth.ai News
 - 📣 NEW! `pip install unsloth` now works! Head over to [pypi](https://pypi.org/project/unsloth/) to check it out! This allows non git pull installs. Use `pip install unsloth[colab-new]` for non dependency installs.
@@ -94,6 +93,9 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and
 ![](https://i.ibb.co/sJ7RhGG/image-41.png)
 
 ## 💾 Installation Instructions
+
+If you have Pytorch 2.3 and CUDA 12.1, install Unsloth with `pip install unsloth[colab-new]` then `pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes`
+
 ### Conda Installation
 Select either `pytorch-cuda=11.8` for CUDA 11.8 or `pytorch-cuda=12.1` for CUDA 12.1. If you have `mamba`, use `mamba` instead of `conda` for faster solving. See this [Github issue](https://github.com/unslothai/unsloth/issues/73) for help on debugging Conda installs.
 ```bash

From 8001d30a8f7c179ff7036eaa2a7552ce620176b6 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 6 Aug 2024 20:24:44 -0700
Subject: [PATCH 107/147] Fix tokenizers (#887)

* Update pyproject.toml

* Update tokenizer_utils.py

* Update tokenizer_utils.py

* Update tokenizer_utils.py

* Update _utils.py

* Update _utils.py

* Update tokenizer_utils.py

* Update tokenizer_utils.py

* fix_tokenizer

* Update tokenizer_utils.py

* Update tokenizer_utils.py
---
 pyproject.toml             |   4 +-
 unsloth/models/_utils.py   |  83 ++++++++++++++++++++------
 unsloth/models/llama.py    |   1 +
 unsloth/tokenizer_utils.py | 115 ++++++++++++++++++++++++++++++++++++-
 4 files changed, 180 insertions(+), 23 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index fdc098854..2cbe68f4a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -47,7 +47,7 @@ huggingface = [
     "peft>=0.7.1,!=0.11.0",
     "protobuf<4.0.0",
     "huggingface_hub",
-    "hf-transfer",
+    "hf_transfer",
 ]
 cu118only = [
     "xformers==0.0.22.post7",
@@ -178,7 +178,7 @@ colab-new = [
     "numpy",
     "protobuf<4.0.0",
     "huggingface_hub",
-    "hf-transfer",
+    "hf_transfer",
 ]
 colab-no-deps = [
     "accelerate>=0.26.1",
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index fe3aa9040..d5be8d97e 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -332,7 +332,6 @@ def prepare_model_for_kbit_training(
     """
 
     # Freeze all parameters except LoRA
-    import re
     with torch.no_grad():
         for name, param in model.named_parameters():
             if ".lora_A." in name or ".lora_B." in name or ".lora_magnitude_vector" in name:
@@ -389,12 +388,14 @@ def patch_tokenizer(model, tokenizer):
         Fixes https://github.com/unslothai/unsloth/issues/5
     """
     possible_reserved_tokens = (
+        "<|finetune_right_pad_id|>", # Llama-3.1
+        "<pad>",                     # Mistral Nemo
         "<|reserved",                # Llama-3
         "<|placeholder",             # Phi-3
         "[control",                  # Mistral type models
-        "<pad>",                     # Mistral Nemo
-        "<|finetune_right_pad_id|>", # Llama-3.1
     )
+    joiner = "\1\0=+=\0\1"
+    number_repetitions = 3 - 1 # Number of reserved tokens needed
 
     if model is not None:
         model.config.update({"unsloth_version" : __version__})
@@ -412,28 +413,69 @@ def patch_tokenizer(model, tokenizer):
     if bad_pad_token:
         # Find a better pad token
         added_tokens = [str(x) for x in tokenizer.added_tokens_decoder.values()]
-        possible_pad_token = None
-        n_possible_pad_tokens = 0
-        for added_token in added_tokens[::-1]:
-            if added_token.startswith(possible_reserved_tokens):
-                if possible_pad_token is None: possible_pad_token = added_token
-                n_possible_pad_tokens += 1
-                # We must see at least 3 of the reserved tokens
-                if n_possible_pad_tokens >= 3: break
+        all_added_tokens = joiner.join(added_tokens[::-1])
+        all_added_tokens += joiner
+
+        final_pad_token  = None
+        final_good_match = False
+
+        for possible_reserved_token in possible_reserved_tokens:
+            possible_reserved_token = re.escape(possible_reserved_token)
+            found = re.finditer(f"{possible_reserved_token}", all_added_tokens)
+            first_match = None
+            good_match  = False
+            for j, x in enumerate(found):
+                if j == 0: first_match = x
+                if j >= number_repetitions:
+                    good_match = True
+                    break
+                pass
+            pass
+
+            if first_match is None: continue
+
+            # If it ends with |> or > etc, then set it as a good pad token!
+            start = first_match.span(0)[0]
+            possible_pad_token = first_match.group(0)
+            end = all_added_tokens.find(joiner, start)
+            first_match = all_added_tokens[start:end]
+
+            if first_match is not None:
+                good_match = possible_pad_token.endswith((">", "|>", "]", ")"))
+            pass
+            possible_pad_token = first_match
+
+            # Replace current pad token if another exact match is found
+            if not final_good_match and good_match:
+                final_good_match = True
+                final_pad_token = possible_pad_token
+                break
+            else:
+                final_good_match = False
+                final_pad_token = possible_pad_token
             pass
         pass
-        if n_possible_pad_tokens < 3: possible_pad_token = None
+        possible_pad_token = final_pad_token
 
-        if possible_pad_token is None:
-            # Try unk_token
+        # Try unk_token
+        if possible_pad_token is None and hasattr(tokenizer, "unk_token"):
             possible_pad_token = tokenizer.unk_token
         pass
 
+        # Check pad token's id must be less than vocab size
+        if possible_pad_token is not None:
+            check_pad_token = tokenizer(possible_pad_token, add_special_tokens = False).input_ids
+            if len(check_pad_token) != 1:
+                possible_pad_token = None
+            if check_pad_token[0] >= config.vocab_size:
+                possible_pad_token = None
+        pass
+
         if possible_pad_token is None:
             # Failure to find a good replacement!! We shall manually add one!
             new_pad_token = "<|PAD_TOKEN|>"
             while new_pad_token in tokenizer.get_vocab():
-                new_pad_token += "#"
+                new_pad_token = f"<{new_pad_token}>"
             pass
             possible_pad_token = new_pad_token
         pass
@@ -447,11 +489,16 @@ def patch_tokenizer(model, tokenizer):
         tokenizer.add_special_tokens({"pad_token" : possible_pad_token})
         tokenizer.pad_token = possible_pad_token
         if model is not None:
-            config = model.config.update({"pad_token_id" : tokenizer.pad_token_id})
+            model.config.update({"pad_token_id" : tokenizer.pad_token_id})
+            model.generation_config.update(pad_token_id = tokenizer.pad_token_id)
     else:
         if model is not None:
             if model.config.pad_token_id is None:
-                config = model.config.update({"pad_token_id" : tokenizer.pad_token_id})
+                model.config.update({"pad_token_id" : tokenizer.pad_token_id})
+                model.generation_config.update(pad_token_id = tokenizer.pad_token_id)
+        pass
+    pass
+    model.generation_config.update(max_length = model.config.max_position_embeddings)
     return model, tokenizer
 pass
 
@@ -462,7 +509,6 @@ def patch_tokenizer(model, tokenizer):
 from peft import __version__ as peft_version
 if Version(peft_version) < Version("0.12.0"):
     from peft.tuners.lora.layer import LoraLayer
-    import inspect, re
     try:
         source = inspect.getsource(LoraLayer.update_layer)
         text = "if weight is not None:\n"
@@ -688,7 +734,6 @@ def unsloth_offloaded_gradient_checkpoint(function, *args, use_reentrant = None,
 from transformers.utils.quantization_config import BitsAndBytesConfig, QuantizationMethod
 from inspect import getsource
 from accelerate.utils.dataclasses import DistributedType
-import re
 BitsAndBytesConfig__init__ = getsource(BitsAndBytesConfig.__init__)
 BitsAndBytesConfig__init__ = re.sub(
     r"if[\s]{1,}kwargs\:[\s]{1,}.+?\n",
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index cec743e59..e300e07e0 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1397,6 +1397,7 @@ def from_pretrained(
             padding_side      = "right",
             token             = token,
             trust_remote_code = trust_remote_code,
+            fix_tokenizer     = fix_tokenizer,
         )
 
         model, tokenizer = patch_tokenizer(model, tokenizer)
diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index 8474c2c6b..c67f82c2c 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -454,13 +454,14 @@ class SentencePieceTokenTypes(IntEnum):
 pass
 
 
-def load_correct_tokenizer(
+def _load_correct_tokenizer(
     tokenizer_name,
     model_max_length = None,
     padding_side = "right",
     token = None,
     trust_remote_code = False,
     cache_dir = "huggingface_tokenizers_cache",
+    fix_tokenizer = True,
 ):
     if IS_COLAB_ENVIRONMENT or IS_KAGGLE_ENVIRONMENT:
         cache_dir = cache_dir
@@ -501,7 +502,10 @@ def load_correct_tokenizer(
         cache_dir         = cache_dir,
     )
 
-    if tokenizer_name in IGNORED_TOKENIZER_NAMES:
+    if not fix_tokenizer or tokenizer_name in IGNORED_TOKENIZER_NAMES:
+        return fast_tokenizer
+    # Ignore Mistral ones - they're a bit weird to handle!
+    elif "mistral" in tokenizer_name.lower():
         return fast_tokenizer
     elif slow_tokenizer is not None:
         if hasattr(fast_tokenizer, "add_bos_token") and hasattr(slow_tokenizer, "add_bos_token"):
@@ -522,6 +526,113 @@ def load_correct_tokenizer(
 pass
 
 
+def load_correct_tokenizer(
+    tokenizer_name,
+    model_max_length = None,
+    padding_side = "right",
+    token = None,
+    trust_remote_code = False,
+    cache_dir = "huggingface_tokenizers_cache",
+    fix_tokenizer = True,
+):
+    tokenizer = _load_correct_tokenizer(
+        tokenizer_name = tokenizer_name,
+        model_max_length = model_max_length,
+        padding_side = padding_side,
+        token = token,
+        trust_remote_code = trust_remote_code,
+        cache_dir = cache_dir,
+        fix_tokenizer = fix_tokenizer,
+    )
+
+    ### 1. Fixup tokenizer's chat_template
+    old_chat_template = getattr(tokenizer, "chat_template", None)
+
+    # Ignore mistral type models since they don't have a add_generation_prompt
+    if "mistral" in str(getattr(tokenizer, "name_or_path", "")).lower():
+        chat_template = old_chat_template
+
+    # Also check Llama-2 old style models
+    elif old_chat_template is not None and \
+        "[/INST]" in old_chat_template and "[INST]" in old_chat_template and \
+        "bos_token" in old_chat_template and "eos_token" in old_chat_template:
+
+        chat_template = old_chat_template
+
+    else:
+        chat_template = fix_chat_template(tokenizer)
+        if old_chat_template is not None and chat_template is None:
+            raise RuntimeError(
+                "Unsloth: Fixing chat template failed - please file a report immediately!"
+            )
+        pass
+    pass
+
+    tokenizer.chat_template = chat_template
+    return tokenizer
+pass
+
+
+def _fix_chat_template(chat_template):
+    endfor = "{% endfor %}"
+    where = chat_template.find(endfor)
+    if where == -1: return chat_template
+
+    after_endfor = chat_template[where + len(endfor):]
+
+    if "{% if" not in after_endfor and "{% set " not in after_endfor and \
+        after_endfor.startswith("{{") and after_endfor.endswith("}}") and \
+        after_endfor.count("{{") == 1 and after_endfor.count("}}") == 1:
+
+        after_endfor = "{% if add_generation_prompt %}" + after_endfor + "{% endif %}"
+
+        chat_template = chat_template[:where + len(endfor)] + after_endfor
+    pass
+    return chat_template
+pass
+
+
+def fix_chat_template(tokenizer):
+    chat_template = getattr(tokenizer, "chat_template", None)
+    if chat_template is None: return None
+
+    ### 1. Check if add_generation_prompt works
+    messages = [
+        {"role": "user", "content": "Who are you?"},
+    ]
+    no  = tokenizer.apply_chat_template(messages, add_generation_prompt = False, tokenize = False)
+    yes = tokenizer.apply_chat_template(messages, add_generation_prompt =  True, tokenize = False)
+
+    if no == yes:
+        # SAME?! That's not good! We check for add_generation_prompt
+        if "{% if add_generation_prompt %}" not in chat_template:
+            # Try fixing it by adding it
+            new_chat_template = _fix_chat_template(chat_template)
+            if "{% if add_generation_prompt %}" not in new_chat_template:
+                raise RuntimeError(
+                    f"Unsloth: The tokenizer `{tokenizer.name_or_path}`\n"\
+                    "does not have a {% if add_generation_prompt %} for generation purposes.\n"\
+                    "Please file a bug report immediately - thanks!"
+                )
+            else:
+                logger.warning_once(
+                    "Unsloth: We successfully patched the tokenizer to add a {% if add_generation_prompt %} to the chat_template.\n"\
+                    "This is not a bug, but please notify the Unsloth maintainers - thanks!"
+                )
+                chat_template = new_chat_template
+            pass
+        else:
+            raise RuntimeError(
+                f"Unsloth: The tokenizer `{tokenizer.name_or_path}`\n"\
+                "has a {% if add_generation_prompt %} for generation purposes, but wasn't provided correctly.\n"\
+                "Please file a bug report immediately - thanks!"
+            )
+        pass
+    pass
+    return chat_template
+pass
+
+
 def check_tokenizer(
     model,
     tokenizer,

From 637ed8c6bd252f981e89e30e1085efc03a06a880 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 7 Aug 2024 01:11:06 -0700
Subject: [PATCH 108/147] Update _utils.py

---
 unsloth/models/_utils.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index d5be8d97e..db27eb8a8 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -490,15 +490,21 @@ def patch_tokenizer(model, tokenizer):
         tokenizer.pad_token = possible_pad_token
         if model is not None:
             model.config.update({"pad_token_id" : tokenizer.pad_token_id})
-            model.generation_config.update(pad_token_id = tokenizer.pad_token_id)
+            if getattr(model, "generation_config") is not None:
+                model.generation_config.update(pad_token_id = tokenizer.pad_token_id)
     else:
         if model is not None:
             if model.config.pad_token_id is None:
                 model.config.update({"pad_token_id" : tokenizer.pad_token_id})
-                model.generation_config.update(pad_token_id = tokenizer.pad_token_id)
+                if getattr(model, "generation_config") is not None:
+                    model.generation_config.update(pad_token_id = tokenizer.pad_token_id)
         pass
     pass
-    model.generation_config.update(max_length = model.config.max_position_embeddings)
+
+    if model is not None:
+        if getattr(model, "generation_config") is not None:
+            model.generation_config.update(max_length = model.config.max_position_embeddings)
+
     return model, tokenizer
 pass
 

From cad1146ff7c60f4afc10b9ab243304befdad7a0f Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 7 Aug 2024 10:47:11 -0700
Subject: [PATCH 109/147] Update _utils.py

---
 unsloth/models/_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index db27eb8a8..3686717b2 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -467,7 +467,7 @@ def patch_tokenizer(model, tokenizer):
             check_pad_token = tokenizer(possible_pad_token, add_special_tokens = False).input_ids
             if len(check_pad_token) != 1:
                 possible_pad_token = None
-            if check_pad_token[0] >= config.vocab_size:
+            if check_pad_token[0] >= model.config.vocab_size:
                 possible_pad_token = None
         pass
 

From e4c8ceacb3fca634f78e662873a01c37678fcb3e Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 7 Aug 2024 10:48:39 -0700
Subject: [PATCH 110/147] Update _utils.py

---
 unsloth/models/_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 3686717b2..195fd5bb6 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -467,7 +467,7 @@ def patch_tokenizer(model, tokenizer):
             check_pad_token = tokenizer(possible_pad_token, add_special_tokens = False).input_ids
             if len(check_pad_token) != 1:
                 possible_pad_token = None
-            if check_pad_token[0] >= model.config.vocab_size:
+            if model is not None and check_pad_token[0] >= model.config.vocab_size:
                 possible_pad_token = None
         pass
 

From 3bc804a9f9d603287f0a42a7169ed8cd40420f6b Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 10 Aug 2024 19:59:40 -0700
Subject: [PATCH 111/147] Torch 2.4, Xformers>0.0.27, TRL>0.9, Python 3.12 +
 bug fixes (#902)

* Update pyproject.toml

* Update tokenizer_utils.py

* Update tokenizer_utils.py

* Update tokenizer_utils.py

* Update _utils.py

* Update _utils.py

* Update tokenizer_utils.py

* Update tokenizer_utils.py

* fix_tokenizer

* Update tokenizer_utils.py

* Update tokenizer_utils.py

* Update save.py

* Update save.py

* Update save.py

* Update save.py

* Update save.py

* Update loader.py

* Update pyproject.toml

* Update _utils.py

* Update gemma2.py

* Update gemma2.py

* Update _utils.py

* gemma 2 mask

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update _utils.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update _utils.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update _utils.py

* Update _utils.py

* Update _utils.py

* Update _utils.py

* Update _utils.py

* Update _utils.py

* Torch 2.4 Xformers 0.0.27post2

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Gemma 2 fixes

* Update gemma2.py

* Update llama.py

* Update llama.py

* Update save.py

* Update save.py
---
 pyproject.toml           | 133 ++++++++++++++++++++++++++++-----------
 unsloth/models/_utils.py |  60 +++++++++++++++---
 unsloth/models/gemma2.py |   2 +-
 unsloth/models/llama.py  |  31 ++++++---
 unsloth/models/loader.py |   6 +-
 unsloth/save.py          |  82 +++++++++++++++++++-----
 6 files changed, 240 insertions(+), 74 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 2cbe68f4a..b61908a69 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -43,111 +43,154 @@ huggingface = [
     "wheel>=0.42.0",
     "numpy",
     "accelerate>=0.26.1",
-    "trl>=0.7.9,<0.9.0",
+    "trl>=0.7.9,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3",
     "peft>=0.7.1,!=0.11.0",
     "protobuf<4.0.0",
     "huggingface_hub",
     "hf_transfer",
 ]
 cu118only = [
-    "xformers==0.0.22.post7",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.22.post7%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.22.post7%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.22.post7%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'",
 ]
 cu121only = [
-    "xformers==0.0.22.post7",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.22.post7-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.22.post7-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.22.post7-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'",
 ]
 cu118onlytorch211 = [
-    "xformers==0.0.23",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'",
 ]
 cu121onlytorch211 = [
-    "xformers==0.0.23",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'",
 ]
 cu118onlytorch212 = [
-    "xformers==0.0.23.post1",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23.post1%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23.post1%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23.post1%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'",
 ]
 cu121onlytorch212 = [
-    "xformers==0.0.23.post1",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23.post1-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23.post1-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23.post1-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'",
 ]
 cu118onlytorch220 = [
-    "xformers==0.0.24",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.24%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.24%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.24%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'",
 ]
 cu121onlytorch220 = [
-    "xformers==0.0.24",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.24-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.24-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.24-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'",
 ]
 cu118onlytorch230 = [
-    "xformers==0.0.26.post1",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27%2Bcu118-cp312-cp312-manylinux2014_x86_64.whl ; python_version=='3.12'",
 ]
 cu121onlytorch230 = [
-    "xformers==0.0.26.post1",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.27-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.27-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.27-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.27-cp312-cp312-manylinux2014_x86_64.whl ; python_version=='3.12'",
+]
+cu118onlytorch240 = [
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27.post2%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27.post2%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27.post2%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27.post2%2Bcu118-cp312-cp312-manylinux2014_x86_64.whl ; python_version=='3.12'",
+]
+cu121onlytorch240 = [
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.27.post2-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.27.post2-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.27.post2-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.27.post2-cp312-cp312-manylinux2014_x86_64.whl ; python_version=='3.12'",
 ]
-
 cu118 = [
     "unsloth[huggingface]",
-    "bitsandbytes",
+    "bitsandbytes>=0.43.3",
     "unsloth[cu118only]",
 ]
 cu121 = [
     "unsloth[huggingface]",
-    "bitsandbytes",
+    "bitsandbytes>=0.43.3",
     "unsloth[cu121only]",
 ]
 cu118-torch211 = [
     "unsloth[huggingface]",
-    "bitsandbytes",
+    "bitsandbytes>=0.43.3",
     "unsloth[cu118onlytorch211]",
 ]
 cu121-torch211 = [
     "unsloth[huggingface]",
-    "bitsandbytes",
+    "bitsandbytes>=0.43.3",
     "unsloth[cu121onlytorch211]",
 ]
 cu118-torch212 = [
     "unsloth[huggingface]",
-    "bitsandbytes",
+    "bitsandbytes>=0.43.3",
     "unsloth[cu118onlytorch212]",
 ]
 cu121-torch212 = [
     "unsloth[huggingface]",
-    "bitsandbytes",
+    "bitsandbytes>=0.43.3",
     "unsloth[cu121onlytorch212]",
 ]
 cu118-torch220 = [
     "unsloth[huggingface]",
-    "bitsandbytes",
+    "bitsandbytes>=0.43.3",
     "unsloth[cu118onlytorch220]",
 ]
 cu121-torch220 = [
     "unsloth[huggingface]",
-    "bitsandbytes",
+    "bitsandbytes>=0.43.3",
     "unsloth[cu121onlytorch220]",
 ]
 cu118-torch230 = [
     "unsloth[huggingface]",
-    "bitsandbytes",
+    "bitsandbytes>=0.43.3",
     "unsloth[cu118onlytorch230]",
 ]
 cu121-torch230 = [
     "unsloth[huggingface]",
-    "bitsandbytes",
+    "bitsandbytes>=0.43.3",
     "unsloth[cu121onlytorch230]",
 ]
+cu118-torch240 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu118onlytorch240]",
+]
+cu121-torch240 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu121onlytorch240]",
+]
 kaggle = [
     "unsloth[huggingface]",
 ]
 kaggle-new = [
     "unsloth[huggingface]",
-    "bitsandbytes",
+    "bitsandbytes>=0.43.3",
 ]
 conda = [
     "unsloth[huggingface]",
 ]
 colab-torch211 = [
     "unsloth[huggingface]",
-    "bitsandbytes",
+    "bitsandbytes>=0.43.3",
     "unsloth[cu121onlytorch211]",
 ]
 colab-ampere-torch211 = [
     "unsloth[huggingface]",
-    "bitsandbytes",
+    "bitsandbytes>=0.43.3",
     "unsloth[cu121onlytorch211]",
     "packaging",
     "ninja",
@@ -155,12 +198,12 @@ colab-ampere-torch211 = [
 ]
 colab-torch220 = [
     "unsloth[huggingface]",
-    "bitsandbytes",
+    "bitsandbytes>=0.43.3",
     "unsloth[cu121onlytorch220]",
 ]
 colab-ampere-torch220 = [
     "unsloth[huggingface]",
-    "bitsandbytes",
+    "bitsandbytes>=0.43.3",
     "unsloth[cu121onlytorch220]",
     "packaging",
     "ninja",
@@ -182,10 +225,10 @@ colab-new = [
 ]
 colab-no-deps = [
     "accelerate>=0.26.1",
-    "trl>=0.7.9",
+    "trl>=0.7.9,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3",
     "peft>=0.7.1",
     "xformers<0.0.27",
-    "bitsandbytes",
+    "bitsandbytes>=0.43.3",
     "protobuf<4.0.0",
 ]
 colab = [
@@ -199,7 +242,7 @@ colab-ampere = [
 ]
 cu118-ampere = [
     "unsloth[huggingface]",
-    "bitsandbytes",
+    "bitsandbytes>=0.43.3",
     "unsloth[cu118only]",
     "packaging",
     "ninja",
@@ -207,7 +250,7 @@ cu118-ampere = [
 ]
 cu121-ampere = [
     "unsloth[huggingface]",
-    "bitsandbytes",
+    "bitsandbytes>=0.43.3",
     "unsloth[cu121only]",
     "packaging",
     "ninja",
@@ -215,7 +258,7 @@ cu121-ampere = [
 ]
 cu118-ampere-torch211 = [
     "unsloth[huggingface]",
-    "bitsandbytes",
+    "bitsandbytes>=0.43.3",
     "unsloth[cu118onlytorch211]",
     "packaging",
     "ninja",
@@ -223,7 +266,7 @@ cu118-ampere-torch211 = [
 ]
 cu121-ampere-torch211 = [
     "unsloth[huggingface]",
-    "bitsandbytes",
+    "bitsandbytes>=0.43.3",
     "unsloth[cu121onlytorch211]",
     "packaging",
     "ninja",
@@ -231,7 +274,7 @@ cu121-ampere-torch211 = [
 ]
 cu118-ampere-torch220 = [
     "unsloth[huggingface]",
-    "bitsandbytes",
+    "bitsandbytes>=0.43.3",
     "unsloth[cu118onlytorch220]",
     "packaging",
     "ninja",
@@ -239,7 +282,7 @@ cu118-ampere-torch220 = [
 ]
 cu121-ampere-torch220 = [
     "unsloth[huggingface]",
-    "bitsandbytes",
+    "bitsandbytes>=0.43.3",
     "unsloth[cu121onlytorch220]",
     "packaging",
     "ninja",
@@ -247,7 +290,7 @@ cu121-ampere-torch220 = [
 ]
 cu118-ampere-torch230 = [
     "unsloth[huggingface]",
-    "bitsandbytes",
+    "bitsandbytes>=0.43.3",
     "unsloth[cu118onlytorch230]",
     "packaging",
     "ninja",
@@ -255,12 +298,28 @@ cu118-ampere-torch230 = [
 ]
 cu121-ampere-torch230 = [
     "unsloth[huggingface]",
-    "bitsandbytes",
+    "bitsandbytes>=0.43.3",
     "unsloth[cu121onlytorch230]",
     "packaging",
     "ninja",
     "flash-attn>=2.6.3",
 ]
+cu118-ampere-torch240 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu118onlytorch240]",
+    "packaging",
+    "ninja",
+    "flash-attn>=2.6.3",
+]
+cu121-ampere-torch240 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu121onlytorch240]",
+    "packaging",
+    "ninja",
+    "flash-attn>=2.6.3",
+]
 
 [project.urls]
 homepage = "http://www.unsloth.ai"
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 195fd5bb6..0c0057496 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -53,7 +53,9 @@
 # Disable some warnings which can get annoying
 warnings.filterwarnings(action = "ignore", category = UserWarning,    module = "torch")
 warnings.filterwarnings(action = "ignore", category = UserWarning,    module = "huggingface_hub")
+warnings.filterwarnings(action = "ignore", category = UserWarning,    module = "trl")
 warnings.filterwarnings(action = "ignore", category = FutureWarning,  module = "huggingface_hub")
+warnings.filterwarnings(action = "ignore", category = FutureWarning,  module = "xformers")
 warnings.filterwarnings(action = "ignore", category = RuntimeWarning, module = "subprocess")
 warnings.filterwarnings(action = "ignore", category = UserWarning,    module = "transformers")
 warnings.filterwarnings(action = "ignore", category = FutureWarning,  module = "accelerate")
@@ -133,6 +135,28 @@ def patch_mistral_nemo_config(config):
 pass
 # =============================================
 
+# =============================================
+# Fix KeyError: 'Cache only has 0 layers, attempted to access layer with index 0'
+import transformers.cache_utils
+if hasattr(transformers.cache_utils, "DynamicCache") and \
+    transformers.cache_utils.DynamicCache.__getitem__.__name__ != "__cache_utils_getitem__":
+
+    source = inspect.getsource(transformers.cache_utils.DynamicCache.__getitem__)
+    start = source.find("def")
+    spaces = start*" "
+    source = source.split("\n")
+    source = "\n".join(x[start:] for x in source)
+    where = source.find("raise KeyError")
+    source = source[:where] + \
+        f"if len(self) == 0:\n{spaces}{spaces}"\
+        "    raise RuntimeError('Unsloth: You must call `FastLanguageModel.for_inference(model)` before doing inference for Unsloth models.')\n" + \
+        f"{spaces}{spaces}else:\n{spaces}{spaces}{spaces}" + source[where:]
+    source = source.replace("__getitem__", "__cache_utils_getitem__", 1)
+    exec(source)
+    transformers.cache_utils.DynamicCache.__getitem__ = __cache_utils_getitem__
+pass
+# =============================================
+
 # =============================================
 # Get Flash Attention v2 if Ampere (RTX 30xx, A100)
 import bitsandbytes as bnb
@@ -192,7 +216,7 @@ def patch_mistral_nemo_config(config):
 # Get Xformers
 from xformers import __version__ as xformers_version
 # Temporarily disable 0.0.27 and higher - inference issues
-if Version(xformers_version) >= Version("0.0.27"):
+if False: #Version(xformers_version) >= Version("0.0.27"):
     raise ImportError(
         "Unsloth: If you are in Colab, we updated the top cell install instructions - please change it to below "\
         "then press Disconnect Runtime and then Restart it.\n"\
@@ -200,10 +224,10 @@ def patch_mistral_nemo_config(config):
         "%%capture\n"
         "# Installs Unsloth, Xformers (Flash Attention) and all other packages!\n"
         '!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"\n'
-        '!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes\n'\
+        '!pip install --no-deps "xformers<=0.0.27" trl peft accelerate bitsandbytes\n'\
         '\n'\
         f"Otherwise in local machines, your xformers version of {xformers_version} is too new.\n"\
-        'Please downgrade xformers via `pip install --force-reinstall "xformers<0.0.27"'
+        'Please downgrade xformers via `pip install --force-reinstall "xformers<=0.0.27"'
     )
 pass
 
@@ -217,10 +241,10 @@ def patch_mistral_nemo_config(config):
         f"Unsloth: You have torch = {torch_version} but xformers = {xformers_version}.\n"\
         f"Please install xformers < 0.0.26 for torch = {torch_version}."
     )
-elif Version(torch_version) < Version("2.4.0") and Version(xformers_version) >= Version("0.0.27"):
+elif Version(torch_version) < Version("2.4.0") and Version(xformers_version) > Version("0.0.27"):
     raise ImportError(
         f"Unsloth: You have torch = {torch_version} but xformers = {xformers_version}.\n"\
-        f"Please install xformers < 0.0.27 for torch = {torch_version}."
+        f"Please install xformers <= 0.0.27 for torch = {torch_version}."
     )
 pass
 
@@ -241,7 +265,8 @@ def patch_mistral_nemo_config(config):
 
 # Check TRL version
 from trl import __version__ as trl_version
-if Version(trl_version) >= Version("0.9.0"):
+# Unsloth now supports all TRL versions!
+if False:#Version(trl_version) >= Version("0.9.0"):
     raise ImportError(
         "Unsloth: If you are in Colab, we updated the top cell install instructions - please change it to below "\
         "then press Disconnect Runtime and then Restart it.\n"\
@@ -249,13 +274,32 @@ def patch_mistral_nemo_config(config):
         "%%capture\n"
         "# Installs Unsloth, Xformers (Flash Attention) and all other packages!\n"
         '!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"\n'
-        '!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes\n'\
+        '!pip install --no-deps "xformers<=0.0.27" trl peft accelerate bitsandbytes\n'\
         '\n'\
         f"Otherwise in local machines, your TRL version of {trl_version} is too new.\n"\
-        'Please downgrade TRL via `pip install --force-reinstall "trl<0.9.0"'
+        'Please downgrade TRL via `pip install --force-reinstall trl'
     )
 pass
 
+# =============================================
+# Fix new Xformers versions TypeError: Multiple dispatch failed for 'torch._ops.aten.to.dtype_layout'
+if Version(xformers_version) >= Version("0.0.27"):
+    import accelerate.utils.operations
+    if hasattr(accelerate.utils.operations, "send_to_device") and \
+        accelerate.utils.operations.send_to_device.__name__ != "_fixed_send_to_device":
+        from accelerate.utils.operations import *
+        send_to_device = inspect.getsource(accelerate.utils.operations.send_to_device)
+        send_to_device = re.sub(
+            r"([ ]{4,})return tensor\.to\(device\)",
+            r"\1try: return tensor.to(device)\n\1except: return tensor",
+            send_to_device,
+        ).replace("def send_to_device", "def _fixed_send_to_device")
+        exec(send_to_device)
+        accelerate.utils.operations.send_to_device = _fixed_send_to_device
+    pass
+pass
+# =============================================
+
 # =============================================
 # Torch compile settings
 
diff --git a/unsloth/models/gemma2.py b/unsloth/models/gemma2.py
index 1cbaf5b16..ea9f53e7d 100644
--- a/unsloth/models/gemma2.py
+++ b/unsloth/models/gemma2.py
@@ -156,6 +156,7 @@ def Gemma2Attention_fast_forward(
         )
         A = A.reshape(bsz, q_len, n_heads*head_dim)
     else:
+        mask = causal_mask if attention_mask is None else attention_mask
         A = slow_attention_softcapping(Q, K, V, causal_mask, self, bsz, kv_seq_len)
     pass
     A = self.apply_o(self, A)
@@ -413,7 +414,6 @@ def Gemma2Model_fast_forward_inference(
         SWA = attention_mask
         GA  = attention_mask
     pass
-
     next_decoder_cache = []
     for idx, decoder_layer in enumerate(self.model.layers):
 
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index e300e07e0..2a07da6ce 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -419,7 +419,7 @@ def LlamaAttention_fast_forward(
 def LlamaDecoderLayer_fast_forward(
     self,
     hidden_states:        torch.Tensor,
-    causal_mask:          Optional[xformers.attn_bias.BlockDiagonalCausalMask] = None,
+    causal_mask           = None,
     attention_mask:       Optional[torch.Tensor] = None,
     position_ids:         Optional[torch.LongTensor] = None,
     past_key_value:       Optional[Tuple[torch.Tensor]] = None,
@@ -505,7 +505,7 @@ def LlamaModel_fast_forward(
     return_dict:          Optional[bool] = None,
     *args, **kwargs,
 ) -> Union[Tuple, BaseModelOutputWithPast]:
-
+    
     output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
     assert(output_attentions is False)
     output_hidden_states = (
@@ -682,12 +682,27 @@ def LlamaModel_fast_forward(
 
 
     # Gemma2 has alternating SWA and global attn
-    if IS_GEMMA2 and not hasattr(self, "SWA_mask"):
-        if HAS_FLASH_ATTENTION_SOFTCAPPING:
+    if IS_GEMMA2:
+        if HAS_FLASH_ATTENTION_SOFTCAPPING and attention_mask is None:
             self.SWA_mask = True
             self.GA_mask  = False
-        else:
-            n = self.config.max_position_embeddings
+        elif attention_mask is not None:
+            self.SWA_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+                sliding_window = self.config.sliding_window,
+            )
+            self.GA_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+                sliding_window = None,
+            )
+        elif not hasattr(self, "SWA_mask"):
+            n = self.max_seq_length # self.config.max_position_embeddings
             # masked_fill is making stuff slower!
             # self. GA_mask = create_boolean_mask(n = n, sliding_window = 0)
             # self.SWA_mask = create_boolean_mask(n = n, sliding_window = self.config.sliding_window)
@@ -870,7 +885,7 @@ def _CausalLM_fast_forward(
             )
         else:
             causal_mask = xformers.attn_bias.LowerTriangularMask()
-    
+
             output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
             output_hidden_states = (
                 output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -879,7 +894,6 @@ def _CausalLM_fast_forward(
 
             # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
             self.model._has_no_labels = labels is None
-
             outputs = self.model(
                 input_ids=input_ids,
                 causal_mask=causal_mask,
@@ -893,7 +907,6 @@ def _CausalLM_fast_forward(
                 return_dict=return_dict,
             )
         pass
-
         hidden_states = outputs[0]
         bsz, q_len, hd = hidden_states.shape
         lm_head = self.lm_head.weight
diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 47152d676..cce22aebf 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -93,7 +93,7 @@ def _get_new_mapper():
 pass
 
 
-def _get_model_name(model_name, load_in_4bit = True):
+def get_model_name(model_name, load_in_4bit = True):
     new_model_name = __get_model_name(
         model_name = model_name,
         load_in_4bit = load_in_4bit,
@@ -145,7 +145,7 @@ def from_pretrained(
             token = os.environ["HUGGINGFACE_TOKEN"]
 
         old_model_name = model_name
-        model_name = _get_model_name(model_name, load_in_4bit)
+        model_name = get_model_name(model_name, load_in_4bit)
 
         # First check if it's a normal model via AutoConfig
         from huggingface_hub.utils import disable_progress_bars, enable_progress_bars, are_progress_bars_disabled
@@ -192,7 +192,7 @@ def from_pretrained(
         # Get base model for PEFT:
         if is_peft:
             # Check base model again for PEFT
-            model_name = _get_model_name(peft_config.base_model_name_or_path, load_in_4bit)
+            model_name = get_model_name(peft_config.base_model_name_or_path, load_in_4bit)
             model_config = AutoConfig.from_pretrained(model_name, token = token, revision = revision)
         pass
 
diff --git a/unsloth/save.py b/unsloth/save.py
index a5904efc1..f45d8062a 100644
--- a/unsloth/save.py
+++ b/unsloth/save.py
@@ -28,12 +28,14 @@
 import re
 from transformers.models.llama.modeling_llama import logger
 from .tokenizer_utils import fix_sentencepiece_gguf
+from huggingface_hub import HfApi
 
 __all__ = [
     "print_quantization_methods",
     "unsloth_save_model",
     "save_to_gguf",
     "patch_saving_functions",
+    "create_huggingface_repo",
 ]
 
 # Check environments
@@ -207,8 +209,9 @@ def unsloth_save_model(
 ):
     if token is None and "HF_TOKEN" in os.environ:
         token = os.environ["HF_TOKEN"]
-
-    if token is None and "HUGGINGFACE_TOKEN" in os.environ:
+    elif token is None and "hf_token" in os.environ:
+        token = os.environ["hf_token"]
+    elif token is None and "HUGGINGFACE_TOKEN" in os.environ:
         token = os.environ["HUGGINGFACE_TOKEN"]
 
     if commit_message is None: commit_message = ""
@@ -555,7 +558,8 @@ def unsloth_save_model(
                 logger.warning_once(f"We will save to Disk and not RAM now.")
                 filename = os.path.join(temporary_location, f"{name}.pt")
                 torch.save(W, filename, pickle_module = pickle, pickle_protocol = pickle.HIGHEST_PROTOCOL,)
-                state_dict[name] = torch.load(filename, map_location = "cpu", mmap = True)
+                # weights_only = True weirdly fails?
+                state_dict[name] = torch.load(filename, map_location = "cpu", mmap = True, weights_only = False)
         pass
         for item in LLAMA_LAYERNORMS:
             try:
@@ -675,7 +679,6 @@ def unsloth_save_model(
         # Now manually go through each file and upload them manually!
         filenames = os.listdir(new_save_directory)
 
-        from huggingface_hub import HfApi
         hf_api = HfApi(token = save_pretrained_settings["token"])
 
         print("Unsloth: Uploading all files... Please wait...")
@@ -1312,6 +1315,49 @@ def _determine_username(save_directory, old_username, token):
 pass
 
 
+def create_huggingface_repo(
+    model,
+    save_directory,
+    token = None,
+    private = False,
+):
+    if token is None and "HF_TOKEN" in os.environ:
+        token = os.environ["HF_TOKEN"]
+    elif token is None and "hf_token" in os.environ:
+        token = os.environ["hf_token"]
+    elif token is None and "HUGGINGFACE_TOKEN" in os.environ:
+        token = os.environ["HUGGINGFACE_TOKEN"]
+    pass
+    save_directory, username = _determine_username(save_directory, "", token)
+
+    from huggingface_hub import create_repo
+    try:
+        create_repo(
+            repo_id   = save_directory,
+            token     = token,
+            repo_type = "model",
+            exist_ok  = False,
+            private   = private,
+        ) 
+
+        # Create model card
+        from huggingface_hub import ModelCard
+        content = MODEL_CARD.format(
+            username   = username,
+            base_model = model.config._name_or_path,
+            model_type = model.config.model_type,
+            method     = "",
+            extra      = "unsloth",
+        )
+        card = ModelCard(content)
+        card.push_to_hub(save_directory, token = token)
+    except:
+        pass
+    hf_api = HfApi(token = token)
+    return save_directory, hf_api
+pass
+
+
 def upload_to_huggingface(
     model,
     save_directory,
@@ -1321,6 +1367,7 @@ def upload_to_huggingface(
     file_location = None,
     old_username = None,
     private = None,
+    create_config = True,
 ):
     save_directory, username = _determine_username(save_directory, old_username, token)
 
@@ -1350,7 +1397,6 @@ def upload_to_huggingface(
 
     if file_location is not None:
         # Now upload file
-        from huggingface_hub import HfApi
         hf_api = HfApi(token = token)
 
         if "/" in file_location:
@@ -1372,6 +1418,8 @@ def upload_to_huggingface(
                     repo_type       = "model",
                     commit_message  = "(Trained with Unsloth)",
                 )
+            pass
+        pass
 
         hf_api.upload_file(
             path_or_fileobj = file_location,
@@ -1382,18 +1430,20 @@ def upload_to_huggingface(
         )
 
         # We also upload a config.json file
-        import json
-        with open("_temporary_unsloth_config.json", "w") as file:
-            json.dump({"model_type" : model.config.model_type}, file, indent = 4)
+        if create_config:
+            import json
+            with open("_temporary_unsloth_config.json", "w") as file:
+                json.dump({"model_type" : model.config.model_type}, file, indent = 4)
+            pass
+            hf_api.upload_file(
+                path_or_fileobj = "_temporary_unsloth_config.json",
+                path_in_repo    = "config.json",
+                repo_id         = save_directory,
+                repo_type       = "model",
+                commit_message  = "(Trained with Unsloth)",
+            )
+            os.remove("_temporary_unsloth_config.json")
         pass
-        hf_api.upload_file(
-            path_or_fileobj = "_temporary_unsloth_config.json",
-            path_in_repo    = "config.json",
-            repo_id         = save_directory,
-            repo_type       = "model",
-            commit_message  = "(Trained with Unsloth)",
-        )
-        os.remove("_temporary_unsloth_config.json")
     pass
     return username
 pass

From 3781a03903c6a24c929737f49a1f73b25a517ac6 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sun, 11 Aug 2024 18:26:20 -0700
Subject: [PATCH 112/147] Fix DPO stats (#906)

* Update pyproject.toml

* Update tokenizer_utils.py

* Update tokenizer_utils.py

* Update tokenizer_utils.py

* Update _utils.py

* Update _utils.py

* Update tokenizer_utils.py

* Update tokenizer_utils.py

* fix_tokenizer

* Update tokenizer_utils.py

* Update tokenizer_utils.py

* Update save.py

* Update save.py

* Update save.py

* Update save.py

* Update save.py

* Update loader.py

* Update pyproject.toml

* Update _utils.py

* Update gemma2.py

* Update gemma2.py

* Update _utils.py

* gemma 2 mask

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update _utils.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update _utils.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update _utils.py

* Update _utils.py

* Update _utils.py

* Update _utils.py

* Update _utils.py

* Update _utils.py

* Torch 2.4 Xformers 0.0.27post2

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Gemma 2 fixes

* Update gemma2.py

* Update llama.py

* Update llama.py

* Update save.py

* Update save.py

* Update llama.py

* Update cross_entropy_loss.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py
---
 unsloth/kernels/cross_entropy_loss.py |  1 +
 unsloth/models/dpo.py                 | 16 +++++++++++++---
 unsloth/models/llama.py               |  1 +
 3 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/unsloth/kernels/cross_entropy_loss.py b/unsloth/kernels/cross_entropy_loss.py
index 6074a5153..b8473e60c 100644
--- a/unsloth/kernels/cross_entropy_loss.py
+++ b/unsloth/kernels/cross_entropy_loss.py
@@ -303,6 +303,7 @@ def backward(ctx, dlosses):
 pass
 
 
+@torch._disable_dynamo
 def fast_cross_entropy_loss(logits, labels, logit_softcapping = 0):
     """
     Arguments:
diff --git a/unsloth/models/dpo.py b/unsloth/models/dpo.py
index b7c7305bb..e7074350c 100644
--- a/unsloth/models/dpo.py
+++ b/unsloth/models/dpo.py
@@ -12,6 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+__all__ = [
+    "PatchDPOTrainer",
+]
+
 try:
     from transformers.utils.notebook import (
         IntervalStrategy,
@@ -22,6 +26,12 @@
 except:
     HAS_NOTEBOOK = False
 pass
+import torch
+from ._utils import torch_compile_options
+import inspect
+import torch.nn as nn
+from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union
+
 
 DPOTrainer_metrics = [
     "rewards/chosen",
@@ -37,11 +47,11 @@
 
 
 def NotebookProgressCallback_on_train_begin(self, args, state, control, **kwargs):
-    self.first_column = "Epoch" if args.evaluation_strategy == IntervalStrategy.EPOCH else "Step"
+    self.first_column = "Epoch" if args.eval_strategy == IntervalStrategy.EPOCH else "Step"
     self.training_loss = 0
     self.last_log = 0
     column_names = [self.first_column] + ["Training Loss"]
-    if args.evaluation_strategy != IntervalStrategy.NO:
+    if args.eval_strategy != IntervalStrategy.NO:
         column_names.append("Validation Loss")
     column_names += [x.replace("/", " / ") for x in DPOTrainer_metrics]
     self.training_tracker = NotebookTrainingTracker(state.max_steps, column_names)
@@ -50,7 +60,7 @@ def NotebookProgressCallback_on_train_begin(self, args, state, control, **kwargs
 
 def NotebookProgressCallback_on_log(self, args, state, control, logs=None, **kwargs):
     # Only for when there is no evaluation
-    if args.evaluation_strategy == IntervalStrategy.NO and "loss" in logs:
+    if args.eval_strategy == IntervalStrategy.NO and "loss" in logs:
         values = {"Training Loss": logs["loss"]}
         for metric in DPOTrainer_metrics:
             values[metric.replace("/", " / ")] = logs[metric]
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 2a07da6ce..6f1bb62c1 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -961,6 +961,7 @@ def _CausalLM_fast_forward(
 pass
 
 
+@torch._disable_dynamo
 def PeftModelForCausalLM_fast_forward(
     self,
     input_ids=None,

From a64b8f648ad067f9745253161e73a0367bf0ca5a Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 13 Aug 2024 17:54:02 -0700
Subject: [PATCH 113/147] Fix Chat Templates (#916)

* Update pyproject.toml

* Update tokenizer_utils.py

* Update tokenizer_utils.py

* Update tokenizer_utils.py

* Update _utils.py

* Update _utils.py

* Update tokenizer_utils.py

* Update tokenizer_utils.py

* fix_tokenizer

* Update tokenizer_utils.py

* Update tokenizer_utils.py

* Update save.py

* Update save.py

* Update save.py

* Update save.py

* Update save.py

* Update loader.py

* Update pyproject.toml

* Update _utils.py

* Update gemma2.py

* Update gemma2.py

* Update _utils.py

* gemma 2 mask

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update _utils.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update _utils.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update _utils.py

* Update _utils.py

* Update _utils.py

* Update _utils.py

* Update _utils.py

* Update _utils.py

* Torch 2.4 Xformers 0.0.27post2

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Gemma 2 fixes

* Update gemma2.py

* Update llama.py

* Update llama.py

* Update save.py

* Update save.py

* Update llama.py

* Update cross_entropy_loss.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Providing more flexibility for users to customize their llama when using LoRA (#910)

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update chat_templates.py

* return model

* Update tokenizer_utils.py

* Update chat_templates.py

* Update tokenizer_utils.py

---------

Co-authored-by: Po-Lung Wang <Brownwang0426@gmail.com>
---
 unsloth/chat_templates.py  | 222 +++++++++++++++++++++++++++++++++++--
 unsloth/models/llama.py    |  17 ++-
 unsloth/tokenizer_utils.py |  28 ++++-
 3 files changed, 256 insertions(+), 11 deletions(-)

diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py
index 5bd66bae0..07e79b180 100644
--- a/unsloth/chat_templates.py
+++ b/unsloth/chat_templates.py
@@ -508,6 +508,200 @@
 CHAT_TEMPLATES["phi-3"] = (phi3_template, phi3_template_eos_token, False, phi3_ollama,)
 pass
 
+# =========================================== Llama-3.1
+"""
+No trimming in Llama 3.1 Instruct!
+Also an extra newline for Cutting Knowledge Date
+See https://colab.research.google.com/drive/1Xpqq5xpIgO-B00MQ-UccYMwN2J8QFgBM?usp=sharing
+
+Also should be
+
+import datetime
+tokenizer.apply_chat_template(
+    messages,
+    add_generation_prompt = True,
+    tokenize = False,
+    date_string = datetime.today().strftime("%d %B %Y")),
+)
+"""
+
+llama31_template = \
+"""{{- bos_token }}
+{%- if custom_tools is defined %}
+    {%- set tools = custom_tools %}
+{%- endif %}
+{%- if not tools_in_user_message is defined %}
+    {%- set tools_in_user_message = true %}
+{%- endif %}
+{%- if not date_string is defined %}
+    {%- set date_string = "26 July 2024" %}
+{%- endif %}
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- endif %}
+
+{#- This block extracts the system message, so we can slot it into the right place. #}
+{%- if messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0]['content'] %}
+    {%- set messages = messages[1:] %}
+{%- else %}
+    {%- set system_message = "" %}
+{%- endif %}
+
+{#- System message + builtin tools #}
+{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
+{%- if builtin_tools is defined or tools is not none %}
+    {{- "Environment: ipython\n" }}
+{%- endif %}
+{%- if builtin_tools is defined %}
+    {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}}
+{%- endif %}
+{{- "Cutting Knowledge Date: December 2023\n" }}
+{{- "Today Date: " + date_string + "\n\n" }}
+{%- if tools is not none and not tools_in_user_message %}
+    {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
+    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+    {{- "Do not use variables.\n\n" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+{%- endif %}
+{{- system_message }}
+{{- "<|eot_id|>" }}
+
+{#- Custom tools are passed in a user message with some extra guidance #}
+{%- if tools_in_user_message and not tools is none %}
+    {#- Extract the first user message so we can plug it in here #}
+    {%- if messages | length != 0 %}
+        {%- set first_user_message = messages[0]['content'] %}
+        {%- set messages = messages[1:] %}
+    {%- else %}
+        {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
+{%- endif %}
+    {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
+    {{- "Given the following functions, please respond with a JSON for a function call " }}
+    {{- "with its proper arguments that best answers the given prompt.\n\n" }}
+    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+    {{- "Do not use variables.\n\n" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+    {{- first_user_message + "<|eot_id|>"}}
+{%- endif %}
+
+{%- for message in messages %}
+    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
+        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] + '<|eot_id|>' }}
+    {%- elif 'tool_calls' in message %}
+        {%- if not message.tool_calls|length == 1 %}
+            {{- raise_exception("This model only supports single tool-calls at once!") }}
+        {%- endif %}
+        {%- set tool_call = message.tool_calls[0].function %}
+        {%- if builtin_tools is defined and tool_call.name in builtin_tools %}
+            {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
+            {{- "<|python_tag|>" + tool_call.name + ".call(" }}
+            {%- for arg_name, arg_val in tool_call.arguments | items %}
+                {{- arg_name + '="' + arg_val + '"' }}
+                {%- if not loop.last %}
+                    {{- ", " }}
+                {%- endif %}
+                {%- endfor %}
+            {{- ")" }}
+        {%- else  %}
+            {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
+            {{- '{"name": "' + tool_call.name + '", ' }}
+            {{- '"parameters": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- "}" }}
+        {%- endif %}
+        {%- if builtin_tools is defined %}
+            {#- This means we're in ipython mode #}
+            {{- "<|eom_id|>" }}
+        {%- else %}
+            {{- "<|eot_id|>" }}
+        {%- endif %}
+    {%- elif message.role == "tool" or message.role == "ipython" %}
+        {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
+        {%- if message.content is mapping or message.content is iterable %}
+            {{- message.content | tojson }}
+        {%- else %}
+            {{- message.content }}
+        {%- endif %}
+        {{- "<|eot_id|>" }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
+{%- endif %}
+"""
+pass
+
+# Ollama from https://ollama.com/library/llama3.1 (needs updating!)
+llama31_ollama = \
+'''
+FROM {__FILE_LOCATION__}
+TEMPLATE """{{ if .Messages }}
+{{- if or .System .Tools }}<|start_header_id|>system<|end_header_id|>
+{{- if .System }}
+
+{{ .System }}
+{{- end }}
+{{- if .Tools }}
+
+You are a helpful assistant with tool calling capabilities. When you receive a tool call response, use the output to format an answer to the orginal use question.
+{{- end }}
+{{- end }}<|eot_id|>
+{{- range $i, $_ := .Messages }}
+{{- $last := eq (len (slice $.Messages $i)) 1 }}
+{{- if eq .Role "user" }}<|start_header_id|>user<|end_header_id|>
+{{- if and $.Tools $last }}
+
+Given the following functions, please respond with a JSON for a function call with its proper arguments that best answers the given prompt.
+
+Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}. Do not use variables.
+
+{{ $.Tools }}
+{{- end }}
+
+{{ .Content }}<|eot_id|>{{ if $last }}<|start_header_id|>assistant<|end_header_id|>
+
+{{ end }}
+{{- else if eq .Role "assistant" }}<|start_header_id|>assistant<|end_header_id|>
+{{- if .ToolCalls }}
+
+{{- range .ToolCalls }}{"name": "{{ .Function.Name }}", "parameters": {{ .Function.Arguments }}}{{ end }}
+{{- else }}
+
+{{ .Content }}{{ if not $last }}<|eot_id|>{{ end }}
+{{- end }}
+{{- else if eq .Role "tool" }}<|start_header_id|>ipython<|end_header_id|>
+
+{{ .Content }}<|eot_id|>{{ if $last }}<|start_header_id|>assistant<|end_header_id|>
+
+{{ end }}
+{{- end }}
+{{- end }}
+{{- else }}
+{{- if .System }}<|start_header_id|>system<|end_header_id|>
+
+{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>
+
+{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>
+
+{{ end }}{{ .Response }}{{ if .Response }}<|eot_id|>{{ end }}"""
+PARAMETER stop "<|start_header_id|>"
+PARAMETER stop "<|end_header_id|>"
+PARAMETER stop "<|eot_id|>"
+PARAMETER stop "<|eom_id|>"
+'''
+
+llama31_template_eos_token = "eos_token"
+CHAT_TEMPLATES["llama-3.1"] = (llama31_template, llama31_template_eos_token, False, llama31_ollama,)
+CHAT_TEMPLATES["llama-31"]  = (llama31_template, llama31_template_eos_token, False, llama31_ollama,)
+pass
+
 
 def get_chat_template(
     tokenizer,
@@ -680,21 +874,33 @@ def get_chat_template(
         )
     pass
 
-    # For ShareGPT role -> from and content -> value
-    chat_template = chat_template\
-        .replace("'role'",      "'" + mapping["role"]      + "'")\
-        .replace("'content'",   "'" + mapping["content"]   + "'")\
-        .replace("'user'",      "'" + mapping["user"]      + "'")\
-        .replace("'assistant'", "'" + mapping["assistant"] + "'")
-
     # Careful on Gemma
     # bos_token is a must or else losses become too high
     if IS_GEMMA and not chat_template.startswith("{{ bos_token }}"):
         chat_template = "{{ bos_token }}" + chat_template
     pass
 
+    # For ShareGPT role -> from and content -> value
+    new_chat_template = chat_template\
+        .replace("'role'",      "'" + mapping["role"]      + "'")\
+        .replace("'content'",   "'" + mapping["content"]   + "'")\
+        .replace("'user'",      "'" + mapping["user"]      + "'")\
+        .replace("'assistant'", "'" + mapping["assistant"] + "'")
+
     _, tokenizer = patch_tokenizer(model = None, tokenizer = tokenizer)
-    tokenizer.padding_side  = old_padding_side
+    tokenizer.padding_side = old_padding_side
+
+    # If not normal HF, we add a check to make old templates work
+    if mapping != {"role" : "role", "content" : "content", "user" : "user", "assistant" : "assistant"}:
+        chat_template = \
+            "{% if 'role' in messages[0] %}" + \
+            chat_template + \
+            "{% else %}" + \
+            new_chat_template + \
+            "{% endif %}"
+    else:
+        chat_template = new_chat_template
+    pass
     tokenizer.chat_template = chat_template
 
     # Also fix up other tokens
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 6f1bb62c1..6a111c934 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1873,8 +1873,17 @@ def get_peft_model(
                 else: modules_to_save.append("embed_tokens")
 
             else:
-                assert(module in accepted_modules)
-                final_modules.append(module)
+                try:
+                    assert(module in accepted_modules)
+                    final_modules.append(module)
+                except AssertionError as e:
+                    final_modules.append(module)
+                    print(
+                        "Unsloth: You added custom modules, but Unsloth hasn't optimized for this.\n"\
+                        "Beware - your finetuning might be noticeably slower!"
+                    )
+                pass
+            pass
         pass
 
         # Check if we added new tokens!
@@ -2253,6 +2262,8 @@ def for_inference(model):
         if hasattr(internal_model, "_saved_temp_tokenizer"):
             internal_model._saved_temp_tokenizer.padding_side = "left"
         pass
+
+        return model
     pass
 
 
@@ -2291,6 +2302,8 @@ def for_training(model, use_gradient_checkpointing = True):
         if hasattr(internal_model, "_saved_temp_tokenizer"):
             internal_model._saved_temp_tokenizer.padding_side = "right"
         pass
+
+        return model
     pass
 pass
 
diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index c67f82c2c..9c0bc1c51 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -597,8 +597,34 @@ def fix_chat_template(tokenizer):
     if chat_template is None: return None
 
     ### 1. Check if add_generation_prompt works
+    # Check for ShareGPT style first
+    is_sharegpt = None
+    try:
+        messages = [
+            {"role": "user", "content": "Who are you?"},
+        ]
+        tokenizer.apply_chat_template(messages, add_generation_prompt = False, tokenize = False)
+        is_sharegpt = False
+    except:
+        try:
+            messages = [
+                {"from": "human", "value": "Who are you?"},
+            ]
+            tokenizer.apply_chat_template(messages, add_generation_prompt = False, tokenize = False)
+            is_sharegpt = True
+        except:
+            is_sharegpt = None
+        pass
+    pass
+
+    # Not ShareGPT or HF style - just return
+    if is_sharegpt is None: return chat_template
+
+    # Tokenize
     messages = [
-        {"role": "user", "content": "Who are you?"},
+        {"role": "user", "content": "Who are you?"} \
+        if not is_sharegpt else \
+        {"from": "human", "value": "Who are you?"}
     ]
     no  = tokenizer.apply_chat_template(messages, add_generation_prompt = False, tokenize = False)
     yes = tokenizer.apply_chat_template(messages, add_generation_prompt =  True, tokenize = False)

From a4ab920de9282602d587a40df828674bfa9d650e Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 14 Aug 2024 00:58:02 -0700
Subject: [PATCH 114/147] Fix chat templates (#917)

* Update pyproject.toml

* Update tokenizer_utils.py

* Update tokenizer_utils.py

* Update tokenizer_utils.py

* Update _utils.py

* Update _utils.py

* Update tokenizer_utils.py

* Update tokenizer_utils.py

* fix_tokenizer

* Update tokenizer_utils.py

* Update tokenizer_utils.py

* Update save.py

* Update save.py

* Update save.py

* Update save.py

* Update save.py

* Update loader.py

* Update pyproject.toml

* Update _utils.py

* Update gemma2.py

* Update gemma2.py

* Update _utils.py

* gemma 2 mask

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update _utils.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update _utils.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update _utils.py

* Update _utils.py

* Update _utils.py

* Update _utils.py

* Update _utils.py

* Update _utils.py

* Torch 2.4 Xformers 0.0.27post2

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Gemma 2 fixes

* Update gemma2.py

* Update llama.py

* Update llama.py

* Update save.py

* Update save.py

* Update llama.py

* Update cross_entropy_loss.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Providing more flexibility for users to customize their llama when using LoRA (#910)

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update chat_templates.py

* return model

* Update tokenizer_utils.py

* Update chat_templates.py

* Update tokenizer_utils.py

* Train on completions

---------

Co-authored-by: Po-Lung Wang <Brownwang0426@gmail.com>
---
 unsloth/chat_templates.py | 165 +++++++++++++++++++++++++++++++-------
 1 file changed, 138 insertions(+), 27 deletions(-)

diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py
index 07e79b180..7070524e0 100644
--- a/unsloth/chat_templates.py
+++ b/unsloth/chat_templates.py
@@ -1458,9 +1458,10 @@ def construct_chat_template( \
     ollama_eos = '\n'.join(f'PARAMETER stop "{eos}"' for eos in ollama_eos)
 
     # Ollama modelfile
+    part = '"""'
     modelfile = 'FROM {__FILE_LOCATION__}\n\n'\
-    'TEMPLATE """' + system_modelfile + input_modelfile + output_modelfile + \
-    '"""\n\n' + ollama_eos
+    'TEMPLATE ' + part + system_modelfile + input_modelfile + output_modelfile + \
+        part + '\n\n' + ollama_eos
 
     # HF Jinja Chat template
     def process(part, which, content = "message['content']"):
@@ -1659,6 +1660,70 @@ def formatting_prompts_func(examples):
 pass
 
 
+# From https://www.geeksforgeeks.org/longest-common-substring-array-strings/
+# Longest Common Substring in an Array of Strings
+def _longest_common_substring(arr):
+    n = len(arr)
+    s = arr[0]
+    l = len(s)
+    res = ""
+    for i in range(l):
+        for j in range(i + 1, l + 1):
+            stem = s[i:j]
+            k = 1
+            for k in range(1, n):
+                if stem not in arr[k]:
+                    break
+            if (k + 1 == n and len(res) < len(stem)):
+                res = stem
+    return res
+pass
+
+
+def _find_common_token_ids(component, tokenizer):
+    """
+    \n### User:\n\n
+    \n\n### User:\n\n
+    etc
+    we need to find the middle most repeatted part.
+    Tokenizers can tokenize newlines or spaces as 1 token!
+    """
+    right_text = ""
+    if   component.endswith (" "): right_text = " "
+    elif component.endswith("\n"): right_text = "\n"
+    left_text = ""
+    if   component.startswith (" "): left_text = " "
+    elif component.startswith("\n"): left_text = "\n"
+    stripped = component.strip()
+
+    # Add current pieces and also newlines
+    all_input_ids = []
+    for left in range(3):
+        for right in range(3):
+            x = left*left_text + stripped + right*right_text
+            x = tokenizer(x, add_special_tokens = False).input_ids
+            all_input_ids.append(x)
+
+            x = left*"\n" + stripped + right*"\n"
+            x = tokenizer(x, add_special_tokens = False).input_ids
+            all_input_ids.append(x)
+        pass
+    pass
+    substring = _longest_common_substring([str(x + [0]) for x in all_input_ids])
+    substring = substring.split(", ")[:-1]
+    substring = [int(x) for x in substring]
+
+    # Also get rest of tokenized string
+    original = tokenizer(component, add_special_tokens = False).input_ids
+    # Get optional left and right
+    for j in range(len(original)):
+        if original[j : j + len(substring)] == substring: break
+    optional_left  = original[:j]
+    optional_right = original[j+len(substring):]
+    return substring, optional_left, optional_right
+pass
+
+
 def train_on_responses_only(
     trainer,
     instruction_part = None,
@@ -1685,41 +1750,87 @@ def train_on_responses_only(
         response_part    = tokenizer._unsloth_output_part
     pass
 
-    instruction_ids = tokenizer(instruction_part,  add_special_tokens = False).input_ids
-    response_ids    = tokenizer(response_part, add_special_tokens = False).input_ids
+    # Get most common tokens since tokenizers can tokenize stuff differently!
+    Q_must, Q_left, Q_right = _find_common_token_ids(instruction_part, tokenizer)
+    A_must, A_left, A_right = _find_common_token_ids(response_part,    tokenizer)
 
-    instruction_length = len(instruction_ids)
-    response_length    = len(response_ids)
-    max_length = max(instruction_length, response_length)
+    # Store some temporary stuff
+    A_first = A_must[0]
+    len_A_must = len(A_must)
+    A_left_reversed = A_left[::-1]
+    A_right_forward = A_right
+
+    Q_first = Q_must[0]
+    len_Q_must = len(Q_must)
+    Q_left_reversed = Q_left[::-1]
+    Q_right_forward = Q_right
 
     def _train_on_responses_only(examples):
         input_ids_ = examples["input_ids"]
         all_labels = []
 
         for input_ids in input_ids_:
-
-            labels = [-100] * len(input_ids)
-            m = len(input_ids) - max_length
-            first_response    = response_ids[0]
-            first_instruction = instruction_ids[0]
+            n = len(input_ids)
+            labels = [-100] * n
+            n_minus_1 = n - 1
             j = 0
-            while j < m:
-                if input_ids[j] == first_response:
-                    if input_ids[j : j+response_length] == response_ids:
-                        j = j + response_length
-                        start = j
-                        while j < m:
-                            if input_ids[j] == first_instruction and input_ids[j : j+instruction_length] == instruction_ids:
-                                j = j + instruction_length
-                                labels[start : j] = input_ids[start : j]
-                                break
-                            elif j == (m-1):
-                                j = m
-                                labels[start:] = input_ids[start:]
-                                break
+            while j < n:
+                # Find <assistant>
+                if (input_ids[j] == A_first) and \
+                    (input_ids[j : (k := j + len_A_must)] == A_must):
+
+                    # Now backtrack to get previous optional tokens
+                    for optional_left in A_left_reversed:
+                        if j < 1: break
+                        if optional_left == input_ids[j-1]: j -= 1
+                        else: break
+                    pass
+                    # And forwards look as well
+                    for optional_right in A_right_forward:
+                        if k >= n_minus_1: break
+                        if optional_right == input_ids[k+1]: k += 1
+                        else: break
+                    pass
+                    # assistant_j = j
+                    assistant_k = k
+
+                    j = assistant_k
+                    # Given <assistant>, now find next user
+                    while j < n:
+                        # Find <user>
+                        # Also accept last final item if assistant is the last turn
+                        if (j == n_minus_1) or \
+                            ((input_ids[j] == Q_first) and \
+                             (input_ids[j : (k := j + len_Q_must)] == Q_must)):
+
+                            # Now backtrack to get previous optional tokens
+                            for optional_left in Q_left_reversed:
+                                if j < 1: break
+                                if optional_left == input_ids[j-1]: j -= 1
+                                else: break
+                            pass
+                            # And forwards look as well
+                            for optional_right in Q_right_forward:
+                                if k >= n_minus_1: break
+                                if optional_right == input_ids[k+1]: k += 1
+                                else: break
+                            pass
+                            user_j = j
+                            # Account for last item
+                            if user_j != n_minus_1:
+                                # user_k = k
+                                # j = user_k
+                                j = k
+                            else:
+                                user_j = n
+                                k = n
                             pass
-                            j += 1
+                            # Now copy input_ids to labels
+                            labels[assistant_k : user_j] = input_ids[assistant_k : user_j]
+                            # print(assistant_j, assistant_k, user_j, user_k)
+                            break
                         pass
+                        j += 1
                     pass
                 pass
                 j += 1

From 5393e9e00a1e2019144698d90035ae21e03325c7 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 15 Aug 2024 00:31:30 -0700
Subject: [PATCH 115/147] Bug Fixes (#920)

* Update pyproject.toml

* Update tokenizer_utils.py

* Update tokenizer_utils.py

* Update tokenizer_utils.py

* Update _utils.py

* Update _utils.py

* Update tokenizer_utils.py

* Update tokenizer_utils.py

* fix_tokenizer

* Update tokenizer_utils.py

* Update tokenizer_utils.py

* Update save.py

* Update save.py

* Update save.py

* Update save.py

* Update save.py

* Update loader.py

* Update pyproject.toml

* Update _utils.py

* Update gemma2.py

* Update gemma2.py

* Update _utils.py

* gemma 2 mask

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update _utils.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update _utils.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update _utils.py

* Update _utils.py

* Update _utils.py

* Update _utils.py

* Update _utils.py

* Update _utils.py

* Torch 2.4 Xformers 0.0.27post2

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Gemma 2 fixes

* Update gemma2.py

* Update llama.py

* Update llama.py

* Update save.py

* Update save.py

* Update llama.py

* Update cross_entropy_loss.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Providing more flexibility for users to customize their llama when using LoRA (#910)

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update chat_templates.py

* return model

* Update tokenizer_utils.py

* Update chat_templates.py

* Update tokenizer_utils.py

* Train on completions

* load_in_4bit=False broken

---------

Co-authored-by: Po-Lung Wang <Brownwang0426@gmail.com>
---
 unsloth/models/llama.py  |  6 +++++-
 unsloth/models/loader.py | 19 +++++++++++--------
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 6a111c934..6139115f6 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1387,11 +1387,15 @@ def from_pretrained(
         # RoPE Scaling's max_position_embeddings must be updated
         max_position_embeddings = max(max_seq_length, model_max_seq_length)
         kwargs.pop("attn_implementation", None); # No need since we auto call it
+
+        # Cannot be None, since HF now checks for the config
+        if load_in_4bit: kwargs["quantization_config"] = bnb_config
+
         model = AutoModelForCausalLM.from_pretrained(
             model_name,
             device_map              = device_map,
             torch_dtype             = dtype,
-            quantization_config     = bnb_config,
+            # quantization_config     = bnb_config,
             token                   = token,
             max_position_embeddings = max_position_embeddings,
             trust_remote_code       = trust_remote_code,
diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index cce22aebf..ad1098eda 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -42,10 +42,11 @@ def __get_model_name(
     INT_TO_FLOAT_MAPPER = None,
     FLOAT_TO_INT_MAPPER = None,
 ):
-
     model_name = str(model_name)
-    if not SUPPORTS_FOURBIT and model_name.lower() in INT_TO_FLOAT_MAPPER:
-        model_name = INT_TO_FLOAT_MAPPER[model_name.lower()]
+    lower_model_name = model_name.lower()
+
+    if not SUPPORTS_FOURBIT and lower_model_name in INT_TO_FLOAT_MAPPER:
+        model_name = INT_TO_FLOAT_MAPPER[lower_model_name]
         logger.warning_once(
             f"Unsloth: Your transformers version of {transformers_version} does not support native "\
             f"4bit loading.\nThe minimum required version is 4.37.\n"\
@@ -55,16 +56,18 @@ def __get_model_name(
         )
         return model_name
     
-    elif not load_in_4bit and model_name.lower() in INT_TO_FLOAT_MAPPER:
-        new_model_name = INT_TO_FLOAT_MAPPER[model_name.lower()]
+    elif not load_in_4bit and lower_model_name in INT_TO_FLOAT_MAPPER:
+        new_model_name = INT_TO_FLOAT_MAPPER[lower_model_name]
         # logger.warning_once(
         #     f"Unsloth: You passed in `{model_name}` which is a 4bit model, yet you set\n"\
         #     f"`load_in_4bit = False`. We shall load `{new_model_name}` instead."
         # )
         return new_model_name
-
-    elif load_in_4bit and SUPPORTS_FOURBIT and model_name.lower() in FLOAT_TO_INT_MAPPER:
-        new_model_name = FLOAT_TO_INT_MAPPER[model_name.lower()]
+    elif not load_in_4bit and lower_model_name in FLOAT_TO_INT_MAPPER:
+        new_model_name = FLOAT_TO_INT_MAPPER[lower_model_name]
+        return new_model_name
+    elif load_in_4bit and SUPPORTS_FOURBIT and lower_model_name in FLOAT_TO_INT_MAPPER:
+        new_model_name = FLOAT_TO_INT_MAPPER[lower_model_name]
         # logger.warning_once(
         #     f"Unsloth: You passed in `{model_name}` and `load_in_4bit = True`.\n"\
         #     f"We shall load `{new_model_name}` for 4x faster loading."

From 53cd1e778133efa9721731834fb06589dc95b719 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 15 Aug 2024 01:15:35 -0700
Subject: [PATCH 116/147] Fix mapping (#921)

* Update pyproject.toml

* Update tokenizer_utils.py

* Update tokenizer_utils.py

* Update tokenizer_utils.py

* Update _utils.py

* Update _utils.py

* Update tokenizer_utils.py

* Update tokenizer_utils.py

* fix_tokenizer

* Update tokenizer_utils.py

* Update tokenizer_utils.py

* Update save.py

* Update save.py

* Update save.py

* Update save.py

* Update save.py

* Update loader.py

* Update pyproject.toml

* Update _utils.py

* Update gemma2.py

* Update gemma2.py

* Update _utils.py

* gemma 2 mask

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update _utils.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update _utils.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update _utils.py

* Update _utils.py

* Update _utils.py

* Update _utils.py

* Update _utils.py

* Update _utils.py

* Torch 2.4 Xformers 0.0.27post2

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Gemma 2 fixes

* Update gemma2.py

* Update llama.py

* Update llama.py

* Update save.py

* Update save.py

* Update llama.py

* Update cross_entropy_loss.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Update dpo.py

* Providing more flexibility for users to customize their llama when using LoRA (#910)

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update chat_templates.py

* return model

* Update tokenizer_utils.py

* Update chat_templates.py

* Update tokenizer_utils.py

* Train on completions

* load_in_4bit=False broken

* Update llama.py

* MAP_TO_UNSLOTH_16bit

* Update loader.py

* Update loader.py

* Update loader.py

* Update loader.py

* Update mapper.py

* Update mapper.py

* works!

---------

Co-authored-by: Po-Lung Wang <Brownwang0426@gmail.com>
---
 unsloth/models/llama.py  |  2 +-
 unsloth/models/loader.py | 39 +++++++++++++++++++++++++--------------
 unsloth/models/mapper.py | 13 +++++++++++--
 3 files changed, 37 insertions(+), 17 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 6139115f6..6a23335c8 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1390,7 +1390,7 @@ def from_pretrained(
 
         # Cannot be None, since HF now checks for the config
         if load_in_4bit: kwargs["quantization_config"] = bnb_config
-
+        
         model = AutoModelForCausalLM.from_pretrained(
             model_name,
             device_map              = device_map,
diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index ad1098eda..e260017fb 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -19,7 +19,7 @@
 from transformers import AutoConfig
 from transformers import __version__ as transformers_version
 from peft import PeftConfig, PeftModel
-from .mapper import INT_TO_FLOAT_MAPPER, FLOAT_TO_INT_MAPPER
+from .mapper import INT_TO_FLOAT_MAPPER, FLOAT_TO_INT_MAPPER, MAP_TO_UNSLOTH_16bit
 import os
 
 # https://github.com/huggingface/transformers/pull/26037 allows 4 bit loading!
@@ -39,13 +39,15 @@
 def __get_model_name(
     model_name,
     load_in_4bit = True,
-    INT_TO_FLOAT_MAPPER = None,
-    FLOAT_TO_INT_MAPPER = None,
+    INT_TO_FLOAT_MAPPER  = None,
+    FLOAT_TO_INT_MAPPER  = None,
+    MAP_TO_UNSLOTH_16bit = None,
 ):
     model_name = str(model_name)
     lower_model_name = model_name.lower()
 
     if not SUPPORTS_FOURBIT and lower_model_name in INT_TO_FLOAT_MAPPER:
+
         model_name = INT_TO_FLOAT_MAPPER[lower_model_name]
         logger.warning_once(
             f"Unsloth: Your transformers version of {transformers_version} does not support native "\
@@ -57,16 +59,21 @@ def __get_model_name(
         return model_name
     
     elif not load_in_4bit and lower_model_name in INT_TO_FLOAT_MAPPER:
+
         new_model_name = INT_TO_FLOAT_MAPPER[lower_model_name]
         # logger.warning_once(
         #     f"Unsloth: You passed in `{model_name}` which is a 4bit model, yet you set\n"\
         #     f"`load_in_4bit = False`. We shall load `{new_model_name}` instead."
         # )
         return new_model_name
-    elif not load_in_4bit and lower_model_name in FLOAT_TO_INT_MAPPER:
-        new_model_name = FLOAT_TO_INT_MAPPER[lower_model_name]
+
+    elif not load_in_4bit and lower_model_name in MAP_TO_UNSLOTH_16bit:
+
+        new_model_name = MAP_TO_UNSLOTH_16bit[lower_model_name]
         return new_model_name
+
     elif load_in_4bit and SUPPORTS_FOURBIT and lower_model_name in FLOAT_TO_INT_MAPPER:
+
         new_model_name = FLOAT_TO_INT_MAPPER[lower_model_name]
         # logger.warning_once(
         #     f"Unsloth: You passed in `{model_name}` and `load_in_4bit = True`.\n"\
@@ -86,12 +93,14 @@ def _get_new_mapper():
         with requests.get(new_mapper, timeout = 3) as new_mapper: new_mapper = new_mapper.text
         new_mapper = new_mapper[new_mapper.find("__INT_TO_FLOAT_MAPPER"):]
         new_mapper = new_mapper\
-            .replace("INT_TO_FLOAT_MAPPER", "NEW_INT_TO_FLOAT_MAPPER")\
-            .replace("FLOAT_TO_INT_MAPPER", "NEW_FLOAT_TO_INT_MAPPER")
+            .replace("INT_TO_FLOAT_MAPPER",  "NEW_INT_TO_FLOAT_MAPPER")\
+            .replace("FLOAT_TO_INT_MAPPER",  "NEW_FLOAT_TO_INT_MAPPER")\
+            .replace("MAP_TO_UNSLOTH_16bit", "NEW_MAP_TO_UNSLOTH_16bit")
+
         exec(new_mapper, globals())
-        return NEW_INT_TO_FLOAT_MAPPER, NEW_FLOAT_TO_INT_MAPPER
+        return NEW_INT_TO_FLOAT_MAPPER, NEW_FLOAT_TO_INT_MAPPER, NEW_MAP_TO_UNSLOTH_16bit
     except:
-        return {}, {}
+        return {}, {}, {}
     pass
 pass
 
@@ -100,17 +109,19 @@ def get_model_name(model_name, load_in_4bit = True):
     new_model_name = __get_model_name(
         model_name = model_name,
         load_in_4bit = load_in_4bit,
-        INT_TO_FLOAT_MAPPER = INT_TO_FLOAT_MAPPER,
-        FLOAT_TO_INT_MAPPER = FLOAT_TO_INT_MAPPER,
+        INT_TO_FLOAT_MAPPER  = INT_TO_FLOAT_MAPPER,
+        FLOAT_TO_INT_MAPPER  = FLOAT_TO_INT_MAPPER,
+        MAP_TO_UNSLOTH_16bit = MAP_TO_UNSLOTH_16bit,
     )
     if new_model_name is None and model_name.count("/") == 1 and model_name[0].isalnum():
         # Try checking if a new Unsloth version allows it!
-        NEW_INT_TO_FLOAT_MAPPER, NEW_FLOAT_TO_INT_MAPPER = _get_new_mapper()
+        NEW_INT_TO_FLOAT_MAPPER, NEW_FLOAT_TO_INT_MAPPER, NEW_MAP_TO_UNSLOTH_16bit = _get_new_mapper()
         upgraded_model_name = __get_model_name(
             model_name = model_name,
             load_in_4bit = load_in_4bit,
-            INT_TO_FLOAT_MAPPER = NEW_INT_TO_FLOAT_MAPPER,
-            FLOAT_TO_INT_MAPPER = NEW_FLOAT_TO_INT_MAPPER,
+            INT_TO_FLOAT_MAPPER  = NEW_INT_TO_FLOAT_MAPPER,
+            FLOAT_TO_INT_MAPPER  = NEW_FLOAT_TO_INT_MAPPER,
+            MAP_TO_UNSLOTH_16bit = NEW_MAP_TO_UNSLOTH_16bit,
         )
         if upgraded_model_name is not None:
             raise NotImplementedError(
diff --git a/unsloth/models/mapper.py b/unsloth/models/mapper.py
index 57ba67658..b8259a073 100644
--- a/unsloth/models/mapper.py
+++ b/unsloth/models/mapper.py
@@ -251,8 +251,9 @@
     ),
 }
 
-INT_TO_FLOAT_MAPPER = {}
-FLOAT_TO_INT_MAPPER = {}
+INT_TO_FLOAT_MAPPER  = {}
+FLOAT_TO_INT_MAPPER  = {}
+MAP_TO_UNSLOTH_16bit = {}
 
 for key, values in __INT_TO_FLOAT_MAPPER.items():
     INT_TO_FLOAT_MAPPER[key] = values[0]
@@ -261,6 +262,14 @@
         FLOAT_TO_INT_MAPPER[value] = key
     pass
 
+    # Map to Unsloth version for 16bit versions
+    if len(values) == 2:
+        if values[0].startswith("unsloth"):
+            MAP_TO_UNSLOTH_16bit[values[1]] = values[0]
+            MAP_TO_UNSLOTH_16bit[values[1].lower()] = values[0]
+        pass
+    pass
+
     # Get lowercased
     lowered_key = key.lower()
     INT_TO_FLOAT_MAPPER[lowered_key] = values[0].lower()

From 8be73b10860fee8ac3ab84c88548de2392948492 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 15 Aug 2024 15:04:46 -0700
Subject: [PATCH 117/147] Bug fixes

---
 unsloth/__init__.py      | 15 ++++++---------
 unsloth/models/loader.py | 24 ++++++++++++++++++++----
 2 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/unsloth/__init__.py b/unsloth/__init__.py
index db54c9a16..dd526dc3c 100644
--- a/unsloth/__init__.py
+++ b/unsloth/__init__.py
@@ -11,10 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import os
-import warnings
-import importlib
-import sys
+
+import warnings, importlib, sys
 from packaging.version import Version
 
 # # Define a list of modules to check
@@ -60,9 +58,8 @@
                       "We have some installation instructions on our Github page.")
 pass
 
-import os, re
+import os, re, subprocess, inspect
 import numpy as np
-import subprocess
 
 # Hugging Face Hub faster downloads (only enable during Colab and Kaggle sessions)
 keynames = "\n" + "\n".join(os.environ.keys())
@@ -83,12 +80,12 @@
     del os.environ["PYTORCH_CUDA_ALLOC_CONF"]
 pass
 
-# Torch 2.5 has including_emulation
+# Torch 2.4 has including_emulation
 major_version, minor_version = torch.cuda.get_device_capability()
 SUPPORTS_BFLOAT16 = (major_version >= 8)
 
-if (major_torch == 2) and (minor_torch >= 5): 
-    old_is_bf16_supported = torch.cuda.is_bf16_supported
+old_is_bf16_supported = torch.cuda.is_bf16_supported
+if "including_emulation" in str(inspect.signature(old_is_bf16_supported)):
     def is_bf16_supported(including_emulation = False):
         return old_is_bf16_supported(including_emulation)
     torch.cuda.is_bf16_supported = is_bf16_supported
diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index e260017fb..02ed00f5c 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -169,13 +169,23 @@ def from_pretrained(
         autoconfig_error = None
         peft_error = None
         try:
-            model_config = AutoConfig.from_pretrained(model_name, token = token, revision = revision)
+            model_config = AutoConfig.from_pretrained(
+                model_name,
+                token = token,
+                revision = revision,
+                trust_remote_code = trust_remote_code,
+            )
             is_model = True
         except Exception as error:
             autoconfig_error = str(error)
             is_model = False
         try:
-            peft_config = PeftConfig .from_pretrained(model_name, token = token, revision = revision)
+            peft_config = PeftConfig.from_pretrained(
+                model_name,
+                token = token,
+                revision = revision,
+                trust_remote_code = trust_remote_code,
+            )
             is_peft = True
         except Exception as error:
             peft_error = str(error)
@@ -207,7 +217,12 @@ def from_pretrained(
         if is_peft:
             # Check base model again for PEFT
             model_name = get_model_name(peft_config.base_model_name_or_path, load_in_4bit)
-            model_config = AutoConfig.from_pretrained(model_name, token = token, revision = revision)
+            model_config = AutoConfig.from_pretrained(
+                model_name,
+                token = token,
+                revision = revision,
+                trust_remote_code = trust_remote_code,
+            )
         pass
 
         if not was_disabled: enable_progress_bars()
@@ -340,10 +355,11 @@ def from_pretrained(
                 token = token,
                 revision = revision,
                 is_trainable = True,
+                trust_remote_code = trust_remote_code,
             )
             # Patch it as well!
             model = dispatch_model.patch_peft_model(model, use_gradient_checkpointing)
         pass
         return model, tokenizer
     pass
-pass
+pass
\ No newline at end of file

From 8b80820b8b9f13ab4ecca089ec6ff92c58530bea Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 15 Aug 2024 15:07:42 -0700
Subject: [PATCH 118/147] Update __init__.py

---
 unsloth/__init__.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/unsloth/__init__.py b/unsloth/__init__.py
index dd526dc3c..f6ed99953 100644
--- a/unsloth/__init__.py
+++ b/unsloth/__init__.py
@@ -14,6 +14,8 @@
 
 import warnings, importlib, sys
 from packaging.version import Version
+import os, re, subprocess, inspect
+import numpy as np
 
 # # Define a list of modules to check
 # MODULES_TO_CHECK = ["bitsandbytes"]
@@ -58,9 +60,6 @@
                       "We have some installation instructions on our Github page.")
 pass
 
-import os, re, subprocess, inspect
-import numpy as np
-
 # Hugging Face Hub faster downloads (only enable during Colab and Kaggle sessions)
 keynames = "\n" + "\n".join(os.environ.keys())
 if "\nCOLAB_"  in keynames or "\nKAGGLE_" in keynames:

From 5e2cf1c51cab723000d0ba33a863cad8c4642a7d Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 16 Aug 2024 19:28:43 -0700
Subject: [PATCH 119/147] untrained tokens llama 3.1 base

---
 unsloth/chat_templates.py  |  4 ++--
 unsloth/tokenizer_utils.py | 20 ++++++++++++++++++++
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py
index 7070524e0..82f6aba14 100644
--- a/unsloth/chat_templates.py
+++ b/unsloth/chat_templates.py
@@ -876,7 +876,7 @@ def get_chat_template(
 
     # Careful on Gemma
     # bos_token is a must or else losses become too high
-    if IS_GEMMA and not chat_template.startswith("{{ bos_token }}"):
+    if IS_GEMMA and not chat_template.startswith(("{{ bos_token }}", "{{- bos_token }}")):
         chat_template = "{{ bos_token }}" + chat_template
     pass
 
@@ -1553,7 +1553,7 @@ def process(part, which, content = "message['content']"):
 
     # Check jinja tempate for bos
     if always_bos_token:
-        if not jinja_template.startswith("{{ bos_token }}"):
+        if not jinja_template.startswith(("{{ bos_token }}", "{{- bos_token }}")):
             jinja_template = "{{ bos_token }}" + jinja_template
     pass
 
diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index 9c0bc1c51..a4f0b33be 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -827,7 +827,27 @@ def fix_untrained_tokens(model, tokenizer, train_dataset, eps = 1e-16):
     # Get untrained tokens
     indicator_untrained1 = torch.amax(embedding_matrix, axis = 1) <= eps
     # Check lm_head as well
+
+    # Does NOT work for Llama 3.1!!
     indicator_untrained2 = torch.amax(lm_head_matrix,   axis = 1) <= eps
+
+    # We instead check for repeated vectors
+    lm_head_where = torch.where(indicator_untrained1)[0]
+    lm_head_bad = lm_head_matrix[lm_head_where]
+    lm_head_bad = lm_head_bad.cpu().numpy().round(3)
+    from collections import Counter
+    counter = Counter()
+    for row in lm_head_bad: counter[hash(row.data.tobytes())] += 1
+    counter = Counter({k: c for k, c in counter.items() if c >= 2})
+
+    lm_head_where = lm_head_where.cpu().numpy()
+    final_bad_lm_head = []
+    for j, row in enumerate(lm_head_bad):
+        if hash(row.data.tobytes()) in counter:
+            final_bad_lm_head.append(lm_head_where[j])
+    indicator_untrained2 = indicator_untrained2 | torch.zeros_like(indicator_untrained2)
+    indicator_untrained2[final_bad_lm_head] = True
+
     # Combine both checks
     indicator_untrained = indicator_untrained1 & indicator_untrained2
     

From c22162b402a0e8cc8a5580f232e39a005fad02f1 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 16 Aug 2024 19:57:19 -0700
Subject: [PATCH 120/147] untrained tokens llama 3.1 base (#929)

---
 unsloth/chat_templates.py  |  4 ++--
 unsloth/tokenizer_utils.py | 20 ++++++++++++++++++++
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py
index 7070524e0..82f6aba14 100644
--- a/unsloth/chat_templates.py
+++ b/unsloth/chat_templates.py
@@ -876,7 +876,7 @@ def get_chat_template(
 
     # Careful on Gemma
     # bos_token is a must or else losses become too high
-    if IS_GEMMA and not chat_template.startswith("{{ bos_token }}"):
+    if IS_GEMMA and not chat_template.startswith(("{{ bos_token }}", "{{- bos_token }}")):
         chat_template = "{{ bos_token }}" + chat_template
     pass
 
@@ -1553,7 +1553,7 @@ def process(part, which, content = "message['content']"):
 
     # Check jinja tempate for bos
     if always_bos_token:
-        if not jinja_template.startswith("{{ bos_token }}"):
+        if not jinja_template.startswith(("{{ bos_token }}", "{{- bos_token }}")):
             jinja_template = "{{ bos_token }}" + jinja_template
     pass
 
diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index 9c0bc1c51..a4f0b33be 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -827,7 +827,27 @@ def fix_untrained_tokens(model, tokenizer, train_dataset, eps = 1e-16):
     # Get untrained tokens
     indicator_untrained1 = torch.amax(embedding_matrix, axis = 1) <= eps
     # Check lm_head as well
+
+    # Does NOT work for Llama 3.1!!
     indicator_untrained2 = torch.amax(lm_head_matrix,   axis = 1) <= eps
+
+    # We instead check for repeated vectors
+    lm_head_where = torch.where(indicator_untrained1)[0]
+    lm_head_bad = lm_head_matrix[lm_head_where]
+    lm_head_bad = lm_head_bad.cpu().numpy().round(3)
+    from collections import Counter
+    counter = Counter()
+    for row in lm_head_bad: counter[hash(row.data.tobytes())] += 1
+    counter = Counter({k: c for k, c in counter.items() if c >= 2})
+
+    lm_head_where = lm_head_where.cpu().numpy()
+    final_bad_lm_head = []
+    for j, row in enumerate(lm_head_bad):
+        if hash(row.data.tobytes()) in counter:
+            final_bad_lm_head.append(lm_head_where[j])
+    indicator_untrained2 = indicator_untrained2 | torch.zeros_like(indicator_untrained2)
+    indicator_untrained2[final_bad_lm_head] = True
+
     # Combine both checks
     indicator_untrained = indicator_untrained1 & indicator_untrained2
     

From 9cb5c2eca4c7b5ea8f2a3fb3048d0b376589296e Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 16 Aug 2024 23:38:02 -0700
Subject: [PATCH 121/147] Update tokenizer_utils.py

---
 unsloth/tokenizer_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index a4f0b33be..38d5949f4 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -834,7 +834,7 @@ def fix_untrained_tokens(model, tokenizer, train_dataset, eps = 1e-16):
     # We instead check for repeated vectors
     lm_head_where = torch.where(indicator_untrained1)[0]
     lm_head_bad = lm_head_matrix[lm_head_where]
-    lm_head_bad = lm_head_bad.cpu().numpy().round(3)
+    lm_head_bad = lm_head_bad.cpu().to(torch.float32).numpy().round(3)
     from collections import Counter
     counter = Counter()
     for row in lm_head_bad: counter[hash(row.data.tobytes())] += 1

From 487637db7bfd0d162a1932379f9dab176323689d Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 16 Aug 2024 23:38:43 -0700
Subject: [PATCH 122/147] Update tokenizer_utils.py

---
 unsloth/tokenizer_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index 38d5949f4..7316656b2 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -834,7 +834,7 @@ def fix_untrained_tokens(model, tokenizer, train_dataset, eps = 1e-16):
     # We instead check for repeated vectors
     lm_head_where = torch.where(indicator_untrained1)[0]
     lm_head_bad = lm_head_matrix[lm_head_where]
-    lm_head_bad = lm_head_bad.cpu().to(torch.float32).numpy().round(3)
+    lm_head_bad = lm_head_bad.cpu().float().numpy().round(3)
     from collections import Counter
     counter = Counter()
     for row in lm_head_bad: counter[hash(row.data.tobytes())] += 1

From 52bc19d1fa4cd3557b785127fd68b5f4d1c34347 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 16 Aug 2024 23:39:44 -0700
Subject: [PATCH 123/147] Bug #930 (#931)

* untrained tokens llama 3.1 base

* Update tokenizer_utils.py

* Update tokenizer_utils.py
---
 unsloth/tokenizer_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index a4f0b33be..7316656b2 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -834,7 +834,7 @@ def fix_untrained_tokens(model, tokenizer, train_dataset, eps = 1e-16):
     # We instead check for repeated vectors
     lm_head_where = torch.where(indicator_untrained1)[0]
     lm_head_bad = lm_head_matrix[lm_head_where]
-    lm_head_bad = lm_head_bad.cpu().numpy().round(3)
+    lm_head_bad = lm_head_bad.cpu().float().numpy().round(3)
     from collections import Counter
     counter = Counter()
     for row in lm_head_bad: counter[hash(row.data.tobytes())] += 1

From 9335fa0960c40fd36e2702456415cbdbbcd847dd Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Mon, 19 Aug 2024 15:04:25 -0700
Subject: [PATCH 124/147] Bug fixes

---
 unsloth/models/_utils.py | 28 ++++++++++++++++++++++++----
 unsloth/models/llama.py  | 28 +++++++++++++++++++++++++++-
 2 files changed, 51 insertions(+), 5 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 0c0057496..d8904aa12 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -595,7 +595,6 @@ def _get_statistics(statistics = None, force_download = True):
     # You can disable this by commenting the below out
     try:
         n_cpus = psutil.cpu_count(logical = False)
-
         keynames = "\n" + "\n".join(os.environ.keys())
         if statistics is not None: pass
         elif "\nCOLAB_"  in keynames and n_cpus == 1: statistics = "colab"
@@ -604,10 +603,31 @@ def _get_statistics(statistics = None, force_download = True):
         elif "\nRUNPOD_" in keynames: statistics = "runpod"
         elif "\nAWS_"    in keynames: statistics = "aws"
         elif "\nAZURE_"  in keynames: statistics = "azure"
-        elif "\nK_" in keynames or "\nFUNCTION_" in keynames: statistics = "gcp"
+        # elif "\nK_" in keynames or "\nFUNCTION_" in keynames: statistics = "gcp"
         elif "\nINVOCATION_ID" in keynames: statistics = "lambda"
-        else: statistics = "other"
-
+        # else: statistics = "other"
+        else:
+            def try_vllm_check():
+                vendor_files = (
+                    "/sys/class/dmi/id/product_version",
+                    "/sys/class/dmi/id/bios_vendor",
+                    "/sys/class/dmi/id/product_name",
+                    "/sys/class/dmi/id/chassis_asset_tag",
+                    "/sys/class/dmi/id/sys_vendor",
+                )
+                from pathlib import Path
+                for vendor_file in vendor_files:
+                    path = Path(vendor_file)
+                    if path.is_file():
+                        file_content = path.read_text().lower()
+                        if   "amazon"                in file_content: return "aws"
+                        elif "microsoft corporation" in file_content: return "azure"
+                        elif "google"                in file_content: return "gcp"
+                return "other"
+            pass
+            try:    statistics = try_vllm_check()
+            except: statistics = "other"
+        pass
         if statistics is not None:
             from transformers import AutoModelForCausalLM
             stats_model = AutoModelForCausalLM.from_pretrained(
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 6a23335c8..d18dd4ce9 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1628,7 +1628,7 @@ def post_patch(model):
 
         # Torch.compile fails on embedding matrix??
         # Workaround randomnly fixes it for torch versions < 2.
-        model.set_input_embeddings(torch.nn.Embedding.from_pretrained(model.get_input_embeddings().weight))
+        # model.set_input_embeddings(torch.nn.Embedding.from_pretrained(model.get_input_embeddings().weight))
         model.config.update({"unsloth_version" : __version__})
 
         # We also do this for the lm_head
@@ -2234,6 +2234,9 @@ def for_inference(model):
             internal_model.gradient_checkpointing = False
             internal_model.training = False
         pass
+        if hasattr(internal_model, "training"):
+            internal_model.training = False
+        pass
 
         # Also check if lm_head / embeddings are trained
         internal_model = model
@@ -2267,6 +2270,16 @@ def for_inference(model):
             internal_model._saved_temp_tokenizer.padding_side = "left"
         pass
 
+        # Also disable training for embeddings for NEFTune
+        if hasattr(model, "get_input_embeddings"):
+            embeddings = model.get_input_embeddings()
+            if hasattr(embeddings, "training"): embeddings.training = False
+        pass
+        if hasattr(model, "get_output_embeddings"):
+            embeddings = model.get_output_embeddings()
+            if hasattr(embeddings, "training"): embeddings.training = False
+        pass
+
         return model
     pass
 
@@ -2288,6 +2301,9 @@ def for_training(model, use_gradient_checkpointing = True):
             internal_model.gradient_checkpointing = use_gradient_checkpointing
             internal_model.training = True
         pass
+        if hasattr(internal_model, "training"):
+            internal_model.training = True
+        pass
 
         # Also revert model.generate
         if hasattr(model, "_unwrapped_old_generate"):
@@ -2307,6 +2323,16 @@ def for_training(model, use_gradient_checkpointing = True):
             internal_model._saved_temp_tokenizer.padding_side = "right"
         pass
 
+        # Also re-enable training for embeddings for NEFTune
+        if hasattr(model, "get_input_embeddings"):
+            embeddings = model.get_input_embeddings()
+            if hasattr(embeddings, "training"): embeddings.training = True
+        pass
+        if hasattr(model, "get_output_embeddings"):
+            embeddings = model.get_output_embeddings()
+            if hasattr(embeddings, "training"): embeddings.training = True
+        pass
+
         return model
     pass
 pass

From 1bed78c99279f3667379e0798440ee3a94d536b4 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Mon, 19 Aug 2024 15:08:53 -0700
Subject: [PATCH 125/147] Update llama.py

---
 unsloth/models/llama.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index d18dd4ce9..3f42dee9c 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -571,6 +571,9 @@ def LlamaModel_fast_forward(
     if inputs_embeds is None:
         inputs_embeds = self.embed_tokens(input_ids)
 
+    print(getattr(self.embed_tokens, "neftune_noise_alpha"))
+    print(getattr(self.embed_tokens, "_forward_hooks"))
+    print(getattr(self.embed_tokens, "_forward_pre_hooks"))
     inputs_embeds = inputs_embeds.to(self.config.torch_dtype)
 
     # Normalized from Gemma

From 2c4772b666e93404a780301a9166736fe4734c25 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Mon, 19 Aug 2024 15:50:16 -0700
Subject: [PATCH 126/147] Update tokenizer_utils.py

---
 unsloth/tokenizer_utils.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index 7316656b2..873544007 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -1109,6 +1109,7 @@ def check_nvidia():
 import trl.trainer.sft_trainer
 from trl.trainer.sft_trainer import *
 from transformers.trainer import *
+from trl.trainer.sft_trainer import neftune_post_forward_hook
 
 def patch_sft_trainer_tokenizer():
     """
@@ -1173,6 +1174,17 @@ def patch_sft_trainer_tokenizer():
     "\n"\
     "fix_untrained_tokens(self.model, self.tokenizer, self.train_dataset, eps = 1e-16)\n\n"
 
+    # Add NEFTune since it doesn't seem to work?? We need to manually inject it
+    check_text += \
+    "\n\n"\
+    "if getattr(self.model.get_input_embeddings(), 'neftune_noise_alpha', None) is not None:\n"\
+    "    if hasattr(self, 'neftune_hook_handle'):\n"\
+    "        self.neftune_hook_handle.remove()\n"\
+    "        if hasattr(self, 'neftune_hook_handle'): del self.neftune_hook_handle\n"\
+    "\n"\
+    "    self.neftune_hook_handle = self.model.get_input_embeddings().register_forward_hook(neftune_post_forward_hook)\n\n"\
+    "\n"
+    
     check_text = check_text.split("\n")
     check_text = "\n".join(" "*where + x for x in check_text)
 

From 7fd058fc71f7433bf55cd978feccc580fa26dab8 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Mon, 19 Aug 2024 15:52:13 -0700
Subject: [PATCH 127/147] Update tokenizer_utils.py

---
 unsloth/tokenizer_utils.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index 873544007..a73887061 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -1177,14 +1177,16 @@ def patch_sft_trainer_tokenizer():
     # Add NEFTune since it doesn't seem to work?? We need to manually inject it
     check_text += \
     "\n\n"\
+    "print(1)\n"\
     "if getattr(self.model.get_input_embeddings(), 'neftune_noise_alpha', None) is not None:\n"\
+    "    print(2)\n"\
     "    if hasattr(self, 'neftune_hook_handle'):\n"\
     "        self.neftune_hook_handle.remove()\n"\
     "        if hasattr(self, 'neftune_hook_handle'): del self.neftune_hook_handle\n"\
     "\n"\
     "    self.neftune_hook_handle = self.model.get_input_embeddings().register_forward_hook(neftune_post_forward_hook)\n\n"\
     "\n"
-    
+
     check_text = check_text.split("\n")
     check_text = "\n".join(" "*where + x for x in check_text)
 

From 6e5ad15cd73388ba694bd532b8bf4d05316b1d9a Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Mon, 19 Aug 2024 16:03:24 -0700
Subject: [PATCH 128/147] Update tokenizer_utils.py

---
 unsloth/tokenizer_utils.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index a73887061..b677f864a 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -1176,15 +1176,15 @@ def patch_sft_trainer_tokenizer():
 
     # Add NEFTune since it doesn't seem to work?? We need to manually inject it
     check_text += \
-    "\n\n"\
-    "print(1)\n"\
-    "if getattr(self.model.get_input_embeddings(), 'neftune_noise_alpha', None) is not None:\n"\
-    "    print(2)\n"\
-    "    if hasattr(self, 'neftune_hook_handle'):\n"\
-    "        self.neftune_hook_handle.remove()\n"\
-    "        if hasattr(self, 'neftune_hook_handle'): del self.neftune_hook_handle\n"\
     "\n"\
-    "    self.neftune_hook_handle = self.model.get_input_embeddings().register_forward_hook(neftune_post_forward_hook)\n\n"\
+    "if hasattr(self, 'neftune_hook_handle'):\n"\
+    "    self.neftune_hook_handle.remove()\n"\
+    "    if hasattr(self, 'neftune_hook_handle'): del self.neftune_hook_handle\n"\
+    "\n"\
+    "if getattr(self, 'neftune_noise_alpha', None) is not None:\n"\
+    "    self.model.get_input_embeddings().neftune_noise_alpha = self.neftune_noise_alpha\n"\
+    "    self.neftune_hook_handle = self.model.get_input_embeddings().register_forward_hook(neftune_post_forward_hook)\n"\
+    "pass\n"\
     "\n"
 
     check_text = check_text.split("\n")

From 7139e57b729253c0ce1d70892dbac4f7f87d28ef Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Mon, 19 Aug 2024 16:08:14 -0700
Subject: [PATCH 129/147] Update llama.py

---
 unsloth/models/llama.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 3f42dee9c..461feb3c7 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -571,9 +571,9 @@ def LlamaModel_fast_forward(
     if inputs_embeds is None:
         inputs_embeds = self.embed_tokens(input_ids)
 
-    print(getattr(self.embed_tokens, "neftune_noise_alpha"))
-    print(getattr(self.embed_tokens, "_forward_hooks"))
-    print(getattr(self.embed_tokens, "_forward_pre_hooks"))
+    # print(getattr(self.embed_tokens, "neftune_noise_alpha"))
+    # print(getattr(self.embed_tokens, "_forward_hooks"))
+    # print(getattr(self.embed_tokens, "_forward_pre_hooks"))
     inputs_embeds = inputs_embeds.to(self.config.torch_dtype)
 
     # Normalized from Gemma

From 9caaa5af78292f29aaaad2ed05d5a55564020a3e Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Mon, 19 Aug 2024 16:11:09 -0700
Subject: [PATCH 130/147] Update llama.py

---
 unsloth/models/llama.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 461feb3c7..d18dd4ce9 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -571,9 +571,6 @@ def LlamaModel_fast_forward(
     if inputs_embeds is None:
         inputs_embeds = self.embed_tokens(input_ids)
 
-    # print(getattr(self.embed_tokens, "neftune_noise_alpha"))
-    # print(getattr(self.embed_tokens, "_forward_hooks"))
-    # print(getattr(self.embed_tokens, "_forward_pre_hooks"))
     inputs_embeds = inputs_embeds.to(self.config.torch_dtype)
 
     # Normalized from Gemma

From be7ed9a1e60224c99fb91f01479b8b654264d8eb Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Mon, 19 Aug 2024 16:14:01 -0700
Subject: [PATCH 131/147] Update llama.py

---
 unsloth/models/llama.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index d18dd4ce9..048ba6919 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1628,7 +1628,7 @@ def post_patch(model):
 
         # Torch.compile fails on embedding matrix??
         # Workaround randomnly fixes it for torch versions < 2.
-        # model.set_input_embeddings(torch.nn.Embedding.from_pretrained(model.get_input_embeddings().weight))
+        model.set_input_embeddings(torch.nn.Embedding.from_pretrained(model.get_input_embeddings().weight))
         model.config.update({"unsloth_version" : __version__})
 
         # We also do this for the lm_head

From 75013ff022523729f13479f7738ec5a0e1d237b0 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Mon, 19 Aug 2024 16:17:52 -0700
Subject: [PATCH 132/147] Fix NEFTune (#937)

* untrained tokens llama 3.1 base

* Update tokenizer_utils.py

* Update tokenizer_utils.py

* Bug fixes

* Update llama.py

* Update tokenizer_utils.py

* Update tokenizer_utils.py

* Update tokenizer_utils.py

* Update llama.py

* Update llama.py

* Update llama.py
---
 unsloth/models/_utils.py   | 28 ++++++++++++++++++++++++----
 unsloth/models/llama.py    | 26 ++++++++++++++++++++++++++
 unsloth/tokenizer_utils.py | 14 ++++++++++++++
 3 files changed, 64 insertions(+), 4 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 0c0057496..d8904aa12 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -595,7 +595,6 @@ def _get_statistics(statistics = None, force_download = True):
     # You can disable this by commenting the below out
     try:
         n_cpus = psutil.cpu_count(logical = False)
-
         keynames = "\n" + "\n".join(os.environ.keys())
         if statistics is not None: pass
         elif "\nCOLAB_"  in keynames and n_cpus == 1: statistics = "colab"
@@ -604,10 +603,31 @@ def _get_statistics(statistics = None, force_download = True):
         elif "\nRUNPOD_" in keynames: statistics = "runpod"
         elif "\nAWS_"    in keynames: statistics = "aws"
         elif "\nAZURE_"  in keynames: statistics = "azure"
-        elif "\nK_" in keynames or "\nFUNCTION_" in keynames: statistics = "gcp"
+        # elif "\nK_" in keynames or "\nFUNCTION_" in keynames: statistics = "gcp"
         elif "\nINVOCATION_ID" in keynames: statistics = "lambda"
-        else: statistics = "other"
-
+        # else: statistics = "other"
+        else:
+            def try_vllm_check():
+                vendor_files = (
+                    "/sys/class/dmi/id/product_version",
+                    "/sys/class/dmi/id/bios_vendor",
+                    "/sys/class/dmi/id/product_name",
+                    "/sys/class/dmi/id/chassis_asset_tag",
+                    "/sys/class/dmi/id/sys_vendor",
+                )
+                from pathlib import Path
+                for vendor_file in vendor_files:
+                    path = Path(vendor_file)
+                    if path.is_file():
+                        file_content = path.read_text().lower()
+                        if   "amazon"                in file_content: return "aws"
+                        elif "microsoft corporation" in file_content: return "azure"
+                        elif "google"                in file_content: return "gcp"
+                return "other"
+            pass
+            try:    statistics = try_vllm_check()
+            except: statistics = "other"
+        pass
         if statistics is not None:
             from transformers import AutoModelForCausalLM
             stats_model = AutoModelForCausalLM.from_pretrained(
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 6a23335c8..048ba6919 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -2234,6 +2234,9 @@ def for_inference(model):
             internal_model.gradient_checkpointing = False
             internal_model.training = False
         pass
+        if hasattr(internal_model, "training"):
+            internal_model.training = False
+        pass
 
         # Also check if lm_head / embeddings are trained
         internal_model = model
@@ -2267,6 +2270,16 @@ def for_inference(model):
             internal_model._saved_temp_tokenizer.padding_side = "left"
         pass
 
+        # Also disable training for embeddings for NEFTune
+        if hasattr(model, "get_input_embeddings"):
+            embeddings = model.get_input_embeddings()
+            if hasattr(embeddings, "training"): embeddings.training = False
+        pass
+        if hasattr(model, "get_output_embeddings"):
+            embeddings = model.get_output_embeddings()
+            if hasattr(embeddings, "training"): embeddings.training = False
+        pass
+
         return model
     pass
 
@@ -2288,6 +2301,9 @@ def for_training(model, use_gradient_checkpointing = True):
             internal_model.gradient_checkpointing = use_gradient_checkpointing
             internal_model.training = True
         pass
+        if hasattr(internal_model, "training"):
+            internal_model.training = True
+        pass
 
         # Also revert model.generate
         if hasattr(model, "_unwrapped_old_generate"):
@@ -2307,6 +2323,16 @@ def for_training(model, use_gradient_checkpointing = True):
             internal_model._saved_temp_tokenizer.padding_side = "right"
         pass
 
+        # Also re-enable training for embeddings for NEFTune
+        if hasattr(model, "get_input_embeddings"):
+            embeddings = model.get_input_embeddings()
+            if hasattr(embeddings, "training"): embeddings.training = True
+        pass
+        if hasattr(model, "get_output_embeddings"):
+            embeddings = model.get_output_embeddings()
+            if hasattr(embeddings, "training"): embeddings.training = True
+        pass
+
         return model
     pass
 pass
diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index 7316656b2..b677f864a 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -1109,6 +1109,7 @@ def check_nvidia():
 import trl.trainer.sft_trainer
 from trl.trainer.sft_trainer import *
 from transformers.trainer import *
+from trl.trainer.sft_trainer import neftune_post_forward_hook
 
 def patch_sft_trainer_tokenizer():
     """
@@ -1173,6 +1174,19 @@ def patch_sft_trainer_tokenizer():
     "\n"\
     "fix_untrained_tokens(self.model, self.tokenizer, self.train_dataset, eps = 1e-16)\n\n"
 
+    # Add NEFTune since it doesn't seem to work?? We need to manually inject it
+    check_text += \
+    "\n"\
+    "if hasattr(self, 'neftune_hook_handle'):\n"\
+    "    self.neftune_hook_handle.remove()\n"\
+    "    if hasattr(self, 'neftune_hook_handle'): del self.neftune_hook_handle\n"\
+    "\n"\
+    "if getattr(self, 'neftune_noise_alpha', None) is not None:\n"\
+    "    self.model.get_input_embeddings().neftune_noise_alpha = self.neftune_noise_alpha\n"\
+    "    self.neftune_hook_handle = self.model.get_input_embeddings().register_forward_hook(neftune_post_forward_hook)\n"\
+    "pass\n"\
+    "\n"
+
     check_text = check_text.split("\n")
     check_text = "\n".join(" "*where + x for x in check_text)
 

From 4cc20f4720ad482a2da04ce79cf5cd622c14e54e Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Mon, 19 Aug 2024 17:12:32 -0700
Subject: [PATCH 133/147] Create _auto_install.py

---
 unsloth/_auto_install.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
 create mode 100644 unsloth/_auto_install.py

diff --git a/unsloth/_auto_install.py b/unsloth/_auto_install.py
new file mode 100644
index 000000000..2f5b62d4c
--- /dev/null
+++ b/unsloth/_auto_install.py
@@ -0,0 +1,16 @@
+try: import torch
+except: raise ImportError('Install torch via `pip install torch`')
+from packaging.version import Version as V
+v = V(torch.__version__)
+cuda = str(torch.version.cuda)
+is_ampere = torch.cuda.get_device_capability()[0] >= 8
+if cuda != "12.1" and cuda != "11.8": raise RuntimeError(f"CUDA = {cuda} not supported!")
+if   v <= V('2.1.0'): raise RuntimeError(f"Torch = {v} too old!")
+elif v <= V('2.1.1'): x = 'cu{}{}-torch211'
+elif v <= V('2.1.2'): x = 'cu{}{}-torch212'
+elif v  < V('2.3.0'): x = 'cu{}{}-torch220'
+elif v  < V('2.4.0'): x = 'cu{}{}-torch230'
+elif v  < V('2.5.0'): x = 'cu{}{}-torch240'
+else: raise RuntimeError(f"Torch = {v} too new!")
+x = x.format(cuda.replace(".", ""), "-ampere" if is_ampere else "")
+print(f'pip install "unsloth[{x}] @ git+https://github.com/unslothai/unsloth.git"')
\ No newline at end of file

From fbf50a42602bf299da2a0a99fea2f9b18550332d Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Mon, 19 Aug 2024 17:12:46 -0700
Subject: [PATCH 134/147] Update _auto_install.py

---
 unsloth/_auto_install.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/unsloth/_auto_install.py b/unsloth/_auto_install.py
index 2f5b62d4c..2e6351b8d 100644
--- a/unsloth/_auto_install.py
+++ b/unsloth/_auto_install.py
@@ -1,3 +1,17 @@
+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 try: import torch
 except: raise ImportError('Install torch via `pip install torch`')
 from packaging.version import Version as V

From d45ade257b8578d63236b389f25e73b5c22bb862 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Mon, 19 Aug 2024 17:18:30 -0700
Subject: [PATCH 135/147] Update README.md (#938)

---
 README.md | 105 +++++++++++++++++++++++-------------------------------
 1 file changed, 44 insertions(+), 61 deletions(-)

diff --git a/README.md b/README.md
index 86c3fbd86..b23acffcb 100644
--- a/README.md
+++ b/README.md
@@ -94,85 +94,68 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and
 
 ## 💾 Installation Instructions
 
-If you have Pytorch 2.3 and CUDA 12.1, install Unsloth with `pip install unsloth[colab-new]` then `pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes`
-
 ### Conda Installation
-Select either `pytorch-cuda=11.8` for CUDA 11.8 or `pytorch-cuda=12.1` for CUDA 12.1. If you have `mamba`, use `mamba` instead of `conda` for faster solving. See this [Github issue](https://github.com/unslothai/unsloth/issues/73) for help on debugging Conda installs.
+`⚠️Only use Conda if you have it. If not, use Pip`. Select either `pytorch-cuda=11.8,12.1` for CUDA 11.8 or CUDA 12.1. If you have `mamba`, use `mamba` instead of `conda` for faster solving. We support `python=3.10,3.11,3.12`.
 ```bash
 conda create --name unsloth_env \
-    python=3.10 \
-    pytorch-cuda=<11.8/12.1> \
+    python=3.11 \
+    pytorch-cuda=12.1 \
     pytorch cudatoolkit xformers -c pytorch -c nvidia -c xformers \
     -y
 conda activate unsloth_env
 
 pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
-
-pip install --no-deps "trl<0.9.0" peft accelerate bitsandbytes
+pip install --no-deps trl peft accelerate bitsandbytes
 ```
 
+<details>
+  <summary>If you're looking to install Conda in a Linux environment, <a href="https://docs.anaconda.com/miniconda/">read here</a>, or run the below 🔽</summary>
+  
+  ```bash
+  mkdir -p ~/miniconda3
+  wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda3/miniconda.sh
+  bash ~/miniconda3/miniconda.sh -b -u -p ~/miniconda3
+  rm -rf ~/miniconda3/miniconda.sh
+  ~/miniconda3/bin/conda init bash
+  ~/miniconda3/bin/conda init zsh
+  ```
+</details>
+
 ### Pip Installation
-Do **NOT** use this if you have Anaconda. You must use the Conda install method, or else stuff will BREAK.
+`⚠️Do **NOT** use this if you have Conda.` Pip is a bit more complex since there are dependency issues. The pip command is different for `torch 2.2,2.3,2.4` and CUDA versions.
 
-1. Find your CUDA version via
-```python
-import torch; torch.version.cuda
-```
-2. For Pytorch 2.1.0: You can update Pytorch via Pip (interchange `cu121` / `cu118`). Go to https://pytorch.org/ to learn more. Select either `cu118` for CUDA 11.8 or `cu121` for CUDA 12.1. If you have a RTX 3060 or higher (A100, H100 etc), use the `"ampere"` path. For Pytorch 2.1.1: go to step 3. For Pytorch 2.2.0: go to step 4.
-```bash
-pip install --upgrade --force-reinstall --no-cache-dir torch==2.1.0 triton \
-  --index-url https://download.pytorch.org/whl/cu121
-```
-```bash
-pip install "unsloth[cu118] @ git+https://github.com/unslothai/unsloth.git"
-pip install "unsloth[cu121] @ git+https://github.com/unslothai/unsloth.git"
-pip install "unsloth[cu118-ampere] @ git+https://github.com/unslothai/unsloth.git"
-pip install "unsloth[cu121-ampere] @ git+https://github.com/unslothai/unsloth.git"
-```
-3. For Pytorch 2.1.1: Use the `"ampere"` path for newer RTX 30xx GPUs or higher.
-```bash
-pip install --upgrade --force-reinstall --no-cache-dir torch==2.1.1 triton \
-  --index-url https://download.pytorch.org/whl/cu121
-```
-```bash
-pip install "unsloth[cu118-torch211] @ git+https://github.com/unslothai/unsloth.git"
-pip install "unsloth[cu121-torch211] @ git+https://github.com/unslothai/unsloth.git"
-pip install "unsloth[cu118-ampere-torch211] @ git+https://github.com/unslothai/unsloth.git"
-pip install "unsloth[cu121-ampere-torch211] @ git+https://github.com/unslothai/unsloth.git"
-```
-4. For Pytorch 2.2.0: Use the `"ampere"` path for newer RTX 30xx GPUs or higher.
-```bash
-pip install --upgrade --force-reinstall --no-cache-dir torch==2.2.0 triton \
-  --index-url https://download.pytorch.org/whl/cu121
-```
-```bash
-pip install "unsloth[cu118-torch220] @ git+https://github.com/unslothai/unsloth.git"
-pip install "unsloth[cu121-torch220] @ git+https://github.com/unslothai/unsloth.git"
-pip install "unsloth[cu118-ampere-torch220] @ git+https://github.com/unslothai/unsloth.git"
-pip install "unsloth[cu121-ampere-torch220] @ git+https://github.com/unslothai/unsloth.git"
-```
-5. If you get errors, try the below first, then go back to step 1:
+In general, if you have `torch 2.4` and `CUDA 12.1`, use:
 ```bash
 pip install --upgrade pip
+pip install "unsloth[cu121-torch240] @ git+https://github.com/unslothai/unsloth.git"
 ```
-6. For Pytorch 2.2.1:
-```bash
-# RTX 3090, 4090 Ampere GPUs:
-pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
-pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
 
-# Pre Ampere RTX 2080, T4, GTX 1080 GPUs:
-pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
-pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes
-```
-7. For Pytorch 2.3.0: Use the `"ampere"` path for newer RTX 30xx GPUs or higher.
+Or, run the below in a terminal to get the optional pip installation command:
 ```bash
-pip install "unsloth[cu118-torch230] @ git+https://github.com/unslothai/unsloth.git"
-pip install "unsloth[cu121-torch230] @ git+https://github.com/unslothai/unsloth.git"
-pip install "unsloth[cu118-ampere-torch230] @ git+https://github.com/unslothai/unsloth.git"
-pip install "unsloth[cu121-ampere-torch230] @ git+https://github.com/unslothai/unsloth.git"
+wget -qO- https://raw.githubusercontent.com/unslothai/unsloth/main/unsloth/_auto_install.py | python -
 ```
-8. To troubleshoot installs try the below (all must succeed). Xformers should mostly all be available.
+
+Or, run the below manually in a Python REPL:
+```python
+try: import torch
+except: raise ImportError("Install torch via `pip install torch`")
+from packaging.version import Version as V
+v = V(torch.__version__)
+cuda = str(torch.version.cuda)
+is_ampere = torch.cuda.get_device_capability()[0] >= 8
+if cuda != "12.1" and cuda != "11.8": raise RuntimeError(f"CUDA = {cuda} not supported!")
+if   v <= V('2.1.0'): raise RuntimeError(f"Torch = {v} too old!")
+elif v <= V('2.1.1'): x = 'cu{}{}-torch211'
+elif v <= V('2.1.2'): x = 'cu{}{}-torch212'
+elif v  < V('2.3.0'): x = 'cu{}{}-torch220'
+elif v  < V('2.4.0'): x = 'cu{}{}-torch230'
+elif v  < V('2.5.0'): x = 'cu{}{}-torch240'
+else: raise RuntimeError(f"Torch = {v} too new!")
+x = x.format(cuda.replace(".", ""), "-ampere" if is_ampere else "")
+print(f'pip install --upgrade pip && pip install "unsloth[{x}] @ git+https://github.com/unslothai/unsloth.git"')
+```
+
+Afterwards, confirm if `nvcc` `xformers` and `bitsandbytes` have successfully installed - if not, install them individually first until they work, then install Unsloth.
 ```bash
 nvcc
 python -m xformers.info

From cb7fd3a09dc3ecf4066d718ab6abc1156946050f Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 20 Aug 2024 14:38:20 -0700
Subject: [PATCH 136/147] LongRoPE

---
 unsloth/models/_utils.py |  13 ++++
 unsloth/models/gemma.py  |   4 ++
 unsloth/models/llama.py  | 127 +++++++++++++++++++++++++++++++++++++--
 3 files changed, 140 insertions(+), 4 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index d8904aa12..434554f67 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -969,6 +969,7 @@ def patch_llama_rope_scaling(
     scaled_rope_module = None,
     extended_rope_module = None,
     attention_module = None,
+    longrope_module = None,
 ):
     assert(\
         rope_module is not None and \
@@ -1026,14 +1027,26 @@ def patch_llama_rope_scaling(
                 max_position_embeddings=self.max_position_embeddings,
                 base=self.rope_theta,
             )
+        elif scaling_type == "longrope":
+            self.rotary_emb = {longrope_rope_function}(
+                dim = self.head_dim,
+                max_position_embeddings = self.max_position_embeddings,
+                original_max_position_embeddings = self.original_max_position_embeddings,
+                base = self.rope_theta,
+                short_factor = self.rope_scaling['short_factor'],
+                long_factor  = self.rope_scaling['long_factor' ],
+            )
         else:
             raise ValueError(f"Unknown RoPE scaling type {{scaling_type}}")
     pass
     """
+
     fix_rope_function = fix_rope_function.format(
         rope_function          = rope_module.__name__,
         scaled_rope_function   = scaled_rope_module.__name__,
         extended_rope_function = extended_rope_module.__name__,
+        longrope_rope_function = \
+            (longrope_module if longrope_module is not None else rope_module).__name__
     )
     rotary_emb = re.findall(
         "self.rotary_emb = .+?\)", function,
diff --git a/unsloth/models/gemma.py b/unsloth/models/gemma.py
index a0894ec7a..45f14c113 100644
--- a/unsloth/models/gemma.py
+++ b/unsloth/models/gemma.py
@@ -254,6 +254,10 @@ def forward(self, x, position_ids=None, seq_len=None):
         )
     pass
 
+    def get_cached(self, seq_len = None):
+        return self.cos_cached, self.sin_cached
+    pass
+
     def extend_rope_embedding(self, x, seq_len):
         if seq_len <= self.current_rope_size: return
         # Iteratively grow by increments of 8192
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 048ba6919..eef4f49e0 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -346,14 +346,17 @@ def LlamaAttention_fast_forward(
         kv_seq_len += past_key_value[0].shape[-2]
 
     # Extend RoPE dynamically to fit in VRAM
-    self.rotary_emb.extend_rope_embedding(V, seq_len = kv_seq_len)
+    rotary_emb = self.rotary_emb
+    rotary_emb.extend_rope_embedding(V, seq_len = kv_seq_len)
 
     if position_ids is None:
-        cos = self.rotary_emb.cos_cached
-        sin = self.rotary_emb.sin_cached
+        # Useful for LongRoPE
+        cos, sin = rotary_emb.get_cached(kv_seq_len)
+        # cos = self.rotary_emb.cos_cached
+        # sin = self.rotary_emb.sin_cached
         Q, K = fast_rope_embedding(Q, K, cos, sin)
     else:
-        cos, sin = self.rotary_emb(V, seq_len = kv_seq_len)
+        cos, sin = rotary_emb(V, seq_len = kv_seq_len)
         Q, K = inplace_rope_embedding(Q, K, cos, sin, position_ids)
     pass
 
@@ -1048,6 +1051,10 @@ def forward(self, x, position_ids=None, seq_len=None):
         )
     pass
 
+    def get_cached(self, seq_len = None):
+        return self.cos_cached, self.sin_cached
+    pass
+
     def extend_rope_embedding(self, x, seq_len):
         if seq_len <= self.current_rope_size: return
         # Iteratively grow by increments of 8192
@@ -1170,6 +1177,117 @@ def forward(self, x, position_ids=None, seq_len=None):
         )
     pass
 
+    def get_cached(self, seq_len = None):
+        return self.cos_cached, self.sin_cached
+    pass
+
+    def extend_rope_embedding(self, x, seq_len):
+        if seq_len <= self.current_rope_size: return
+        # Iteratively grow by increments of 8192
+        self.current_rope_size = math.ceil(seq_len / 8192) * 8192
+        self._set_cos_sin_cache(self.current_rope_size, device = "cuda:0", dtype = x.dtype)
+    pass
+pass
+
+
+class LongRopeRotaryEmbedding(torch.nn.Module):
+    # For Phi 3.5 128K https://huggingface.co/microsoft/Phi-3.5-mini-instruct/blob/main/modeling_phi3.py
+    def __init__(self,
+        dim = None,
+        max_position_embeddings = 131072,
+        original_max_position_embeddings = 4096,
+        base = 10000,
+        short_factor = None,
+        long_factor  = None,
+        device = None,
+        config = None, # [TODO] Hack to pass in config - need to remove later
+    ):
+        super().__init__()
+        assert(short_factor is not None)
+        assert(long_factor  is not None)
+        assert(type(original_max_position_embeddings) is int)
+
+        if config is not None:
+            # [TODO] Hack to pass in config - need to remove later
+            base = config.rope_theta
+            partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
+            dim = int((config.hidden_size // config.num_attention_heads))
+            device = "cuda"
+            max_position_embeddings = config.max_position_embeddings
+        pass
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.original_max_position_embeddings = original_max_position_embeddings
+        self.base = base
+        # Dynamic RoPE we first set it to a max of 4 * 8192 tokens then we iteratively grow this
+        self.current_rope_size = min(original_max_position_embeddings, self.max_position_embeddings)
+
+        # Long RoPE similar to RoPE except short sequences have 1 cos / sin
+        # and long sequences have another cos / sin
+        inv_freq_shape = torch.arange(0, self.dim, 2, dtype=torch.int64, device="cpu").float() / self.dim
+        short_factor = torch.tensor(short_factor, device = "cpu", dtype = torch.float32)
+        long_factor  = torch.tensor(long_factor,  device = "cpu", dtype = torch.float32)
+        short_inv_freq = 1.0 / (short_factor * self.base**inv_freq_shape)
+        long_inv_freq  = 1.0 / (long_factor  * self.base**inv_freq_shape)
+
+        # Phi-3 Scale factor
+        scale = self.max_position_embeddings / self.original_max_position_embeddings
+        if scale <= 1.0:
+            scaling_factor = 1.0
+        else:
+            scaling_factor = math.sqrt(1 + math.log(scale) / math.log(self.original_max_position_embeddings))
+        pass
+        self.scaling_factor = scaling_factor
+
+        # Short and long inv_freq
+        self.register_buffer("short_inv_freq", short_inv_freq, persistent = False)
+        self.register_buffer("long_inv_freq",  long_inv_freq,  persistent = False)
+        # Build here to make `torch.jit.trace` work.
+        # self._set_cos_sin_cache(seq_len=self.current_rope_size, device=device, dtype=torch.get_default_dtype())
+
+        # Short sequences
+        t = torch.arange(original_max_position_embeddings, device=self.short_inv_freq.device, dtype=torch.int64).float()
+        freqs = torch.outer(t, self.short_inv_freq)
+        emb = torch.cat((freqs, freqs), dim=-1)
+        cos_cached = (emb.cos() * self.scaling_factor).to(dtype=dtype, device=device, non_blocking=True)
+        sin_cached = (emb.sin() * self.scaling_factor).to(dtype=dtype, device=device, non_blocking=True)
+        self.register_buffer("short_cos_cached", cos_cached, persistent=False)
+        self.register_buffer("short_sin_cached", sin_cached, persistent=False)
+    pass
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        # Note: on the original Llama codebase, these tensors are created on the target device (and not on CPU) and
+        # in FP32. They are applied (multiplied) in FP32 as well.
+        self.current_rope_size = seq_len
+        
+        t = torch.arange(self.current_rope_size, device=self.inv_freq.device, dtype=torch.int64).float()
+        # Long sequences
+        freqs = torch.outer(t, self.long_inv_freq)
+        emb = torch.cat((freqs, freqs), dim=-1)
+        cos_cached = (emb.cos() * self.scaling_factor).to(dtype=dtype, device=device, non_blocking=True)
+        sin_cached = (emb.sin() * self.scaling_factor).to(dtype=dtype, device=device, non_blocking=True)
+        self.register_buffer("long_cos_cached", cos_cached, persistent=False)
+        self.register_buffer("long_sin_cached", sin_cached, persistent=False)
+    pass
+
+    def forward(self, x, position_ids=None, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.current_rope_size:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+        return (
+            self.cos_cached[:seq_len].to(dtype = x.dtype),
+            self.sin_cached[:seq_len].to(dtype = x.dtype),
+        )
+    pass
+
+    def get_cached(self, seq_len = None):
+        if seq_len < original_max_position_embeddings:
+            return self.short_cos_cached, self.short_sin_cached
+        return self.long_cos_cached, self.long_sin_cached
+    pass
+
     def extend_rope_embedding(self, x, seq_len):
         if seq_len <= self.current_rope_size: return
         # Iteratively grow by increments of 8192
@@ -1242,6 +1360,7 @@ def pre_patch():
             scaled_rope_module   = LlamaLinearScalingRotaryEmbedding,
             extended_rope_module = LlamaExtendedRotaryEmbedding,
             attention_module     = LlamaAttention,
+            longrope_module      = LongRopeRotaryEmbedding,
         )
         if init_name is not None:
             exec(function, globals())

From ad9418f94a444daf98022c750c47ce4fd7d16a2c Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 20 Aug 2024 14:41:49 -0700
Subject: [PATCH 137/147] Update _utils.py

---
 unsloth/models/_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 434554f67..90ceb25b5 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -1028,6 +1028,7 @@ def patch_llama_rope_scaling(
                 base=self.rope_theta,
             )
         elif scaling_type == "longrope":
+            print('## Long RoPE')
             self.rotary_emb = {longrope_rope_function}(
                 dim = self.head_dim,
                 max_position_embeddings = self.max_position_embeddings,

From 592e7a483608897ef3e5e9f85be46e06a5508eb0 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 20 Aug 2024 14:45:36 -0700
Subject: [PATCH 138/147] Update _utils.py

---
 unsloth/models/_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 90ceb25b5..8cc056b03 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -1037,6 +1037,7 @@ def patch_llama_rope_scaling(
                 short_factor = self.rope_scaling['short_factor'],
                 long_factor  = self.rope_scaling['long_factor' ],
             )
+            print('## Long RoPE')
         else:
             raise ValueError(f"Unknown RoPE scaling type {{scaling_type}}")
     pass

From 0e0b71fe5447f7b007401e4d308670523a34973b Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 20 Aug 2024 15:03:43 -0700
Subject: [PATCH 139/147] Update _utils.py

---
 unsloth/models/_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 8cc056b03..e87ffe02d 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -1032,7 +1032,7 @@ def patch_llama_rope_scaling(
             self.rotary_emb = {longrope_rope_function}(
                 dim = self.head_dim,
                 max_position_embeddings = self.max_position_embeddings,
-                original_max_position_embeddings = self.original_max_position_embeddings,
+                original_max_position_embeddings = self.rope_scaling['original_max_position_embeddings'],
                 base = self.rope_theta,
                 short_factor = self.rope_scaling['short_factor'],
                 long_factor  = self.rope_scaling['long_factor' ],

From 1d724f8b8d38ebb8c125c34e471ad06e1b9874b4 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 20 Aug 2024 15:05:12 -0700
Subject: [PATCH 140/147] Update _utils.py

---
 unsloth/models/_utils.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index e87ffe02d..1c48e8e58 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -1028,16 +1028,14 @@ def patch_llama_rope_scaling(
                 base=self.rope_theta,
             )
         elif scaling_type == "longrope":
-            print('## Long RoPE')
             self.rotary_emb = {longrope_rope_function}(
                 dim = self.head_dim,
                 max_position_embeddings = self.max_position_embeddings,
-                original_max_position_embeddings = self.rope_scaling['original_max_position_embeddings'],
+                original_max_position_embeddings = self.config.original_max_position_embeddings,
                 base = self.rope_theta,
-                short_factor = self.rope_scaling['short_factor'],
-                long_factor  = self.rope_scaling['long_factor' ],
+                short_factor = self.config.rope_scaling['short_factor'],
+                long_factor  = self.config.rope_scaling['long_factor' ],
             )
-            print('## Long RoPE')
         else:
             raise ValueError(f"Unknown RoPE scaling type {{scaling_type}}")
     pass

From 9ea3579f219ea3878e7e9b506b54a8889d43fd2e Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 20 Aug 2024 15:08:31 -0700
Subject: [PATCH 141/147] Update llama.py

---
 unsloth/models/llama.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index eef4f49e0..c1dae2eda 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1247,6 +1247,7 @@ def __init__(self,
         # self._set_cos_sin_cache(seq_len=self.current_rope_size, device=device, dtype=torch.get_default_dtype())
 
         # Short sequences
+        dtype = torch.bfloat16 if is_bfloat16_supported() else torch.float16
         t = torch.arange(original_max_position_embeddings, device=self.short_inv_freq.device, dtype=torch.int64).float()
         freqs = torch.outer(t, self.short_inv_freq)
         emb = torch.cat((freqs, freqs), dim=-1)

From 40cbfc5cdd1504a7680748d6e5fa13c6fc9efba8 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 20 Aug 2024 15:11:10 -0700
Subject: [PATCH 142/147] Update llama.py

---
 unsloth/models/llama.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index c1dae2eda..bf033a56a 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1277,10 +1277,17 @@ def forward(self, x, position_ids=None, seq_len=None):
         if seq_len > self.current_rope_size:
             self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
 
-        return (
-            self.cos_cached[:seq_len].to(dtype = x.dtype),
-            self.sin_cached[:seq_len].to(dtype = x.dtype),
-        )
+        if seq_len < original_max_position_embeddings:
+            return (
+                self.short_cos_cached[:seq_len].to(dtype = x.dtype),
+                self.short_sin_cached[:seq_len].to(dtype = x.dtype),
+            )
+        else:
+            return (
+                self.long_cos_cached[:seq_len].to(dtype = x.dtype),
+                self.long_sin_cached[:seq_len].to(dtype = x.dtype),
+            )
+        pass
     pass
 
     def get_cached(self, seq_len = None):

From 200b2350930861f2fe6b69514ffffa7d20102910 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 20 Aug 2024 15:13:23 -0700
Subject: [PATCH 143/147] Update llama.py

---
 unsloth/models/llama.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index bf033a56a..867b724ea 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1784,11 +1784,19 @@ def post_patch(model):
                 pass
             pass
             # Downcast RoPE embedding to correct data type
-            if (name.endswith("rotary_emb") or hasattr(module, "cos_cached")) \
-                and (module.cos_cached.dtype != correct_dtype):
-                
-                module.cos_cached = module.cos_cached.to(correct_dtype)
-                module.sin_cached = module.sin_cached.to(correct_dtype)
+            if (name.endswith("rotary_emb") or hasattr(module, "cos_cached")):
+
+                if hasattr(module, "cos_cached") and \
+                    (module.cos_cached.dtype != correct_dtype):
+
+                    module.cos_cached = module.cos_cached.to(correct_dtype)
+                    module.sin_cached = module.sin_cached.to(correct_dtype)
+
+                elif hasattr(module, "short_cos_cached") and \
+                    (module.short_cos_cached.dtype != correct_dtype):
+                    
+                    module.short_cos_cached = module.short_cos_cached.to(correct_dtype)
+                    module.short_sin_cached = module.short_sin_cached.to(correct_dtype)
                 pass
             pass
         pass

From f35f9b03560e9631286e67590fa3c45cc3c2282d Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 20 Aug 2024 15:15:09 -0700
Subject: [PATCH 144/147] Update llama.py

---
 unsloth/models/llama.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 867b724ea..42f01b7eb 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1277,7 +1277,7 @@ def forward(self, x, position_ids=None, seq_len=None):
         if seq_len > self.current_rope_size:
             self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
 
-        if seq_len < original_max_position_embeddings:
+        if seq_len < self.original_max_position_embeddings:
             return (
                 self.short_cos_cached[:seq_len].to(dtype = x.dtype),
                 self.short_sin_cached[:seq_len].to(dtype = x.dtype),
@@ -1291,7 +1291,7 @@ def forward(self, x, position_ids=None, seq_len=None):
     pass
 
     def get_cached(self, seq_len = None):
-        if seq_len < original_max_position_embeddings:
+        if seq_len < self.original_max_position_embeddings:
             return self.short_cos_cached, self.short_sin_cached
         return self.long_cos_cached, self.long_sin_cached
     pass

From 9266c1c5cbac3f18baba807a627c3c122ad7dc26 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 20 Aug 2024 15:21:18 -0700
Subject: [PATCH 145/147] Update llama.py

---
 unsloth/models/llama.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 42f01b7eb..376b4b4eb 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -187,8 +187,9 @@ def LlamaAttention_fast_forward_inference(
 
     # cos, sin = self.rotary_emb(Vn, seq_len = kv_seq_len)
     # Qn, Kn = inplace_rope_embedding(Qn, Kn, cos, sin, position_ids)
-    cos = self.rotary_emb.cos_cached[position_ids].unsqueeze(1)
-    sin = self.rotary_emb.sin_cached[position_ids].unsqueeze(1)
+    cos, sin = self.rotary_emb.get_cached(kv_seq_len)
+    cos = cos[position_ids].unsqueeze(1)
+    sin = sin[position_ids].unsqueeze(1)
     h = self.half_head_dim
 
     RH_Q = self.RH_Q

From 647bbdbb2934a71240dd63845d4a715eb2d06caf Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 20 Aug 2024 16:29:11 -0700
Subject: [PATCH 146/147] Update mapper.py

---
 unsloth/models/mapper.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/unsloth/models/mapper.py b/unsloth/models/mapper.py
index b8259a073..3f49c9655 100644
--- a/unsloth/models/mapper.py
+++ b/unsloth/models/mapper.py
@@ -249,6 +249,10 @@
         "unsloth/gemma-2-2b-it",
         "google/gemma-2-2b-it",
     ),
+    "unsloth/Phi-3.5-mini-instruct-bnb-4bit" : (
+        "unsloth/Phi-3.5-mini-instruct",
+        "microsoft/Phi-3.5-mini-instruct",
+    ),
 }
 
 INT_TO_FLOAT_MAPPER  = {}

From be8b3d8528c6e2cb34f8909e65e5ab420b597edc Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 20 Aug 2024 16:46:26 -0700
Subject: [PATCH 147/147] Phi 3.5

---
 README.md                 | 2 +-
 unsloth/chat_templates.py | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index b23acffcb..0590415f4 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@
 <a href="https://discord.gg/unsloth"><img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/Discord button.png" height="48"></a>
 <a href="https://ko-fi.com/unsloth"><img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/buy me a coffee button.png" height="48"></a>
 
-### Finetune Llama 3.1, Mistral, Phi-3 & Gemma 2-5x faster with 80% less memory!
+### Finetune Llama 3.1, Mistral, Phi-3.5 & Gemma 2-5x faster with 80% less memory!
 
 ![](https://i.ibb.co/sJ7RhGG/image-41.png)
 
diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py
index 82f6aba14..f83df579b 100644
--- a/unsloth/chat_templates.py
+++ b/unsloth/chat_templates.py
@@ -473,7 +473,7 @@
 
 # =========================================== Phi-3
 phi3_template = \
-    "{{ bos_token }}"\
+    # "{{ bos_token }}"\ # Phi-3.5 removes BOS?
     "{% for message in messages %}"\
         "{% if message['role'] == 'user' %}"\
             "{{'<|user|>\n' + message['content'] + '<|end|>\n'}}"\
@@ -505,7 +505,9 @@
 '''
 
 phi3_template_eos_token = "<|end|>"
-CHAT_TEMPLATES["phi-3"] = (phi3_template, phi3_template_eos_token, False, phi3_ollama,)
+CHAT_TEMPLATES["phi-3"]   = (phi3_template, phi3_template_eos_token, False, phi3_ollama,)
+CHAT_TEMPLATES["phi-35"]  = CHAT_TEMPLATES["phi-3"]
+CHAT_TEMPLATES["phi-3.5"] = CHAT_TEMPLATES["phi-3"]
 pass
 
 # =========================================== Llama-3.1