ignored labels

danielhanchen · danielhanchen · commit 9d07be077b33 · 2024-10-27T22:10:59.000-07:00
diff --git a/unsloth/models/gemma.py b/unsloth/models/gemma.py
@@ -339,7 +339,18 @@ def pre_patch():
 
 
     @staticmethod
-    def post_patch(model, tokenizer):
+    def post_patch(model, tokenizer, max_seq_length):
+        # Add max_seq_length to all modules
+        extra_ignored_labels = torch.full((max_seq_length, 1), -100, device = "cuda:0")
+        internal_model = model
+        while hasattr(internal_model, "model"):
+            internal_model.max_seq_length = max_seq_length
+            internal_model.extra_ignored_labels = extra_ignored_labels
+            internal_model = internal_model.model
+        pass
+        internal_model.max_seq_length = max_seq_length
+        internal_model.extra_ignored_labels = extra_ignored_labels
+        
         # Torch.compile fails on embedding matrix??
         # Workaround randomnly fixes it for torch versions < 2.2
         model.model.embed_tokens = torch.nn.Embedding.from_pretrained(model.model.embed_tokens.weight)
diff --git a/unsloth/models/gemma2.py b/unsloth/models/gemma2.py
@@ -490,7 +490,18 @@ def pre_patch():
 
 
     @staticmethod
-    def post_patch(model, tokenizer):
+    def post_patch(model, tokenizer, max_seq_length):
+        # Add max_seq_length to all modules
+        extra_ignored_labels = torch.full((max_seq_length, 1), -100, device = "cuda:0")
+        internal_model = model
+        while hasattr(internal_model, "model"):
+            internal_model.max_seq_length = max_seq_length
+            internal_model.extra_ignored_labels = extra_ignored_labels
+            internal_model = internal_model.model
+        pass
+        internal_model.max_seq_length = max_seq_length
+        internal_model.extra_ignored_labels = extra_ignored_labels
+        
         # Torch.compile fails on embedding matrix??
         # Workaround randomnly fixes it for torch versions < 2.2
         model.model.embed_tokens = torch.nn.Embedding.from_pretrained(model.model.embed_tokens.weight)
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
@@ -1621,7 +1621,7 @@ def from_pretrained(
         )
 
         model, tokenizer = patch_tokenizer(model, tokenizer)
-        model, tokenizer = model_patcher.post_patch(model, tokenizer)
+        model, tokenizer = model_patcher.post_patch(model, tokenizer, max_position_embeddings)
 
         # Patch up QKV / O and MLP
         for idx, layer in enumerate(model.model.layers):
@@ -1827,7 +1827,18 @@ def from_pretrained(
 
 
     @staticmethod
-    def post_patch(model, tokenizer):
+    def post_patch(model, tokenizer, max_seq_length):
+        # Add max_seq_length to all modules
+        extra_ignored_labels = torch.full((max_seq_length, 1), -100, device = "cuda:0")
+        internal_model = model
+        while hasattr(internal_model, "model"):
+            internal_model.max_seq_length = max_seq_length
+            internal_model.extra_ignored_labels = extra_ignored_labels
+            internal_model = internal_model.model
+        pass
+        internal_model.max_seq_length = max_seq_length
+        internal_model.extra_ignored_labels = extra_ignored_labels
+
         # Torch.compile fails on embedding matrix??
         try: old_input_embedding  = model.get_input_embeddings ().weight
         except: return model, tokenizer
@@ -2459,18 +2470,6 @@ def patch_peft_model(
         )
         patch_saving_functions(model)
 
-        # Patch cross entropy loss labels
-        # Fixes https://github.com/unslothai/unsloth/issues/10
-        max_seq_length = model.max_seq_length
-        extra_ignored_labels = torch.full((max_seq_length, 1), -100, device = "cuda:0")
-        model.model.extra_ignored_labels = extra_ignored_labels
-        internal_model = model
-        while hasattr(internal_model, "model"):
-            internal_model.max_seq_length = max_seq_length
-            internal_model = internal_model.model
-        pass
-        internal_model.max_seq_length = max_seq_length        
-
         # Patch tokenizer to pad to the right
         internal_model = model
         while hasattr(internal_model, "model"):