Bug fix (#1249)

danielhanchen · timothelaborie · eltociear · web-flow · commit 3ea7044fc077 · 2024-11-05T21:08:11.000-08:00
* Fix TRL * Update mistral.py * Patch processing_class * Update tokenizer_utils.py * Update tokenizer_utils.py * Update tokenizer_utils.py * Update tokenizer_utils.py * Update tokenizer_utils.py * Update tokenizer_utils.py * Installation guide (#1165) * chore: update chat_templates.py (#1166) orginal -> original * Disable Flex Attention * Update tokenizer_utils.py * Update _utils.py * n_items * Update cross_entropy_loss.py * Fix DPO, ORPO * Update _utils.py * Update _utils.py * fix/transformers-unpack (#1180) * Fix DPO, ORPO (#1177) * Fix TRL * Update mistral.py * Patch processing_class * Update tokenizer_utils.py * Update tokenizer_utils.py * Update tokenizer_utils.py * Update tokenizer_utils.py * Update tokenizer_utils.py * Update tokenizer_utils.py * Installation guide (#1165) * chore: update chat_templates.py (#1166) orginal -> original * Disable Flex Attention * Update tokenizer_utils.py * Update _utils.py * n_items * Update cross_entropy_loss.py * Fix DPO, ORPO * Update _utils.py --------- Co-authored-by: timothelaborie <97834767+timothelaborie@users.noreply.github.com> Co-authored-by: Ikko Eltociear Ashimine <eltociear@gmail.com> * Add warning for missing Unpack and KwargsForCausalLM in older Transformers versions --------- Co-authored-by: Daniel Han <danielhanchen@gmail.com> Co-authored-by: timothelaborie <97834767+timothelaborie@users.noreply.github.com> Co-authored-by: Ikko Eltociear Ashimine <eltociear@gmail.com> * Update cross_entropy_loss.py * Update _utils.py * Update _utils.py * donot upcast lm_head and embeddings to float32 (#1186) * Cleanup upcast logs (#1188) * Fix/phi-longrope (#1193) * Enhance rotary embedding handling in LlamaAttention and LongRopeRotaryEmbedding * Typo * Improve rotary embedding handling in LlamaAttention to prevent errors with short KV cache * Update llama.py * Update llama.py --------- Co-authored-by: Daniel Han <danielhanchen@gmail.com> * Update transformers * Unk token issues * Update _utils.py * Fix pad token * Update llama.py * Typo * ignored labels * Revert "ignored labels" This reverts commit 9d07be0. * More patching * Update _utils.py * Update _utils.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Feat/all tmp (#1219) * Update save.py Check whether path is in /tmp dir for Kaggle environment * Update save.py Move temporary_location to /tmp in Kaggle * Enhance Kaggle environment support in save and tokenizer utilities --------- Co-authored-by: dendarrion <37800703+dendarrion@users.noreply.github.com> Co-authored-by: Erland366 <erland.pg366@gmail.com> * Bug fixes * Update pyproject.toml * Update _utils.py * Update __init__.py * Update __init__.py * Update _utils.py * Update _utils.py * Update _utils.py * Update _utils.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Tied weights * Revert "Tied weights" This reverts commit 8090b7c. * Tied weights * Utils * CE Loss patching * Update __init__.py * Update __init__.py * Patching * Update cross_entropy_loss.py * CE Loss * Update _utils.py * Update _utils.py * CE Loss * Update _utils.py * Update _utils.py * Layernorm * Update _utils.py * Update _utils.py * Post patch * Update _utils.py * Update llama.py * Update _utils.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * typing * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * int64 * Update _utils.py * Update cross_entropy_loss.py * constexpr * constexpr * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update _utils.py * Update _utils.py * Update _utils.py * CE * Update cross_entropy_loss.py * Update _utils.py * Update llama.py * Update _utils.py * Update rms_layernorm.py * Update rms_layernorm.py * Update rms_layernorm.py * Update rms_layernorm.py * Update rms_layernorm.py * Update rms_layernorm.py * Update utils.py * Update rms_layernorm.py * Update rms_layernorm.py * Update rms_layernorm.py * Update rms_layernorm.py * Update rms_layernorm.py * Update rms_layernorm.py * Update rms_layernorm.py * Update rms_layernorm.py * Update rms_layernorm.py * Update rms_layernorm.py * Update rms_layernorm.py * Update rms_layernorm.py * typing * Update rope_embedding.py * types * Disable compiling * Update _utils.py * Update _utils.py * Forward hook * Update _utils.py * Update llama.py * Update _utils.py * Update llama.py * Update llama.py * Update _utils.py * Update pyproject.toml * Update _utils.py * Update llama.py * CE Loss * Update cross_entropy_loss.py * Update _utils.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update llama.py --------- Co-authored-by: timothelaborie <97834767+timothelaborie@users.noreply.github.com> Co-authored-by: Ikko Eltociear Ashimine <eltociear@gmail.com> Co-authored-by: Edd <68678137+Erland366@users.noreply.github.com> Co-authored-by: Datta Nimmaturi <datta.nimmaturi@nutanix.com> Co-authored-by: dendarrion <37800703+dendarrion@users.noreply.github.com> Co-authored-by: Erland366 <erland.pg366@gmail.com>
diff --git a/unsloth/kernels/cross_entropy_loss.py b/unsloth/kernels/cross_entropy_loss.py
@@ -73,14 +73,13 @@ def _cross_entropy_forward(
     mask = col_offsets < VOCAB_SIZE
 
     label_idx = tl.load(labels_ptr).to(tl.int32)
-    logits = tl.load(logits_ptr + col_offsets, mask = mask, other = -float("inf"))
+    logits = tl.load(logits_ptr + col_offsets, mask = mask, other = -float("inf")).to(tl.float32)
 
     # Go logit scaling for Cohere: t * x
     if DO_LOGIT_SCALING: logits = LOGIT_SCALE * logits
     # Do logit softcapping for Gemma 2: t * tanh(1/t * x)
-    if DO_SOFTCAPPING:   logits = SOFTCAP * triton_tanh(logits.to(tl.float32) / SOFTCAP).to(logits.dtype)
-
-    logits = logits.to(tl.float32)
+    if DO_SOFTCAPPING:   logits = SOFTCAP * triton_tanh(logits / SOFTCAP)
+    
     c = tl.max(logits, 0)
     logsumexp = c + tl.log(tl.sum(tl.exp(logits - c), 0))
 
@@ -152,14 +151,13 @@ def _chunked_cross_entropy_forward(
     mask = col_offsets < VOCAB_SIZE
 
     label_idx = tl.load(labels_ptr).to(tl.int32)
-    logits = tl.load(logits_ptr + col_offsets, mask = mask, other = -float("inf"))
+    logits = tl.load(logits_ptr + col_offsets, mask = mask, other = -float("inf")).to(tl.float32)
 
     # Go logit scaling for Cohere: t * x
     if DO_LOGIT_SCALING: logits = LOGIT_SCALE * logits
     # Do logit softcapping for Gemma 2: t * tanh(1/t * x)
-    if DO_SOFTCAPPING:   logits = SOFTCAP * triton_tanh(logits.to(tl.float32) / SOFTCAP).to(logits.dtype)
+    if DO_SOFTCAPPING:   logits = SOFTCAP * triton_tanh(logits / SOFTCAP)
 
-    logits = logits.to(tl.float32)
     c = tl.max(logits, 0)
     logsumexp = c + tl.log(tl.sum(tl.exp(logits - c), 0))
 
@@ -229,7 +227,7 @@ def _cross_entropy_backward(
     else:
         dloss = 0.0
 
-    x = tl.load(logits_ptr + col_offsets, mask = mask, other = -float("inf"))
+    x = tl.load(logits_ptr + col_offsets, mask = mask, other = -float("inf")).to(tl.float32)
 
     # Do logit scaling for Cohere
     if DO_LOGIT_SCALING:
@@ -241,12 +239,12 @@ def _cross_entropy_backward(
     partial = x
     if DO_SOFTCAPPING:
         # d/dx [t * tanh(1/t * x)] = 1 - tanh^2(1/t * x)
-        partial = triton_tanh(x.to(tl.float32) / SOFTCAP).to(x.dtype)
+        partial = triton_tanh(x / SOFTCAP)
         x = SOFTCAP * partial
     pass
 
     logsumexp = tl.load(logsumexp_ptr + row_idx)
-    y = tl.exp(x.to(tl.float32) - logsumexp)
+    y = tl.exp(x - logsumexp)
     y = tl.where(
         col_offsets == label_idx,
         y - 1.0, # exp(x - logsumexp) - 1
@@ -337,6 +335,7 @@ def forward(ctx, logits, labels, logit_softcapping : float = 0, logit_scaling :
         return losses
     pass
 
+
     @staticmethod
     def backward(ctx, dlosses):
         logits, logsumexp, labels = ctx.saved_tensors
@@ -345,6 +344,8 @@ def backward(ctx, dlosses):
         n_rows, vocab_size = logits.shape
 
         BLOCK_SIZE : int = 4096
+        div : int
+        mod : int
         div, mod = divmod(vocab_size, BLOCK_SIZE)
         n_blocks : int = div + (mod != 0)