Fixes

danielhanchen · danielhanchen · commit c2dbc03f1433 · 2025-10-26T22:39:38.000-07:00
diff --git a/unsloth/import_fixes.py b/unsloth/import_fixes.py
@@ -150,8 +150,8 @@ def patch_ipykernel_hf_xet():
         Version(importlib_version("ipykernel")) == Version("7.0.0")
     ):
         print(
-            "#### Unsloth: `hf_xet==1.1.10` and `ipykernel>6.30.1` breaks progress bars. Disabling for now in XET.\n"\
-            "#### Unsloth: To re-enable progress bars, please downgrade to `ipykernel==6.30.1` or wait for a fix to\n"\
+            "#### Unsloth: `hf_xet==1.1.10` and `ipykernel==7.0.0` breaks progress bars. Disabling for now in XET.\n"\
+            "#### Unsloth: To re-enable progress bars, please upgrade to `ipykernel>7.0.0` or wait for a fix to\n"\
             "https://github.com/huggingface/xet-core/issues/526"
         )
         from huggingface_hub.utils import disable_progress_bars
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
@@ -1203,12 +1203,8 @@ def _CausalLM_fast_forward(
         else:
             RETURN_LOGITS = os.environ.get("UNSLOTH_RETURN_LOGITS", "0") == "1"
             # < 1024 Normal Unsloth uses less VRAM!
-            if DEVICE_TYPE == "hip":
-                # [TODO] AMD GPUs fail on chunked_cross_entropy loss!
-                # RuntimeError: Triton Error [HIP]: Code: 1, Messsage: invalid argument
-                RETURN_LOGITS = False
-            elif bsz*q_len <= 1024:
-                # Uses 800MB more VRAM it seems than fused CE Loss
+            if bsz * q_len <= 1024 and not RETURN_LOGITS:
+                # Use unsloth_fused_ce_loss which actually calculates the best chunk size to reduce VRAM usage
                 RETURN_LOGITS = False
 
             if not RETURN_LOGITS and labels is not None:
diff --git a/unsloth/models/mistral.py b/unsloth/models/mistral.py
@@ -298,7 +298,9 @@ def MistralForCausalLM_fast_forward(
     else:
         RETURN_LOGITS = os.environ.get("UNSLOTH_RETURN_LOGITS", "0") == "1"
         # < 1024 Normal Unsloth uses less VRAM!
-        if bsz * q_len <= 1024: RETURN_LOGITS = True
+        if bsz * q_len <= 1024 and not RETURN_LOGITS:
+            # Use unsloth_fused_ce_loss which actually calculates the best chunk size to reduce VRAM usage
+            RETURN_LOGITS = False
 
         if not RETURN_LOGITS and labels is not None:
             n_items = kwargs.get("num_items_in_batch", None) or kwargs.get("n_items", None)