FP8 training enhancements (#3496)

Datta0 · web-flow · commit fc178b520451 · 2025-10-26T23:22:20.000-07:00
* Fix FP8 for models with non 8 multiple weights

* patch fp8 forward methods for compiled models

* patch hf quantizer for fp8

* Failsafe import of fbgemmfp8linear and fp8linear

* Beautify
diff --git a/unsloth/kernels/__init__.py b/unsloth/kernels/__init__.py
@@ -44,6 +44,7 @@
     apply_lora_o,
     fast_lora_forward,
 )
+from .fp8 import * # This step is to ensure that we patch the FbgmemFP8Linear and FP8Linear's forward functions before the execution of model creation so that this applies to compiled non fast inference models as well
 from .utils import fast_dequantize, fast_gemv, QUANT_STATE, fast_linear_forward, matmul_lora
 
 from .flex_attention import (
diff --git a/unsloth/kernels/fp8.py b/unsloth/kernels/fp8.py
@@ -17,6 +17,20 @@
 import triton.language as tl
 from torch.nn import functional as F
 import math
+from unsloth_zoo.log import logger
+
+try:
+    from transformers.integrations.finegrained_fp8 import FP8Linear
+except ImportError:
+    FP8Linear = None
+    logger.log("Unsloth: FP8 models need importing FP8Linear from `transformers.integrations.finegrained_fp8` but we don't see it.")
+
+try:
+    from transformers.integrations.fbgemm_fp8 import FbgemmFp8Linear
+except ImportError:
+    FbgemmFp8Linear = None
+    logger.log("Unsloth: FP8 models need importing FbgemmFP8Linear from `transformers.integrations.fbgemm_fp8` but we don't see it.")
+
 try:
     from fbgemm_gpu.experimental.gemm.triton_gemm.fp8_gemm import triton_quantize_fp8_block
 except ImportError:
@@ -329,7 +343,7 @@ def forward(ctx, X, weight, weight_scale):
     @staticmethod
     def backward(ctx, grad_output):
         W_deq = weight_dequant(ctx.weight, ctx.weight_scale)
-        grad_X = torch_matmul(grad_output, W_deq.t())
+        grad_X = torch_matmul(grad_output, W_deq)
         del W_deq
         return grad_X, None, None
 
@@ -338,20 +352,17 @@ def fp8_block_quant_forward(X, weight, weight_scale):
     return FP8BlockQuantLinear.apply(X, weight, weight_scale)
 
 
-class FbgemmFp8Linear(torch.autograd.Function):
+class FbgemmFp8Linear_matmul(torch.autograd.Function):
 
     @staticmethod
     def forward(ctx, x, weight, weight_scale, bias=None):
-        if weight.shape[0] != weight_scale.shape[0]:
-            if weight.shape[1] == weight_scale.shape[0]:
-                # This is generally the case when we do backward pass. The only way is to dequantize as there is no column wise fp8 matmul
-                W_deq = weight_dequant(weight, weight_scale).T
-                x = torch_matmul(x, W_deq)
-                del W_deq
-                return x
-            else:
-                raise ValueError(f"Shapes are incompatible {weight.shape=}, {weight_scale.shape=}, {x.shape=}")
-        else:
+
+        if weight.shape[0] == weight_scale.shape[0] and (weight.shape[0] % 8 == 0 and weight.shape[1] % 8 == 0):
+            # Edit: The kernel seems to expect that the weight has dimensions divisible by 8. Otherwise it throws `RuntimeError: cutlass cannot implement`
+            # One thing we can do is to pad the weight and weight scale to multiple of 8 and perform a F8F8BF16 operation.
+            # I tried benchmarking that for speed but observed that dequantize+bf16 matmul is significantly faster than padding+f8f8bf16 matmul. So we'll go that route.
+            # So essentially, f8f8bf16_rowise only happens when shapes are proper (no transposes) and divisible by 8.
+
             # quantize_fp8_per_row will squash the leading dimensions, so save the desired shape here
             output_shape = (*x.shape[:-1], -1)
             # x_quantized and x_scale are not necessarily on the same device as x, this is an issue.
@@ -378,6 +389,16 @@ def forward(ctx, x, weight, weight_scale, bias=None):
             output = output.to(x.device, x.dtype)
             output = output.reshape(output_shape)
             del x_quantized, x_scale
+        elif (weight.shape[0] != weight_scale.shape[0] and weight.shape[1] == weight_scale.shape[0]) or (weight.shape[0] // 8 != 0 or weight.shape[1] // 8 != 0):
+            # Either the weight/scale is transposed or its shape is not divisible by 8. Both cases, dequantizing is the preferred way.
+            # The transpose case is generally noticed in backward pass when we do dY@W instead of @W.T as we do for forward.
+            # The shape case, I noticed to happen in MLP of Qwen 2.5 VL 7B where the gate proj is of shape (3420, 1280) and 3420/8=427.5
+
+            W_deq = weight_dequant(weight, weight_scale).T
+            output = torch_matmul(x, W_deq)
+            del W_deq
+        else:
+            raise ValueError(f"Shapes are incompatible {weight.shape=}, {weight_scale.shape=}, {x.shape=}")
 
         ctx.weight = weight
         ctx.weight_scale = weight_scale
@@ -386,13 +407,13 @@ def forward(ctx, x, weight, weight_scale, bias=None):
     @staticmethod
     def backward(ctx, grad_output):
         W_deq = weight_dequant(ctx.weight, ctx.weight_scale)
-        grad_X = torch_matmul(grad_output, W_deq.t())
+        grad_X = torch_matmul(grad_output, W_deq)
         del W_deq
         return grad_X, None, None, None, None
 
 @torch_compile
-def fbgemm_fp8_linear(X, weight, weight_scale, bias=None, ):
-    return FbgemmFp8Linear.apply(X, weight, weight_scale, bias)
+def fbgemm_fp8_linear(X, weight, weight_scale, bias=None):
+    return FbgemmFp8Linear_matmul.apply(X, weight, weight_scale, bias)
 
 
 class FP8_torch_linear(torch.autograd.Function):
@@ -437,7 +458,7 @@ def forward(ctx, X, weight, weight_scale, bias=None):
     @staticmethod
     def backward(ctx, grad_output):
         W_deq = weight_dequant(ctx.weight, ctx.weight_scale)
-        grad_X = torch_matmul(grad_output, W_deq.t())
+        grad_X = torch_matmul(grad_output, W_deq)
         del W_deq
         return grad_X, None, None, None, None
 
@@ -459,3 +480,16 @@ def fp8_linear(X, weight, weight_scale, bias=None):
         # Row quantized FP8
         out = fbgemm_fp8_linear(X, weight, weight_scale, bias)
     return out
+
+
+def module_forward_patch(forward_function, scale_attr='weight_scale'):
+    def patched_forward(self, X):
+        return forward_function(X, self.weight, getattr(self, scale_attr))
+    return patched_forward
+
+
+# Patch the forward functions of the layers (for compiled models)
+if FbgemmFp8Linear is not None:
+    FbgemmFp8Linear.forward = module_forward_patch(fbgemm_fp8_linear, 'weight_scale')
+if FP8Linear is not None:
+    FP8Linear.forward = module_forward_patch(fp8_block_quant_forward, 'weight_scale_inv')
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
@@ -73,6 +73,7 @@
     "patch_peft_fast_inference",
     "error_out_no_vllm",
     "dequantize_module_weight",
+    "patch_hf_quantizer",
 ]
 
 import torch
@@ -1814,3 +1815,24 @@ def _prepare_model_for_qat(model: torch.nn.Module, qat_scheme: Union[str, TorchA
     quantize_(model, QATConfig(base_config, step = "prepare"), filter_fn = filter_fn)
     return model
 pass
+
+def patch_hf_quantizer():
+    # To tell hf trainer that the quantized model is trainable
+    def make_trainable(self):
+        return True
+    try:
+        from transformers.quantizers.quantizer_finegrained_fp8 import FineGrainedFP8HfQuantizer
+        FineGrainedFP8HfQuantizer.is_trainable = property(make_trainable)
+        FineGrainedFP8HfQuantizer.is_qat_trainable = property(make_trainable)
+    except Exception as e:
+        logger.warning(f"Failed to patch FineGrainedFP8HfQuantizer. Error {e}")
+
+    try:
+        from transformers.quantizers.quantizer_fbgemm_fp8 import FbgemmFp8HfQuantizer
+        FbgemmFp8HfQuantizer.is_trainable = property(make_trainable)
+        FbgemmFp8HfQuantizer.is_qat_trainable = property(make_trainable)
+    except Exception as e:
+        logger.warning(f"Failed to patch FbgemmFp8HfQuantizer. Error {e}")
+pass
+
+patch_hf_quantizer()

Original file line number	Diff line number	Diff line change
`@@ -44,6 +44,7 @@`
`44`	`44`	`apply_lora_o,`
`45`	`45`	`fast_lora_forward,`
`46`	`46`	`)`
	`47`	`+from .fp8 import * # This step is to ensure that we patch the FbgmemFP8Linear and FP8Linear's forward functions before the execution of model creation so that this applies to compiled non fast inference models as well`
`47`	`48`	`from .utils import fast_dequantize, fast_gemv, QUANT_STATE, fast_linear_forward, matmul_lora`
`48`	`49`
`49`	`50`	`from .flex_attention import (`