Cohere

unslothai · danielhanchen · Sep 8, 2024 · Aug 22, 2024 · Aug 22, 2024 · Aug 22, 2024
commit 754e670daf6b53bf8fe92c5f07bae25a96aa67f1
diff --git a/unsloth/kernels/cross_entropy_loss.py b/unsloth/kernels/cross_entropy_loss.py
@@ -19,17 +19,22 @@
 from transformers.models.llama.modeling_llama import logger
 
 
-@triton.heuristics({"DO_SOFTCAPPING": lambda args: args["DO_SOFTCAPPING"],})
+@triton.heuristics({
+    "DO_SOFTCAPPING":   lambda args: args["DO_SOFTCAPPING"  ],
+    "DO_LOGIT_SCALING": lambda args: args["DO_LOGIT_SCALING"],
+})
 @triton.jit
 def _cross_entropy_forward(
     logits_ptr, logits_row_stride,
     loss_ptr,
     logsumexp_ptr,
     labels_ptr,
-    VOCAB_SIZE     : tl.constexpr,
-    BLOCK_SIZE     : tl.constexpr,
-    DO_SOFTCAPPING : tl.constexpr,
-    SOFTCAP        : tl.constexpr,
+    VOCAB_SIZE      : tl.constexpr,
+    BLOCK_SIZE      : tl.constexpr,
+    DO_SOFTCAPPING  : tl.constexpr,
+    SOFTCAP         : tl.constexpr,
+    DO_LOGIT_SCALING: tl.constexpr,
+    LOGIT_SCALE     : tl.constexpr,
 ):
     """
         Cross Entropy Loss = 1/n sum [ -yi log(Pi) ]
@@ -62,17 +67,22 @@ def _cross_entropy_forward(
 
     label_idx = tl.load(labels_ptr).to(tl.int32)
     logits = tl.load(logits_ptr + col_offsets, mask = mask, other = -float("inf"))
+
+    # Go logit scaling for Cohere: t * x
+    if DO_LOGIT_SCALING: logits = LOGIT_SCALE * logits
     # Do logit softcapping for Gemma 2: t * tanh(1/t * x)
-    if DO_SOFTCAPPING: logits = SOFTCAP * triton_tanh(logits / SOFTCAP)
+    if DO_SOFTCAPPING:   logits = SOFTCAP * triton_tanh(logits / SOFTCAP)
 
     logits = logits.to(tl.float32)
     c = tl.max(logits, 0)
     logsumexp = c + tl.log(tl.sum(tl.exp(logits - c), 0))
 
     if label_idx != -100:
         x = tl.load(logits_ptr + label_idx)
+        # Go logit scaling for Cohere: t * x
+        if DO_LOGIT_SCALING: x = LOGIT_SCALE * x
         # Do logit softcapping for Gemma 2: t * tanh(1/t * x)
-        if DO_SOFTCAPPING: x = SOFTCAP * triton_tanh(x / SOFTCAP)
+        if DO_SOFTCAPPING:   x = SOFTCAP * triton_tanh(x / SOFTCAP)
         loss = logsumexp - x.to(tl.float32)
     else:
         loss = 0.0
@@ -81,18 +91,23 @@ def _cross_entropy_forward(
 pass
 
 
-@triton.heuristics({"DO_SOFTCAPPING": lambda args: args["DO_SOFTCAPPING"],})
+@triton.heuristics({
+    "DO_SOFTCAPPING":   lambda args: args["DO_SOFTCAPPING"  ],
+    "DO_LOGIT_SCALING": lambda args: args["DO_LOGIT_SCALING"],
+})
 @triton.jit
 def _chunked_cross_entropy_forward(
     logits_ptr, logits_row_stride,
     loss_ptr,
     logsumexp_ptr,
     labels_ptr,
-    VOCAB_SIZE     : tl.constexpr,
-    N_CHUNKS       : tl.constexpr,
-    BLOCK_SIZE     : tl.constexpr,
-    DO_SOFTCAPPING : tl.constexpr,
-    SOFTCAP        : tl.constexpr,
+    VOCAB_SIZE      : tl.constexpr,
+    N_CHUNKS        : tl.constexpr,
+    BLOCK_SIZE      : tl.constexpr,
+    DO_SOFTCAPPING  : tl.constexpr,
+    SOFTCAP         : tl.constexpr,
+    DO_LOGIT_SCALING: tl.constexpr,
+    LOGIT_SCALE     : tl.constexpr,
 ):
     """
         256K vocab divided in 4 chunks
@@ -130,8 +145,11 @@ def _chunked_cross_entropy_forward(
 
     label_idx = tl.load(labels_ptr).to(tl.int32)
     logits = tl.load(logits_ptr + col_offsets, mask = mask, other = -float("inf"))
+
+    # Go logit scaling for Cohere: t * x
+    if DO_LOGIT_SCALING: logits = LOGIT_SCALE * logits
     # Do logit softcapping for Gemma 2: t * tanh(1/t * x)
-    if DO_SOFTCAPPING: logits = SOFTCAP * triton_tanh(logits / SOFTCAP)
+    if DO_SOFTCAPPING:   logits = SOFTCAP * triton_tanh(logits / SOFTCAP)
 
     logits = logits.to(tl.float32)
     c = tl.max(logits, 0)
@@ -142,8 +160,10 @@ def _chunked_cross_entropy_forward(
         # Do the -x separately
         if label_idx != -100:
             x = tl.load(logits_ptr + label_idx).to(tl.float32)
+            # Go logit scaling for Cohere: t * x
+            if DO_LOGIT_SCALING: x = LOGIT_SCALE * x
             # Do logit softcapping for Gemma 2: t * tanh(1/t * x)
-            if DO_SOFTCAPPING: x = SOFTCAP * triton_tanh(x / SOFTCAP)
+            if DO_SOFTCAPPING:   x = SOFTCAP * triton_tanh(x / SOFTCAP)
             loss = -1.0 * x.to(tl.float32)
         else:
             loss = 0.0
@@ -153,17 +173,22 @@ def _chunked_cross_entropy_forward(
 pass
 
 
-@triton.heuristics({"DO_SOFTCAPPING": lambda args: args["DO_SOFTCAPPING"],})
+@triton.heuristics({
+    "DO_SOFTCAPPING":   lambda args: args["DO_SOFTCAPPING"  ],
+    "DO_LOGIT_SCALING": lambda args: args["DO_LOGIT_SCALING"],
+})
 @triton.jit
 def _cross_entropy_backward(
     logits_ptr, logits_row_stride,
     dloss_ptr,   dloss_row_stride,
     logsumexp_ptr,
     labels_ptr,
-    VOCAB_SIZE     : tl.constexpr,
-    BLOCK_SIZE     : tl.constexpr,
-    DO_SOFTCAPPING : tl.constexpr,
-    SOFTCAP        : tl.constexpr,
+    VOCAB_SIZE      : tl.constexpr,
+    BLOCK_SIZE      : tl.constexpr,
+    DO_SOFTCAPPING  : tl.constexpr,
+    SOFTCAP         : tl.constexpr,
+    DO_LOGIT_SCALING: tl.constexpr,
+    LOGIT_SCALE     : tl.constexpr,
 ):
     """
         CE_i = -y log(P) = y * (log[sum(exp(x))] - x)
@@ -210,6 +235,11 @@ def _cross_entropy_backward(
         y,       # exp(x - logsumexp)
     )
 
+    if DO_LOGIT_SCALING:
+        # d/dx [s * x] = s
+        y = y * LOGIT_SCALE
+    pass
+
     if DO_SOFTCAPPING:
         # d/dx [t * tanh(1/t * x)] = 1 - tanh^2(1/t * x)
         y = y * (1.0 - partial*partial)
@@ -224,14 +254,15 @@ def _cross_entropy_backward(
 
 class Fast_CrossEntropyLoss(torch.autograd.Function):
     @staticmethod
-    def forward(ctx, logits, labels, logit_softcapping = 0):
+    def forward(ctx, logits, labels, logit_softcapping = 0, logit_scaling = 0):
         n_rows, vocab_size = logits.shape
 
         div, mod = divmod(vocab_size, MAX_FUSED_SIZE)
         n_chunks = div + (mod != 0)
         losses = torch.empty(n_rows, dtype = torch.float32, device = "cuda:0")
 
-        DO_SOFTCAPPING = (logit_softcapping != 0)
+        DO_SOFTCAPPING   = (logit_softcapping != 0)
+        DO_LOGIT_SCALING = (logit_scaling != 0)
 
         if n_chunks == 1:
             # For small vocabs <= 65336 like Llama, Mistral
@@ -243,11 +274,13 @@ def forward(ctx, logits, labels, logit_softcapping = 0):
                 losses,
                 logsumexp,
                 labels,
-                VOCAB_SIZE     = vocab_size,
-                BLOCK_SIZE     = BLOCK_SIZE,
-                DO_SOFTCAPPING = DO_SOFTCAPPING,
-                SOFTCAP        = logit_softcapping,
-                num_warps      = num_warps,
+                VOCAB_SIZE       = vocab_size,
+                BLOCK_SIZE       = BLOCK_SIZE,
+                DO_SOFTCAPPING   = DO_SOFTCAPPING,
+                SOFTCAP          = logit_softcapping,
+                DO_LOGIT_SCALING = DO_LOGIT_SCALING,
+                LOGIT_SCALE      = logit_scaling,
+                num_warps        = num_warps,
             )
         else:
             # For large vocabs > 65336 like Gemma 256K
@@ -258,12 +291,14 @@ def forward(ctx, logits, labels, logit_softcapping = 0):
                 losses,
                 logsumexp,
                 labels,
-                VOCAB_SIZE     = vocab_size,
-                N_CHUNKS       = n_chunks,
-                BLOCK_SIZE     = MAX_FUSED_SIZE,
-                DO_SOFTCAPPING = DO_SOFTCAPPING,
-                SOFTCAP        = logit_softcapping,
-                num_warps      = 32,
+                VOCAB_SIZE       = vocab_size,
+                N_CHUNKS         = n_chunks,
+                BLOCK_SIZE       = MAX_FUSED_SIZE,
+                DO_SOFTCAPPING   = DO_SOFTCAPPING,
+                SOFTCAP          = logit_softcapping,
+                DO_LOGIT_SCALING = DO_LOGIT_SCALING,
+                LOGIT_SCALE      = logit_scaling,
+                num_warps        = 32,
             )
             # logsumexp(chunked_logsumexp) - x
             # Do the -x separately
@@ -275,6 +310,8 @@ def forward(ctx, logits, labels, logit_softcapping = 0):
         ctx.save_for_backward(logits, logsumexp, labels)
         ctx.DO_SOFTCAPPING    = DO_SOFTCAPPING
         ctx.logit_softcapping = logit_softcapping
+        ctx.DO_LOGIT_SCALING  = DO_LOGIT_SCALING
+        ctx.logit_scaling     = logit_scaling
         return losses
     pass
 
@@ -292,19 +329,26 @@ def backward(ctx, dlosses):
             dlosses, dlosses.stride(0),
             logsumexp,
             labels,
-            VOCAB_SIZE     = vocab_size,
-            BLOCK_SIZE     = BLOCK_SIZE,
-            DO_SOFTCAPPING = ctx.DO_SOFTCAPPING,
-            SOFTCAP        = ctx.logit_softcapping,
+            VOCAB_SIZE       = vocab_size,
+            BLOCK_SIZE       = BLOCK_SIZE,
+            DO_SOFTCAPPING   = ctx.DO_SOFTCAPPING,
+            SOFTCAP          = ctx.logit_softcapping,
+            DO_LOGIT_SCALING = ctx.DO_LOGIT_SCALING,
+            LOGIT_SCALE      = ctx.logit_scaling,
             num_warps      = 8,
         )
-        return logits, None, None,
+        return logits, None, None, None,
     pass
 pass
 
 
 @torch._disable_dynamo
-def fast_cross_entropy_loss(logits, labels, logit_softcapping = 0):
+def fast_cross_entropy_loss(
+    logits,
+    labels,
+    logit_softcapping = 0,
+    logit_scaling = 0,
+):
     """
     Arguments:
         logits: (batch, seq_len, vocab_size)
@@ -319,6 +363,7 @@ def fast_cross_entropy_loss(logits, labels, logit_softcapping = 0):
         logits.view(batch*seq_len, d),
         labels.view(-1),
         logit_softcapping,
+        logit_scaling,
     )
     n_items = torch.count_nonzero(labels != -100)
     return loss.sum() / n_items

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
@@ -305,6 +305,20 @@ def fast_rms_layernorm_inference_gemma(self, X, out_weight = None):
 pass
 
 
+# Normal layernorm with mean removal
+@torch.compile(fullgraph = False, dynamic = True, options = torch_compile_options)
+def fast_layernorm_compiled(layernorm, X):
+    old_dtype = X.dtype
+    X = X.float()
+    mean = X.mean(-1, keepdim = True)
+    Xbar = X - mean
+    X = Xbar * torch.rsqrt(Xbar.square().mean(-1, keepdim = True) + \
+        layernorm.variance_epsilon) * \
+        layernorm.weight.float()
+    return X.to(old_dtype)
+pass
+
+
 # https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L320
 def LlamaAttention_fast_forward(
     self,
@@ -597,6 +611,7 @@ def LlamaModel_fast_forward(
     # Normalized from Gemma
     IS_GEMMA  = self.config.model_type.startswith("gemma")
     IS_GEMMA2 = self.config.model_type.startswith("gemma2")
+    IS_COHERE = self.config.model_type.startswith("cohere")
     train_embed_tokens = self.embed_tokens.weight.requires_grad
 
     if IS_GEMMA:
@@ -802,8 +817,11 @@ def custom_forward(*inputs):
 
     # Final layernorm
     if use_cache:
-        hidden_states = (fast_rms_layernorm_inference_gemma if IS_GEMMA else fast_rms_layernorm_inference)\
+        hidden_states = \
+            (fast_rms_layernorm_inference_gemma if IS_GEMMA else fast_rms_layernorm_inference)\
             (self.norm, hidden_states)
+    elif IS_COHERE:
+        hidden_states = fast_layernorm_compiled(self.norm, hidden_states)
     else:
         hidden_states = fast_rms_layernorm(self.norm, hidden_states, gemma = IS_GEMMA)
     pass
@@ -943,6 +961,7 @@ def _CausalLM_fast_forward(
 
         loss = None
         logit_softcapping = getattr(self.config, "final_logit_softcapping", 0)
+        logit_scaling     = getattr(self.config, "logit_scale", 0)
         if labels is not None:
             shift_logits = logits
             if not hasattr(self, "extra_ignored_labels"):
@@ -955,16 +974,26 @@ def _CausalLM_fast_forward(
                 logits = shift_logits,
                 labels = shift_labels,
                 logit_softcapping = logit_softcapping,
+                logit_scaling     = logit_scaling,
             )
-        elif logit_softcapping != 0:
-            if logits.requires_grad:
-                logits = (1.0 / logit_softcapping) * logits
-                logits = torch.tanh(logits)
-                logits = logit_softcapping * logits
-            else:
-                logits *= (1.0 / logit_softcapping)
-                torch.tanh(logits, out = logits)
-                logits *= logit_softcapping
+        else:
+            if logit_scaling != 0:
+                if logits.requires_grad:
+                    logits = logit_scaling * logits
+                else:
+                    logits *= logit_scaling
+                pass
+            pass
+            if logit_softcapping != 0:
+                if logits.requires_grad:
+                    logits = (1.0 / logit_softcapping) * logits
+                    logits = torch.tanh(logits)
+                    logits = logit_softcapping * logits
+                else:
+                    logits *= (1.0 / logit_softcapping)
+                    torch.tanh(logits, out = logits)
+                    logits *= logit_softcapping
+                pass
             pass
         pass