use seqlen k only for cases with block table (#5836) (#5839)

baodii · geqinling · web-flow · commit 7259fd81b92d · 2025-10-29T11:09:25.000+08:00
Signed-off-by: baodii &lt;di.bao@intel.com&gt;
Co-authored-by: Ge Qinling &lt;qinling.ge@intel.com&gt;
diff --git a/csrc/gpu/aten/operators/xetla/kernels/SDP/fmha_forward.hpp b/csrc/gpu/aten/operators/xetla/kernels/SDP/fmha_forward.hpp
@@ -358,19 +358,15 @@ class fmha_forward_t {
         mem_desc_Oi.init(
             args.O_ptr, {end_x, end_y, ld_qo}, {start_acc, start_y});
 
-        // get current location for kv
-        kv_offset_y = 0;
-        for (int32_t i = 0; i <= static_cast<int32_t>(batch_id) - 1; ++i) {
-          kv_offset_y = kv_offset_y + args.cu_seqlen_k[i];
-        }
-
         // for local attention
         if constexpr (kIsLocal) {
           if constexpr (kIsCausal) {
             args.w_right = 0;
           }
           int32_t startF = item.get_group(1) * kBr;
-          uint32_t real_T = args.cu_seqlen_k[batch_id];
+          uint32_t real_T = args.block_tables == nullptr
+              ? args.cu_seqlen_k[batch_id + 1] - args.cu_seqlen_k[batch_id]
+              : args.cu_seqlen_k[batch_id];
           uint32_t real_F =
               args.cu_seqlen_q[batch_id + 1] - args.cu_seqlen_q[batch_id];
           uint32_t seq_diff = real_T - real_F;
@@ -458,9 +454,9 @@ class fmha_forward_t {
           remain_T = remain_T < args.block_size ? remain_T : args.block_size;
           end_x = start_x + remain_T;
         } else {
-          start_x = startT + kv_offset_y;
+          start_x = startT + args.cu_seqlen_k[batch_id];
           end_x = start_x + kBc;
-          int32_t limit_x = kv_offset_y + args.cu_seqlen_k[batch_id];
+          int32_t limit_x = args.cu_seqlen_k[batch_id + 1];
           end_x = end_x < limit_x ? end_x : limit_x;
         }
         int32_t start_acc = head_id * args.uNkv / args.uN * args.uH;
@@ -701,7 +697,9 @@ class fmha_forward_t {
     }
     uint32_t real_T = args.uT;
     if constexpr (kVarlen) {
-      real_T = args.cu_seqlen_k[ctx.batch_id];
+      real_T = args.block_tables == nullptr
+          ? args.cu_seqlen_k[ctx.batch_id + 1] - args.cu_seqlen_k[ctx.batch_id]
+          : args.cu_seqlen_k[ctx.batch_id];
     }
     uint32_t remainT = std::max(int(real_T) - int(sg_startT), 0);
     if constexpr (kIsLocal) {
@@ -1066,7 +1064,9 @@ class fmha_forward_t {
     int32_t actual_seqlen_k = 0;
     int32_t seqlen_diff = 0;
     if constexpr (kVarlen) {
-      actual_seqlen_k = args.cu_seqlen_k[batch_id];
+      actual_seqlen_k = args.block_tables == nullptr
+          ? args.cu_seqlen_k[batch_id + 1] - args.cu_seqlen_k[batch_id]
+          : args.cu_seqlen_k[batch_id];
       seqlen_diff = actual_seqlen_k - actual_seqlen_q;
     }
 
diff --git a/csrc/gpu/aten/operators/xetla/kernels/SDP/fmha_forward_v3.hpp b/csrc/gpu/aten/operators/xetla/kernels/SDP/fmha_forward_v3.hpp
@@ -363,19 +363,15 @@ class fmha_forward_v3_t {
           mem_desc_Oi[i].init(
               args.O_ptr, {end_x, end_y, ld_qo}, {start_acc, start_y});
         }
-        // get current kv location
-        kv_offset_y = 0;
-        for (int32_t i = 0; i <= static_cast<int>(batch_id) - 1; ++i) {
-          kv_offset_y += args.cu_seqlen_k[i];
-        }
-
         // for local attention
         if constexpr (kIsLocal) {
           if constexpr (kIsCausal) {
             args.w_right = 0;
           }
           int32_t startF = item.get_group(1) * kBr;
-          uint32_t real_T = args.cu_seqlen_k[batch_id];
+          uint32_t real_T = args.block_tables == nullptr
+              ? args.cu_seqlen_k[batch_id + 1] - args.cu_seqlen_k[batch_id]
+              : args.cu_seqlen_k[batch_id];
           uint32_t real_F =
               args.cu_seqlen_q[batch_id + 1] - args.cu_seqlen_q[batch_id];
           uint32_t seq_diff = real_T - real_F;
@@ -473,9 +469,9 @@ class fmha_forward_v3_t {
           remain_T = remain_T < args.block_size ? remain_T : args.block_size;
           end_x = start_x + remain_T;
         } else {
-          start_x = startT + kv_offset_y;
+          start_x = startT + args.cu_seqlen_k[batch_id];
           end_x = start_x + kBc;
-          int32_t limit_x = kv_offset_y + args.cu_seqlen_k[batch_id];
+          int32_t limit_x = args.cu_seqlen_k[batch_id + 1];
           end_x = end_x < limit_x ? end_x : limit_x;
         }
         int32_t start_acc = head_id_kv * args.uH;
@@ -664,7 +660,9 @@ class fmha_forward_v3_t {
     }
     uint32_t real_T = args.uT;
     if constexpr (kVarlen) {
-      real_T = args.cu_seqlen_k[ctx.batch_id];
+      real_T = args.block_tables == nullptr
+          ? args.cu_seqlen_k[ctx.batch_id + 1] - args.cu_seqlen_k[ctx.batch_id]
+          : args.cu_seqlen_k[ctx.batch_id];
     }
     uint32_t remainT = std::max(int(real_T) - int(sg_startT), 0);
     if constexpr (kIsLocal) {
@@ -920,7 +918,9 @@ class fmha_forward_v3_t {
     int32_t actual_seqlen_k = 0;
     int32_t seqlen_diff = 0;
     if constexpr (kVarlen) {
-      actual_seqlen_k = args.cu_seqlen_k[batch_id];
+      actual_seqlen_k = args.block_tables == nullptr
+          ? args.cu_seqlen_k[batch_id + 1] - args.cu_seqlen_k[batch_id]
+          : args.cu_seqlen_k[batch_id];
       seqlen_diff = actual_seqlen_k - actual_seqlen_q;
     }
 
diff --git a/tests/gpu/examples/test_varlen_fwd.py b/tests/gpu/examples/test_varlen_fwd.py
@@ -113,9 +113,9 @@ def varlen_fwd_reference(
     seqlen_q_ = seqlen_q.clone()
     seqlen_q_[:batch_size] = seqlen_q[1:]
     seqlen_q = (seqlen_q_ - seqlen_q)[:batch_size]
-    # seqlen_k_ = seqlen_k.clone()
-    # seqlen_k_[:batch_size] = seqlen_k[1:]
-    # seqlen_k = (seqlen_k_ - seqlen_k)[:batch_size]
+    seqlen_k_ = seqlen_k.clone()
+    seqlen_k_[:batch_size] = seqlen_k[1:]
+    seqlen_k = (seqlen_k_ - seqlen_k)[:batch_size]
 
     pad_q = torch.zeros(
         [batch_size, max_seqlen_q, num_head, head_size],
@@ -263,8 +263,6 @@ def test_varlen_fwd(
     cu_seqlen = (
         torch.cat([torch.tensor([0]), cu_seqlen], dim=0).to(torch.int32).to("xpu")
     )
-    seqlen_list = seqlen_list.to("xpu")
-    print(f"seqlen_list: {seqlen_list} cu_seqlen: {cu_seqlen}")
 
     query = torch.randn(
         [cu_seqlen[-1], num_heads_query, head_dim], dtype=dtype, device="xpu"
@@ -294,7 +292,7 @@ def test_varlen_fwd(
         value,
         out,
         cu_seqlen,
-        seqlen_list,
+        cu_seqlen,
         None,
         None,
         alibi_slopes,
@@ -316,7 +314,7 @@ def test_varlen_fwd(
         value,
         out_ref,
         cu_seqlen,
-        seqlen_list,
+        cu_seqlen,
         max_seqlen,
         max_seqlen,
         alibi_slopes,
@@ -336,7 +334,7 @@ def test_varlen_fwd(
         value,
         out,
         cu_seqlen,
-        seqlen_list,
+        cu_seqlen,
         alibi_slopes,
         max_seqlen,
         max_seqlen,
@@ -376,7 +374,6 @@ def test_varlen_attention_softcap(
     cu_seqlen = (
         torch.cat([torch.tensor([0]), cu_seqlen], dim=0).to(torch.int32).to("xpu")
     )
-    seqlen_list = seqlen_list.to("xpu")
 
     query = torch.randn(
         [cu_seqlen[-1], num_heads_query, head_dim], dtype=dtype, device="xpu"
@@ -395,7 +392,7 @@ def test_varlen_attention_softcap(
         value,
         out,
         cu_seqlen,
-        seqlen_list,
+        cu_seqlen,
         None,
         max_seqlen,
         max_seqlen,