intel
diff --git a/‎csrc/cpu/aten/Conv.cpp
Lines changed: 43 additions & 2 deletions b/‎csrc/cpu/aten/Conv.cpp
Lines changed: 43 additions & 2 deletions
diff --git a/‎csrc/cpu/aten/Conv.h
Lines changed: 18 additions & 0 deletions b/‎csrc/cpu/aten/Conv.h
Lines changed: 18 additions & 0 deletions
@@ -9,6 +9,7 @@ namespace torch_ipex {
 namespace cpu {
 
 IPEX_DEFINE_DISPATCH(causal_conv1d_update_kernel_stub);
+IPEX_DEFINE_DISPATCH(causal_conv1d_fn_kernel_stub);
 std::vector<int64_t> calc_conv_output_size(
     at::IntArrayRef input_size,
     at::IntArrayRef kernel_size,
@@ -514,21 +515,55 @@ at::Tensor convolution_forward(
  * @param conv_weights (dim, width)
  * @param conv_bias (dim,)
  * @param silu_activation If true, apply the SiLU activation function.
+ * @param cache_seqlens (batch,) or None
  * @return (hidden_states, conv_states)
  */
 std::tuple<at::Tensor, at::Tensor> causal_conv1d_update(
     const at::Tensor& hidden_states,
     const at::Tensor& conv_states,
     const at::Tensor& conv_weights,
     const c10::optional<at::Tensor>& conv_bias,
-    bool silu_activation) {
+    bool silu_activation,
+    const c10::optional<at::Tensor>& cache_seqlens) {
   RECORD_FUNCTION("causal_conv1d_update", c10::ArrayRef<c10::IValue>({}));
   return causal_conv1d_update_kernel_stub(
       kCPU,
       hidden_states,
       conv_states,
       conv_weights,
       conv_bias,
+      silu_activation,
+      cache_seqlens);
+}
+
+/**
+ * Official Python implementation: causal_conv1d_ref:
+ * https://github.com/Dao-AILab/causal-conv1d/blob/main/causal_conv1d/causal_conv1d_interface.py#L133
+ * @param x (batch, dim, seqlen)
+ * @param conv_weights (dim, width)
+ * @param conv_bias (dim,)
+ * @param initial_states (batch, dim, width - 1)
+ * @param final_states_out (batch, dim, width - 1)
+ * @param silu_activation If true, apply the SiLU activation function.
+ * @return (out, final_states_out)
+ * out: (batch, dim, seqlen)
+ * final_states_out: (batch, dim, width - 1)
+ */
+std::tuple<at::Tensor, at::Tensor> causal_conv1d_fn(
+    const at::Tensor& x,
+    const at::Tensor& conv_weights,
+    const c10::optional<at::Tensor>& conv_bias,
+    const c10::optional<at::Tensor>& initial_states,
+    const c10::optional<at::Tensor>& final_states_out,
+    bool silu_activation) {
+  RECORD_FUNCTION("causal_conv1d_fn", c10::ArrayRef<c10::IValue>({}));
+  return causal_conv1d_fn_kernel_stub(
+      kCPU,
+      x,
+      conv_weights,
+      conv_bias,
+      initial_states,
+      final_states_out,
       silu_activation);
 }
 
@@ -589,11 +624,17 @@ TORCH_LIBRARY_FRAGMENT(torch_ipex, m) {
       c10::DispatchKey::CPU,
       torch_ipex::cpu::convolution_forward_impl);
   m.def(
-      "causal_conv1d_update(Tensor hidden_states, Tensor conv_states, Tensor conv_weights, Tensor? conv_bias, bool silu_activation) -> (Tensor, Tensor)");
+      "causal_conv1d_update(Tensor hidden_states, Tensor conv_states, Tensor conv_weights, Tensor? conv_bias, bool silu_activation, Tensor? cache_seqlens=None) -> (Tensor, Tensor)");
   m.impl(
       "causal_conv1d_update",
       c10::DispatchKey::CPU,
       torch_ipex::cpu::causal_conv1d_update);
+  m.def(
+      "causal_conv1d_fn(Tensor x, Tensor conv_weights, Tensor? conv_bias, Tensor? initial_states, Tensor? final_states_out, bool silu_activation) -> (Tensor, Tensor)");
+  m.impl(
+      "causal_conv1d_fn",
+      c10::DispatchKey::CPU,
+      torch_ipex::cpu::causal_conv1d_fn);
   // bw
   m.def(
       "convolution_backward(Tensor input, Tensor weight, Tensor? bias, Tensor grad_output, bool[3] out_mask, "
 
@@ -57,6 +57,15 @@ std::tuple<at::Tensor, at::Tensor> causal_conv1d_update(
     const at::Tensor& conv_states,
     const at::Tensor& conv_weights,
     const c10::optional<at::Tensor>& conv_bias,
+    bool silu_activation,
+    const c10::optional<at::Tensor>& cache_seqlens);
+
+std::tuple<at::Tensor, at::Tensor> causal_conv1d_fn(
+    const at::Tensor& x,
+    const at::Tensor& conv_weights,
+    const c10::optional<at::Tensor>& conv_bias,
+    const c10::optional<at::Tensor>& initial_states,
+    const c10::optional<at::Tensor>& final_states_out,
     bool silu_activation);
 
 // IPEX customized convolution OP with n-D packed weight
@@ -108,9 +117,18 @@ using causal_conv1d_update_kernel_fn = std::tuple<at::Tensor, at::Tensor> (*)(
     const at::Tensor& conv_states,
     const at::Tensor& conv_weights,
     const c10::optional<at::Tensor>& conv_bias,
+    bool silu_activation,
+    const c10::optional<at::Tensor>& cache_seqlens);
+using causal_conv1d_fn_kernel_fn = std::tuple<at::Tensor, at::Tensor> (*)(
+    const at::Tensor& x,
+    const at::Tensor& conv_weights,
+    const c10::optional<at::Tensor>& conv_bias,
+    const c10::optional<at::Tensor>& initial_states,
+    const c10::optional<at::Tensor>& final_states_out,
     bool silu_activation);
 IPEX_DECLARE_DISPATCH(
     causal_conv1d_update_kernel_fn,
     causal_conv1d_update_kernel_stub);
+IPEX_DECLARE_DISPATCH(causal_conv1d_fn_kernel_fn, causal_conv1d_fn_kernel_stub);
 } // namespace cpu
 } // namespace torch_ipex