intel
diff --git a/‎cmake/cpu/Options.cmake
Lines changed: 1 addition & 1 deletion b/‎cmake/cpu/Options.cmake
Lines changed: 1 addition & 1 deletion
diff --git a/‎csrc/cpu/CMakeLists.txt
Lines changed: 4 additions & 0 deletions b/‎csrc/cpu/CMakeLists.txt
Lines changed: 4 additions & 0 deletions
diff --git a/‎csrc/cpu/aten/Linear.cpp
Lines changed: 114 additions & 13 deletions b/‎csrc/cpu/aten/Linear.cpp
Lines changed: 114 additions & 13 deletions
diff --git a/‎csrc/cpu/aten/Linear.h
Lines changed: 20 additions & 0 deletions b/‎csrc/cpu/aten/Linear.h
Lines changed: 20 additions & 0 deletions
diff --git a/‎csrc/cpu/aten/kernels/FlashAttentionKrnl.cpp
Lines changed: 15 additions & 18 deletions b/‎csrc/cpu/aten/kernels/FlashAttentionKrnl.cpp
Lines changed: 15 additions & 18 deletions
@@ -9,7 +9,7 @@ include(CMakeDependentOption)
 
 option(BUILD_LIBXSMM_VIA_CMAKE "Build LIBXSMM via CMake" ON)
 option(USE_LIBXSMM "Enable LIBXSMM" ON)
-option(USE_DNNL_GRAPH_COMPILER "Build with DNNL Graph Compiler" ON)
+option(USE_DNNL_GRAPH_COMPILER  "Build with DNNL Graph Compiler" ON)
 if(WIN32)
   set(USE_LIBXSMM ON)
 endif()
 
@@ -20,6 +20,10 @@ if(BUILD_CPU_WITH_ONECCL)
   find_package(oneCCL REQUIRED)
   list(APPEND DEPENDS_LIB oneCCL)
   list(APPEND DEPENDS_LIB mpi)
+  set(RPATH_VALUE)
+  list(APPEND RPATH_VALUE "$ORIGIN")
+  list(APPEND RPATH_VALUE "$ORIGIN/../opt/mpi/lib")
+  set(CMAKE_INSTALL_RPATH "${RPATH_VALUE}")
 endif()   
 
 # TODO: Once llga is merged into oneDNN, use oneDNN directly as the third_party of IPEX
 
@@ -398,16 +398,8 @@ at::Tensor woq_linear_pack_weight(
       // Note that weight is already compressed
       int64_t K_int4_compressed = K / 2;
       int64_t N_int4 = N % block_n ? N / block_n * block_n + block_n : N;
-      at::Tensor weight_int4 = at::empty(
-          {N_int4, K_int4_compressed}, device(c10::kCPU).dtype(c10::kByte));
-      int64_t weight_size_bytes = weight.numel();
-      int64_t weight_int4_size_bytes = weight_int4.numel();
-      int64_t pad_size_bytes = weight_int4_size_bytes - weight_size_bytes;
-      std::memcpy(weight_int4.data_ptr(), weight.data_ptr(), weight_size_bytes);
-      std::fill_n(
-          (uint8_t*)weight_int4.data_ptr() + weight_size_bytes,
-          pad_size_bytes,
-          0);
+      at::Tensor weight_int4 =
+          at::pad(weight, {0, 0, 0, N_int4 - N}, "constant", 0);
       return woq_tpp_gemm_packB_stub(
           kCPU, weight_int4, weight_dtype, block_n, block_k, lowp_mode);
     }
@@ -491,7 +483,9 @@ at::Tensor woq_linear_kernel(
     int64_t lowp_mode,
     int64_t act_quant_mode,
     const c10::optional<at::Tensor>& compensation) {
-  int64_t quant_w_mode = group_size > 0 ? 1 : 0;
+  int64_t quant_w_mode = zps_list[0].defined()
+      ? (group_size > 0 ? QUANT_W_PER_K_BLOCK : QUANT_W_PER_CHANNEL)
+      : (group_size > 0 ? QUANT_W_PER_K_BLOCK_SYM : QUANT_W_PER_CHANNEL_SYM);
   auto K = self.size(-1);
   auto M = self.numel() / K;
   auto in = self;
@@ -533,6 +527,63 @@ at::Tensor woq_linear_forward(
       ->run(input);
 }
 
+at::Tensor woq_linear_forward_v2(
+    const at::Tensor& input,
+    const at::Tensor& qweight,
+    const c10::string_view& weight_dtype,
+    const std::vector<int64_t>& weight_shape,
+    const std::vector<at::Tensor>& weight_scales,
+    const c10::optional<std::vector<at::Tensor>>& weight_zeros,
+    const c10::optional<std::vector<at::Tensor>>& bias,
+    const c10::optional<at::Tensor>& g_idx,
+    int64_t group_size,
+    int64_t lowp_mode,
+    int64_t act_quant_mode,
+    const c10::optional<at::Tensor>& compensation) {
+  static const std::map<c10::string_view, int64_t> WOQ_DTYPE_MAP = {
+      {"int8", WOQ_DTYPE_INT8},
+      {"int4", WOQ_DTYPE_INT4},
+      {"nf4", WOQ_DTYPE_NF4},
+  };
+  TORCH_CHECK(
+      WOQ_DTYPE_MAP.find(weight_dtype) != WOQ_DTYPE_MAP.end(),
+      "Unsupported weight dtype: ",
+      weight_dtype);
+  if (WOQ_DTYPE_MAP.at(weight_dtype) == WOQ_DTYPE_INT8 && lowp_mode == 3) {
+    TORCH_CHECK(compensation.has_value() && compensation.value().defined());
+  }
+  static const at::Tensor empty_tensor = at::Tensor();
+  // zp list of all dtypes = {fp32, fp16, bf16, int8}
+  static const std::vector<at::Tensor> empty_zp_list = {
+      empty_tensor, empty_tensor, empty_tensor, empty_tensor};
+  // bias list of all dtypes = {fp32, fp16, bf16}
+  static const std::vector<at::Tensor> empty_bias_list = {
+      empty_tensor, empty_tensor, empty_tensor};
+  if (weight_zeros.has_value()) {
+    TORCH_CHECK(
+        weight_zeros.value().size() == 4,
+        "IPEX WOQ: expect list of zeros has length 4");
+  }
+  auto& zeros_list =
+      weight_zeros.has_value() ? weight_zeros.value() : empty_zp_list;
+  if (bias.has_value()) {
+    TORCH_CHECK(
+        bias.value().size() == 3, "IPEX WOQ: expect list of bias has length 3");
+  }
+  auto& bias_list = bias.has_value() ? bias.value() : empty_bias_list;
+  return woq_linear_kernel(
+      input,
+      qweight,
+      WOQ_DTYPE_MAP.at(weight_dtype),
+      weight_scales,
+      zeros_list,
+      bias_list,
+      group_size,
+      lowp_mode,
+      act_quant_mode,
+      compensation);
+}
+
 at::Tensor woq_linear_unary_kernel(
     const at::Tensor& self,
     const at::Tensor& weight,
@@ -559,7 +610,9 @@ at::Tensor woq_linear_unary_kernel(
   } else if (post_op == "silu") {
     post_op_fusion_type = WOQ_FUSE_SILU;
   }
-  int64_t quant_w_mode = group_size > 0 ? 1 : 0;
+  int64_t quant_w_mode = zps_list[0].defined()
+      ? (group_size > 0 ? QUANT_W_PER_K_BLOCK : QUANT_W_PER_CHANNEL)
+      : (group_size > 0 ? QUANT_W_PER_K_BLOCK_SYM : QUANT_W_PER_CHANNEL_SYM);
   auto K = self.size(-1);
   auto M = self.numel() / K;
   auto in = self;
@@ -648,7 +701,9 @@ at::Tensor woq_linear_binary_kernel(
   } else if (post_op == "mul") {
     post_op_fusion_type = WOQ_FUSE_MUL;
   }
-  int64_t quant_w_mode = group_size > 0 ? 1 : 0;
+  int64_t quant_w_mode = zps_list[0].defined()
+      ? (group_size > 0 ? QUANT_W_PER_K_BLOCK : QUANT_W_PER_CHANNEL)
+      : (group_size > 0 ? QUANT_W_PER_K_BLOCK_SYM : QUANT_W_PER_CHANNEL_SYM);
   auto K = self.size(-1);
   auto M = self.numel() / K;
   auto in = self;
@@ -782,6 +837,39 @@ at::Tensor woq_linear_forward(
   return op.call(cpu_cached_cast(target_type, input), op_context);
 }
 
+at::Tensor woq_linear_forward_v2(
+    const at::Tensor& input,
+    const at::Tensor& qweight,
+    const c10::string_view& weight_dtype,
+    const std::vector<int64_t>& weight_shape,
+    const std::vector<at::Tensor>& weight_scales,
+    const c10::optional<std::vector<at::Tensor>>& weight_zeros,
+    const c10::optional<std::vector<at::Tensor>>& bias,
+    const c10::optional<at::Tensor>& g_idx,
+    int64_t group_size,
+    int64_t lowp_mode,
+    int64_t act_quant_mode,
+    const c10::optional<at::Tensor>& compensation) {
+  c10::impl::ExcludeDispatchKeyGuard no_autocastCPU(DispatchKey::AutocastCPU);
+  static auto op = torch::Dispatcher::singleton()
+                       .findSchemaOrThrow("torch_ipex::woq_linear", "")
+                       .typed<decltype(woq_linear_forward_v2)>();
+  auto target_type = get_autocast_dtype();
+  return op.call(
+      cpu_cached_cast(target_type, input),
+      qweight,
+      weight_dtype,
+      weight_shape,
+      weight_scales,
+      weight_zeros,
+      bias,
+      g_idx,
+      group_size,
+      lowp_mode,
+      act_quant_mode,
+      compensation);
+}
+
 at::Tensor woq_linear_gelu_forward(
     const at::Tensor& input,
     const at::Tensor& op_context) {
@@ -964,6 +1052,19 @@ TORCH_LIBRARY_FRAGMENT(torch_ipex, m) {
       "woq_linear_mul",
       c10::DispatchKey::AutocastCPU,
       torch_ipex::autocast::woq_linear_mul_forward);
+  // the version without op_context
+  m.def(
+      "woq_linear(Tensor input, Tensor qweight, str weight_dtype, int[] weight_shape, Tensor[] weight_scales, "
+      "Tensor[]? weight_zeros, Tensor[]? bias, Tensor? g_idx, int group_size, int lowp_mode, int act_quant_mode, "
+      "Tensor? compensation = None) -> Tensor");
+  m.impl(
+      "woq_linear",
+      c10::DispatchKey::CPU,
+      torch_ipex::cpu::woq_linear_forward_v2);
+  m.impl(
+      "woq_linear",
+      c10::DispatchKey::AutocastCPU,
+      torch_ipex::autocast::woq_linear_forward_v2);
 #endif
   // fuse eltwise
   m.def(
 
@@ -84,6 +84,20 @@ at::Tensor woq_linear_forward(
     const at::Tensor& input,
     const at::Tensor& op_context);
 
+at::Tensor woq_linear_forward_v2(
+    const at::Tensor& input,
+    const at::Tensor& qweight,
+    const c10::string_view& weight_dtype,
+    const std::vector<int64_t>& weight_shape,
+    const std::vector<at::Tensor>& weight_scales,
+    const c10::optional<std::vector<at::Tensor>>& weight_zeros,
+    const c10::optional<std::vector<at::Tensor>>& bias,
+    const c10::optional<at::Tensor>& g_idx,
+    int64_t group_size,
+    int64_t lowp_mode,
+    int64_t act_quant_mode,
+    const c10::optional<at::Tensor>& compensation);
+
 at::Tensor woq_linear_gelu_forward(
     const at::Tensor& input,
     const at::Tensor& op_context);
@@ -252,6 +266,12 @@ IPEX_DECLARE_DISPATCH(
 #define WOQ_FUSE_ADD_ADD 0x20
 #define WOQ_FUSE_MUL 0x30
 
+// weight quant mode
+#define QUANT_W_PER_CHANNEL 0
+#define QUANT_W_PER_K_BLOCK 1
+#define QUANT_W_PER_CHANNEL_SYM 2
+#define QUANT_W_PER_K_BLOCK_SYM 3
+
 #define WOQ_N_BLOCK_SIZE 32
 
 #endif
 
@@ -130,27 +130,24 @@ inline Vectorized<scalar_t> exp_u20(Vectorized<scalar_t> data) {
 inline Vectorized<float> exp_u20(Vectorized<float> data) {
   __m512 values = __m512(data);
   // A faster version of exp with ULP=20
-  static __m512 vec_factorial_1 =
-      _mm512_set1_ps(0.999999701f); // 1/factorial(1)
-  static __m512 vec_factorial_2 =
-      _mm512_set1_ps(0.499991506f); // 1/factorial(2)
-  static __m512 vec_factorial_3 =
-      _mm512_set1_ps(0.166676521f); // 1/factorial(3)
-  static __m512 vec_factorial_4 =
+  const __m512 vec_factorial_1 = _mm512_set1_ps(0.999999701f); // 1/factorial(1)
+  const __m512 vec_factorial_2 = _mm512_set1_ps(0.499991506f); // 1/factorial(2)
+  const __m512 vec_factorial_3 = _mm512_set1_ps(0.166676521f); // 1/factorial(3)
+  const __m512 vec_factorial_4 =
       _mm512_set1_ps(0.0418978221f); // 1/factorial(4)
-  static __m512 vec_factorial_5 =
+  const __m512 vec_factorial_5 =
       _mm512_set1_ps(0.00828929059f); // 1/factorial(5)
-  static __m512 vec_exp_log2ef =
+  const __m512 vec_exp_log2ef =
       (__m512)_mm512_set1_epi32(0x3fb8aa3b); // log2(e)
-  static __m512 vec_half = _mm512_set1_ps(0.5f);
-  static __m512 vec_one = _mm512_set1_ps(1.f);
-  static __m512 vec_zero = _mm512_set1_ps(0.f);
-  static __m512 vec_two = _mm512_set1_ps(2.f);
-  static __m512 vec_ln2f = (__m512)_mm512_set1_epi32(0x3f317218); // ln(2)
-  static __m512 vec_ln_flt_min = (__m512)_mm512_set1_epi32(0xc2aeac50);
-  static __m512 vec_ln_flt_max = (__m512)_mm512_set1_epi32(0x42b17218);
-  static __m512i vec_127 = _mm512_set1_epi32(0x0000007f);
-  static int n_mantissa_bits = 23;
+  const __m512 vec_half = _mm512_set1_ps(0.5f);
+  const __m512 vec_one = _mm512_set1_ps(1.f);
+  const __m512 vec_zero = _mm512_set1_ps(0.f);
+  const __m512 vec_two = _mm512_set1_ps(2.f);
+  const __m512 vec_ln2f = (__m512)_mm512_set1_epi32(0x3f317218); // ln(2)
+  const __m512 vec_ln_flt_min = (__m512)_mm512_set1_epi32(0xc2aeac50);
+  const __m512 vec_ln_flt_max = (__m512)_mm512_set1_epi32(0x42b17218);
+  const __m512i vec_127 = _mm512_set1_epi32(0x0000007f);
+  const int n_mantissa_bits = 23;
 
   // exp(x) =
   // = exp(n * ln(2) + r) // divide x by ln(2) and get quot and rem