[tensorexpr] Fix isNLC segfault (pytorch#72786)

IvanKobzarev · pytorchmergebot · commit 67cd98fad4d8 · 2022-02-15T20:31:56.000Z
Summary: Pull Request resolved: pytorch#72786 Test Plan: Imported from OSS Reviewed By: H-Huang Differential Revision: D34204523 Pulled By: IvanKobzarev fbshipit-source-id: 9a0f2ce0a1921e261932029c3ebd842330fdf528 (cherry picked from commit b832606)
diff --git a/test/cpp/tensorexpr/test_quantization.cpp b/test/cpp/tensorexpr/test_quantization.cpp
@@ -90,6 +90,38 @@ TEST_F(Quantization, QuantDequantUInt8) {
   CHECK_EQ(check, 1);
 }
 
+TEST_F(Quantization, QuantDequantUInt8_NLC) {
+  const auto graph_string = R"IR(
+      graph(%x.1 : Float(1, 2, 2, strides=[4, 1, 2], device=cpu)):
+        %2 : int = prim::Constant[value=13]()
+        %3 : int = prim::Constant[value=122]()
+        %4 : float = prim::Constant[value=0.1]()
+        %q.1 : QUInt8(1, 2, 2) = aten::quantize_per_tensor(%x.1, %4, %3, %2)
+        %6 : Float(1, 2, 2) = aten::dequantize(%q.1)
+        return (%6))IR";
+  auto graph = std::make_shared<Graph>();
+  parseIR(graph_string, &*graph);
+
+  auto x = 2 * at::rand({1, 2, 2}, TensorOptions(kCPU).dtype(at::kFloat));
+  x.unsafeGetTensorImpl()->set_sizes_and_strides({1, 2, 2}, {4, 1, 2});
+  auto q = at::quantize_per_tensor(x, 0.1f, 122, at::kQUInt8);
+  auto y_expected = at::dequantize(q);
+  TensorExprKernel k(graph);
+  std::vector<at::Tensor> inputs = {x};
+  StmtPtr s = k.getCodeGenStmt();
+
+  std::vector<IValue> stack = fmap<IValue>(inputs);
+  k.run(stack);
+  auto y = stack[0].toTensor();
+  bool check = at::allclose(y_expected, y);
+  if (!check) {
+    std::cout << "x:\n" << x << std::endl;
+    std::cout << "y_expected:\n" << y_expected << std::endl;
+    std::cout << "y:\n" << y << std::endl;
+  }
+  CHECK_EQ(check, 1);
+}
+
 at::Tensor quantized_add(
     at::Tensor x1,
     at::Tensor x2,
diff --git a/torch/csrc/jit/tensorexpr/operators/quantization.cpp b/torch/csrc/jit/tensorexpr/operators/quantization.cpp
@@ -39,20 +39,7 @@ bool isQuantized(const BufHandle& qx) {
   return qx.node()->qscale() && qx.node()->qzero();
 }
 
-BufHandle makeQBufHandleNCHW(
-    const std::string& name,
-    const std::vector<ExprHandle>& dims,
-    Dtype dtype,
-    const ExprPtr qscale,
-    const ExprPtr qzero) {
-  BufHandle ResultBuf(name, dims, dtype);
-  ResultBuf.node()->set_qscale(qscale);
-  ResultBuf.node()->set_qzero(qzero);
-  ResultBuf.node()->set_strides(make_contiguous_strides(dims));
-  return ResultBuf;
-}
-
-BufHandle makeQBufHandleNHWC(
+BufHandle makeQBufHandleChannelsLast(
     const std::string& name,
     const std::vector<ExprHandle>& dims,
     Dtype dtype,
@@ -65,21 +52,21 @@ BufHandle makeQBufHandleNHWC(
   return ResultBuf;
 }
 
-BufHandle makeQBufHandleNHWC(
+BufHandle makeQBufHandleChannelsLast(
     const std::string& name,
     const std::vector<ExprHandle>& dims,
     Dtype dtype,
     const double qscale,
     const int64_t qzero) {
-  return makeQBufHandleNHWC(
+  return makeQBufHandleChannelsLast(
       name,
       dims,
       dtype,
       DoubleImm::make(qscale).node(),
       LongImm::make(qzero).node());
 }
 
-BufHandle makeQBufHandleNLC(
+BufHandle makeQBufHandleContiguous(
     const std::string& name,
     const std::vector<ExprHandle>& dims,
     Dtype dtype,
@@ -88,62 +75,37 @@ BufHandle makeQBufHandleNLC(
   BufHandle ResultBuf(name, dims, dtype);
   ResultBuf.node()->set_qscale(qscale);
   ResultBuf.node()->set_qzero(qzero);
-  ResultBuf.node()->set_strides(make_channels_last_strides(dims));
+  ResultBuf.node()->set_strides(make_contiguous_strides(dims));
   return ResultBuf;
 }
 
-BufHandle makeQBufHandleNLC(
+BufHandle makeQBufHandleContiguous(
     const std::string& name,
     const std::vector<ExprHandle>& dims,
     Dtype dtype,
     const double qscale,
     const int64_t qzero) {
-  return makeQBufHandleNLC(
+  return makeQBufHandleContiguous(
       name,
       dims,
       dtype,
       DoubleImm::make(qscale).node(),
       LongImm::make(qzero).node());
 }
 
-BufHandle makeQBufHandleNCHW(
-    const std::string& name,
-    const std::vector<ExprHandle>& dims,
-    Dtype dtype,
-    const double qscale,
-    const int64_t qzero) {
-  return makeQBufHandleNCHW(
-      name,
-      dims,
-      dtype,
-      DoubleImm::make(qscale).node(),
-      LongImm::make(qzero).node());
-}
-
-bool isNHWC(const BufHandle& buf) {
+bool isChannelsLast(const BufHandle& buf) {
   const auto& strides = buf.node()->strides();
   const auto& dims = buf.node()->dims();
-  if (strides.size() != 4) {
+  const auto rank = dims.size();
+  if (rank < 3) {
     return false;
   }
-  auto dims1 = to<LongImm>(IRSimplifier::simplify(dims[1]))->value();
-  auto strides1 = to<LongImm>(IRSimplifier::simplify(strides[1]))->value();
-  auto strides3 = to<LongImm>(IRSimplifier::simplify(strides[3]))->value();
+  auto dimsC = to<LongImm>(IRSimplifier::simplify(dims[1]))->value();
+  auto stridesC = to<LongImm>(IRSimplifier::simplify(strides[1]))->value();
+  auto stridesLast =
+      to<LongImm>(IRSimplifier::simplify(strides[rank - 1]))->value();
 
-  return ((strides3 == dims1) && (strides1 == 1));
-}
-
-bool isNLC(const BufHandle& buf) {
-  const auto& strides = buf.node()->strides();
-  const auto& dims = buf.node()->dims();
-  if (strides.size() != 3) {
-    return false;
-  }
-  auto dims1 = to<LongImm>(IRSimplifier::simplify(dims[1]))->value();
-  auto strides1 = to<LongImm>(IRSimplifier::simplify(strides[1]))->value();
-  auto strides3 = to<LongImm>(IRSimplifier::simplify(strides[3]))->value();
-
-  return ((strides3 == dims1) && (strides1 == 1));
+  return ((stridesLast == dimsC) && (stridesC == 1));
 }
 
 ExprHandle quant(
@@ -273,15 +235,11 @@ Tensor computeQuantizePerTensorExternalCall(
     throw malformed_input("Expected quantized dtype");
   }(qdtype);
   auto ResultBuf = [&]() {
-    if (isNHWC(x)) {
-      return makeQBufHandleNHWC(
-          "quantize_per_tensor", outputShape, dtype, qscale, qzero);
-    }
-    if (isNLC(x)) {
-      return makeQBufHandleNLC(
+    if (isChannelsLast(x)) {
+      return makeQBufHandleChannelsLast(
           "quantize_per_tensor", outputShape, dtype, qscale, qzero);
     }
-    return makeQBufHandleNCHW(
+    return makeQBufHandleContiguous(
         "quantize_per_tensor", outputShape, dtype, qscale, qzero);
   }();
   StmtPtr s = ExternalCall::make(
@@ -376,7 +334,7 @@ Tensor computeQuantizedConv1d(
   const auto out_qzero = c10::get<int64_t>(inputs[3]);
   // Change to dtype based on outputType when dtype propagation implemented
   const auto out_qdtype = immQDType(qx);
-  auto ResultBuf = makeQBufHandleNLC(
+  auto ResultBuf = makeQBufHandleChannelsLast(
       "quantized_conv1d",
       outputShape,
       Dtype(out_qdtype),
@@ -407,7 +365,7 @@ Tensor computeQuantizedConv2d(
   const auto out_qzero = c10::get<int64_t>(inputs[3]);
   // Change to dtype based on outputType when dtype propagation implemented
   const auto out_qdtype = immQDType(qx);
-  auto ResultBuf = makeQBufHandleNHWC(
+  auto ResultBuf = makeQBufHandleChannelsLast(
       "quantized_conv2d",
       outputShape,
       Dtype(out_qdtype),
@@ -438,7 +396,7 @@ Tensor computeQuantizedConv2dRelu(
   const auto out_qzero = c10::get<int64_t>(inputs[3]);
   // Change to dtype based on outputType when dtype propagation implemented
   const auto out_qdtype = immQDType(qx);
-  auto ResultBuf = makeQBufHandleNHWC(
+  auto ResultBuf = makeQBufHandleChannelsLast(
       "quantized_conv2d_relu",
       outputShape,
       Dtype(out_qdtype),
@@ -469,7 +427,7 @@ Tensor computeQuantizedLinear(
   const auto out_qzero = c10::get<int64_t>(inputs[3]);
   // Change to dtype based on outputType when dtype propagation implemented
   const auto out_qdtype = immQDType(qx);
-  auto ResultBuf = makeQBufHandleNCHW(
+  auto ResultBuf = makeQBufHandleContiguous(
       "quantized_linear",
       outputShape,
       Dtype(out_qdtype),
@@ -500,7 +458,7 @@ Tensor computeQuantizedLinearRelu(
   const auto out_qzero = c10::get<int64_t>(inputs[3]);
   // Change to dtype based on outputType when dtype propagation implemented
   const auto out_qdtype = immQDType(qx);
-  auto ResultBuf = makeQBufHandleNCHW(
+  auto ResultBuf = makeQBufHandleContiguous(
       "quantized_linear_relu",
       outputShape,
       Dtype(out_qdtype),
@@ -531,16 +489,16 @@ Tensor computeQuantizedAddExternalCall(
   const auto out_qzero = c10::get<int64_t>(inputs[3]);
   // Change to dtype based on outputType when dtype propagation implemented
   const auto out_qdtype = immQDType(qa);
-  const bool isQAChannelsLast = isNHWC(qa);
-  const bool isQBChannelsLast = isNHWC(qb);
+  const bool isQAChannelsLast = isChannelsLast(qa);
+  const bool isQBChannelsLast = isChannelsLast(qb);
   auto ResultBuf = (isQAChannelsLast || isQBChannelsLast)
-      ? makeQBufHandleNHWC(
+      ? makeQBufHandleChannelsLast(
             "quantized_add",
             outputShape,
             Dtype(out_qdtype),
             out_qscale,
             out_qzero)
-      : makeQBufHandleNCHW(
+      : makeQBufHandleContiguous(
             "quantized_add",
             outputShape,
             Dtype(out_qdtype),
@@ -574,7 +532,7 @@ Tensor computeQuantizedMul(
   const auto out_qzero = c10::get<int64_t>(inputs[3]);
   // Change to dtype based on outputType when dtype propagation implemented
   const auto out_qdtype = immQDType(qa);
-  auto ResultBuf = makeQBufHandleNCHW(
+  auto ResultBuf = makeQBufHandleContiguous(
       "quantized_mul", outputShape, Dtype(out_qdtype), out_qscale, out_qzero);
   StmtPtr s = ExternalCall::make(
       ResultBuf,
@@ -603,7 +561,7 @@ Tensor computeQuantizedMulScalar(
   // Change to dtype based on outputType when dtype propagation implemented
   const auto out_qdtype = immQDType(qa);
   double scale1 = immQScale(qa);
-  auto ResultBuf = makeQBufHandleNCHW(
+  auto ResultBuf = makeQBufHandleContiguous(
       "quantized_mul_scalar",
       outputShape,
       Dtype(out_qdtype),
@@ -626,14 +584,14 @@ Tensor computeQuantizedRelu(
     at::Device device) {
   const BufHandle& qa = c10::get<BufHandle>(inputs[0]);
   const auto out_qdtype = immQDType(qa);
-  const bool isQAChannelsLast = isNHWC(qa);
-  auto ResultBuf = isQAChannelsLast ? makeQBufHandleNHWC(
+  const bool isQAChannelsLast = isChannelsLast(qa);
+  auto ResultBuf = isQAChannelsLast ? makeQBufHandleChannelsLast(
                                           "quantized_relu",
                                           outputShape,
                                           Dtype(out_qdtype),
                                           immQScale(qa),
                                           immQZero(qa))
-                                    : makeQBufHandleNCHW(
+                                    : makeQBufHandleContiguous(
                                           "quantized_relu",
                                           outputShape,
                                           Dtype(out_qdtype),
@@ -674,7 +632,7 @@ Tensor computeQuantizedCat(
   extra_args.emplace_back(argDim);
   extra_args.emplace_back(out_qscale);
   extra_args.emplace_back(out_qzero);
-  auto ResultBuf = makeQBufHandleNCHW(
+  auto ResultBuf = makeQBufHandleContiguous(
       "quantized_cat",
       outputShape,
       Dtype(immQDType(inputList[0])),
@@ -793,7 +751,7 @@ Tensor computeUpsampleNearest2dExternalCall(
 
   BufHandle ResultBuf = [&]() {
     if (isQuantized(x)) {
-      return makeQBufHandleNHWC(
+      return makeQBufHandleChannelsLast(
           "upsample_nearest2d",
           outputShape,
           Dtype(immQDType(x)),
@@ -829,7 +787,7 @@ Tensor computeQuantizedSigmoidExternalCall(
   const double out_qscale = 1.0f / 256.0f;
   const int64_t out_qzero = (out_qdtype == ScalarType::QInt8) ? -128 : 0;
 
-  auto ResultBuf = makeQBufHandleNHWC(
+  auto ResultBuf = makeQBufHandleChannelsLast(
       "quantized_sigmoid",
       outputShape,
       Dtype(out_qdtype),