2021-03-12 nightly release (997f05c)

chronos_secgrp_pytorch_oss_ci_oncall · chronos_secgrp_pytorch_oss_ci_oncall · commit 4ec6b3690550 · 2021-03-12T00:01:53.000-08:00
diff --git a/aten/src/ATen/TensorIterator.cpp b/aten/src/ATen/TensorIterator.cpp
@@ -1288,7 +1288,12 @@ void TensorIteratorBase::set_output(int64_t output_idx, IntArrayRef sizes, IntAr
       // for the is_meta_ test.
       TORCH_INTERNAL_ASSERT(op.original_tensor.is_same(t));
       TORCH_INTERNAL_ASSERT(!op.tensor.is_same(t));
-      at::native::resize_output(op.tensor, sizes);
+      // fastpath CPU to skip a dispatcher trip
+      if (op.tensor.device().is_cpu()) {
+        at::native::resize_output_cpu(op.tensor, sizes);
+      } else {
+        at::native::resize_output(op.tensor, sizes);
+      }
       if (!strides.empty()) {
         TORCH_INTERNAL_ASSERT(!options.memory_format_opt().has_value());
         op.tensor.as_strided_(sizes, strides);
@@ -1314,7 +1319,12 @@ void TensorIterator::set_output(int64_t output_idx, IntArrayRef sizes, IntArrayR
       }
       op.current_dtype = op.target_dtype;
   } else if (op.will_resize) {
-      at::native::resize_output(op.tensor, sizes);
+      // fastpath CPU to skip a dispatcher trip
+      if (op.tensor.device().is_cpu()) {
+        at::native::resize_output_cpu(op.tensor, sizes);
+      } else {
+        at::native::resize_output(op.tensor, sizes);
+      }
       if (!strides.empty()) {
         TORCH_INTERNAL_ASSERT(!options.memory_format_opt().has_value());
         op.tensor.as_strided_(sizes, strides);
diff --git a/aten/src/ATen/native/Resize.cpp b/aten/src/ATen/native/Resize.cpp
@@ -6,7 +6,7 @@
 
 namespace at { namespace native {
 
-void resize_output(Tensor& output, IntArrayRef shape) {
+void resize_output_check(Tensor& output, IntArrayRef shape) {
   // Tests for resizing of tensors with one more elements
   if (output.numel() != 0 && !output.sizes().equals(shape)) {
     TORCH_WARN(
@@ -18,10 +18,22 @@ void resize_output(Tensor& output, IntArrayRef shape) {
       "reuse an out tensor t by resizing it, inplace, to zero elements with ",
       "t.resize_(0).");
   }
+}
 
+void resize_output(Tensor& output, IntArrayRef shape) {
+  resize_output_check(output, shape);
   output.resize_(shape);
 }
 
+// This is a performance escape hatch for resize_output.
+// It's CPU only and it skips the dispatcher.
+// Ideally, once external backends have access to meta functions
+// We can write one for resize_ and get rid of this.
+void resize_output_cpu(Tensor& output, IntArrayRef shape) {
+  resize_output_check(output, shape);
+  at::native::resize_(output, shape);
+}
+
 // Call the sparse implementation in SparseTensor.cpp directly.
 // A dynamic dispatch here is NOT necessary, so I didn't put
 // this function in native_functions.yaml
diff --git a/aten/src/ATen/native/Resize.h b/aten/src/ATen/native/Resize.h
@@ -18,6 +18,8 @@ namespace at { namespace native {
 // NOTE: In the future the warning will become an error
 TORCH_API void resize_output(Tensor& output, IntArrayRef shape);
 
+TORCH_API void resize_output_cpu(Tensor& output, IntArrayRef shape);
+
 // These functions are called by native::resize_ as well as (legacy) TH resize.
 // They are not in TH/THTensor.cpp because the at namespace is easier
 // to benchmark than TH; I can't get gbenchmark to call fns from THTensor.cpp
diff --git a/caffe2/opt/onnxifi_op.cc b/caffe2/opt/onnxifi_op.cc
@@ -684,12 +684,11 @@ bool OnnxifiOp<CPUContext>::RunOnDevice() {
         output_desc_.data(),
         &output_fence,
         traces_.get());
-    const string statusString = mapOnnxStatusToString(status);
     CAFFE_ENFORCE_EQ(
         status,
         ONNXIFI_STATUS_SUCCESS,
         "Reason: onnxSetIOAndRunGraph returned status code ",
-        statusString);
+        mapOnnxStatusToString(status));
 
     current_batch_size = extractOutputBatchSizes();
     onnxEventState eventState;
diff --git a/test/cpp/tensorexpr/test_reductions.cpp b/test/cpp/tensorexpr/test_reductions.cpp
@@ -1978,5 +1978,34 @@ TEST(Reductions, ReductionVectorizeRfactor) {
   ASSERT_EQ(out_before[0], out_after[0]);
 }
 
+TEST(Reductions, InitFunction) {
+  KernelScope ks;
+  constexpr int M = 32;
+  constexpr int N = 16;
+  Placeholder A("A", kFloat, {M, N});
+  Placeholder B("B", kFloat, {N});
+  Tensor* C = Reduce(
+      "C",
+      {{N, "n"}},
+      Sum(),
+      [&](const std::vector<VarHandle>& v) { return B.load(v[0]); },
+      [&](const std::vector<VarHandle>& v) { return A.load(v[1], v[0]); },
+      {{M, "m"}});
+  LoopNest nest({C});
+  nest.prepareForCodegen();
+  Stmt* s = IRSimplifier::simplify(nest.root_stmt());
+  std::ostringstream oss;
+  oss << *s << "\n";
+  const std::string& expected_ir =
+      R"IR(
+#CHECK:  for (int n = 0; n < 16; n++) {
+#CHECK:    C[n] = B[n];
+#CHECK:    for (int m = 0; m < 32; m++) {
+#CHECK:      C[n] = (C[n]) + (A[n + 16 * m]);
+#CHECK:    }
+#CHECK:  }
+      )IR";
+  torch::jit::testing::FileCheck().run(expected_ir, oss.str());
+}
 } // namespace jit
 } // namespace torch
diff --git a/tools/codegen/dest/register_dispatch_key.py b/tools/codegen/dest/register_dispatch_key.py
@@ -277,9 +277,16 @@ def gen_class_set_output_body(self, k: SchemaKind) -> str:
         elif k is SchemaKind.inplace:
             return maybe_set_guard
         elif k is SchemaKind.out:
+            if self.dispatch_key == DispatchKey.CPU:
+                resize_impl = "resize_output_cpu"
+            else:
+                # Only bothering to include a resize_output fastpath for CPU for now.
+                # We can add one in if for the perf if we need to. But it'll be easier when external backends
+                # have access to meta functions, and we can write one for resize_.
+                resize_impl = "resize_output"
             return f"""
 {maybe_set_guard}
-at::native::resize_output(outputs_[output_idx], sizes);
+at::native::{resize_impl}(outputs_[output_idx], sizes);
 if (!strides.empty()) {{
     TORCH_INTERNAL_ASSERT(!options.memory_format_opt().has_value());
     at::native::as_strided_(outputs_[output_idx], sizes, strides);
diff --git a/torch/csrc/jit/tensorexpr/tensor.h b/torch/csrc/jit/tensorexpr/tensor.h
@@ -173,11 +173,12 @@ inline void unpack_dim_args(
 }
 
 // Handle reductions over a Reducer and a body_func which produces values.
-template <typename BodyFunc>
+template <typename InitFunc, typename BodyFunc>
 Tensor* Reduce(
     const std::string& func_name,
     const std::vector<DimArg>& dim_args,
     const Reducer& reducer,
+    const InitFunc& init_func,
     const BodyFunc& body_func,
     const std::vector<DimArg>& reduce_args) {
   std::vector<const Expr*> dims;
@@ -195,7 +196,8 @@ Tensor* Reduce(
   ExprHandle body =
       Reducer::getReduceBody(body_func, VarVectorToVarHandleVector(all_vars));
   std::vector<const Expr*> output_args(vars.begin(), vars.end());
-  const Expr* init_expr = new Cast(body.dtype(), reducer.initializer());
+  const Expr* init_expr = new Cast(
+      body.dtype(), init_func(VarVectorToVarHandleVector(vars)).node());
   Buf* func_result = new Buf(func_name, dims, body.dtype(), init_expr);
   const ReduceOp* reduce_op =
       reducer(func_result, body, output_args, reduce_vars);
@@ -204,6 +206,22 @@ Tensor* Reduce(
   return t;
 }
 
+template <typename BodyFunc>
+Tensor* Reduce(
+    const std::string& func_name,
+    const std::vector<DimArg>& dim_args,
+    const Reducer& reducer,
+    const BodyFunc& body_func,
+    const std::vector<DimArg>& reduce_args) {
+  return Reduce(
+      func_name,
+      dim_args,
+      reducer,
+      [&](ParameterList p) { return ExprHandle(reducer.initializer()); },
+      body_func,
+      reduce_args);
+}
+
 // Overload which allows inline lambda functions for the body_func.
 template <typename BodyFunc>
 Tensor* Reduce(