Skip to content

Commit 4ec6b36

Browse files
author
chronos_secgrp_pytorch_oss_ci_oncall
committed
2021-03-12 nightly release (997f05c)
1 parent f8c286b commit 4ec6b36

File tree

7 files changed

+85
-8
lines changed

7 files changed

+85
-8
lines changed

aten/src/ATen/TensorIterator.cpp

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1288,7 +1288,12 @@ void TensorIteratorBase::set_output(int64_t output_idx, IntArrayRef sizes, IntAr
12881288
// for the is_meta_ test.
12891289
TORCH_INTERNAL_ASSERT(op.original_tensor.is_same(t));
12901290
TORCH_INTERNAL_ASSERT(!op.tensor.is_same(t));
1291-
at::native::resize_output(op.tensor, sizes);
1291+
// fastpath CPU to skip a dispatcher trip
1292+
if (op.tensor.device().is_cpu()) {
1293+
at::native::resize_output_cpu(op.tensor, sizes);
1294+
} else {
1295+
at::native::resize_output(op.tensor, sizes);
1296+
}
12921297
if (!strides.empty()) {
12931298
TORCH_INTERNAL_ASSERT(!options.memory_format_opt().has_value());
12941299
op.tensor.as_strided_(sizes, strides);
@@ -1314,7 +1319,12 @@ void TensorIterator::set_output(int64_t output_idx, IntArrayRef sizes, IntArrayR
13141319
}
13151320
op.current_dtype = op.target_dtype;
13161321
} else if (op.will_resize) {
1317-
at::native::resize_output(op.tensor, sizes);
1322+
// fastpath CPU to skip a dispatcher trip
1323+
if (op.tensor.device().is_cpu()) {
1324+
at::native::resize_output_cpu(op.tensor, sizes);
1325+
} else {
1326+
at::native::resize_output(op.tensor, sizes);
1327+
}
13181328
if (!strides.empty()) {
13191329
TORCH_INTERNAL_ASSERT(!options.memory_format_opt().has_value());
13201330
op.tensor.as_strided_(sizes, strides);

aten/src/ATen/native/Resize.cpp

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
namespace at { namespace native {
88

9-
void resize_output(Tensor& output, IntArrayRef shape) {
9+
void resize_output_check(Tensor& output, IntArrayRef shape) {
1010
// Tests for resizing of tensors with one more elements
1111
if (output.numel() != 0 && !output.sizes().equals(shape)) {
1212
TORCH_WARN(
@@ -18,10 +18,22 @@ void resize_output(Tensor& output, IntArrayRef shape) {
1818
"reuse an out tensor t by resizing it, inplace, to zero elements with ",
1919
"t.resize_(0).");
2020
}
21+
}
2122

23+
void resize_output(Tensor& output, IntArrayRef shape) {
24+
resize_output_check(output, shape);
2225
output.resize_(shape);
2326
}
2427

28+
// This is a performance escape hatch for resize_output.
29+
// It's CPU only and it skips the dispatcher.
30+
// Ideally, once external backends have access to meta functions
31+
// We can write one for resize_ and get rid of this.
32+
void resize_output_cpu(Tensor& output, IntArrayRef shape) {
33+
resize_output_check(output, shape);
34+
at::native::resize_(output, shape);
35+
}
36+
2537
// Call the sparse implementation in SparseTensor.cpp directly.
2638
// A dynamic dispatch here is NOT necessary, so I didn't put
2739
// this function in native_functions.yaml

aten/src/ATen/native/Resize.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ namespace at { namespace native {
1818
// NOTE: In the future the warning will become an error
1919
TORCH_API void resize_output(Tensor& output, IntArrayRef shape);
2020

21+
TORCH_API void resize_output_cpu(Tensor& output, IntArrayRef shape);
22+
2123
// These functions are called by native::resize_ as well as (legacy) TH resize.
2224
// They are not in TH/THTensor.cpp because the at namespace is easier
2325
// to benchmark than TH; I can't get gbenchmark to call fns from THTensor.cpp

caffe2/opt/onnxifi_op.cc

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -684,12 +684,11 @@ bool OnnxifiOp<CPUContext>::RunOnDevice() {
684684
output_desc_.data(),
685685
&output_fence,
686686
traces_.get());
687-
const string statusString = mapOnnxStatusToString(status);
688687
CAFFE_ENFORCE_EQ(
689688
status,
690689
ONNXIFI_STATUS_SUCCESS,
691690
"Reason: onnxSetIOAndRunGraph returned status code ",
692-
statusString);
691+
mapOnnxStatusToString(status));
693692

694693
current_batch_size = extractOutputBatchSizes();
695694
onnxEventState eventState;

test/cpp/tensorexpr/test_reductions.cpp

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1978,5 +1978,34 @@ TEST(Reductions, ReductionVectorizeRfactor) {
19781978
ASSERT_EQ(out_before[0], out_after[0]);
19791979
}
19801980

1981+
TEST(Reductions, InitFunction) {
1982+
KernelScope ks;
1983+
constexpr int M = 32;
1984+
constexpr int N = 16;
1985+
Placeholder A("A", kFloat, {M, N});
1986+
Placeholder B("B", kFloat, {N});
1987+
Tensor* C = Reduce(
1988+
"C",
1989+
{{N, "n"}},
1990+
Sum(),
1991+
[&](const std::vector<VarHandle>& v) { return B.load(v[0]); },
1992+
[&](const std::vector<VarHandle>& v) { return A.load(v[1], v[0]); },
1993+
{{M, "m"}});
1994+
LoopNest nest({C});
1995+
nest.prepareForCodegen();
1996+
Stmt* s = IRSimplifier::simplify(nest.root_stmt());
1997+
std::ostringstream oss;
1998+
oss << *s << "\n";
1999+
const std::string& expected_ir =
2000+
R"IR(
2001+
#CHECK: for (int n = 0; n < 16; n++) {
2002+
#CHECK: C[n] = B[n];
2003+
#CHECK: for (int m = 0; m < 32; m++) {
2004+
#CHECK: C[n] = (C[n]) + (A[n + 16 * m]);
2005+
#CHECK: }
2006+
#CHECK: }
2007+
)IR";
2008+
torch::jit::testing::FileCheck().run(expected_ir, oss.str());
2009+
}
19812010
} // namespace jit
19822011
} // namespace torch

tools/codegen/dest/register_dispatch_key.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -277,9 +277,16 @@ def gen_class_set_output_body(self, k: SchemaKind) -> str:
277277
elif k is SchemaKind.inplace:
278278
return maybe_set_guard
279279
elif k is SchemaKind.out:
280+
if self.dispatch_key == DispatchKey.CPU:
281+
resize_impl = "resize_output_cpu"
282+
else:
283+
# Only bothering to include a resize_output fastpath for CPU for now.
284+
# We can add one in if for the perf if we need to. But it'll be easier when external backends
285+
# have access to meta functions, and we can write one for resize_.
286+
resize_impl = "resize_output"
280287
return f"""
281288
{maybe_set_guard}
282-
at::native::resize_output(outputs_[output_idx], sizes);
289+
at::native::{resize_impl}(outputs_[output_idx], sizes);
283290
if (!strides.empty()) {{
284291
TORCH_INTERNAL_ASSERT(!options.memory_format_opt().has_value());
285292
at::native::as_strided_(outputs_[output_idx], sizes, strides);

torch/csrc/jit/tensorexpr/tensor.h

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -173,11 +173,12 @@ inline void unpack_dim_args(
173173
}
174174

175175
// Handle reductions over a Reducer and a body_func which produces values.
176-
template <typename BodyFunc>
176+
template <typename InitFunc, typename BodyFunc>
177177
Tensor* Reduce(
178178
const std::string& func_name,
179179
const std::vector<DimArg>& dim_args,
180180
const Reducer& reducer,
181+
const InitFunc& init_func,
181182
const BodyFunc& body_func,
182183
const std::vector<DimArg>& reduce_args) {
183184
std::vector<const Expr*> dims;
@@ -195,7 +196,8 @@ Tensor* Reduce(
195196
ExprHandle body =
196197
Reducer::getReduceBody(body_func, VarVectorToVarHandleVector(all_vars));
197198
std::vector<const Expr*> output_args(vars.begin(), vars.end());
198-
const Expr* init_expr = new Cast(body.dtype(), reducer.initializer());
199+
const Expr* init_expr = new Cast(
200+
body.dtype(), init_func(VarVectorToVarHandleVector(vars)).node());
199201
Buf* func_result = new Buf(func_name, dims, body.dtype(), init_expr);
200202
const ReduceOp* reduce_op =
201203
reducer(func_result, body, output_args, reduce_vars);
@@ -204,6 +206,22 @@ Tensor* Reduce(
204206
return t;
205207
}
206208

209+
template <typename BodyFunc>
210+
Tensor* Reduce(
211+
const std::string& func_name,
212+
const std::vector<DimArg>& dim_args,
213+
const Reducer& reducer,
214+
const BodyFunc& body_func,
215+
const std::vector<DimArg>& reduce_args) {
216+
return Reduce(
217+
func_name,
218+
dim_args,
219+
reducer,
220+
[&](ParameterList p) { return ExprHandle(reducer.initializer()); },
221+
body_func,
222+
reduce_args);
223+
}
224+
207225
// Overload which allows inline lambda functions for the body_func.
208226
template <typename BodyFunc>
209227
Tensor* Reduce(

0 commit comments

Comments
 (0)