forked from pytorch/pytorch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCUDAFunctions.cpp
342 lines (314 loc) · 10.8 KB
/
CUDAFunctions.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
#include <c10/cuda/CUDAFunctions.h>
#include <c10/macros/Macros.h>
#include <limits>
namespace c10::cuda {
namespace {
// returns -1 on failure
int32_t driver_version() {
int driver_version = -1;
C10_CUDA_IGNORE_ERROR(cudaDriverGetVersion(&driver_version));
return driver_version;
}
int device_count_impl(bool fail_if_no_driver) {
int count = 0;
auto err = C10_CUDA_ERROR_HANDLED(c10::cuda::GetDeviceCount(&count));
if (err == cudaSuccess) {
return count;
}
// Clear out the error state, so we don't spuriously trigger someone else.
// (This shouldn't really matter, since we won't be running very much CUDA
// code in this regime.)
cudaError_t last_err C10_UNUSED = cudaGetLastError();
switch (err) {
case cudaErrorNoDevice:
// Zero devices is ok here
count = 0;
break;
case cudaErrorInsufficientDriver: {
auto version = driver_version();
if (version <= 0) {
if (!fail_if_no_driver) {
// No CUDA driver means no devices
count = 0;
break;
}
TORCH_CHECK(
false,
"Found no NVIDIA driver on your system. Please check that you "
"have an NVIDIA GPU and installed a driver from "
"http://www.nvidia.com/Download/index.aspx");
} else {
TORCH_CHECK(
false,
"The NVIDIA driver on your system is too old (found version ",
version,
"). Please update your GPU driver by downloading and installing "
"a new version from the URL: "
"http://www.nvidia.com/Download/index.aspx Alternatively, go to: "
"https://pytorch.org to install a PyTorch version that has been "
"compiled with your version of the CUDA driver.");
}
} break;
case cudaErrorInitializationError:
TORCH_CHECK(
false,
"CUDA driver initialization failed, you might not "
"have a CUDA gpu.");
break;
case cudaErrorUnknown:
TORCH_CHECK(
false,
"CUDA unknown error - this may be due to an "
"incorrectly set up environment, e.g. changing env "
"variable CUDA_VISIBLE_DEVICES after program start. "
"Setting the available devices to be zero.");
break;
#if C10_ASAN_ENABLED
case cudaErrorMemoryAllocation:
// In ASAN mode, we know that a cudaErrorMemoryAllocation error will
// pop up if compiled with NVCC (clang-cuda is fine)
TORCH_CHECK(
false,
"Got 'out of memory' error while trying to initialize CUDA. "
"CUDA with nvcc does not work well with ASAN and it's probably "
"the reason. We will simply shut down CUDA support. If you "
"would like to use GPUs, turn off ASAN.");
break;
#endif // C10_ASAN_ENABLED
default:
TORCH_CHECK(
false,
"Unexpected error from cudaGetDeviceCount(). Did you run "
"some cuda functions before calling NumCudaDevices() "
"that might have already set an error? Error ",
err,
": ",
cudaGetErrorString(err));
}
return count;
}
} // namespace
DeviceIndex device_count() noexcept {
// initialize number of devices only once
static int count = []() {
try {
auto result = device_count_impl(/*fail_if_no_driver=*/false);
TORCH_INTERNAL_ASSERT(
result <= std::numeric_limits<DeviceIndex>::max(),
"Too many CUDA devices, DeviceIndex overflowed");
return result;
} catch (const c10::Error& ex) {
// We don't want to fail, but still log the warning
// msg() returns the message without the stack trace
TORCH_WARN("CUDA initialization: ", ex.msg());
return 0;
}
}();
return static_cast<DeviceIndex>(count);
}
DeviceIndex device_count_ensure_non_zero() {
// Call the implementation every time to throw the exception
int count = device_count_impl(/*fail_if_no_driver=*/true);
// Zero gpus doesn't produce a warning in `device_count` but we fail here
TORCH_CHECK(count, "No CUDA GPUs are available");
TORCH_INTERNAL_ASSERT(
count <= std::numeric_limits<DeviceIndex>::max(),
"Too many CUDA devices, DeviceIndex overflowed");
return static_cast<DeviceIndex>(count);
}
DeviceIndex current_device() {
DeviceIndex cur_device = -1;
C10_CUDA_CHECK(c10::cuda::GetDevice(&cur_device));
return cur_device;
}
void set_device(DeviceIndex device) {
C10_CUDA_CHECK(c10::cuda::SetDevice(device));
}
void device_synchronize() {
const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
if (C10_UNLIKELY(interp)) {
(*interp)->trace_gpu_device_synchronization(c10::kCUDA);
}
C10_CUDA_CHECK(cudaDeviceSynchronize());
}
// this function has to be called from callers performing cuda synchronizing
// operations, to raise proper error or warning
void warn_or_error_on_sync() {
if (warning_state().get_sync_debug_mode() == SyncDebugMode::L_ERROR) {
TORCH_CHECK(false, "called a synchronizing CUDA operation");
} else if (warning_state().get_sync_debug_mode() == SyncDebugMode::L_WARN) {
TORCH_WARN("called a synchronizing CUDA operation");
}
}
std::optional<DeviceIndex> getDeviceIndexWithPrimaryContext() {
// check current device first
auto current_device_index = current_device();
if (current_device_index >= 0) {
if (hasPrimaryContext(current_device_index)) {
return current_device_index;
}
}
for (const auto device_index : c10::irange(at::cuda::device_count())) {
if (device_index == current_device_index)
continue;
if (hasPrimaryContext(device_index)) {
return device_index;
}
}
return c10::nullopt;
}
namespace _internal {
bool dummyHasPrimaryContext(C10_UNUSED DeviceIndex device_index) {
TORCH_CHECK(false, "Should never been called");
}
bool (*hasPrimaryContext)(DeviceIndex) = dummyHasPrimaryContext;
// Private api to be called from CUDAHooks.cpp
C10_CUDA_API void setHasPrimaryContext(bool (*func)(DeviceIndex)) {
hasPrimaryContext = func ? func : dummyHasPrimaryContext;
}
} // namespace _internal
bool hasPrimaryContext(DeviceIndex device_index) {
return _internal::hasPrimaryContext(device_index);
}
// Wrappers for raw CUDA device management functions
cudaError_t GetDeviceCount(int* dev_count) {
return cudaGetDeviceCount(dev_count);
}
// This is a codepath for CUDA 12 that comes with a critical change in behavior
// of `cudaSetDevice`. Unlike to previous CUDA versions that allocate context
// lazily CUDA 12.x eagerly allocates primary context the moment `cudaSetDevice`
// is called. This can lead to dramatic consequences and pollute the device
// memory in distributed runs. To avoid unnecessary context creation a new
// function called `MaybeSetDevice` was introduced. This function is to be
// called in device guard destructor and at the exit of torch.cuda.device
// context manager. The behavior of `MaybeSetDevice` is quite simple, it calls
// to `cudaSetDevice` if context already exist or if context was not allocated
// on targeted device it simply saves the device index. This way we can keep
// PyTorch backward compatible for applications like this:
//
// ```
// import torch
// x = torch.empty(1, device=“cuda:1”) # no CUDA context on cuda:0 after this
// call y = torch.empty(1, device=“cuda”) # CUDA context is created on cuda:0
// ```
#if CUDA_VERSION >= 12000
thread_local DeviceIndex targetDeviceIndex = -1;
cudaError_t GetDevice(DeviceIndex* device) {
if (targetDeviceIndex >= 0) {
*device = targetDeviceIndex;
return cudaSuccess;
}
int tmp_device = -1;
auto err = cudaGetDevice(&tmp_device);
if (err == cudaSuccess) {
TORCH_INTERNAL_ASSERT(
tmp_device >= 0 &&
tmp_device <= std::numeric_limits<DeviceIndex>::max(),
"cudaGetDevice returns invalid device ",
tmp_device);
*device = static_cast<DeviceIndex>(tmp_device);
}
return err;
}
cudaError_t SetDevice(DeviceIndex device) {
TORCH_CHECK(device >= 0, "device id must be positive!", device);
targetDeviceIndex = -1;
int cur_device = -1;
C10_CUDA_CHECK(cudaGetDevice(&cur_device));
if (device == cur_device) {
return cudaSuccess;
}
return cudaSetDevice(device);
}
cudaError_t MaybeSetDevice(DeviceIndex device) {
if (hasPrimaryContext(device)) {
return c10::cuda::SetDevice(device);
}
targetDeviceIndex = device;
return cudaSuccess;
}
// This function always initializes the CUDA context
// on to_device
DeviceIndex ExchangeDevice(DeviceIndex to_device) {
auto cur_device = targetDeviceIndex;
targetDeviceIndex = -1;
if (cur_device < 0) {
int tmp_device = -1;
C10_CUDA_CHECK(cudaGetDevice(&tmp_device));
cur_device = static_cast<DeviceIndex>(tmp_device);
if (to_device == cur_device) {
return cur_device;
}
}
C10_CUDA_CHECK(cudaSetDevice(to_device));
return cur_device;
}
// This function does not initialize the CUDA context
// on to_device if it does not already exist
DeviceIndex MaybeExchangeDevice(DeviceIndex to_device) {
int tmp_cur_device = -1;
C10_CUDA_CHECK(cudaGetDevice(&tmp_cur_device));
TORCH_INTERNAL_ASSERT(
tmp_cur_device >= 0 &&
tmp_cur_device <= std::numeric_limits<DeviceIndex>::max(),
"cudaGetDevice returns invalid device ",
tmp_cur_device);
auto cur_device = static_cast<DeviceIndex>(tmp_cur_device);
if (to_device == tmp_cur_device) {
return cur_device;
}
if (hasPrimaryContext(to_device)) {
C10_CUDA_CHECK(cudaSetDevice(to_device));
} else {
targetDeviceIndex = to_device;
}
return cur_device;
}
void SetTargetDevice() {
if (targetDeviceIndex >= 0) {
C10_CUDA_CHECK(c10::cuda::SetDevice(targetDeviceIndex));
}
}
#else
cudaError_t GetDevice(DeviceIndex* device) {
int tmp_device = -1;
auto err = cudaGetDevice(&tmp_device);
if (err == cudaSuccess) {
TORCH_INTERNAL_ASSERT(
tmp_device >= 0 &&
tmp_device <= std::numeric_limits<DeviceIndex>::max(),
"cudaGetDevice returns invalid device ",
tmp_device);
*device = static_cast<DeviceIndex>(tmp_device);
}
return err;
}
cudaError_t SetDevice(DeviceIndex device) {
TORCH_CHECK(device >= 0, "device id must be positive!", device);
int cur_device = -1;
C10_CUDA_CHECK(cudaGetDevice(&cur_device));
if (device == cur_device) {
return cudaSuccess;
}
return cudaSetDevice(device);
}
cudaError_t MaybeSetDevice(DeviceIndex device) {
return c10::cuda::SetDevice(device);
}
DeviceIndex ExchangeDevice(DeviceIndex to_device) {
DeviceIndex cur_device = -1;
C10_CUDA_CHECK(c10::cuda::GetDevice(&cur_device));
if (to_device == cur_device) {
return cur_device;
}
C10_CUDA_CHECK(cudaSetDevice(to_device));
return cur_device;
}
DeviceIndex MaybeExchangeDevice(DeviceIndex to_device) {
return c10::cuda::ExchangeDevice(to_device);
}
void SetTargetDevice() {
// no-op on CUDA version < 12.x
}
#endif
} // namespace c10::cuda