97 lines
4.0 KiB
C
97 lines
4.0 KiB
C
|
#pragma once
|
||
|
|
||
|
#include <c10/cuda/CUDAException.h>
|
||
|
#include <c10/macros/Macros.h>
|
||
|
|
||
|
namespace c10::cuda {
|
||
|
|
||
|
#ifdef TORCH_USE_CUDA_DSA
|
||
|
// Copy string from `src` to `dst`
|
||
|
static __device__ void dstrcpy(char* dst, const char* src) {
|
||
|
int i = 0;
|
||
|
// Copy string from source to destination, ensuring that it
|
||
|
// isn't longer than `C10_CUDA_DSA_MAX_STR_LEN-1`
|
||
|
while (*src != '\0' && i++ < C10_CUDA_DSA_MAX_STR_LEN - 1) {
|
||
|
*dst++ = *src++;
|
||
|
}
|
||
|
*dst = '\0';
|
||
|
}
|
||
|
|
||
|
static __device__ void dsa_add_new_assertion_failure(
|
||
|
DeviceAssertionsData* assertions_data,
|
||
|
const char* assertion_msg,
|
||
|
const char* filename,
|
||
|
const char* function_name,
|
||
|
const int line_number,
|
||
|
const uint32_t caller,
|
||
|
const dim3 block_id,
|
||
|
const dim3 thread_id) {
|
||
|
// `assertions_data` may be nullptr if device-side assertion checking
|
||
|
// is disabled at run-time. If it is disabled at compile time this
|
||
|
// function will never be called
|
||
|
if (!assertions_data) {
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
// Atomically increment so other threads can fail at the same time
|
||
|
// Note that incrementing this means that the CPU can observe that
|
||
|
// a failure has happened and can begin to respond before we've
|
||
|
// written information about that failure out to the buffer.
|
||
|
const auto nid = atomicAdd(&(assertions_data->assertion_count), 1);
|
||
|
|
||
|
if (nid >= C10_CUDA_DSA_ASSERTION_COUNT) {
|
||
|
// At this point we're ran out of assertion buffer space.
|
||
|
// We could print a message about this, but that'd get
|
||
|
// spammy if a lot of threads did it, so we just silently
|
||
|
// ignore any other assertion failures. In most cases the
|
||
|
// failures will all probably be analogous anyway.
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
// Write information about the assertion failure to memory.
|
||
|
// Note that this occurs only after the `assertion_count`
|
||
|
// increment broadcasts that there's been a problem.
|
||
|
auto& self = assertions_data->assertions[nid];
|
||
|
dstrcpy(self.assertion_msg, assertion_msg);
|
||
|
dstrcpy(self.filename, filename);
|
||
|
dstrcpy(self.function_name, function_name);
|
||
|
self.line_number = line_number;
|
||
|
self.caller = caller;
|
||
|
self.block_id[0] = block_id.x;
|
||
|
self.block_id[1] = block_id.y;
|
||
|
self.block_id[2] = block_id.z;
|
||
|
self.thread_id[0] = thread_id.x;
|
||
|
self.thread_id[1] = thread_id.y;
|
||
|
self.thread_id[2] = thread_id.z;
|
||
|
}
|
||
|
|
||
|
// Emulates a kernel assertion. The assertion won't stop the kernel's progress,
|
||
|
// so you should assume everything the kernel produces is garbage if there's an
|
||
|
// assertion failure.
|
||
|
// NOTE: This assumes that `assertions_data` and `assertion_caller_id` are
|
||
|
// arguments of the kernel and therefore accessible.
|
||
|
#define CUDA_KERNEL_ASSERT2(condition) \
|
||
|
do { \
|
||
|
if (C10_UNLIKELY(!(condition))) { \
|
||
|
/* Has an atomic element so threads can fail at the same time */ \
|
||
|
c10::cuda::dsa_add_new_assertion_failure( \
|
||
|
assertions_data, \
|
||
|
C10_STRINGIZE(condition), \
|
||
|
__FILE__, \
|
||
|
__FUNCTION__, \
|
||
|
__LINE__, \
|
||
|
assertion_caller_id, \
|
||
|
blockIdx, \
|
||
|
threadIdx); \
|
||
|
/* Now that the kernel has failed we early exit the kernel, but */ \
|
||
|
/* otherwise keep going and rely on the host to check UVM and */ \
|
||
|
/* determine we've had a problem */ \
|
||
|
return; \
|
||
|
} \
|
||
|
} while (false)
|
||
|
#else
|
||
|
#define CUDA_KERNEL_ASSERT2(condition) assert(condition)
|
||
|
#endif
|
||
|
|
||
|
} // namespace c10::cuda
|