ai-content-maker/.venv/Lib/site-packages/torch/include/c10/cuda/CUDADeviceAssertion.h

97 lines
4.0 KiB
C++

#pragma once
#include <c10/cuda/CUDAException.h>
#include <c10/macros/Macros.h>
namespace c10::cuda {
#ifdef TORCH_USE_CUDA_DSA
// Copy string from `src` to `dst`
static __device__ void dstrcpy(char* dst, const char* src) {
int i = 0;
// Copy string from source to destination, ensuring that it
// isn't longer than `C10_CUDA_DSA_MAX_STR_LEN-1`
while (*src != '\0' && i++ < C10_CUDA_DSA_MAX_STR_LEN - 1) {
*dst++ = *src++;
}
*dst = '\0';
}
static __device__ void dsa_add_new_assertion_failure(
DeviceAssertionsData* assertions_data,
const char* assertion_msg,
const char* filename,
const char* function_name,
const int line_number,
const uint32_t caller,
const dim3 block_id,
const dim3 thread_id) {
// `assertions_data` may be nullptr if device-side assertion checking
// is disabled at run-time. If it is disabled at compile time this
// function will never be called
if (!assertions_data) {
return;
}
// Atomically increment so other threads can fail at the same time
// Note that incrementing this means that the CPU can observe that
// a failure has happened and can begin to respond before we've
// written information about that failure out to the buffer.
const auto nid = atomicAdd(&(assertions_data->assertion_count), 1);
if (nid >= C10_CUDA_DSA_ASSERTION_COUNT) {
// At this point we're ran out of assertion buffer space.
// We could print a message about this, but that'd get
// spammy if a lot of threads did it, so we just silently
// ignore any other assertion failures. In most cases the
// failures will all probably be analogous anyway.
return;
}
// Write information about the assertion failure to memory.
// Note that this occurs only after the `assertion_count`
// increment broadcasts that there's been a problem.
auto& self = assertions_data->assertions[nid];
dstrcpy(self.assertion_msg, assertion_msg);
dstrcpy(self.filename, filename);
dstrcpy(self.function_name, function_name);
self.line_number = line_number;
self.caller = caller;
self.block_id[0] = block_id.x;
self.block_id[1] = block_id.y;
self.block_id[2] = block_id.z;
self.thread_id[0] = thread_id.x;
self.thread_id[1] = thread_id.y;
self.thread_id[2] = thread_id.z;
}
// Emulates a kernel assertion. The assertion won't stop the kernel's progress,
// so you should assume everything the kernel produces is garbage if there's an
// assertion failure.
// NOTE: This assumes that `assertions_data` and `assertion_caller_id` are
// arguments of the kernel and therefore accessible.
#define CUDA_KERNEL_ASSERT2(condition) \
do { \
if (C10_UNLIKELY(!(condition))) { \
/* Has an atomic element so threads can fail at the same time */ \
c10::cuda::dsa_add_new_assertion_failure( \
assertions_data, \
C10_STRINGIZE(condition), \
__FILE__, \
__FUNCTION__, \
__LINE__, \
assertion_caller_id, \
blockIdx, \
threadIdx); \
/* Now that the kernel has failed we early exit the kernel, but */ \
/* otherwise keep going and rely on the host to check UVM and */ \
/* determine we've had a problem */ \
return; \
} \
} while (false)
#else
#define CUDA_KERNEL_ASSERT2(condition) assert(condition)
#endif
} // namespace c10::cuda