#pragma once #include #include namespace c10::cuda { #ifdef TORCH_USE_CUDA_DSA // Copy string from `src` to `dst` static __device__ void dstrcpy(char* dst, const char* src) { int i = 0; // Copy string from source to destination, ensuring that it // isn't longer than `C10_CUDA_DSA_MAX_STR_LEN-1` while (*src != '\0' && i++ < C10_CUDA_DSA_MAX_STR_LEN - 1) { *dst++ = *src++; } *dst = '\0'; } static __device__ void dsa_add_new_assertion_failure( DeviceAssertionsData* assertions_data, const char* assertion_msg, const char* filename, const char* function_name, const int line_number, const uint32_t caller, const dim3 block_id, const dim3 thread_id) { // `assertions_data` may be nullptr if device-side assertion checking // is disabled at run-time. If it is disabled at compile time this // function will never be called if (!assertions_data) { return; } // Atomically increment so other threads can fail at the same time // Note that incrementing this means that the CPU can observe that // a failure has happened and can begin to respond before we've // written information about that failure out to the buffer. const auto nid = atomicAdd(&(assertions_data->assertion_count), 1); if (nid >= C10_CUDA_DSA_ASSERTION_COUNT) { // At this point we're ran out of assertion buffer space. // We could print a message about this, but that'd get // spammy if a lot of threads did it, so we just silently // ignore any other assertion failures. In most cases the // failures will all probably be analogous anyway. return; } // Write information about the assertion failure to memory. // Note that this occurs only after the `assertion_count` // increment broadcasts that there's been a problem. auto& self = assertions_data->assertions[nid]; dstrcpy(self.assertion_msg, assertion_msg); dstrcpy(self.filename, filename); dstrcpy(self.function_name, function_name); self.line_number = line_number; self.caller = caller; self.block_id[0] = block_id.x; self.block_id[1] = block_id.y; self.block_id[2] = block_id.z; self.thread_id[0] = thread_id.x; self.thread_id[1] = thread_id.y; self.thread_id[2] = thread_id.z; } // Emulates a kernel assertion. The assertion won't stop the kernel's progress, // so you should assume everything the kernel produces is garbage if there's an // assertion failure. // NOTE: This assumes that `assertions_data` and `assertion_caller_id` are // arguments of the kernel and therefore accessible. #define CUDA_KERNEL_ASSERT2(condition) \ do { \ if (C10_UNLIKELY(!(condition))) { \ /* Has an atomic element so threads can fail at the same time */ \ c10::cuda::dsa_add_new_assertion_failure( \ assertions_data, \ C10_STRINGIZE(condition), \ __FILE__, \ __FUNCTION__, \ __LINE__, \ assertion_caller_id, \ blockIdx, \ threadIdx); \ /* Now that the kernel has failed we early exit the kernel, but */ \ /* otherwise keep going and rely on the host to check UVM and */ \ /* determine we've had a problem */ \ return; \ } \ } while (false) #else #define CUDA_KERNEL_ASSERT2(condition) assert(condition) #endif } // namespace c10::cuda