ai-content-maker/.venv/Lib/site-packages/torch/include/c10/cuda/CUDADeviceAssertion.h

#pragma once

#include <c10/cuda/CUDAException.h>
#include <c10/macros/Macros.h>

namespace c10::cuda {

#ifdef TORCH_USE_CUDA_DSA
// Copy string from `src` to `dst`
static __device__ void dstrcpy(char* dst, const char* src) {
  int i = 0;
  // Copy string from source to destination, ensuring that it
  // isn't longer than `C10_CUDA_DSA_MAX_STR_LEN-1`
  while (*src != '\0' && i++ < C10_CUDA_DSA_MAX_STR_LEN - 1) {
    *dst++ = *src++;
  }
  *dst = '\0';
}

static __device__ void dsa_add_new_assertion_failure(
    DeviceAssertionsData* assertions_data,
    const char* assertion_msg,
    const char* filename,
    const char* function_name,
    const int line_number,
    const uint32_t caller,
    const dim3 block_id,
    const dim3 thread_id) {
  // `assertions_data` may be nullptr if device-side assertion checking
  // is disabled at run-time. If it is disabled at compile time this
  // function will never be called
  if (!assertions_data) {
    return;
  }

  // Atomically increment so other threads can fail at the same time
  // Note that incrementing this means that the CPU can observe that
  // a failure has happened and can begin to respond before we've
  // written information about that failure out to the buffer.
  const auto nid = atomicAdd(&(assertions_data->assertion_count), 1);

  if (nid >= C10_CUDA_DSA_ASSERTION_COUNT) {
    // At this point we're ran out of assertion buffer space.
    // We could print a message about this, but that'd get
    // spammy if a lot of threads did it, so we just silently
    // ignore any other assertion failures. In most cases the
    // failures will all probably be analogous anyway.
    return;
  }

  // Write information about the assertion failure to memory.
  // Note that this occurs only after the `assertion_count`
  // increment broadcasts that there's been a problem.
  auto& self = assertions_data->assertions[nid];
  dstrcpy(self.assertion_msg, assertion_msg);
  dstrcpy(self.filename, filename);
  dstrcpy(self.function_name, function_name);
  self.line_number = line_number;
  self.caller = caller;
  self.block_id[0] = block_id.x;
  self.block_id[1] = block_id.y;
  self.block_id[2] = block_id.z;
  self.thread_id[0] = thread_id.x;
  self.thread_id[1] = thread_id.y;
  self.thread_id[2] = thread_id.z;
}

// Emulates a kernel assertion. The assertion won't stop the kernel's progress,
// so you should assume everything the kernel produces is garbage if there's an
// assertion failure.
// NOTE: This assumes that `assertions_data` and  `assertion_caller_id` are
//       arguments of the kernel and therefore accessible.
#define CUDA_KERNEL_ASSERT2(condition)                                   \
  do {                                                                   \
    if (C10_UNLIKELY(!(condition))) {                                    \
      /* Has an atomic element so threads can fail at the same time */   \
      c10::cuda::dsa_add_new_assertion_failure(                          \
          assertions_data,                                               \
          C10_STRINGIZE(condition),                                      \
          __FILE__,                                                      \
          __FUNCTION__,                                                  \
          __LINE__,                                                      \
          assertion_caller_id,                                           \
          blockIdx,                                                      \
          threadIdx);                                                    \
      /* Now that the kernel has failed we early exit the kernel, but */ \
      /* otherwise keep going and rely on the host to check UVM and */   \
      /* determine we've had a problem */                                \
      return;                                                            \
    }                                                                    \
  } while (false)
#else
#define CUDA_KERNEL_ASSERT2(condition) assert(condition)
#endif

} // namespace c10::cuda
first commit 2024-05-03 04:18:51 +03:00			`#pragma once`

			`#include <c10/cuda/CUDAException.h>`
			`#include <c10/macros/Macros.h>`

			`namespace c10::cuda {`

			`#ifdef TORCH_USE_CUDA_DSA`
			// Copy string from `src` to `dst`
			`static __device__ void dstrcpy(char* dst, const char* src) {`
			`int i = 0;`
			`// Copy string from source to destination, ensuring that it`
			// isn't longer than `C10_CUDA_DSA_MAX_STR_LEN-1`
			`while (*src != '\0' && i++ < C10_CUDA_DSA_MAX_STR_LEN - 1) {`
			`dst++ = src++;`
			`}`
			`*dst = '\0';`
			`}`

			`static __device__ void dsa_add_new_assertion_failure(`
			`DeviceAssertionsData* assertions_data,`
			`const char* assertion_msg,`
			`const char* filename,`
			`const char* function_name,`
			`const int line_number,`
			`const uint32_t caller,`
			`const dim3 block_id,`
			`const dim3 thread_id) {`
			// `assertions_data` may be nullptr if device-side assertion checking
			`// is disabled at run-time. If it is disabled at compile time this`
			`// function will never be called`
			`if (!assertions_data) {`
			`return;`
			`}`

			`// Atomically increment so other threads can fail at the same time`
			`// Note that incrementing this means that the CPU can observe that`
			`// a failure has happened and can begin to respond before we've`
			`// written information about that failure out to the buffer.`
			`const auto nid = atomicAdd(&(assertions_data->assertion_count), 1);`

			`if (nid >= C10_CUDA_DSA_ASSERTION_COUNT) {`
			`// At this point we're ran out of assertion buffer space.`
			`// We could print a message about this, but that'd get`
			`// spammy if a lot of threads did it, so we just silently`
			`// ignore any other assertion failures. In most cases the`
			`// failures will all probably be analogous anyway.`
			`return;`
			`}`

			`// Write information about the assertion failure to memory.`
			// Note that this occurs only after the `assertion_count`
			`// increment broadcasts that there's been a problem.`
			`auto& self = assertions_data->assertions[nid];`
			`dstrcpy(self.assertion_msg, assertion_msg);`
			`dstrcpy(self.filename, filename);`
			`dstrcpy(self.function_name, function_name);`
			`self.line_number = line_number;`
			`self.caller = caller;`
			`self.block_id[0] = block_id.x;`
			`self.block_id[1] = block_id.y;`
			`self.block_id[2] = block_id.z;`
			`self.thread_id[0] = thread_id.x;`
			`self.thread_id[1] = thread_id.y;`
			`self.thread_id[2] = thread_id.z;`
			`}`

			`// Emulates a kernel assertion. The assertion won't stop the kernel's progress,`
			`// so you should assume everything the kernel produces is garbage if there's an`
			`// assertion failure.`
			// NOTE: This assumes that `assertions_data` and `assertion_caller_id` are
			`// arguments of the kernel and therefore accessible.`
			`#define CUDA_KERNEL_ASSERT2(condition) \`
			`do { \`
			`if (C10_UNLIKELY(!(condition))) { \`
			`/* Has an atomic element so threads can fail at the same time */ \`
			`c10::cuda::dsa_add_new_assertion_failure( \`
			`assertions_data, \`
			`C10_STRINGIZE(condition), \`
			`__FILE__, \`
			`__FUNCTION__, \`
			`__LINE__, \`
			`assertion_caller_id, \`
			`blockIdx, \`
			`threadIdx); \`
			`/* Now that the kernel has failed we early exit the kernel, but */ \`
			`/* otherwise keep going and rely on the host to check UVM and */ \`
			`/* determine we've had a problem */ \`
			`return; \`
			`} \`
			`} while (false)`
			`#else`
			`#define CUDA_KERNEL_ASSERT2(condition) assert(condition)`
			`#endif`

			`} // namespace c10::cuda`