ai-content-maker/.venv/Lib/site-packages/torch/include/ATen/Context.h

#pragma once

#include <ATen/CPUGeneratorImpl.h>
#include <ATen/DeviceAccelerator.h>
#include <ATen/LinalgBackend.h>
#include <ATen/core/ATenGeneral.h>
#include <ATen/core/DeprecatedTypeProperties.h>
#include <ATen/core/Generator.h>
#include <ATen/core/LegacyTypeDispatch.h>
#include <ATen/detail/AcceleratorHooksInterface.h>
#include <ATen/detail/CUDAHooksInterface.h>
#include <ATen/detail/HIPHooksInterface.h>
#include <ATen/detail/IPUHooksInterface.h>
#include <ATen/detail/MPSHooksInterface.h>
#include <ATen/detail/MTIAHooksInterface.h>
#include <ATen/detail/ORTHooksInterface.h>
#include <ATen/detail/PrivateUse1HooksInterface.h>
#include <ATen/detail/XPUHooksInterface.h>
#include <c10/core/QEngine.h>
#include <c10/core/impl/DeviceGuardImplInterface.h>
#include <c10/util/CallOnce.h>
#include <c10/util/Exception.h>
#include <c10/util/env.h>
#include <c10/util/irange.h>

#include <cstdint>
#include <mutex>

namespace at {

class Tensor;

enum class TORCH_API Float32MatmulPrecision { HIGHEST, HIGH, MEDIUM };

class TORCH_API Context {
 public:
  Context();

  const Generator& defaultGenerator(Device device) {
    c10::DeviceType device_type = device.type();
    initCUDAIfNeeded(device_type);
    initHIPIfNeeded(device_type);
    if (device_type == at::kCPU) {
      return at::detail::getDefaultCPUGenerator();
    } else if (device_type == at::kCUDA) {
      return at::detail::getCUDAHooks().getDefaultCUDAGenerator(device.index());
    } else if (device_type == at::kMPS) {
      return at::detail::getMPSHooks().getDefaultMPSGenerator();
    } else if (device_type == at::kXPU) {
      return at::detail::getXPUHooks().getDefaultXPUGenerator(device.index());
    } else if (device_type == at::kIPU) {
      return at::detail::getIPUHooks().getDefaultIPUGenerator(device.index());
    } else if (device_type == at::kPrivateUse1) {
      return at::GetPrivateUse1HooksInterface()->getDefaultGenerator(
          device.index());
    } else {
      AT_ERROR(c10::DeviceTypeName(device_type), " device type not enabled.");
    }
  }
  const AcceleratorHooksInterface& getAcceleratorHooksInterface(
      c10::optional<c10::DeviceType> opt_device_type = c10::nullopt) {
    c10::DeviceType device_type = opt_device_type.has_value()
        ? opt_device_type.value()
        : at::getAccelerator(true).value();
    if (device_type == at::kCUDA) {
      return at::detail::getCUDAHooks();
    } else if (device_type == at::kMPS) {
      return at::detail::getMPSHooks();
    } else if (device_type == at::kPrivateUse1) {
      return at::detail::getPrivateUse1Hooks();
    } else {
      AT_ERROR(
          c10::DeviceTypeName(device_type), " device type not an accelerator.");
    }
  }
  Device getDeviceFromPtr(void* data, c10::DeviceType device_type) {
    initCUDAIfNeeded(device_type);
    initHIPIfNeeded(device_type);
    initXPUIfNeeded(device_type);
    if (device_type == at::kCPU) {
      return c10::DeviceType::CPU;
    } else if (device_type == at::kCUDA) {
      return at::detail::getCUDAHooks().getDeviceFromPtr(data);
    } else if (device_type == at::kXPU) {
      return at::detail::getXPUHooks().getDeviceFromPtr(data);
    } else if (device_type == at::kPrivateUse1) {
      return at::GetPrivateUse1HooksInterface()->getDeviceFromPtr(data);
    } else {
      AT_ERROR(c10::DeviceTypeName(device_type), " device type not enabled.");
    }
  }
  static bool isPinnedPtr(const void* data) {
    return detail::getCUDAHooks().isPinnedPtr(data);
  }
  static bool hasOpenMP();
  static bool hasMKL();
  static bool hasLAPACK();
  static bool hasMKLDNN();
  static bool hasMAGMA() {
    return detail::getCUDAHooks().hasMAGMA();
  }
  static bool hasCUDA() {
    return detail::getCUDAHooks().hasCUDA();
  }
  static bool hasMTIA() {
    return detail::getMTIAHooks().hasMTIA();
  }
  static bool hasCUDART() {
    return detail::getCUDAHooks().hasCUDART();
  }
  static long versionCUDART() {
    return detail::getCUDAHooks().versionCUDART();
  }
  static bool hasCuDNN() {
    return detail::getCUDAHooks().hasCuDNN();
  }
  static long versionCuDNN() {
    return detail::getCUDAHooks().versionCuDNN();
  }
  static bool hasCuSOLVER() {
    return detail::getCUDAHooks().hasCuSOLVER();
  }
  static bool hasHIP() {
    return detail::getHIPHooks().hasHIP();
  }
  static bool hasMPS() {
    return detail::getMPSHooks().hasMPS();
  }
  static bool hasIPU() {
    return c10::impl::hasDeviceGuardImpl(c10::DeviceType::IPU);
  }
  static bool hasXLA() {
    return c10::impl::hasDeviceGuardImpl(c10::DeviceType::XLA);
  }
  static bool hasXPU() {
    return detail::getXPUHooks().hasXPU();
  }
  static bool hasLazy() {
    return c10::impl::hasDeviceGuardImpl(c10::DeviceType::Lazy);
  }
  static bool hasORT() {
    return c10::impl::hasDeviceGuardImpl(c10::DeviceType::ORT);
  }
  // defined in header so that getNonVariableType has ability to inline
  // call_once check. getNonVariableType is called fairly frequently
  void lazyInitCUDA() {
    c10::call_once(thc_init, [&] { detail::getCUDAHooks().initCUDA(); });
  }
  void lazyInitHIP() {
    c10::call_once(thh_init, [&] { detail::getHIPHooks().initHIP(); });
  }
  void lazyInitXPU() {
    c10::call_once(thx_init, [&] { detail::getXPUHooks().initXPU(); });
  }
  void lazyInitPrivateUse1() {
    c10::call_once(thp_init, [&] {
      if (isPrivateUse1HooksRegistered()) {
        at::GetPrivateUse1HooksInterface()->initPrivateUse1();
      }
    });
  }
  static const at::cuda::NVRTC& getNVRTC() {
    return detail::getCUDAHooks().nvrtc();
  }

  static bool setFlushDenormal(bool on);

  // NB: This method is *purely* whether or not a user requested
  // that CuDNN was enabled, it doesn't actually say anything about
  // whether or not CuDNN is actually usable.  Use cudnn_is_acceptable
  // to test this instead
  bool userEnabledCuDNN() const;
  void setUserEnabledCuDNN(bool e);
  bool userEnabledMkldnn() const;
  void setUserEnabledMkldnn(bool e);
  bool benchmarkCuDNN() const;
  void setBenchmarkCuDNN(bool);
  int benchmarkLimitCuDNN() const;
  void setBenchmarkLimitCuDNN(int);
  bool deterministicCuDNN() const;
  void setDeterministicCuDNN(bool);
  bool userEnabledNNPACK() const;
  void setUserEnabledNNPACK(bool e);

  // Note [Disabling Fused SDP Kernels]
  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  // Flash and Memory Efficient SDP kernels are enabled by default.
  // However, they can be disabled by setting
  // at::globalContext().setUserEnabledFlashSDP(false) flag.
  // This is useful for debugging purposes. For example, if you want to
  // compare the performance of the flash SDP kernels with the unfused
  // kernel, you can disable the flash SDP kernels. By disabling
  // the math SDP kernel, you can force your code to use flash kernels.
  // The math SDP kernel can be disabled by setting
  // at::globalContext().setUserEnabledMathSDP(false) flag.
  void setSDPUseFlash(bool);
  bool userEnabledFlashSDP() const;

  void setSDPUseMemEfficient(bool);
  bool userEnabledMemEfficientSDP() const;

  void setSDPUseMath(bool);
  bool userEnabledMathSDP() const;

  void setSDPUseCuDNN(bool);
  bool userEnabledCuDNNSDP() const;

  at::LinalgBackend linalgPreferredBackend() const;
  void setLinalgPreferredBackend(at::LinalgBackend);

  // Note [Enabling Deterministic Operations]
  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  // Operations in PyTorch that normally act nondeterministically, but have an
  // alternate deterministic implementation, should satisfy the following
  // requirements:
  //
  // * Include this comment: "See Note [Enabling Deterministic Operations]"
  //
  // * Check the value of `at::globalContext().deterministicAlgorithms()` to
  // toggle
  //   between nondeterministic and deterministic implementations.
  //
  // * Have an entry in the list of PyTorch operations that toggle between
  // nondeterministic
  //   and deterministic implementations, in the docstring of
  //   `use_deterministic_algorithms()` in torch/__init__.py
  //
  // `example_func()` below shows an example of toggling between
  // nondeterministic and deterministic implementations:
  //
  //    void example_func() {
  //      // See Note [Enabling Deterministic Operations]
  //      if (at::globalContext().deterministicAlgorithms()) {
  //        example_func_deterministic();
  //      } else {
  //        example_func_nondeterministic();
  //      }
  //    }

  bool deterministicAlgorithms() const;
  bool deterministicAlgorithmsWarnOnly() const;
  void setDeterministicAlgorithms(bool, bool);
  bool deterministicFillUninitializedMemory() const;
  void setDeterministicFillUninitializedMemory(bool);

  // Note [Writing Nondeterministic Operations]
  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  // Operations in PyTorch that act nondeterministically and do not have an
  // alternate deterministic implementation should satisfy the following
  // requirements:
  //
  // * Include this comment: "See Note [Writing Nondeterministic Operations]"
  //
  // * Include a comment explaining why the operation is nondeterministic.
  //
  // * Throw an error when `Context::deterministicAlgorithms()` is true. Most
  //   of the time, this should be accomplished by calling
  //   `at::globalContext().alertNotDeterminstic()`.  However, if the
  //   nondeterministic behavior is caused by the CuBLAS workspace
  //   configuration in CUDA >= 10.2,
  //   `at::globalContext().alertCuBLASConfigNotDeterministic()` should be
  //   called instead (in this case, a comment explaining why the operation is
  //   nondeterministic is not necessary). See below for details on these
  //   methods.
  //
  // * Have an entry in the list of nondeterministic PyTorch operations in the
  //   docstring of `use_deterministic_algorithms()` in torch/__init__.py
  //
  // * Have a test function in `test/test_torch.py` whose name begins with
  //   `test_nondeterministic_alert_`. Alternatively, if CuBLAS workspace
  //   configuration is the reason for nondeterminism, the operation should be
  //   included in the `test_cublas_config_nondeterministic_alert` test. Any new
  //   tests should ideally follow a pattern similar to the existing ones.
  //
  // `example_func()` below shows an example of the comments and error-throwing
  // code for a nondeterministic operation:
  //
  //    void example_func() {
  //      // See Note [Writing Nondeterministic Operations]
  //      // Nondeterministic because <reason>
  //      at::globalContext().alertNondeterministic("example_func");
  //      ...
  //    }

  // Throws an error if `Context::deterministicAlgorithms()` is true
  static void alertNotDeterministic(c10::string_view const& caller);

  // Throws an error if `Context::deterministicAlgorithms()` is true, CUDA
  // >= 10.2, and CUBLAS_WORKSPACE_CONFIG is not set to either ":16:8" or
  // ":4096:8". For more details:
  // https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility
  void alertCuBLASConfigNotDeterministic() const;

  void setFloat32MatmulPrecision(const std::string& s);
  bool allowTF32CuDNN() const;
  void setAllowTF32CuDNN(bool);
  bool allowTF32CuBLAS() const;
  void setAllowTF32CuBLAS(bool);
  Float32MatmulPrecision float32MatmulPrecision() const;
  void setFloat32MatmulPrecision(Float32MatmulPrecision p);
  bool allowFP16ReductionCuBLAS() const;
  void setAllowFP16ReductionCuBLAS(bool);
  bool allowBF16ReductionCuBLAS() const;
  void setAllowBF16ReductionCuBLAS(bool);
  at::QEngine qEngine() const;
  void setQEngine(at::QEngine e);
  static const std::vector<at::QEngine>& supportedQEngines();
  static bool isXNNPACKAvailable();
  void setCheckSparseTensorInvariants(bool e);
  bool checkSparseTensorInvariants() const;
  // This method is used to release the original weight after pre-packing.
  // It should be called once before loading/running the model.
  // NB: By default it is set to true for mobile builds.
  void setReleaseWeightsWhenPrepacking(bool e);
  bool releaseWeightsWhenPrepacking() const;

  void setDisplayVmapFallbackWarnings(bool enabled);
  bool areVmapFallbackWarningsEnabled() const;

  void setDefaultMobileCPUAllocator();
  void unsetDefaultMobileCPUAllocator();
  bool allowFP16ReductionCPU() const;
  void setAllowFP16ReductionCPU(bool);

 private:
  void initCUDAIfNeeded(c10::DeviceType p) {
    if (p == c10::DeviceType::CUDA) {
      lazyInitCUDA();
    }
  }
  void initHIPIfNeeded(c10::DeviceType p) {
    if (p == c10::DeviceType::HIP) {
      lazyInitHIP();
    }
  }
  void initXPUIfNeeded(c10::DeviceType p) {
    if (p == c10::DeviceType::XPU) {
      lazyInitXPU();
    }
  }
  static bool checkCuBLASConfigDeterministic();
  c10::once_flag thc_init;
  c10::once_flag thh_init;
  c10::once_flag thx_init;
  c10::once_flag thp_init;
  bool enabled_cudnn = true;
  bool deterministic_cudnn = false;
  bool _deterministic_algorithms = false;
  bool _deterministic_algorithms_warn_only = false;
  bool _deterministic_fill_uninitialized_memory = true;
  bool enabled_flashSDP = true;
  bool enabled_mem_efficientSDP = true;
  bool enabled_mathSDP = true;
  bool enabled_cudnnSDP = false;
#ifdef USE_ROCM
  bool benchmark_cudnn = true;
#else
  bool benchmark_cudnn = false;
#endif
  Float32MatmulPrecision float32_matmul_precision =
      c10::utils::check_env("TORCH_ALLOW_TF32_CUBLAS_OVERRIDE") == true
      ? at::Float32MatmulPrecision::HIGH
      : at::Float32MatmulPrecision::HIGHEST;
  int benchmark_limit_cudnn = 10;
  bool allow_tf32_cudnn = true;
  bool allow_fp16_reduction_cublas = true;
  bool allow_bf16_reduction_cublas = true;
  bool enabled_mkldnn = true;
  bool enabled_nnpack = true;
  at::LinalgBackend linalg_preferred_backend =
      c10::utils::check_env("TORCH_LINALG_PREFER_CUSOLVER") == true
      ? at::LinalgBackend::Cusolver
      : at::LinalgBackend::Default;
#ifdef C10_MOBILE
  bool release_original_weights = true;
#else
  bool release_original_weights = false;
#endif
  bool display_vmap_fallback_warnings_ = false;
  c10::optional<at::QEngine> quantized_engine = c10::nullopt;
  bool enable_sparse_tensor_invariant_checks = false;
  bool allow_fp16_reduction_cpu = false;

  Allocator* prev_allocator_ptr_{nullptr};
};

TORCH_API Context& globalContext();

static inline void init() {
  globalContext();
}

TORCH_API Allocator* getCPUAllocator();

static inline DeprecatedTypeProperties& getDeprecatedTypeProperties(
    Backend p,
    ScalarType s) {
  return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties(
      p, s);
}

static inline DeprecatedTypeProperties& CPU(ScalarType s) {
  return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties(
      Backend::CPU, s);
}

static inline DeprecatedTypeProperties& CUDA(ScalarType s) {
  return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties(
      Backend::CUDA, s);
}

static inline DeprecatedTypeProperties& HIP(ScalarType s) {
  return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties(
      Backend::HIP, s);
}

static inline DeprecatedTypeProperties& MPS(ScalarType s) {
  return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties(
      Backend::MPS, s);
}

static inline bool hasCUDA() {
  return globalContext().hasCUDA();
}

static inline bool hasMTIA() {
  return globalContext().hasMTIA();
}

static inline bool hasHIP() {
  return globalContext().hasHIP();
}

static inline bool hasIPU() {
  return globalContext().hasIPU();
}

static inline bool hasXLA() {
  return globalContext().hasXLA();
}

static inline bool hasMPS() {
  return globalContext().hasMPS();
}

static inline bool hasORT() {
  return globalContext().hasORT();
}

static inline bool hasXPU() {
  return globalContext().hasXPU();
}

// Despite its name, this function returns the number of *CUDA* GPUs.
static inline size_t getNumGPUs() {
  // WARNING: DO NOT ADD LOGIC TO HANDLE OTHER DEVICE TYPES TO THIS
  // FUNCTION.  If you are interested in interrogating the number of
  // devices for a specific device type, add that function to the
  // relevant library (e.g., similar to at::cuda::device_count())
  if (hasCUDA() && hasHIP()) {
    throw std::runtime_error(
        "Enabling both CUDA and HIP in ATen is not supported, as HIP masquerades "
        "to be CUDA (e.g., when you say CUDA, on a HIP build of ATen, this actually "
        "means HIP.  Rebuild PyTorch with one or the other disabled.");
  } else if (hasCUDA()) {
    return detail::getCUDAHooks().getNumGPUs();
  } else if (hasHIP()) {
    return detail::getHIPHooks().getNumGPUs();
  } else {
    return 0;
  }
}

static inline bool hasOpenMP() {
  return globalContext().hasOpenMP();
}

static inline bool hasMKL() {
  return globalContext().hasMKL();
}

static inline bool hasLAPACK() {
  return globalContext().hasLAPACK();
}

static inline bool hasMAGMA() {
  return globalContext().hasMAGMA();
}

static inline bool hasMKLDNN() {
  return globalContext().hasMKLDNN();
}

static inline void manual_seed(uint64_t seed) {
  auto gen = globalContext().defaultGenerator(c10::DeviceType::CPU);
  {
    // See Note [Acquire lock when using random generators]
    std::lock_guard<std::mutex> lock(gen.mutex());
    gen.set_current_seed(seed);
  }
  // NB: Sometimes we build with CUDA, but we don't have any GPUs
  // available. In that case, we must not seed CUDA; it will fail!
  const auto cuda_num_gpus = detail::getCUDAHooks().getNumGPUs();
  if (hasCUDA() && cuda_num_gpus > 0) {
    for (const auto i : c10::irange(cuda_num_gpus)) {
      auto cuda_gen = globalContext().defaultGenerator(
          Device(at::kCUDA, static_cast<c10::DeviceIndex>(i)));
      {
        // See Note [Acquire lock when using random generators]
        std::lock_guard<std::mutex> lock(cuda_gen.mutex());
        cuda_gen.set_current_seed(seed);
      }
    }
  }

  const auto xpu_num_gpus = detail::getXPUHooks().getNumGPUs();
  if (hasXPU() && xpu_num_gpus) {
    for (const auto i : c10::irange(xpu_num_gpus)) {
      auto xpu_gen = globalContext().defaultGenerator(
          Device(at::kXPU, static_cast<c10::DeviceIndex>(i)));
      {
        // See Note [Acquire lock when using random generators]
        std::lock_guard<std::mutex> lock(xpu_gen.mutex());
        xpu_gen.set_current_seed(seed);
      }
    }
  }

  if (hasMPS()) {
    auto mps_gen = globalContext().defaultGenerator(c10::DeviceType::MPS);
    // See Note [Acquire lock when using random generators]
    std::lock_guard<std::mutex> lock(mps_gen.mutex());
    mps_gen.set_current_seed(seed);
  }
}

// When the global flag `allow_tf32` is set to true, cuBLAS handles are
// automatically configured to use math mode CUBLAS_TF32_TENSOR_OP_MATH.
// For some operators, such as addmv, TF32 offers no performance improvement
// but causes precision loss. To help this case, this class implements
// a RAII guard that can be used to quickly disable TF32 within its scope.
//
// Usage:
//     NoTF32Guard disable_tf32;
struct TORCH_API NoTF32Guard {
  NoTF32Guard();
  ~NoTF32Guard();
  static bool should_disable_tf32();

 private:
  bool changed = false;
};

struct TORCH_API ROCmBackwardPassGuard {
  ROCmBackwardPassGuard();
  ~ROCmBackwardPassGuard();
  static bool is_backward_pass();
};

} // namespace at
first commit 2024-05-03 04:18:51 +03:00			`#pragma once`

			`#include <ATen/CPUGeneratorImpl.h>`
			`#include <ATen/DeviceAccelerator.h>`
			`#include <ATen/LinalgBackend.h>`
			`#include <ATen/core/ATenGeneral.h>`
			`#include <ATen/core/DeprecatedTypeProperties.h>`
			`#include <ATen/core/Generator.h>`
			`#include <ATen/core/LegacyTypeDispatch.h>`
			`#include <ATen/detail/AcceleratorHooksInterface.h>`
			`#include <ATen/detail/CUDAHooksInterface.h>`
			`#include <ATen/detail/HIPHooksInterface.h>`
			`#include <ATen/detail/IPUHooksInterface.h>`
			`#include <ATen/detail/MPSHooksInterface.h>`
			`#include <ATen/detail/MTIAHooksInterface.h>`
			`#include <ATen/detail/ORTHooksInterface.h>`
			`#include <ATen/detail/PrivateUse1HooksInterface.h>`
			`#include <ATen/detail/XPUHooksInterface.h>`
			`#include <c10/core/QEngine.h>`
			`#include <c10/core/impl/DeviceGuardImplInterface.h>`
			`#include <c10/util/CallOnce.h>`
			`#include <c10/util/Exception.h>`
			`#include <c10/util/env.h>`
			`#include <c10/util/irange.h>`

			`#include <cstdint>`
			`#include <mutex>`

			`namespace at {`

			`class Tensor;`

			`enum class TORCH_API Float32MatmulPrecision { HIGHEST, HIGH, MEDIUM };`

			`class TORCH_API Context {`
			`public:`
			`Context();`

			`const Generator& defaultGenerator(Device device) {`
			`c10::DeviceType device_type = device.type();`
			`initCUDAIfNeeded(device_type);`
			`initHIPIfNeeded(device_type);`
			`if (device_type == at::kCPU) {`
			`return at::detail::getDefaultCPUGenerator();`
			`} else if (device_type == at::kCUDA) {`
			`return at::detail::getCUDAHooks().getDefaultCUDAGenerator(device.index());`
			`} else if (device_type == at::kMPS) {`
			`return at::detail::getMPSHooks().getDefaultMPSGenerator();`
			`} else if (device_type == at::kXPU) {`
			`return at::detail::getXPUHooks().getDefaultXPUGenerator(device.index());`
			`} else if (device_type == at::kIPU) {`
			`return at::detail::getIPUHooks().getDefaultIPUGenerator(device.index());`
			`} else if (device_type == at::kPrivateUse1) {`
			`return at::GetPrivateUse1HooksInterface()->getDefaultGenerator(`
			`device.index());`
			`} else {`
			`AT_ERROR(c10::DeviceTypeName(device_type), " device type not enabled.");`
			`}`
			`}`
			`const AcceleratorHooksInterface& getAcceleratorHooksInterface(`
			`c10::optional<c10::DeviceType> opt_device_type = c10::nullopt) {`
			`c10::DeviceType device_type = opt_device_type.has_value()`
			`? opt_device_type.value()`
			`: at::getAccelerator(true).value();`
			`if (device_type == at::kCUDA) {`
			`return at::detail::getCUDAHooks();`
			`} else if (device_type == at::kMPS) {`
			`return at::detail::getMPSHooks();`
			`} else if (device_type == at::kPrivateUse1) {`
			`return at::detail::getPrivateUse1Hooks();`
			`} else {`
			`AT_ERROR(`
			`c10::DeviceTypeName(device_type), " device type not an accelerator.");`
			`}`
			`}`
			`Device getDeviceFromPtr(void* data, c10::DeviceType device_type) {`
			`initCUDAIfNeeded(device_type);`
			`initHIPIfNeeded(device_type);`
			`initXPUIfNeeded(device_type);`
			`if (device_type == at::kCPU) {`
			`return c10::DeviceType::CPU;`
			`} else if (device_type == at::kCUDA) {`
			`return at::detail::getCUDAHooks().getDeviceFromPtr(data);`
			`} else if (device_type == at::kXPU) {`
			`return at::detail::getXPUHooks().getDeviceFromPtr(data);`
			`} else if (device_type == at::kPrivateUse1) {`
			`return at::GetPrivateUse1HooksInterface()->getDeviceFromPtr(data);`
			`} else {`
			`AT_ERROR(c10::DeviceTypeName(device_type), " device type not enabled.");`
			`}`
			`}`
			`static bool isPinnedPtr(const void* data) {`
			`return detail::getCUDAHooks().isPinnedPtr(data);`
			`}`
			`static bool hasOpenMP();`
			`static bool hasMKL();`
			`static bool hasLAPACK();`
			`static bool hasMKLDNN();`
			`static bool hasMAGMA() {`
			`return detail::getCUDAHooks().hasMAGMA();`
			`}`
			`static bool hasCUDA() {`
			`return detail::getCUDAHooks().hasCUDA();`
			`}`
			`static bool hasMTIA() {`
			`return detail::getMTIAHooks().hasMTIA();`
			`}`
			`static bool hasCUDART() {`
			`return detail::getCUDAHooks().hasCUDART();`
			`}`
			`static long versionCUDART() {`
			`return detail::getCUDAHooks().versionCUDART();`
			`}`
			`static bool hasCuDNN() {`
			`return detail::getCUDAHooks().hasCuDNN();`
			`}`
			`static long versionCuDNN() {`
			`return detail::getCUDAHooks().versionCuDNN();`
			`}`
			`static bool hasCuSOLVER() {`
			`return detail::getCUDAHooks().hasCuSOLVER();`
			`}`
			`static bool hasHIP() {`
			`return detail::getHIPHooks().hasHIP();`
			`}`
			`static bool hasMPS() {`
			`return detail::getMPSHooks().hasMPS();`
			`}`
			`static bool hasIPU() {`
			`return c10::impl::hasDeviceGuardImpl(c10::DeviceType::IPU);`
			`}`
			`static bool hasXLA() {`
			`return c10::impl::hasDeviceGuardImpl(c10::DeviceType::XLA);`
			`}`
			`static bool hasXPU() {`
			`return detail::getXPUHooks().hasXPU();`
			`}`
			`static bool hasLazy() {`
			`return c10::impl::hasDeviceGuardImpl(c10::DeviceType::Lazy);`
			`}`
			`static bool hasORT() {`
			`return c10::impl::hasDeviceGuardImpl(c10::DeviceType::ORT);`
			`}`
			`// defined in header so that getNonVariableType has ability to inline`
			`// call_once check. getNonVariableType is called fairly frequently`
			`void lazyInitCUDA() {`
			`c10::call_once(thc_init, [&] { detail::getCUDAHooks().initCUDA(); });`
			`}`
			`void lazyInitHIP() {`
			`c10::call_once(thh_init, [&] { detail::getHIPHooks().initHIP(); });`
			`}`
			`void lazyInitXPU() {`
			`c10::call_once(thx_init, [&] { detail::getXPUHooks().initXPU(); });`
			`}`
			`void lazyInitPrivateUse1() {`
			`c10::call_once(thp_init, [&] {`
			`if (isPrivateUse1HooksRegistered()) {`
			`at::GetPrivateUse1HooksInterface()->initPrivateUse1();`
			`}`
			`});`
			`}`
			`static const at::cuda::NVRTC& getNVRTC() {`
			`return detail::getCUDAHooks().nvrtc();`
			`}`

			`static bool setFlushDenormal(bool on);`

			`// NB: This method is purely whether or not a user requested`
			`// that CuDNN was enabled, it doesn't actually say anything about`
			`// whether or not CuDNN is actually usable. Use cudnn_is_acceptable`
			`// to test this instead`
			`bool userEnabledCuDNN() const;`
			`void setUserEnabledCuDNN(bool e);`
			`bool userEnabledMkldnn() const;`
			`void setUserEnabledMkldnn(bool e);`
			`bool benchmarkCuDNN() const;`
			`void setBenchmarkCuDNN(bool);`
			`int benchmarkLimitCuDNN() const;`
			`void setBenchmarkLimitCuDNN(int);`
			`bool deterministicCuDNN() const;`
			`void setDeterministicCuDNN(bool);`
			`bool userEnabledNNPACK() const;`
			`void setUserEnabledNNPACK(bool e);`

			`// Note [Disabling Fused SDP Kernels]`
			`// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
			`// Flash and Memory Efficient SDP kernels are enabled by default.`
			`// However, they can be disabled by setting`
			`// at::globalContext().setUserEnabledFlashSDP(false) flag.`
			`// This is useful for debugging purposes. For example, if you want to`
			`// compare the performance of the flash SDP kernels with the unfused`
			`// kernel, you can disable the flash SDP kernels. By disabling`
			`// the math SDP kernel, you can force your code to use flash kernels.`
			`// The math SDP kernel can be disabled by setting`
			`// at::globalContext().setUserEnabledMathSDP(false) flag.`
			`void setSDPUseFlash(bool);`
			`bool userEnabledFlashSDP() const;`

			`void setSDPUseMemEfficient(bool);`
			`bool userEnabledMemEfficientSDP() const;`

			`void setSDPUseMath(bool);`
			`bool userEnabledMathSDP() const;`

			`void setSDPUseCuDNN(bool);`
			`bool userEnabledCuDNNSDP() const;`

			`at::LinalgBackend linalgPreferredBackend() const;`
			`void setLinalgPreferredBackend(at::LinalgBackend);`

			`// Note [Enabling Deterministic Operations]`
			`// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
			`// Operations in PyTorch that normally act nondeterministically, but have an`
			`// alternate deterministic implementation, should satisfy the following`
			`// requirements:`
			`//`
			`// * Include this comment: "See Note [Enabling Deterministic Operations]"`
			`//`
			// * Check the value of `at::globalContext().deterministicAlgorithms()` to
			`// toggle`
			`// between nondeterministic and deterministic implementations.`
			`//`
			`// * Have an entry in the list of PyTorch operations that toggle between`
			`// nondeterministic`
			`// and deterministic implementations, in the docstring of`
			// `use_deterministic_algorithms()` in torch/__init__.py
			`//`
			// `example_func()` below shows an example of toggling between
			`// nondeterministic and deterministic implementations:`
			`//`
			`// void example_func() {`
			`// // See Note [Enabling Deterministic Operations]`
			`// if (at::globalContext().deterministicAlgorithms()) {`
			`// example_func_deterministic();`
			`// } else {`
			`// example_func_nondeterministic();`
			`// }`
			`// }`

			`bool deterministicAlgorithms() const;`
			`bool deterministicAlgorithmsWarnOnly() const;`
			`void setDeterministicAlgorithms(bool, bool);`
			`bool deterministicFillUninitializedMemory() const;`
			`void setDeterministicFillUninitializedMemory(bool);`

			`// Note [Writing Nondeterministic Operations]`
			`// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
			`// Operations in PyTorch that act nondeterministically and do not have an`
			`// alternate deterministic implementation should satisfy the following`
			`// requirements:`
			`//`
			`// * Include this comment: "See Note [Writing Nondeterministic Operations]"`
			`//`
			`// * Include a comment explaining why the operation is nondeterministic.`
			`//`
			// * Throw an error when `Context::deterministicAlgorithms()` is true. Most
			`// of the time, this should be accomplished by calling`
			// `at::globalContext().alertNotDeterminstic()`. However, if the
			`// nondeterministic behavior is caused by the CuBLAS workspace`
			`// configuration in CUDA >= 10.2,`
			// `at::globalContext().alertCuBLASConfigNotDeterministic()` should be
			`// called instead (in this case, a comment explaining why the operation is`
			`// nondeterministic is not necessary). See below for details on these`
			`// methods.`
			`//`
			`// * Have an entry in the list of nondeterministic PyTorch operations in the`
			// docstring of `use_deterministic_algorithms()` in torch/__init__.py
			`//`
			// * Have a test function in `test/test_torch.py` whose name begins with
			// `test_nondeterministic_alert_`. Alternatively, if CuBLAS workspace
			`// configuration is the reason for nondeterminism, the operation should be`
			// included in the `test_cublas_config_nondeterministic_alert` test. Any new
			`// tests should ideally follow a pattern similar to the existing ones.`
			`//`
			// `example_func()` below shows an example of the comments and error-throwing
			`// code for a nondeterministic operation:`
			`//`
			`// void example_func() {`
			`// // See Note [Writing Nondeterministic Operations]`
			`// // Nondeterministic because <reason>`
			`// at::globalContext().alertNondeterministic("example_func");`
			`// ...`
			`// }`

			// Throws an error if `Context::deterministicAlgorithms()` is true
			`static void alertNotDeterministic(c10::string_view const& caller);`

			// Throws an error if `Context::deterministicAlgorithms()` is true, CUDA
			`// >= 10.2, and CUBLAS_WORKSPACE_CONFIG is not set to either ":16:8" or`
			`// ":4096:8". For more details:`
			`// https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility`
			`void alertCuBLASConfigNotDeterministic() const;`

			`void setFloat32MatmulPrecision(const std::string& s);`
			`bool allowTF32CuDNN() const;`
			`void setAllowTF32CuDNN(bool);`
			`bool allowTF32CuBLAS() const;`
			`void setAllowTF32CuBLAS(bool);`
			`Float32MatmulPrecision float32MatmulPrecision() const;`
			`void setFloat32MatmulPrecision(Float32MatmulPrecision p);`
			`bool allowFP16ReductionCuBLAS() const;`
			`void setAllowFP16ReductionCuBLAS(bool);`
			`bool allowBF16ReductionCuBLAS() const;`
			`void setAllowBF16ReductionCuBLAS(bool);`
			`at::QEngine qEngine() const;`
			`void setQEngine(at::QEngine e);`
			`static const std::vector<at::QEngine>& supportedQEngines();`
			`static bool isXNNPACKAvailable();`
			`void setCheckSparseTensorInvariants(bool e);`
			`bool checkSparseTensorInvariants() const;`
			`// This method is used to release the original weight after pre-packing.`
			`// It should be called once before loading/running the model.`
			`// NB: By default it is set to true for mobile builds.`
			`void setReleaseWeightsWhenPrepacking(bool e);`
			`bool releaseWeightsWhenPrepacking() const;`

			`void setDisplayVmapFallbackWarnings(bool enabled);`
			`bool areVmapFallbackWarningsEnabled() const;`

			`void setDefaultMobileCPUAllocator();`
			`void unsetDefaultMobileCPUAllocator();`
			`bool allowFP16ReductionCPU() const;`
			`void setAllowFP16ReductionCPU(bool);`

			`private:`
			`void initCUDAIfNeeded(c10::DeviceType p) {`
			`if (p == c10::DeviceType::CUDA) {`
			`lazyInitCUDA();`
			`}`
			`}`
			`void initHIPIfNeeded(c10::DeviceType p) {`
			`if (p == c10::DeviceType::HIP) {`
			`lazyInitHIP();`
			`}`
			`}`
			`void initXPUIfNeeded(c10::DeviceType p) {`
			`if (p == c10::DeviceType::XPU) {`
			`lazyInitXPU();`
			`}`
			`}`
			`static bool checkCuBLASConfigDeterministic();`
			`c10::once_flag thc_init;`
			`c10::once_flag thh_init;`
			`c10::once_flag thx_init;`
			`c10::once_flag thp_init;`
			`bool enabled_cudnn = true;`
			`bool deterministic_cudnn = false;`
			`bool _deterministic_algorithms = false;`
			`bool _deterministic_algorithms_warn_only = false;`
			`bool _deterministic_fill_uninitialized_memory = true;`
			`bool enabled_flashSDP = true;`
			`bool enabled_mem_efficientSDP = true;`
			`bool enabled_mathSDP = true;`
			`bool enabled_cudnnSDP = false;`
			`#ifdef USE_ROCM`
			`bool benchmark_cudnn = true;`
			`#else`
			`bool benchmark_cudnn = false;`
			`#endif`
			`Float32MatmulPrecision float32_matmul_precision =`
			`c10::utils::check_env("TORCH_ALLOW_TF32_CUBLAS_OVERRIDE") == true`
			`? at::Float32MatmulPrecision::HIGH`
			`: at::Float32MatmulPrecision::HIGHEST;`
			`int benchmark_limit_cudnn = 10;`
			`bool allow_tf32_cudnn = true;`
			`bool allow_fp16_reduction_cublas = true;`
			`bool allow_bf16_reduction_cublas = true;`
			`bool enabled_mkldnn = true;`
			`bool enabled_nnpack = true;`
			`at::LinalgBackend linalg_preferred_backend =`
			`c10::utils::check_env("TORCH_LINALG_PREFER_CUSOLVER") == true`
			`? at::LinalgBackend::Cusolver`
			`: at::LinalgBackend::Default;`
			`#ifdef C10_MOBILE`
			`bool release_original_weights = true;`
			`#else`
			`bool release_original_weights = false;`
			`#endif`
			`bool display_vmap_fallback_warnings_ = false;`
			`c10::optional<at::QEngine> quantized_engine = c10::nullopt;`
			`bool enable_sparse_tensor_invariant_checks = false;`
			`bool allow_fp16_reduction_cpu = false;`

			`Allocator* prev_allocator_ptr_{nullptr};`
			`};`

			`TORCH_API Context& globalContext();`

			`static inline void init() {`
			`globalContext();`
			`}`

			`TORCH_API Allocator* getCPUAllocator();`

			`static inline DeprecatedTypeProperties& getDeprecatedTypeProperties(`
			`Backend p,`
			`ScalarType s) {`
			`return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties(`
			`p, s);`
			`}`

			`static inline DeprecatedTypeProperties& CPU(ScalarType s) {`
			`return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties(`
			`Backend::CPU, s);`
			`}`

			`static inline DeprecatedTypeProperties& CUDA(ScalarType s) {`
			`return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties(`
			`Backend::CUDA, s);`
			`}`

			`static inline DeprecatedTypeProperties& HIP(ScalarType s) {`
			`return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties(`
			`Backend::HIP, s);`
			`}`

			`static inline DeprecatedTypeProperties& MPS(ScalarType s) {`
			`return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties(`
			`Backend::MPS, s);`
			`}`

			`static inline bool hasCUDA() {`
			`return globalContext().hasCUDA();`
			`}`

			`static inline bool hasMTIA() {`
			`return globalContext().hasMTIA();`
			`}`

			`static inline bool hasHIP() {`
			`return globalContext().hasHIP();`
			`}`

			`static inline bool hasIPU() {`
			`return globalContext().hasIPU();`
			`}`

			`static inline bool hasXLA() {`
			`return globalContext().hasXLA();`
			`}`

			`static inline bool hasMPS() {`
			`return globalContext().hasMPS();`
			`}`

			`static inline bool hasORT() {`
			`return globalContext().hasORT();`
			`}`

			`static inline bool hasXPU() {`
			`return globalContext().hasXPU();`
			`}`

			`// Despite its name, this function returns the number of CUDA GPUs.`
			`static inline size_t getNumGPUs() {`
			`// WARNING: DO NOT ADD LOGIC TO HANDLE OTHER DEVICE TYPES TO THIS`
			`// FUNCTION. If you are interested in interrogating the number of`
			`// devices for a specific device type, add that function to the`
			`// relevant library (e.g., similar to at::cuda::device_count())`
			`if (hasCUDA() && hasHIP()) {`
			`throw std::runtime_error(`
			`"Enabling both CUDA and HIP in ATen is not supported, as HIP masquerades "`
			`"to be CUDA (e.g., when you say CUDA, on a HIP build of ATen, this actually "`
			`"means HIP. Rebuild PyTorch with one or the other disabled.");`
			`} else if (hasCUDA()) {`
			`return detail::getCUDAHooks().getNumGPUs();`
			`} else if (hasHIP()) {`
			`return detail::getHIPHooks().getNumGPUs();`
			`} else {`
			`return 0;`
			`}`
			`}`

			`static inline bool hasOpenMP() {`
			`return globalContext().hasOpenMP();`
			`}`

			`static inline bool hasMKL() {`
			`return globalContext().hasMKL();`
			`}`

			`static inline bool hasLAPACK() {`
			`return globalContext().hasLAPACK();`
			`}`

			`static inline bool hasMAGMA() {`
			`return globalContext().hasMAGMA();`
			`}`

			`static inline bool hasMKLDNN() {`
			`return globalContext().hasMKLDNN();`
			`}`

			`static inline void manual_seed(uint64_t seed) {`
			`auto gen = globalContext().defaultGenerator(c10::DeviceType::CPU);`
			`{`
			`// See Note [Acquire lock when using random generators]`
			`std::lock_guard<std::mutex> lock(gen.mutex());`
			`gen.set_current_seed(seed);`
			`}`
			`// NB: Sometimes we build with CUDA, but we don't have any GPUs`
			`// available. In that case, we must not seed CUDA; it will fail!`
			`const auto cuda_num_gpus = detail::getCUDAHooks().getNumGPUs();`
			`if (hasCUDA() && cuda_num_gpus > 0) {`
			`for (const auto i : c10::irange(cuda_num_gpus)) {`
			`auto cuda_gen = globalContext().defaultGenerator(`
			`Device(at::kCUDA, static_cast<c10::DeviceIndex>(i)));`
			`{`
			`// See Note [Acquire lock when using random generators]`
			`std::lock_guard<std::mutex> lock(cuda_gen.mutex());`
			`cuda_gen.set_current_seed(seed);`
			`}`
			`}`
			`}`

			`const auto xpu_num_gpus = detail::getXPUHooks().getNumGPUs();`
			`if (hasXPU() && xpu_num_gpus) {`
			`for (const auto i : c10::irange(xpu_num_gpus)) {`
			`auto xpu_gen = globalContext().defaultGenerator(`
			`Device(at::kXPU, static_cast<c10::DeviceIndex>(i)));`
			`{`
			`// See Note [Acquire lock when using random generators]`
			`std::lock_guard<std::mutex> lock(xpu_gen.mutex());`
			`xpu_gen.set_current_seed(seed);`
			`}`
			`}`
			`}`

			`if (hasMPS()) {`
			`auto mps_gen = globalContext().defaultGenerator(c10::DeviceType::MPS);`
			`// See Note [Acquire lock when using random generators]`
			`std::lock_guard<std::mutex> lock(mps_gen.mutex());`
			`mps_gen.set_current_seed(seed);`
			`}`
			`}`

			// When the global flag `allow_tf32` is set to true, cuBLAS handles are
			`// automatically configured to use math mode CUBLAS_TF32_TENSOR_OP_MATH.`
			`// For some operators, such as addmv, TF32 offers no performance improvement`
			`// but causes precision loss. To help this case, this class implements`
			`// a RAII guard that can be used to quickly disable TF32 within its scope.`
			`//`
			`// Usage:`
			`// NoTF32Guard disable_tf32;`
			`struct TORCH_API NoTF32Guard {`
			`NoTF32Guard();`
			`~NoTF32Guard();`
			`static bool should_disable_tf32();`

			`private:`
			`bool changed = false;`
			`};`

			`struct TORCH_API ROCmBackwardPassGuard {`
			`ROCmBackwardPassGuard();`
			`~ROCmBackwardPassGuard();`
			`static bool is_backward_pass();`
			`};`

			`} // namespace at`