""" This is a direct translation of nvvm.h """ import logging import re import sys import warnings from ctypes import (c_void_p, c_int, POINTER, c_char_p, c_size_t, byref, c_char) import threading from llvmlite import ir from .error import NvvmError, NvvmSupportError, NvvmWarning from .libs import get_libdevice, open_libdevice, open_cudalib from numba.core import cgutils, config logger = logging.getLogger(__name__) ADDRSPACE_GENERIC = 0 ADDRSPACE_GLOBAL = 1 ADDRSPACE_SHARED = 3 ADDRSPACE_CONSTANT = 4 ADDRSPACE_LOCAL = 5 # Opaque handle for compilation unit nvvm_program = c_void_p # Result code nvvm_result = c_int RESULT_CODE_NAMES = ''' NVVM_SUCCESS NVVM_ERROR_OUT_OF_MEMORY NVVM_ERROR_PROGRAM_CREATION_FAILURE NVVM_ERROR_IR_VERSION_MISMATCH NVVM_ERROR_INVALID_INPUT NVVM_ERROR_INVALID_PROGRAM NVVM_ERROR_INVALID_IR NVVM_ERROR_INVALID_OPTION NVVM_ERROR_NO_MODULE_IN_PROGRAM NVVM_ERROR_COMPILATION '''.split() for i, k in enumerate(RESULT_CODE_NAMES): setattr(sys.modules[__name__], k, i) # Data layouts. NVVM IR 1.8 (CUDA 11.6) introduced 128-bit integer support. _datalayout_original = ('e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-' 'i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-' 'v64:64:64-v128:128:128-n16:32:64') _datalayout_i128 = ('e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-' 'i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-' 'v64:64:64-v128:128:128-n16:32:64') def is_available(): """ Return if libNVVM is available """ try: NVVM() except NvvmSupportError: return False else: return True _nvvm_lock = threading.Lock() class NVVM(object): '''Process-wide singleton. ''' _PROTOTYPES = { # nvvmResult nvvmVersion(int *major, int *minor) 'nvvmVersion': (nvvm_result, POINTER(c_int), POINTER(c_int)), # nvvmResult nvvmCreateProgram(nvvmProgram *cu) 'nvvmCreateProgram': (nvvm_result, POINTER(nvvm_program)), # nvvmResult nvvmDestroyProgram(nvvmProgram *cu) 'nvvmDestroyProgram': (nvvm_result, POINTER(nvvm_program)), # nvvmResult nvvmAddModuleToProgram(nvvmProgram cu, const char *buffer, # size_t size, const char *name) 'nvvmAddModuleToProgram': ( nvvm_result, nvvm_program, c_char_p, c_size_t, c_char_p), # nvvmResult nvvmLazyAddModuleToProgram(nvvmProgram cu, # const char* buffer, # size_t size, # const char *name) 'nvvmLazyAddModuleToProgram': ( nvvm_result, nvvm_program, c_char_p, c_size_t, c_char_p), # nvvmResult nvvmCompileProgram(nvvmProgram cu, int numOptions, # const char **options) 'nvvmCompileProgram': ( nvvm_result, nvvm_program, c_int, POINTER(c_char_p)), # nvvmResult nvvmGetCompiledResultSize(nvvmProgram cu, # size_t *bufferSizeRet) 'nvvmGetCompiledResultSize': ( nvvm_result, nvvm_program, POINTER(c_size_t)), # nvvmResult nvvmGetCompiledResult(nvvmProgram cu, char *buffer) 'nvvmGetCompiledResult': (nvvm_result, nvvm_program, c_char_p), # nvvmResult nvvmGetProgramLogSize(nvvmProgram cu, # size_t *bufferSizeRet) 'nvvmGetProgramLogSize': (nvvm_result, nvvm_program, POINTER(c_size_t)), # nvvmResult nvvmGetProgramLog(nvvmProgram cu, char *buffer) 'nvvmGetProgramLog': (nvvm_result, nvvm_program, c_char_p), # nvvmResult nvvmIRVersion (int* majorIR, int* minorIR, int* majorDbg, # int* minorDbg ) 'nvvmIRVersion': (nvvm_result, POINTER(c_int), POINTER(c_int), POINTER(c_int), POINTER(c_int)), # nvvmResult nvvmVerifyProgram (nvvmProgram prog, int numOptions, # const char** options) 'nvvmVerifyProgram': (nvvm_result, nvvm_program, c_int, POINTER(c_char_p)) } # Singleton reference __INSTANCE = None def __new__(cls): with _nvvm_lock: if cls.__INSTANCE is None: cls.__INSTANCE = inst = object.__new__(cls) try: inst.driver = open_cudalib('nvvm') except OSError as e: cls.__INSTANCE = None errmsg = ("libNVVM cannot be found. Do `conda install " "cudatoolkit`:\n%s") raise NvvmSupportError(errmsg % e) # Find & populate functions for name, proto in inst._PROTOTYPES.items(): func = getattr(inst.driver, name) func.restype = proto[0] func.argtypes = proto[1:] setattr(inst, name, func) return cls.__INSTANCE def __init__(self): ir_versions = self.get_ir_version() self._majorIR = ir_versions[0] self._minorIR = ir_versions[1] self._majorDbg = ir_versions[2] self._minorDbg = ir_versions[3] self._supported_ccs = get_supported_ccs() @property def data_layout(self): if (self._majorIR, self._minorIR) < (1, 8): return _datalayout_original else: return _datalayout_i128 @property def supported_ccs(self): return self._supported_ccs def get_version(self): major = c_int() minor = c_int() err = self.nvvmVersion(byref(major), byref(minor)) self.check_error(err, 'Failed to get version.') return major.value, minor.value def get_ir_version(self): majorIR = c_int() minorIR = c_int() majorDbg = c_int() minorDbg = c_int() err = self.nvvmIRVersion(byref(majorIR), byref(minorIR), byref(majorDbg), byref(minorDbg)) self.check_error(err, 'Failed to get IR version.') return majorIR.value, minorIR.value, majorDbg.value, minorDbg.value def check_error(self, error, msg, exit=False): if error: exc = NvvmError(msg, RESULT_CODE_NAMES[error]) if exit: print(exc) sys.exit(1) else: raise exc class CompilationUnit(object): def __init__(self): self.driver = NVVM() self._handle = nvvm_program() err = self.driver.nvvmCreateProgram(byref(self._handle)) self.driver.check_error(err, 'Failed to create CU') def __del__(self): driver = NVVM() err = driver.nvvmDestroyProgram(byref(self._handle)) driver.check_error(err, 'Failed to destroy CU', exit=True) def add_module(self, buffer): """ Add a module level NVVM IR to a compilation unit. - The buffer should contain an NVVM module IR either in the bitcode representation (LLVM3.0) or in the text representation. """ err = self.driver.nvvmAddModuleToProgram(self._handle, buffer, len(buffer), None) self.driver.check_error(err, 'Failed to add module') def lazy_add_module(self, buffer): """ Lazily add an NVVM IR module to a compilation unit. The buffer should contain NVVM module IR either in the bitcode representation or in the text representation. """ err = self.driver.nvvmLazyAddModuleToProgram(self._handle, buffer, len(buffer), None) self.driver.check_error(err, 'Failed to add module') def compile(self, **options): """Perform Compilation. Compilation options are accepted as keyword arguments, with the following considerations: - Underscores (`_`) in option names are converted to dashes (`-`), to match NVVM's option name format. - Options that take a value will be emitted in the form "-=". - Booleans passed as option values will be converted to integers. - Options which take no value (such as `-gen-lto`) should have a value of `None` passed in and will be emitted in the form "-". For documentation on NVVM compilation options, see the CUDA Toolkit Documentation: https://docs.nvidia.com/cuda/libnvvm-api/index.html#_CPPv418nvvmCompileProgram11nvvmProgramiPPKc """ def stringify_option(k, v): k = k.replace('_', '-') if v is None: return f'-{k}' if isinstance(v, bool): v = int(v) return f'-{k}={v}' options = [stringify_option(k, v) for k, v in options.items()] c_opts = (c_char_p * len(options))(*[c_char_p(x.encode('utf8')) for x in options]) # verify err = self.driver.nvvmVerifyProgram(self._handle, len(options), c_opts) self._try_error(err, 'Failed to verify\n') # compile err = self.driver.nvvmCompileProgram(self._handle, len(options), c_opts) self._try_error(err, 'Failed to compile\n') # get result reslen = c_size_t() err = self.driver.nvvmGetCompiledResultSize(self._handle, byref(reslen)) self._try_error(err, 'Failed to get size of compiled result.') ptxbuf = (c_char * reslen.value)() err = self.driver.nvvmGetCompiledResult(self._handle, ptxbuf) self._try_error(err, 'Failed to get compiled result.') # get log self.log = self.get_log() if self.log: warnings.warn(self.log, category=NvvmWarning) return ptxbuf[:] def _try_error(self, err, msg): self.driver.check_error(err, "%s\n%s" % (msg, self.get_log())) def get_log(self): reslen = c_size_t() err = self.driver.nvvmGetProgramLogSize(self._handle, byref(reslen)) self.driver.check_error(err, 'Failed to get compilation log size.') if reslen.value > 1: logbuf = (c_char * reslen.value)() err = self.driver.nvvmGetProgramLog(self._handle, logbuf) self.driver.check_error(err, 'Failed to get compilation log.') return logbuf.value.decode('utf8') # populate log attribute return '' COMPUTE_CAPABILITIES = ( (3, 5), (3, 7), (5, 0), (5, 2), (5, 3), (6, 0), (6, 1), (6, 2), (7, 0), (7, 2), (7, 5), (8, 0), (8, 6), (8, 7), (8, 9), (9, 0) ) # Maps CTK version -> (min supported cc, max supported cc) inclusive CTK_SUPPORTED = { (11, 2): ((3, 5), (8, 6)), (11, 3): ((3, 5), (8, 6)), (11, 4): ((3, 5), (8, 7)), (11, 5): ((3, 5), (8, 7)), (11, 6): ((3, 5), (8, 7)), (11, 7): ((3, 5), (8, 7)), (11, 8): ((3, 5), (9, 0)), (12, 0): ((5, 0), (9, 0)), (12, 1): ((5, 0), (9, 0)), (12, 2): ((5, 0), (9, 0)), (12, 3): ((5, 0), (9, 0)), } def ccs_supported_by_ctk(ctk_version): try: # For supported versions, we look up the range of supported CCs min_cc, max_cc = CTK_SUPPORTED[ctk_version] return tuple([cc for cc in COMPUTE_CAPABILITIES if min_cc <= cc <= max_cc]) except KeyError: # For unsupported CUDA toolkit versions, all we can do is assume all # non-deprecated versions we are aware of are supported. return tuple([cc for cc in COMPUTE_CAPABILITIES if cc >= config.CUDA_DEFAULT_PTX_CC]) def get_supported_ccs(): try: from numba.cuda.cudadrv.runtime import runtime cudart_version = runtime.get_version() except: # noqa: E722 # We can't support anything if there's an error getting the runtime # version (e.g. if it's not present or there's another issue) _supported_cc = () return _supported_cc # Ensure the minimum CTK version requirement is met min_cudart = min(CTK_SUPPORTED) if cudart_version < min_cudart: _supported_cc = () ctk_ver = f"{cudart_version[0]}.{cudart_version[1]}" unsupported_ver = (f"CUDA Toolkit {ctk_ver} is unsupported by Numba - " f"{min_cudart[0]}.{min_cudart[1]} is the minimum " "required version.") warnings.warn(unsupported_ver) return _supported_cc _supported_cc = ccs_supported_by_ctk(cudart_version) return _supported_cc def find_closest_arch(mycc): """ Given a compute capability, return the closest compute capability supported by the CUDA toolkit. :param mycc: Compute capability as a tuple ``(MAJOR, MINOR)`` :return: Closest supported CC as a tuple ``(MAJOR, MINOR)`` """ supported_ccs = NVVM().supported_ccs if not supported_ccs: msg = "No supported GPU compute capabilities found. " \ "Please check your cudatoolkit version matches your CUDA version." raise NvvmSupportError(msg) for i, cc in enumerate(supported_ccs): if cc == mycc: # Matches return cc elif cc > mycc: # Exceeded if i == 0: # CC lower than supported msg = "GPU compute capability %d.%d is not supported" \ "(requires >=%d.%d)" % (mycc + cc) raise NvvmSupportError(msg) else: # return the previous CC return supported_ccs[i - 1] # CC higher than supported return supported_ccs[-1] # Choose the highest def get_arch_option(major, minor): """Matches with the closest architecture option """ if config.FORCE_CUDA_CC: arch = config.FORCE_CUDA_CC else: arch = find_closest_arch((major, minor)) return 'compute_%d%d' % arch MISSING_LIBDEVICE_FILE_MSG = '''Missing libdevice file. Please ensure you have package cudatoolkit >= 11.0 Install package by: conda install cudatoolkit ''' class LibDevice(object): _cache_ = None def __init__(self): if self._cache_ is None: if get_libdevice() is None: raise RuntimeError(MISSING_LIBDEVICE_FILE_MSG) self._cache_ = open_libdevice() self.bc = self._cache_ def get(self): return self.bc cas_nvvm = """ %cas_success = cmpxchg volatile {Ti}* %iptr, {Ti} %old, {Ti} %new monotonic monotonic %cas = extractvalue {{ {Ti}, i1 }} %cas_success, 0 """ # noqa: E501 # Translation of code from CUDA Programming Guide v6.5, section B.12 ir_numba_atomic_binary_template = """ define internal {T} @___numba_atomic_{T}_{FUNC}({T}* %ptr, {T} %val) alwaysinline {{ entry: %iptr = bitcast {T}* %ptr to {Ti}* %old2 = load volatile {Ti}, {Ti}* %iptr br label %attempt attempt: %old = phi {Ti} [ %old2, %entry ], [ %cas, %attempt ] %dold = bitcast {Ti} %old to {T} %dnew = {OP} {T} %dold, %val %new = bitcast {T} %dnew to {Ti} {CAS} %repeat = icmp ne {Ti} %cas, %old br i1 %repeat, label %attempt, label %done done: %result = bitcast {Ti} %old to {T} ret {T} %result }} """ # noqa: E501 ir_numba_atomic_inc_template = """ define internal {T} @___numba_atomic_{Tu}_inc({T}* %iptr, {T} %val) alwaysinline {{ entry: %old2 = load volatile {T}, {T}* %iptr br label %attempt attempt: %old = phi {T} [ %old2, %entry ], [ %cas, %attempt ] %bndchk = icmp ult {T} %old, %val %inc = add {T} %old, 1 %new = select i1 %bndchk, {T} %inc, {T} 0 {CAS} %repeat = icmp ne {T} %cas, %old br i1 %repeat, label %attempt, label %done done: ret {T} %old }} """ # noqa: E501 ir_numba_atomic_dec_template = """ define internal {T} @___numba_atomic_{Tu}_dec({T}* %iptr, {T} %val) alwaysinline {{ entry: %old2 = load volatile {T}, {T}* %iptr br label %attempt attempt: %old = phi {T} [ %old2, %entry ], [ %cas, %attempt ] %dec = add {T} %old, -1 %bndchk = icmp ult {T} %dec, %val %new = select i1 %bndchk, {T} %dec, {T} %val {CAS} %repeat = icmp ne {T} %cas, %old br i1 %repeat, label %attempt, label %done done: ret {T} %old }} """ # noqa: E501 ir_numba_atomic_minmax_template = """ define internal {T} @___numba_atomic_{T}_{NAN}{FUNC}({T}* %ptr, {T} %val) alwaysinline {{ entry: %ptrval = load volatile {T}, {T}* %ptr ; Return early when: ; - For nanmin / nanmax when val is a NaN ; - For min / max when val or ptr is a NaN %early_return = fcmp uno {T} %val, %{PTR_OR_VAL}val br i1 %early_return, label %done, label %lt_check lt_check: %dold = phi {T} [ %ptrval, %entry ], [ %dcas, %attempt ] ; Continue attempts if dold less or greater than val (depending on whether min or max) ; or if dold is NaN (for nanmin / nanmax) %cmp = fcmp {OP} {T} %dold, %val br i1 %cmp, label %attempt, label %done attempt: ; Attempt to swap in the value %old = bitcast {T} %dold to {Ti} %iptr = bitcast {T}* %ptr to {Ti}* %new = bitcast {T} %val to {Ti} {CAS} %dcas = bitcast {Ti} %cas to {T} br label %lt_check done: ret {T} %ptrval }} """ # noqa: E501 def ir_cas(Ti): return cas_nvvm.format(Ti=Ti) def ir_numba_atomic_binary(T, Ti, OP, FUNC): params = dict(T=T, Ti=Ti, OP=OP, FUNC=FUNC, CAS=ir_cas(Ti)) return ir_numba_atomic_binary_template.format(**params) def ir_numba_atomic_minmax(T, Ti, NAN, OP, PTR_OR_VAL, FUNC): params = dict(T=T, Ti=Ti, NAN=NAN, OP=OP, PTR_OR_VAL=PTR_OR_VAL, FUNC=FUNC, CAS=ir_cas(Ti)) return ir_numba_atomic_minmax_template.format(**params) def ir_numba_atomic_inc(T, Tu): return ir_numba_atomic_inc_template.format(T=T, Tu=Tu, CAS=ir_cas(T)) def ir_numba_atomic_dec(T, Tu): return ir_numba_atomic_dec_template.format(T=T, Tu=Tu, CAS=ir_cas(T)) def llvm_replace(llvmir): replacements = [ ('declare double @"___numba_atomic_double_add"(double* %".1", double %".2")', # noqa: E501 ir_numba_atomic_binary(T='double', Ti='i64', OP='fadd', FUNC='add')), ('declare float @"___numba_atomic_float_sub"(float* %".1", float %".2")', # noqa: E501 ir_numba_atomic_binary(T='float', Ti='i32', OP='fsub', FUNC='sub')), ('declare double @"___numba_atomic_double_sub"(double* %".1", double %".2")', # noqa: E501 ir_numba_atomic_binary(T='double', Ti='i64', OP='fsub', FUNC='sub')), ('declare i64 @"___numba_atomic_u64_inc"(i64* %".1", i64 %".2")', ir_numba_atomic_inc(T='i64', Tu='u64')), ('declare i64 @"___numba_atomic_u64_dec"(i64* %".1", i64 %".2")', ir_numba_atomic_dec(T='i64', Tu='u64')), ('declare float @"___numba_atomic_float_max"(float* %".1", float %".2")', # noqa: E501 ir_numba_atomic_minmax(T='float', Ti='i32', NAN='', OP='nnan olt', PTR_OR_VAL='ptr', FUNC='max')), ('declare double @"___numba_atomic_double_max"(double* %".1", double %".2")', # noqa: E501 ir_numba_atomic_minmax(T='double', Ti='i64', NAN='', OP='nnan olt', PTR_OR_VAL='ptr', FUNC='max')), ('declare float @"___numba_atomic_float_min"(float* %".1", float %".2")', # noqa: E501 ir_numba_atomic_minmax(T='float', Ti='i32', NAN='', OP='nnan ogt', PTR_OR_VAL='ptr', FUNC='min')), ('declare double @"___numba_atomic_double_min"(double* %".1", double %".2")', # noqa: E501 ir_numba_atomic_minmax(T='double', Ti='i64', NAN='', OP='nnan ogt', PTR_OR_VAL='ptr', FUNC='min')), ('declare float @"___numba_atomic_float_nanmax"(float* %".1", float %".2")', # noqa: E501 ir_numba_atomic_minmax(T='float', Ti='i32', NAN='nan', OP='ult', PTR_OR_VAL='', FUNC='max')), ('declare double @"___numba_atomic_double_nanmax"(double* %".1", double %".2")', # noqa: E501 ir_numba_atomic_minmax(T='double', Ti='i64', NAN='nan', OP='ult', PTR_OR_VAL='', FUNC='max')), ('declare float @"___numba_atomic_float_nanmin"(float* %".1", float %".2")', # noqa: E501 ir_numba_atomic_minmax(T='float', Ti='i32', NAN='nan', OP='ugt', PTR_OR_VAL='', FUNC='min')), ('declare double @"___numba_atomic_double_nanmin"(double* %".1", double %".2")', # noqa: E501 ir_numba_atomic_minmax(T='double', Ti='i64', NAN='nan', OP='ugt', PTR_OR_VAL='', FUNC='min')), ('immarg', '') ] for decl, fn in replacements: llvmir = llvmir.replace(decl, fn) llvmir = llvm140_to_70_ir(llvmir) return llvmir def llvm_to_ptx(llvmir, **opts): if isinstance(llvmir, str): llvmir = [llvmir] if opts.pop('fastmath', False): opts.update({ 'ftz': True, 'fma': True, 'prec_div': False, 'prec_sqrt': False, }) cu = CompilationUnit() libdevice = LibDevice() for mod in llvmir: mod = llvm_replace(mod) cu.add_module(mod.encode('utf8')) cu.lazy_add_module(libdevice.get()) return cu.compile(**opts) re_attributes_def = re.compile(r"^attributes #\d+ = \{ ([\w\s]+)\ }") def llvm140_to_70_ir(ir): """ Convert LLVM 14.0 IR for LLVM 7.0. """ buf = [] for line in ir.splitlines(): if line.startswith('attributes #'): # Remove function attributes unsupported by LLVM 7.0 m = re_attributes_def.match(line) attrs = m.group(1).split() attrs = ' '.join(a for a in attrs if a != 'willreturn') line = line.replace(m.group(1), attrs) buf.append(line) return '\n'.join(buf) def set_cuda_kernel(function): """ Mark a function as a CUDA kernel. Kernels have the following requirements: - Metadata that marks them as a kernel. - Addition to the @llvm.used list, so that they will not be discarded. - The noinline attribute is not permitted, because this causes NVVM to emit a warning, which counts as failing IR verification. Presently it is assumed that there is one kernel per module, which holds for Numba-jitted functions. If this changes in future or this function is to be used externally, this function may need modification to add to the @llvm.used list rather than creating it. """ module = function.module # Add kernel metadata mdstr = ir.MetaDataString(module, "kernel") mdvalue = ir.Constant(ir.IntType(32), 1) md = module.add_metadata((function, mdstr, mdvalue)) nmd = cgutils.get_or_insert_named_metadata(module, 'nvvm.annotations') nmd.add(md) # Create the used list ptrty = ir.IntType(8).as_pointer() usedty = ir.ArrayType(ptrty, 1) fnptr = function.bitcast(ptrty) llvm_used = ir.GlobalVariable(module, usedty, 'llvm.used') llvm_used.linkage = 'appending' llvm_used.section = 'llvm.metadata' llvm_used.initializer = ir.Constant(usedty, [fnptr]) # Remove 'noinline' if it is present. function.attributes.discard('noinline') def add_ir_version(mod): """Add NVVM IR version to module""" # We specify the IR version to match the current NVVM's IR version i32 = ir.IntType(32) ir_versions = [i32(v) for v in NVVM().get_ir_version()] md_ver = mod.add_metadata(ir_versions) mod.add_named_metadata('nvvmir.version', md_ver)