703 lines
23 KiB
703 lines
23 KiB
This is a direct translation of nvvm.h
import logging
import re
import sys
import warnings
from ctypes import (c_void_p, c_int, POINTER, c_char_p, c_size_t, byref,
import threading
from llvmlite import ir
from .error import NvvmError, NvvmSupportError, NvvmWarning
from .libs import get_libdevice, open_libdevice, open_cudalib
from numba.core import cgutils, config
logger = logging.getLogger(__name__)
# Opaque handle for compilation unit
nvvm_program = c_void_p
# Result code
nvvm_result = c_int
for i, k in enumerate(RESULT_CODE_NAMES):
setattr(sys.modules[__name__], k, i)
# Data layouts. NVVM IR 1.8 (CUDA 11.6) introduced 128-bit integer support.
_datalayout_original = ('e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-'
_datalayout_i128 = ('e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-'
def is_available():
Return if libNVVM is available
except NvvmSupportError:
return False
return True
_nvvm_lock = threading.Lock()
class NVVM(object):
'''Process-wide singleton.
# nvvmResult nvvmVersion(int *major, int *minor)
'nvvmVersion': (nvvm_result, POINTER(c_int), POINTER(c_int)),
# nvvmResult nvvmCreateProgram(nvvmProgram *cu)
'nvvmCreateProgram': (nvvm_result, POINTER(nvvm_program)),
# nvvmResult nvvmDestroyProgram(nvvmProgram *cu)
'nvvmDestroyProgram': (nvvm_result, POINTER(nvvm_program)),
# nvvmResult nvvmAddModuleToProgram(nvvmProgram cu, const char *buffer,
# size_t size, const char *name)
'nvvmAddModuleToProgram': (
nvvm_result, nvvm_program, c_char_p, c_size_t, c_char_p),
# nvvmResult nvvmLazyAddModuleToProgram(nvvmProgram cu,
# const char* buffer,
# size_t size,
# const char *name)
'nvvmLazyAddModuleToProgram': (
nvvm_result, nvvm_program, c_char_p, c_size_t, c_char_p),
# nvvmResult nvvmCompileProgram(nvvmProgram cu, int numOptions,
# const char **options)
'nvvmCompileProgram': (
nvvm_result, nvvm_program, c_int, POINTER(c_char_p)),
# nvvmResult nvvmGetCompiledResultSize(nvvmProgram cu,
# size_t *bufferSizeRet)
'nvvmGetCompiledResultSize': (
nvvm_result, nvvm_program, POINTER(c_size_t)),
# nvvmResult nvvmGetCompiledResult(nvvmProgram cu, char *buffer)
'nvvmGetCompiledResult': (nvvm_result, nvvm_program, c_char_p),
# nvvmResult nvvmGetProgramLogSize(nvvmProgram cu,
# size_t *bufferSizeRet)
'nvvmGetProgramLogSize': (nvvm_result, nvvm_program, POINTER(c_size_t)),
# nvvmResult nvvmGetProgramLog(nvvmProgram cu, char *buffer)
'nvvmGetProgramLog': (nvvm_result, nvvm_program, c_char_p),
# nvvmResult nvvmIRVersion (int* majorIR, int* minorIR, int* majorDbg,
# int* minorDbg )
'nvvmIRVersion': (nvvm_result, POINTER(c_int), POINTER(c_int),
POINTER(c_int), POINTER(c_int)),
# nvvmResult nvvmVerifyProgram (nvvmProgram prog, int numOptions,
# const char** options)
'nvvmVerifyProgram': (nvvm_result, nvvm_program, c_int,
# Singleton reference
def __new__(cls):
with _nvvm_lock:
if cls.__INSTANCE is None:
cls.__INSTANCE = inst = object.__new__(cls)
inst.driver = open_cudalib('nvvm')
except OSError as e:
cls.__INSTANCE = None
errmsg = ("libNVVM cannot be found. Do `conda install "
raise NvvmSupportError(errmsg % e)
# Find & populate functions
for name, proto in inst._PROTOTYPES.items():
func = getattr(inst.driver, name)
func.restype = proto[0]
func.argtypes = proto[1:]
setattr(inst, name, func)
return cls.__INSTANCE
def __init__(self):
ir_versions = self.get_ir_version()
self._majorIR = ir_versions[0]
self._minorIR = ir_versions[1]
self._majorDbg = ir_versions[2]
self._minorDbg = ir_versions[3]
self._supported_ccs = get_supported_ccs()
def data_layout(self):
if (self._majorIR, self._minorIR) < (1, 8):
return _datalayout_original
return _datalayout_i128
def supported_ccs(self):
return self._supported_ccs
def get_version(self):
major = c_int()
minor = c_int()
err = self.nvvmVersion(byref(major), byref(minor))
self.check_error(err, 'Failed to get version.')
return major.value, minor.value
def get_ir_version(self):
majorIR = c_int()
minorIR = c_int()
majorDbg = c_int()
minorDbg = c_int()
err = self.nvvmIRVersion(byref(majorIR), byref(minorIR),
byref(majorDbg), byref(minorDbg))
self.check_error(err, 'Failed to get IR version.')
return majorIR.value, minorIR.value, majorDbg.value, minorDbg.value
def check_error(self, error, msg, exit=False):
if error:
exc = NvvmError(msg, RESULT_CODE_NAMES[error])
if exit:
raise exc
class CompilationUnit(object):
def __init__(self):
self.driver = NVVM()
self._handle = nvvm_program()
err = self.driver.nvvmCreateProgram(byref(self._handle))
self.driver.check_error(err, 'Failed to create CU')
def __del__(self):
driver = NVVM()
err = driver.nvvmDestroyProgram(byref(self._handle))
driver.check_error(err, 'Failed to destroy CU', exit=True)
def add_module(self, buffer):
Add a module level NVVM IR to a compilation unit.
- The buffer should contain an NVVM module IR either in the bitcode
representation (LLVM3.0) or in the text representation.
err = self.driver.nvvmAddModuleToProgram(self._handle, buffer,
len(buffer), None)
self.driver.check_error(err, 'Failed to add module')
def lazy_add_module(self, buffer):
Lazily add an NVVM IR module to a compilation unit.
The buffer should contain NVVM module IR either in the bitcode
representation or in the text representation.
err = self.driver.nvvmLazyAddModuleToProgram(self._handle, buffer,
len(buffer), None)
self.driver.check_error(err, 'Failed to add module')
def compile(self, **options):
"""Perform Compilation.
Compilation options are accepted as keyword arguments, with the
following considerations:
- Underscores (`_`) in option names are converted to dashes (`-`), to
match NVVM's option name format.
- Options that take a value will be emitted in the form
- Booleans passed as option values will be converted to integers.
- Options which take no value (such as `-gen-lto`) should have a value
of `None` passed in and will be emitted in the form "-<name>".
For documentation on NVVM compilation options, see the CUDA Toolkit
def stringify_option(k, v):
k = k.replace('_', '-')
if v is None:
return f'-{k}'
if isinstance(v, bool):
v = int(v)
return f'-{k}={v}'
options = [stringify_option(k, v) for k, v in options.items()]
c_opts = (c_char_p * len(options))(*[c_char_p(x.encode('utf8'))
for x in options])
# verify
err = self.driver.nvvmVerifyProgram(self._handle, len(options), c_opts)
self._try_error(err, 'Failed to verify\n')
# compile
err = self.driver.nvvmCompileProgram(self._handle, len(options), c_opts)
self._try_error(err, 'Failed to compile\n')
# get result
reslen = c_size_t()
err = self.driver.nvvmGetCompiledResultSize(self._handle, byref(reslen))
self._try_error(err, 'Failed to get size of compiled result.')
ptxbuf = (c_char * reslen.value)()
err = self.driver.nvvmGetCompiledResult(self._handle, ptxbuf)
self._try_error(err, 'Failed to get compiled result.')
# get log
self.log = self.get_log()
if self.log:
warnings.warn(self.log, category=NvvmWarning)
return ptxbuf[:]
def _try_error(self, err, msg):
self.driver.check_error(err, "%s\n%s" % (msg, self.get_log()))
def get_log(self):
reslen = c_size_t()
err = self.driver.nvvmGetProgramLogSize(self._handle, byref(reslen))
self.driver.check_error(err, 'Failed to get compilation log size.')
if reslen.value > 1:
logbuf = (c_char * reslen.value)()
err = self.driver.nvvmGetProgramLog(self._handle, logbuf)
self.driver.check_error(err, 'Failed to get compilation log.')
return logbuf.value.decode('utf8') # populate log attribute
return ''
(3, 5), (3, 7),
(5, 0), (5, 2), (5, 3),
(6, 0), (6, 1), (6, 2),
(7, 0), (7, 2), (7, 5),
(8, 0), (8, 6), (8, 7), (8, 9),
(9, 0)
# Maps CTK version -> (min supported cc, max supported cc) inclusive
(11, 2): ((3, 5), (8, 6)),
(11, 3): ((3, 5), (8, 6)),
(11, 4): ((3, 5), (8, 7)),
(11, 5): ((3, 5), (8, 7)),
(11, 6): ((3, 5), (8, 7)),
(11, 7): ((3, 5), (8, 7)),
(11, 8): ((3, 5), (9, 0)),
(12, 0): ((5, 0), (9, 0)),
(12, 1): ((5, 0), (9, 0)),
(12, 2): ((5, 0), (9, 0)),
(12, 3): ((5, 0), (9, 0)),
def ccs_supported_by_ctk(ctk_version):
# For supported versions, we look up the range of supported CCs
min_cc, max_cc = CTK_SUPPORTED[ctk_version]
return tuple([cc for cc in COMPUTE_CAPABILITIES
if min_cc <= cc <= max_cc])
except KeyError:
# For unsupported CUDA toolkit versions, all we can do is assume all
# non-deprecated versions we are aware of are supported.
return tuple([cc for cc in COMPUTE_CAPABILITIES
if cc >= config.CUDA_DEFAULT_PTX_CC])
def get_supported_ccs():
from numba.cuda.cudadrv.runtime import runtime
cudart_version = runtime.get_version()
except: # noqa: E722
# We can't support anything if there's an error getting the runtime
# version (e.g. if it's not present or there's another issue)
_supported_cc = ()
return _supported_cc
# Ensure the minimum CTK version requirement is met
min_cudart = min(CTK_SUPPORTED)
if cudart_version < min_cudart:
_supported_cc = ()
ctk_ver = f"{cudart_version[0]}.{cudart_version[1]}"
unsupported_ver = (f"CUDA Toolkit {ctk_ver} is unsupported by Numba - "
f"{min_cudart[0]}.{min_cudart[1]} is the minimum "
"required version.")
return _supported_cc
_supported_cc = ccs_supported_by_ctk(cudart_version)
return _supported_cc
def find_closest_arch(mycc):
Given a compute capability, return the closest compute capability supported
by the CUDA toolkit.
:param mycc: Compute capability as a tuple ``(MAJOR, MINOR)``
:return: Closest supported CC as a tuple ``(MAJOR, MINOR)``
supported_ccs = NVVM().supported_ccs
if not supported_ccs:
msg = "No supported GPU compute capabilities found. " \
"Please check your cudatoolkit version matches your CUDA version."
raise NvvmSupportError(msg)
for i, cc in enumerate(supported_ccs):
if cc == mycc:
# Matches
return cc
elif cc > mycc:
# Exceeded
if i == 0:
# CC lower than supported
msg = "GPU compute capability %d.%d is not supported" \
"(requires >=%d.%d)" % (mycc + cc)
raise NvvmSupportError(msg)
# return the previous CC
return supported_ccs[i - 1]
# CC higher than supported
return supported_ccs[-1] # Choose the highest
def get_arch_option(major, minor):
"""Matches with the closest architecture option
if config.FORCE_CUDA_CC:
arch = config.FORCE_CUDA_CC
arch = find_closest_arch((major, minor))
return 'compute_%d%d' % arch
MISSING_LIBDEVICE_FILE_MSG = '''Missing libdevice file.
Please ensure you have package cudatoolkit >= 11.0
Install package by:
conda install cudatoolkit
class LibDevice(object):
_cache_ = None
def __init__(self):
if self._cache_ is None:
if get_libdevice() is None:
self._cache_ = open_libdevice()
self.bc = self._cache_
def get(self):
return self.bc
cas_nvvm = """
%cas_success = cmpxchg volatile {Ti}* %iptr, {Ti} %old, {Ti} %new monotonic monotonic
%cas = extractvalue {{ {Ti}, i1 }} %cas_success, 0
""" # noqa: E501
# Translation of code from CUDA Programming Guide v6.5, section B.12
ir_numba_atomic_binary_template = """
define internal {T} @___numba_atomic_{T}_{FUNC}({T}* %ptr, {T} %val) alwaysinline {{
%iptr = bitcast {T}* %ptr to {Ti}*
%old2 = load volatile {Ti}, {Ti}* %iptr
br label %attempt
%old = phi {Ti} [ %old2, %entry ], [ %cas, %attempt ]
%dold = bitcast {Ti} %old to {T}
%dnew = {OP} {T} %dold, %val
%new = bitcast {T} %dnew to {Ti}
%repeat = icmp ne {Ti} %cas, %old
br i1 %repeat, label %attempt, label %done
%result = bitcast {Ti} %old to {T}
ret {T} %result
""" # noqa: E501
ir_numba_atomic_inc_template = """
define internal {T} @___numba_atomic_{Tu}_inc({T}* %iptr, {T} %val) alwaysinline {{
%old2 = load volatile {T}, {T}* %iptr
br label %attempt
%old = phi {T} [ %old2, %entry ], [ %cas, %attempt ]
%bndchk = icmp ult {T} %old, %val
%inc = add {T} %old, 1
%new = select i1 %bndchk, {T} %inc, {T} 0
%repeat = icmp ne {T} %cas, %old
br i1 %repeat, label %attempt, label %done
ret {T} %old
""" # noqa: E501
ir_numba_atomic_dec_template = """
define internal {T} @___numba_atomic_{Tu}_dec({T}* %iptr, {T} %val) alwaysinline {{
%old2 = load volatile {T}, {T}* %iptr
br label %attempt
%old = phi {T} [ %old2, %entry ], [ %cas, %attempt ]
%dec = add {T} %old, -1
%bndchk = icmp ult {T} %dec, %val
%new = select i1 %bndchk, {T} %dec, {T} %val
%repeat = icmp ne {T} %cas, %old
br i1 %repeat, label %attempt, label %done
ret {T} %old
""" # noqa: E501
ir_numba_atomic_minmax_template = """
define internal {T} @___numba_atomic_{T}_{NAN}{FUNC}({T}* %ptr, {T} %val) alwaysinline {{
%ptrval = load volatile {T}, {T}* %ptr
; Return early when:
; - For nanmin / nanmax when val is a NaN
; - For min / max when val or ptr is a NaN
%early_return = fcmp uno {T} %val, %{PTR_OR_VAL}val
br i1 %early_return, label %done, label %lt_check
%dold = phi {T} [ %ptrval, %entry ], [ %dcas, %attempt ]
; Continue attempts if dold less or greater than val (depending on whether min or max)
; or if dold is NaN (for nanmin / nanmax)
%cmp = fcmp {OP} {T} %dold, %val
br i1 %cmp, label %attempt, label %done
; Attempt to swap in the value
%old = bitcast {T} %dold to {Ti}
%iptr = bitcast {T}* %ptr to {Ti}*
%new = bitcast {T} %val to {Ti}
%dcas = bitcast {Ti} %cas to {T}
br label %lt_check
ret {T} %ptrval
""" # noqa: E501
def ir_cas(Ti):
return cas_nvvm.format(Ti=Ti)
def ir_numba_atomic_binary(T, Ti, OP, FUNC):
params = dict(T=T, Ti=Ti, OP=OP, FUNC=FUNC, CAS=ir_cas(Ti))
return ir_numba_atomic_binary_template.format(**params)
def ir_numba_atomic_minmax(T, Ti, NAN, OP, PTR_OR_VAL, FUNC):
params = dict(T=T, Ti=Ti, NAN=NAN, OP=OP, PTR_OR_VAL=PTR_OR_VAL,
FUNC=FUNC, CAS=ir_cas(Ti))
return ir_numba_atomic_minmax_template.format(**params)
def ir_numba_atomic_inc(T, Tu):
return ir_numba_atomic_inc_template.format(T=T, Tu=Tu, CAS=ir_cas(T))
def ir_numba_atomic_dec(T, Tu):
return ir_numba_atomic_dec_template.format(T=T, Tu=Tu, CAS=ir_cas(T))
def llvm_replace(llvmir):
replacements = [
('declare double @"___numba_atomic_double_add"(double* %".1", double %".2")', # noqa: E501
ir_numba_atomic_binary(T='double', Ti='i64', OP='fadd', FUNC='add')),
('declare float @"___numba_atomic_float_sub"(float* %".1", float %".2")', # noqa: E501
ir_numba_atomic_binary(T='float', Ti='i32', OP='fsub', FUNC='sub')),
('declare double @"___numba_atomic_double_sub"(double* %".1", double %".2")', # noqa: E501
ir_numba_atomic_binary(T='double', Ti='i64', OP='fsub', FUNC='sub')),
('declare i64 @"___numba_atomic_u64_inc"(i64* %".1", i64 %".2")',
ir_numba_atomic_inc(T='i64', Tu='u64')),
('declare i64 @"___numba_atomic_u64_dec"(i64* %".1", i64 %".2")',
ir_numba_atomic_dec(T='i64', Tu='u64')),
('declare float @"___numba_atomic_float_max"(float* %".1", float %".2")', # noqa: E501
ir_numba_atomic_minmax(T='float', Ti='i32', NAN='', OP='nnan olt',
PTR_OR_VAL='ptr', FUNC='max')),
('declare double @"___numba_atomic_double_max"(double* %".1", double %".2")', # noqa: E501
ir_numba_atomic_minmax(T='double', Ti='i64', NAN='', OP='nnan olt',
PTR_OR_VAL='ptr', FUNC='max')),
('declare float @"___numba_atomic_float_min"(float* %".1", float %".2")', # noqa: E501
ir_numba_atomic_minmax(T='float', Ti='i32', NAN='', OP='nnan ogt',
PTR_OR_VAL='ptr', FUNC='min')),
('declare double @"___numba_atomic_double_min"(double* %".1", double %".2")', # noqa: E501
ir_numba_atomic_minmax(T='double', Ti='i64', NAN='', OP='nnan ogt',
PTR_OR_VAL='ptr', FUNC='min')),
('declare float @"___numba_atomic_float_nanmax"(float* %".1", float %".2")', # noqa: E501
ir_numba_atomic_minmax(T='float', Ti='i32', NAN='nan', OP='ult',
PTR_OR_VAL='', FUNC='max')),
('declare double @"___numba_atomic_double_nanmax"(double* %".1", double %".2")', # noqa: E501
ir_numba_atomic_minmax(T='double', Ti='i64', NAN='nan', OP='ult',
PTR_OR_VAL='', FUNC='max')),
('declare float @"___numba_atomic_float_nanmin"(float* %".1", float %".2")', # noqa: E501
ir_numba_atomic_minmax(T='float', Ti='i32', NAN='nan', OP='ugt',
PTR_OR_VAL='', FUNC='min')),
('declare double @"___numba_atomic_double_nanmin"(double* %".1", double %".2")', # noqa: E501
ir_numba_atomic_minmax(T='double', Ti='i64', NAN='nan', OP='ugt',
PTR_OR_VAL='', FUNC='min')),
('immarg', '')
for decl, fn in replacements:
llvmir = llvmir.replace(decl, fn)
llvmir = llvm140_to_70_ir(llvmir)
return llvmir
def llvm_to_ptx(llvmir, **opts):
if isinstance(llvmir, str):
llvmir = [llvmir]
if opts.pop('fastmath', False):
'ftz': True,
'fma': True,
'prec_div': False,
'prec_sqrt': False,
cu = CompilationUnit()
libdevice = LibDevice()
for mod in llvmir:
mod = llvm_replace(mod)
return cu.compile(**opts)
re_attributes_def = re.compile(r"^attributes #\d+ = \{ ([\w\s]+)\ }")
def llvm140_to_70_ir(ir):
Convert LLVM 14.0 IR for LLVM 7.0.
buf = []
for line in ir.splitlines():
if line.startswith('attributes #'):
# Remove function attributes unsupported by LLVM 7.0
m = re_attributes_def.match(line)
attrs = m.group(1).split()
attrs = ' '.join(a for a in attrs if a != 'willreturn')
line = line.replace(m.group(1), attrs)
return '\n'.join(buf)
def set_cuda_kernel(function):
Mark a function as a CUDA kernel. Kernels have the following requirements:
- Metadata that marks them as a kernel.
- Addition to the @llvm.used list, so that they will not be discarded.
- The noinline attribute is not permitted, because this causes NVVM to emit
a warning, which counts as failing IR verification.
Presently it is assumed that there is one kernel per module, which holds
for Numba-jitted functions. If this changes in future or this function is
to be used externally, this function may need modification to add to the
@llvm.used list rather than creating it.
module = function.module
# Add kernel metadata
mdstr = ir.MetaDataString(module, "kernel")
mdvalue = ir.Constant(ir.IntType(32), 1)
md = module.add_metadata((function, mdstr, mdvalue))
nmd = cgutils.get_or_insert_named_metadata(module, 'nvvm.annotations')
# Create the used list
ptrty = ir.IntType(8).as_pointer()
usedty = ir.ArrayType(ptrty, 1)
fnptr = function.bitcast(ptrty)
llvm_used = ir.GlobalVariable(module, usedty, 'llvm.used')
llvm_used.linkage = 'appending'
llvm_used.section = 'llvm.metadata'
llvm_used.initializer = ir.Constant(usedty, [fnptr])
# Remove 'noinline' if it is present.
def add_ir_version(mod):
"""Add NVVM IR version to module"""
# We specify the IR version to match the current NVVM's IR version
i32 = ir.IntType(32)
ir_versions = [i32(v) for v in NVVM().get_ir_version()]
md_ver = mod.add_metadata(ir_versions)
mod.add_named_metadata('nvvmir.version', md_ver)