383 lines
14 KiB
Python
383 lines
14 KiB
Python
|
from llvmlite import ir
|
||
|
from numba.core.typing.templates import ConcreteTemplate
|
||
|
from numba.core import types, typing, funcdesc, config, compiler, sigutils
|
||
|
from numba.core.compiler import (sanitize_compile_result_entries, CompilerBase,
|
||
|
DefaultPassBuilder, Flags, Option,
|
||
|
CompileResult)
|
||
|
from numba.core.compiler_lock import global_compiler_lock
|
||
|
from numba.core.compiler_machinery import (LoweringPass,
|
||
|
PassManager, register_pass)
|
||
|
from numba.core.errors import NumbaInvalidConfigWarning
|
||
|
from numba.core.typed_passes import (IRLegalization, NativeLowering,
|
||
|
AnnotateTypes)
|
||
|
from warnings import warn
|
||
|
from numba.cuda.api import get_current_device
|
||
|
from numba.cuda.target import CUDACABICallConv
|
||
|
|
||
|
|
||
|
def _nvvm_options_type(x):
|
||
|
if x is None:
|
||
|
return None
|
||
|
|
||
|
else:
|
||
|
assert isinstance(x, dict)
|
||
|
return x
|
||
|
|
||
|
|
||
|
class CUDAFlags(Flags):
|
||
|
nvvm_options = Option(
|
||
|
type=_nvvm_options_type,
|
||
|
default=None,
|
||
|
doc="NVVM options",
|
||
|
)
|
||
|
compute_capability = Option(
|
||
|
type=tuple,
|
||
|
default=None,
|
||
|
doc="Compute Capability",
|
||
|
)
|
||
|
|
||
|
|
||
|
# The CUDACompileResult (CCR) has a specially-defined entry point equal to its
|
||
|
# id. This is because the entry point is used as a key into a dict of
|
||
|
# overloads by the base dispatcher. The id of the CCR is the only small and
|
||
|
# unique property of a CompileResult in the CUDA target (cf. the CPU target,
|
||
|
# which uses its entry_point, which is a pointer value).
|
||
|
#
|
||
|
# This does feel a little hackish, and there are two ways in which this could
|
||
|
# be improved:
|
||
|
#
|
||
|
# 1. We could change the core of Numba so that each CompileResult has its own
|
||
|
# unique ID that can be used as a key - e.g. a count, similar to the way in
|
||
|
# which types have unique counts.
|
||
|
# 2. At some future time when kernel launch uses a compiled function, the entry
|
||
|
# point will no longer need to be a synthetic value, but will instead be a
|
||
|
# pointer to the compiled function as in the CPU target.
|
||
|
|
||
|
class CUDACompileResult(CompileResult):
|
||
|
@property
|
||
|
def entry_point(self):
|
||
|
return id(self)
|
||
|
|
||
|
|
||
|
def cuda_compile_result(**entries):
|
||
|
entries = sanitize_compile_result_entries(entries)
|
||
|
return CUDACompileResult(**entries)
|
||
|
|
||
|
|
||
|
@register_pass(mutates_CFG=True, analysis_only=False)
|
||
|
class CUDABackend(LoweringPass):
|
||
|
|
||
|
_name = "cuda_backend"
|
||
|
|
||
|
def __init__(self):
|
||
|
LoweringPass.__init__(self)
|
||
|
|
||
|
def run_pass(self, state):
|
||
|
"""
|
||
|
Back-end: Packages lowering output in a compile result
|
||
|
"""
|
||
|
lowered = state['cr']
|
||
|
signature = typing.signature(state.return_type, *state.args)
|
||
|
|
||
|
state.cr = cuda_compile_result(
|
||
|
typing_context=state.typingctx,
|
||
|
target_context=state.targetctx,
|
||
|
typing_error=state.status.fail_reason,
|
||
|
type_annotation=state.type_annotation,
|
||
|
library=state.library,
|
||
|
call_helper=lowered.call_helper,
|
||
|
signature=signature,
|
||
|
fndesc=lowered.fndesc,
|
||
|
)
|
||
|
return True
|
||
|
|
||
|
|
||
|
@register_pass(mutates_CFG=False, analysis_only=False)
|
||
|
class CreateLibrary(LoweringPass):
|
||
|
"""
|
||
|
Create a CUDACodeLibrary for the NativeLowering pass to populate. The
|
||
|
NativeLowering pass will create a code library if none exists, but we need
|
||
|
to set it up with nvvm_options from the flags if they are present.
|
||
|
"""
|
||
|
|
||
|
_name = "create_library"
|
||
|
|
||
|
def __init__(self):
|
||
|
LoweringPass.__init__(self)
|
||
|
|
||
|
def run_pass(self, state):
|
||
|
codegen = state.targetctx.codegen()
|
||
|
name = state.func_id.func_qualname
|
||
|
nvvm_options = state.flags.nvvm_options
|
||
|
state.library = codegen.create_library(name, nvvm_options=nvvm_options)
|
||
|
# Enable object caching upfront so that the library can be serialized.
|
||
|
state.library.enable_object_caching()
|
||
|
|
||
|
return True
|
||
|
|
||
|
|
||
|
class CUDACompiler(CompilerBase):
|
||
|
def define_pipelines(self):
|
||
|
dpb = DefaultPassBuilder
|
||
|
pm = PassManager('cuda')
|
||
|
|
||
|
untyped_passes = dpb.define_untyped_pipeline(self.state)
|
||
|
pm.passes.extend(untyped_passes.passes)
|
||
|
|
||
|
typed_passes = dpb.define_typed_pipeline(self.state)
|
||
|
pm.passes.extend(typed_passes.passes)
|
||
|
|
||
|
lowering_passes = self.define_cuda_lowering_pipeline(self.state)
|
||
|
pm.passes.extend(lowering_passes.passes)
|
||
|
|
||
|
pm.finalize()
|
||
|
return [pm]
|
||
|
|
||
|
def define_cuda_lowering_pipeline(self, state):
|
||
|
pm = PassManager('cuda_lowering')
|
||
|
# legalise
|
||
|
pm.add_pass(IRLegalization,
|
||
|
"ensure IR is legal prior to lowering")
|
||
|
pm.add_pass(AnnotateTypes, "annotate types")
|
||
|
|
||
|
# lower
|
||
|
pm.add_pass(CreateLibrary, "create library")
|
||
|
pm.add_pass(NativeLowering, "native lowering")
|
||
|
pm.add_pass(CUDABackend, "cuda backend")
|
||
|
|
||
|
pm.finalize()
|
||
|
return pm
|
||
|
|
||
|
|
||
|
@global_compiler_lock
|
||
|
def compile_cuda(pyfunc, return_type, args, debug=False, lineinfo=False,
|
||
|
inline=False, fastmath=False, nvvm_options=None,
|
||
|
cc=None):
|
||
|
if cc is None:
|
||
|
raise ValueError('Compute Capability must be supplied')
|
||
|
|
||
|
from .descriptor import cuda_target
|
||
|
typingctx = cuda_target.typing_context
|
||
|
targetctx = cuda_target.target_context
|
||
|
|
||
|
flags = CUDAFlags()
|
||
|
# Do not compile (generate native code), just lower (to LLVM)
|
||
|
flags.no_compile = True
|
||
|
flags.no_cpython_wrapper = True
|
||
|
flags.no_cfunc_wrapper = True
|
||
|
|
||
|
# Both debug and lineinfo turn on debug information in the compiled code,
|
||
|
# but we keep them separate arguments in case we later want to overload
|
||
|
# some other behavior on the debug flag. In particular, -opt=3 is not
|
||
|
# supported with debug enabled, and enabling only lineinfo should not
|
||
|
# affect the error model.
|
||
|
if debug or lineinfo:
|
||
|
flags.debuginfo = True
|
||
|
|
||
|
if lineinfo:
|
||
|
flags.dbg_directives_only = True
|
||
|
|
||
|
if debug:
|
||
|
flags.error_model = 'python'
|
||
|
else:
|
||
|
flags.error_model = 'numpy'
|
||
|
|
||
|
if inline:
|
||
|
flags.forceinline = True
|
||
|
if fastmath:
|
||
|
flags.fastmath = True
|
||
|
if nvvm_options:
|
||
|
flags.nvvm_options = nvvm_options
|
||
|
flags.compute_capability = cc
|
||
|
|
||
|
# Run compilation pipeline
|
||
|
from numba.core.target_extension import target_override
|
||
|
with target_override('cuda'):
|
||
|
cres = compiler.compile_extra(typingctx=typingctx,
|
||
|
targetctx=targetctx,
|
||
|
func=pyfunc,
|
||
|
args=args,
|
||
|
return_type=return_type,
|
||
|
flags=flags,
|
||
|
locals={},
|
||
|
pipeline_class=CUDACompiler)
|
||
|
|
||
|
library = cres.library
|
||
|
library.finalize()
|
||
|
|
||
|
return cres
|
||
|
|
||
|
|
||
|
def cabi_wrap_function(context, lib, fndesc, wrapper_function_name,
|
||
|
nvvm_options):
|
||
|
"""
|
||
|
Wrap a Numba ABI function in a C ABI wrapper at the NVVM IR level.
|
||
|
|
||
|
The C ABI wrapper will have the same name as the source Python function.
|
||
|
"""
|
||
|
# The wrapper will be contained in a new library that links to the wrapped
|
||
|
# function's library
|
||
|
library = lib.codegen.create_library(f'{lib.name}_function_',
|
||
|
entry_name=wrapper_function_name,
|
||
|
nvvm_options=nvvm_options)
|
||
|
library.add_linking_library(lib)
|
||
|
|
||
|
# Determine the caller (C ABI) and wrapper (Numba ABI) function types
|
||
|
argtypes = fndesc.argtypes
|
||
|
restype = fndesc.restype
|
||
|
c_call_conv = CUDACABICallConv(context)
|
||
|
wrapfnty = c_call_conv.get_function_type(restype, argtypes)
|
||
|
fnty = context.call_conv.get_function_type(fndesc.restype, argtypes)
|
||
|
|
||
|
# Create a new module and declare the callee
|
||
|
wrapper_module = context.create_module("cuda.cabi.wrapper")
|
||
|
func = ir.Function(wrapper_module, fnty, fndesc.llvm_func_name)
|
||
|
|
||
|
# Define the caller - populate it with a call to the callee and return
|
||
|
# its return value
|
||
|
|
||
|
wrapfn = ir.Function(wrapper_module, wrapfnty, wrapper_function_name)
|
||
|
builder = ir.IRBuilder(wrapfn.append_basic_block(''))
|
||
|
|
||
|
arginfo = context.get_arg_packer(argtypes)
|
||
|
callargs = arginfo.from_arguments(builder, wrapfn.args)
|
||
|
# We get (status, return_value), but we ignore the status since we
|
||
|
# can't propagate it through the C ABI anyway
|
||
|
_, return_value = context.call_conv.call_function(
|
||
|
builder, func, restype, argtypes, callargs)
|
||
|
builder.ret(return_value)
|
||
|
|
||
|
library.add_ir_module(wrapper_module)
|
||
|
library.finalize()
|
||
|
return library
|
||
|
|
||
|
|
||
|
@global_compiler_lock
|
||
|
def compile_ptx(pyfunc, sig, debug=False, lineinfo=False, device=False,
|
||
|
fastmath=False, cc=None, opt=True, abi="numba", abi_info=None):
|
||
|
"""Compile a Python function to PTX for a given set of argument types.
|
||
|
|
||
|
:param pyfunc: The Python function to compile.
|
||
|
:param sig: The signature representing the function's input and output
|
||
|
types.
|
||
|
:param debug: Whether to include debug info in the generated PTX.
|
||
|
:type debug: bool
|
||
|
:param lineinfo: Whether to include a line mapping from the generated PTX
|
||
|
to the source code. Usually this is used with optimized
|
||
|
code (since debug mode would automatically include this),
|
||
|
so we want debug info in the LLVM but only the line
|
||
|
mapping in the final PTX.
|
||
|
:type lineinfo: bool
|
||
|
:param device: Whether to compile a device function. Defaults to ``False``,
|
||
|
to compile global kernel functions.
|
||
|
:type device: bool
|
||
|
:param fastmath: Whether to enable fast math flags (ftz=1, prec_sqrt=0,
|
||
|
prec_div=, and fma=1)
|
||
|
:type fastmath: bool
|
||
|
:param cc: Compute capability to compile for, as a tuple
|
||
|
``(MAJOR, MINOR)``. Defaults to ``(5, 0)``.
|
||
|
:type cc: tuple
|
||
|
:param opt: Enable optimizations. Defaults to ``True``.
|
||
|
:type opt: bool
|
||
|
:param abi: The ABI for a compiled function - either ``"numba"`` or
|
||
|
``"c"``. Note that the Numba ABI is not considered stable.
|
||
|
The C ABI is only supported for device functions at present.
|
||
|
:type abi: str
|
||
|
:param abi_info: A dict of ABI-specific options. The ``"c"`` ABI supports
|
||
|
one option, ``"abi_name"``, for providing the wrapper
|
||
|
function's name. The ``"numba"`` ABI has no options.
|
||
|
:type abi_info: dict
|
||
|
:return: (ptx, resty): The PTX code and inferred return type
|
||
|
:rtype: tuple
|
||
|
"""
|
||
|
if abi not in ("numba", "c"):
|
||
|
raise NotImplementedError(f'Unsupported ABI: {abi}')
|
||
|
|
||
|
if abi == 'c' and not device:
|
||
|
raise NotImplementedError('The C ABI is not supported for kernels')
|
||
|
|
||
|
if debug and opt:
|
||
|
msg = ("debug=True with opt=True (the default) "
|
||
|
"is not supported by CUDA. This may result in a crash"
|
||
|
" - set debug=False or opt=False.")
|
||
|
warn(NumbaInvalidConfigWarning(msg))
|
||
|
|
||
|
abi_info = abi_info or dict()
|
||
|
|
||
|
nvvm_options = {
|
||
|
'fastmath': fastmath,
|
||
|
'opt': 3 if opt else 0
|
||
|
}
|
||
|
|
||
|
args, return_type = sigutils.normalize_signature(sig)
|
||
|
|
||
|
cc = cc or config.CUDA_DEFAULT_PTX_CC
|
||
|
cres = compile_cuda(pyfunc, return_type, args, debug=debug,
|
||
|
lineinfo=lineinfo, fastmath=fastmath,
|
||
|
nvvm_options=nvvm_options, cc=cc)
|
||
|
resty = cres.signature.return_type
|
||
|
|
||
|
if resty and not device and resty != types.void:
|
||
|
raise TypeError("CUDA kernel must have void return type.")
|
||
|
|
||
|
tgt = cres.target_context
|
||
|
|
||
|
if device:
|
||
|
lib = cres.library
|
||
|
if abi == "c":
|
||
|
wrapper_name = abi_info.get('abi_name', pyfunc.__name__)
|
||
|
lib = cabi_wrap_function(tgt, lib, cres.fndesc, wrapper_name,
|
||
|
nvvm_options)
|
||
|
else:
|
||
|
code = pyfunc.__code__
|
||
|
filename = code.co_filename
|
||
|
linenum = code.co_firstlineno
|
||
|
|
||
|
lib, kernel = tgt.prepare_cuda_kernel(cres.library, cres.fndesc, debug,
|
||
|
lineinfo, nvvm_options, filename,
|
||
|
linenum)
|
||
|
|
||
|
ptx = lib.get_asm_str(cc=cc)
|
||
|
return ptx, resty
|
||
|
|
||
|
|
||
|
def compile_ptx_for_current_device(pyfunc, sig, debug=False, lineinfo=False,
|
||
|
device=False, fastmath=False, opt=True,
|
||
|
abi="numba", abi_info=None):
|
||
|
"""Compile a Python function to PTX for a given set of argument types for
|
||
|
the current device's compute capabilility. This calls :func:`compile_ptx`
|
||
|
with an appropriate ``cc`` value for the current device."""
|
||
|
cc = get_current_device().compute_capability
|
||
|
return compile_ptx(pyfunc, sig, debug=debug, lineinfo=lineinfo,
|
||
|
device=device, fastmath=fastmath, cc=cc, opt=opt,
|
||
|
abi=abi, abi_info=abi_info)
|
||
|
|
||
|
|
||
|
def declare_device_function(name, restype, argtypes):
|
||
|
return declare_device_function_template(name, restype, argtypes).key
|
||
|
|
||
|
|
||
|
def declare_device_function_template(name, restype, argtypes):
|
||
|
from .descriptor import cuda_target
|
||
|
typingctx = cuda_target.typing_context
|
||
|
targetctx = cuda_target.target_context
|
||
|
sig = typing.signature(restype, *argtypes)
|
||
|
extfn = ExternFunction(name, sig)
|
||
|
|
||
|
class device_function_template(ConcreteTemplate):
|
||
|
key = extfn
|
||
|
cases = [sig]
|
||
|
|
||
|
fndesc = funcdesc.ExternalFunctionDescriptor(
|
||
|
name=name, restype=restype, argtypes=argtypes)
|
||
|
typingctx.insert_user_function(extfn, device_function_template)
|
||
|
targetctx.insert_user_function(extfn, fndesc)
|
||
|
|
||
|
return device_function_template
|
||
|
|
||
|
|
||
|
class ExternFunction(object):
|
||
|
def __init__(self, name, sig):
|
||
|
self.name = name
|
||
|
self.sig = sig
|