240 lines
8.9 KiB
Python
240 lines
8.9 KiB
Python
|
from math import sqrt
|
||
|
from numba import cuda, float32, int16, int32, int64, uint32, void
|
||
|
from numba.cuda import compile_ptx, compile_ptx_for_current_device
|
||
|
|
||
|
from numba.cuda.testing import skip_on_cudasim, unittest, CUDATestCase
|
||
|
|
||
|
|
||
|
# A test function at the module scope to ensure we get the name right for the C
|
||
|
# ABI whether a function is at module or local scope.
|
||
|
def f_module(x, y):
|
||
|
return x + y
|
||
|
|
||
|
|
||
|
@skip_on_cudasim('Compilation unsupported in the simulator')
|
||
|
class TestCompileToPTX(unittest.TestCase):
|
||
|
def test_global_kernel(self):
|
||
|
def f(r, x, y):
|
||
|
i = cuda.grid(1)
|
||
|
if i < len(r):
|
||
|
r[i] = x[i] + y[i]
|
||
|
|
||
|
args = (float32[:], float32[:], float32[:])
|
||
|
ptx, resty = compile_ptx(f, args)
|
||
|
|
||
|
# Kernels should not have a func_retval parameter
|
||
|
self.assertNotIn('func_retval', ptx)
|
||
|
# .visible .func is used to denote a device function
|
||
|
self.assertNotIn('.visible .func', ptx)
|
||
|
# .visible .entry would denote the presence of a global function
|
||
|
self.assertIn('.visible .entry', ptx)
|
||
|
# Return type for kernels should always be void
|
||
|
self.assertEqual(resty, void)
|
||
|
|
||
|
def test_device_function(self):
|
||
|
def add(x, y):
|
||
|
return x + y
|
||
|
|
||
|
args = (float32, float32)
|
||
|
ptx, resty = compile_ptx(add, args, device=True)
|
||
|
|
||
|
# Device functions take a func_retval parameter for storing the
|
||
|
# returned value in by reference
|
||
|
self.assertIn('func_retval', ptx)
|
||
|
# .visible .func is used to denote a device function
|
||
|
self.assertIn('.visible .func', ptx)
|
||
|
# .visible .entry would denote the presence of a global function
|
||
|
self.assertNotIn('.visible .entry', ptx)
|
||
|
# Inferred return type as expected?
|
||
|
self.assertEqual(resty, float32)
|
||
|
|
||
|
# Check that function's output matches signature
|
||
|
sig_int32 = int32(int32, int32)
|
||
|
ptx, resty = compile_ptx(add, sig_int32, device=True)
|
||
|
self.assertEqual(resty, int32)
|
||
|
|
||
|
sig_int16 = int16(int16, int16)
|
||
|
ptx, resty = compile_ptx(add, sig_int16, device=True)
|
||
|
self.assertEqual(resty, int16)
|
||
|
# Using string as signature
|
||
|
sig_string = "uint32(uint32, uint32)"
|
||
|
ptx, resty = compile_ptx(add, sig_string, device=True)
|
||
|
self.assertEqual(resty, uint32)
|
||
|
|
||
|
def test_fastmath(self):
|
||
|
def f(x, y, z, d):
|
||
|
return sqrt((x * y + z) / d)
|
||
|
|
||
|
args = (float32, float32, float32, float32)
|
||
|
ptx, resty = compile_ptx(f, args, device=True)
|
||
|
|
||
|
# Without fastmath, fma contraction is enabled by default, but ftz and
|
||
|
# approximate div / sqrt is not.
|
||
|
self.assertIn('fma.rn.f32', ptx)
|
||
|
self.assertIn('div.rn.f32', ptx)
|
||
|
self.assertIn('sqrt.rn.f32', ptx)
|
||
|
|
||
|
ptx, resty = compile_ptx(f, args, device=True, fastmath=True)
|
||
|
|
||
|
# With fastmath, ftz and approximate div / sqrt are enabled
|
||
|
self.assertIn('fma.rn.ftz.f32', ptx)
|
||
|
self.assertIn('div.approx.ftz.f32', ptx)
|
||
|
self.assertIn('sqrt.approx.ftz.f32', ptx)
|
||
|
|
||
|
def check_debug_info(self, ptx):
|
||
|
# A debug_info section should exist in the PTX. Whitespace varies
|
||
|
# between CUDA toolkit versions.
|
||
|
self.assertRegex(ptx, '\\.section\\s+\\.debug_info')
|
||
|
# A .file directive should be produced and include the name of the
|
||
|
# source. The path and whitespace may vary, so we accept anything
|
||
|
# ending in the filename of this module.
|
||
|
self.assertRegex(ptx, '\\.file.*test_compiler.py"')
|
||
|
|
||
|
def test_device_function_with_debug(self):
|
||
|
# See Issue #6719 - this ensures that compilation with debug succeeds
|
||
|
# with CUDA 11.2 / NVVM 7.0 onwards. Previously it failed because NVVM
|
||
|
# IR version metadata was not added when compiling device functions,
|
||
|
# and NVVM assumed DBG version 1.0 if not specified, which is
|
||
|
# incompatible with the 3.0 IR we use. This was specified only for
|
||
|
# kernels.
|
||
|
def f():
|
||
|
pass
|
||
|
|
||
|
ptx, resty = compile_ptx(f, (), device=True, debug=True)
|
||
|
self.check_debug_info(ptx)
|
||
|
|
||
|
def test_kernel_with_debug(self):
|
||
|
# Inspired by (but not originally affected by) Issue #6719
|
||
|
def f():
|
||
|
pass
|
||
|
|
||
|
ptx, resty = compile_ptx(f, (), debug=True)
|
||
|
self.check_debug_info(ptx)
|
||
|
|
||
|
def check_line_info(self, ptx):
|
||
|
# A .file directive should be produced and include the name of the
|
||
|
# source. The path and whitespace may vary, so we accept anything
|
||
|
# ending in the filename of this module.
|
||
|
self.assertRegex(ptx, '\\.file.*test_compiler.py"')
|
||
|
|
||
|
def test_device_function_with_line_info(self):
|
||
|
def f():
|
||
|
pass
|
||
|
|
||
|
ptx, resty = compile_ptx(f, (), device=True, lineinfo=True)
|
||
|
self.check_line_info(ptx)
|
||
|
|
||
|
def test_kernel_with_line_info(self):
|
||
|
def f():
|
||
|
pass
|
||
|
|
||
|
ptx, resty = compile_ptx(f, (), lineinfo=True)
|
||
|
self.check_line_info(ptx)
|
||
|
|
||
|
def test_non_void_return_type(self):
|
||
|
def f(x, y):
|
||
|
return x[0] + y[0]
|
||
|
|
||
|
with self.assertRaisesRegex(TypeError, 'must have void return type'):
|
||
|
compile_ptx(f, (uint32[::1], uint32[::1]))
|
||
|
|
||
|
def test_c_abi_disallowed_for_kernel(self):
|
||
|
def f(x, y):
|
||
|
return x + y
|
||
|
|
||
|
with self.assertRaisesRegex(NotImplementedError,
|
||
|
"The C ABI is not supported for kernels"):
|
||
|
compile_ptx(f, (int32, int32), abi="c")
|
||
|
|
||
|
def test_unsupported_abi(self):
|
||
|
def f(x, y):
|
||
|
return x + y
|
||
|
|
||
|
with self.assertRaisesRegex(NotImplementedError,
|
||
|
"Unsupported ABI: fastcall"):
|
||
|
compile_ptx(f, (int32, int32), abi="fastcall")
|
||
|
|
||
|
def test_c_abi_device_function(self):
|
||
|
def f(x, y):
|
||
|
return x + y
|
||
|
|
||
|
ptx, resty = compile_ptx(f, int32(int32, int32), device=True, abi="c")
|
||
|
# There should be no more than two parameters
|
||
|
self.assertNotIn(ptx, "param_2")
|
||
|
|
||
|
# The function name should match the Python function name (not the
|
||
|
# qualname, which includes additional info), and its return value
|
||
|
# should be 32 bits
|
||
|
self.assertRegex(ptx, r"\.visible\s+\.func\s+\(\.param\s+\.b32\s+"
|
||
|
r"func_retval0\)\s+f\(")
|
||
|
|
||
|
# If we compile for 64-bit integers, the return type should be 64 bits
|
||
|
# wide
|
||
|
ptx, resty = compile_ptx(f, int64(int64, int64), device=True, abi="c")
|
||
|
self.assertRegex(ptx, r"\.visible\s+\.func\s+\(\.param\s+\.b64")
|
||
|
|
||
|
def test_c_abi_device_function_module_scope(self):
|
||
|
ptx, resty = compile_ptx(f_module, int32(int32, int32), device=True,
|
||
|
abi="c")
|
||
|
|
||
|
# The function name should match the Python function name, and its
|
||
|
# return value should be 32 bits
|
||
|
self.assertRegex(ptx, r"\.visible\s+\.func\s+\(\.param\s+\.b32\s+"
|
||
|
r"func_retval0\)\s+f_module\(")
|
||
|
|
||
|
def test_c_abi_with_abi_name(self):
|
||
|
abi_info = {'abi_name': '_Z4funcii'}
|
||
|
ptx, resty = compile_ptx(f_module, int32(int32, int32), device=True,
|
||
|
abi="c", abi_info=abi_info)
|
||
|
|
||
|
# The function name should match the one given in the ABI info, and its
|
||
|
# return value should be 32 bits
|
||
|
self.assertRegex(ptx, r"\.visible\s+\.func\s+\(\.param\s+\.b32\s+"
|
||
|
r"func_retval0\)\s+_Z4funcii\(")
|
||
|
|
||
|
|
||
|
@skip_on_cudasim('Compilation unsupported in the simulator')
|
||
|
class TestCompileToPTXForCurrentDevice(CUDATestCase):
|
||
|
def test_compile_ptx_for_current_device(self):
|
||
|
def add(x, y):
|
||
|
return x + y
|
||
|
|
||
|
args = (float32, float32)
|
||
|
ptx, resty = compile_ptx_for_current_device(add, args, device=True)
|
||
|
|
||
|
# Check we target the current device's compute capability, or the
|
||
|
# closest compute capability supported by the current toolkit.
|
||
|
device_cc = cuda.get_current_device().compute_capability
|
||
|
cc = cuda.cudadrv.nvvm.find_closest_arch(device_cc)
|
||
|
target = f'.target sm_{cc[0]}{cc[1]}'
|
||
|
self.assertIn(target, ptx)
|
||
|
|
||
|
|
||
|
@skip_on_cudasim('Compilation unsupported in the simulator')
|
||
|
class TestCompileOnlyTests(unittest.TestCase):
|
||
|
'''For tests where we can only check correctness by examining the compiler
|
||
|
output rather than observing the effects of execution.'''
|
||
|
|
||
|
def test_nanosleep(self):
|
||
|
def use_nanosleep(x):
|
||
|
# Sleep for a constant time
|
||
|
cuda.nanosleep(32)
|
||
|
# Sleep for a variable time
|
||
|
cuda.nanosleep(x)
|
||
|
|
||
|
ptx, resty = compile_ptx(use_nanosleep, (uint32,), cc=(7, 0))
|
||
|
|
||
|
nanosleep_count = 0
|
||
|
for line in ptx.split('\n'):
|
||
|
if 'nanosleep.u32' in line:
|
||
|
nanosleep_count += 1
|
||
|
|
||
|
expected = 2
|
||
|
self.assertEqual(expected, nanosleep_count,
|
||
|
(f'Got {nanosleep_count} nanosleep instructions, '
|
||
|
f'expected {expected}'))
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
unittest.main()
|