import itertools import numpy as np import operator import re from numba import cuda, int64 from numba.cuda import compile_ptx from numba.core.errors import TypingError from numba.core.types import f2 from numba.cuda.testing import (unittest, CUDATestCase, skip_on_cudasim, skip_unless_cc_53) def simple_threadidx(ary): i = cuda.threadIdx.x ary[0] = i def fill_threadidx(ary): i = cuda.threadIdx.x ary[i] = i def fill3d_threadidx(ary): i = cuda.threadIdx.x j = cuda.threadIdx.y k = cuda.threadIdx.z ary[i, j, k] = (i + 1) * (j + 1) * (k + 1) def simple_grid1d(ary): i = cuda.grid(1) ary[i] = i def simple_grid2d(ary): i, j = cuda.grid(2) ary[i, j] = i + j def simple_gridsize1d(ary): i = cuda.grid(1) x = cuda.gridsize(1) if i == 0: ary[0] = x def simple_gridsize2d(ary): i, j = cuda.grid(2) x, y = cuda.gridsize(2) if i == 0 and j == 0: ary[0] = x ary[1] = y def intrinsic_forloop_step(c): startX, startY = cuda.grid(2) gridX = cuda.gridDim.x * cuda.blockDim.x gridY = cuda.gridDim.y * cuda.blockDim.y height, width = c.shape for x in range(startX, width, gridX): for y in range(startY, height, gridY): c[y, x] = x + y def simple_popc(ary, c): ary[0] = cuda.popc(c) def simple_fma(ary, a, b, c): ary[0] = cuda.fma(a, b, c) def simple_hadd(ary, a, b): ary[0] = cuda.fp16.hadd(a[0], b[0]) def simple_hadd_scalar(ary, a, b): ary[0] = cuda.fp16.hadd(a, b) def simple_hfma(ary, a, b, c): ary[0] = cuda.fp16.hfma(a[0], b[0], c[0]) def simple_hfma_scalar(ary, a, b, c): ary[0] = cuda.fp16.hfma(a, b, c) def simple_hsub(ary, a, b): ary[0] = cuda.fp16.hsub(a[0], b[0]) def simple_hsub_scalar(ary, a, b): ary[0] = cuda.fp16.hsub(a, b) def simple_hmul(ary, a, b): ary[0] = cuda.fp16.hmul(a[0], b[0]) def simple_hmul_scalar(ary, a, b): ary[0] = cuda.fp16.hmul(a, b) def simple_hdiv_scalar(ary, a, b): ary[0] = cuda.fp16.hdiv(a, b) def simple_hdiv_kernel(ary, array_a, array_b): i = cuda.grid(1) if i < ary.size: a = array_a[i] b = array_b[i] ary[i] = cuda.fp16.hdiv(a, b) def simple_hneg(ary, a): ary[0] = cuda.fp16.hneg(a[0]) def simple_hneg_scalar(ary, a): ary[0] = cuda.fp16.hneg(a) def simple_habs(ary, a): ary[0] = cuda.fp16.habs(a[0]) def simple_habs_scalar(ary, a): ary[0] = cuda.fp16.habs(a) def simple_heq_scalar(ary, a, b): ary[0] = cuda.fp16.heq(a, b) def simple_hne_scalar(ary, a, b): ary[0] = cuda.fp16.hne(a, b) def simple_hge_scalar(ary, a, b): ary[0] = cuda.fp16.hge(a, b) def simple_hgt_scalar(ary, a, b): ary[0] = cuda.fp16.hgt(a, b) def simple_hle_scalar(ary, a, b): ary[0] = cuda.fp16.hle(a, b) def simple_hlt_scalar(ary, a, b): ary[0] = cuda.fp16.hlt(a, b) @cuda.jit(device=True) def hlt_func_1(x, y): return cuda.fp16.hlt(x, y) @cuda.jit(device=True) def hlt_func_2(x, y): return cuda.fp16.hlt(x, y) def test_multiple_hcmp_1(r, a, b, c): # float16 predicates used in two separate functions r[0] = hlt_func_1(a, b) and hlt_func_2(b, c) def test_multiple_hcmp_2(r, a, b, c): # The same float16 predicate used in the caller and callee r[0] = hlt_func_1(a, b) and cuda.fp16.hlt(b, c) def test_multiple_hcmp_3(r, a, b, c): # Different float16 predicates used in the caller and callee r[0] = hlt_func_1(a, b) and cuda.fp16.hge(c, b) def test_multiple_hcmp_4(r, a, b, c): # The same float16 predicates used twice in a function r[0] = cuda.fp16.hlt(a, b) and cuda.fp16.hlt(b, c) def test_multiple_hcmp_5(r, a, b, c): # Different float16 predicates used in a function r[0] = cuda.fp16.hlt(a, b) and cuda.fp16.hge(c, b) def simple_hmax_scalar(ary, a, b): ary[0] = cuda.fp16.hmax(a, b) def simple_hmin_scalar(ary, a, b): ary[0] = cuda.fp16.hmin(a, b) def simple_hsin(r, x): i = cuda.grid(1) if i < len(r): r[i] = cuda.fp16.hsin(x[i]) def simple_hcos(r, x): i = cuda.grid(1) if i < len(r): r[i] = cuda.fp16.hcos(x[i]) def simple_hlog(r, x): i = cuda.grid(1) if i < len(r): r[i] = cuda.fp16.hlog(x[i]) def simple_hlog2(r, x): i = cuda.grid(1) if i < len(r): r[i] = cuda.fp16.hlog2(x[i]) def simple_hlog10(r, x): i = cuda.grid(1) if i < len(r): r[i] = cuda.fp16.hlog10(x[i]) def simple_hexp(r, x): i = cuda.grid(1) if i < len(r): r[i] = cuda.fp16.hexp(x[i]) def simple_hexp2(r, x): i = cuda.grid(1) if i < len(r): r[i] = cuda.fp16.hexp2(x[i]) def simple_hsqrt(r, x): i = cuda.grid(1) if i < len(r): r[i] = cuda.fp16.hsqrt(x[i]) def simple_hrsqrt(r, x): i = cuda.grid(1) if i < len(r): r[i] = cuda.fp16.hrsqrt(x[i]) def numpy_hrsqrt(x, dtype): return x ** -0.5 def simple_hceil(r, x): i = cuda.grid(1) if i < len(r): r[i] = cuda.fp16.hceil(x[i]) def simple_hfloor(r, x): i = cuda.grid(1) if i < len(r): r[i] = cuda.fp16.hfloor(x[i]) def simple_hrcp(r, x): i = cuda.grid(1) if i < len(r): r[i] = cuda.fp16.hrcp(x[i]) def simple_htrunc(r, x): i = cuda.grid(1) if i < len(r): r[i] = cuda.fp16.htrunc(x[i]) def simple_hrint(r, x): i = cuda.grid(1) if i < len(r): r[i] = cuda.fp16.hrint(x[i]) def simple_cbrt(ary, a): ary[0] = cuda.cbrt(a) def simple_brev(ary, c): ary[0] = cuda.brev(c) def simple_clz(ary, c): ary[0] = cuda.clz(c) def simple_ffs(ary, c): ary[0] = cuda.ffs(c) def simple_round(ary, c): ary[0] = round(c) def simple_round_to(ary, c, ndigits): ary[0] = round(c, ndigits) def branching_with_ifs(a, b, c): i = cuda.grid(1) if a[i] > 4: if b % 2 == 0: a[i] = c[i] else: a[i] = 13 else: a[i] = 3 def branching_with_selps(a, b, c): i = cuda.grid(1) inner = cuda.selp(b % 2 == 0, c[i], 13) a[i] = cuda.selp(a[i] > 4, inner, 3) def simple_laneid(ary): i = cuda.grid(1) ary[i] = cuda.laneid def simple_warpsize(ary): ary[0] = cuda.warpsize def nonliteral_grid(x): cuda.grid(x) def nonliteral_gridsize(x): cuda.gridsize(x) class TestCudaIntrinsic(CUDATestCase): def setUp(self): super().setUp() np.random.seed(0) def test_simple_threadidx(self): compiled = cuda.jit("void(int32[:])")(simple_threadidx) ary = np.ones(1, dtype=np.int32) compiled[1, 1](ary) self.assertTrue(ary[0] == 0) def test_fill_threadidx(self): compiled = cuda.jit("void(int32[:])")(fill_threadidx) N = 10 ary = np.ones(N, dtype=np.int32) exp = np.arange(N, dtype=np.int32) compiled[1, N](ary) self.assertTrue(np.all(ary == exp)) def test_fill3d_threadidx(self): X, Y, Z = 4, 5, 6 def c_contigous(): compiled = cuda.jit("void(int32[:,:,::1])")(fill3d_threadidx) ary = np.zeros((X, Y, Z), dtype=np.int32) compiled[1, (X, Y, Z)](ary) return ary def f_contigous(): compiled = cuda.jit("void(int32[::1,:,:])")(fill3d_threadidx) ary = np.asfortranarray(np.zeros((X, Y, Z), dtype=np.int32)) compiled[1, (X, Y, Z)](ary) return ary c_res = c_contigous() f_res = f_contigous() self.assertTrue(np.all(c_res == f_res)) @skip_on_cudasim('Cudasim does not check types') def test_nonliteral_grid_error(self): with self.assertRaisesRegex(TypingError, 'RequireLiteralValue'): cuda.jit('void(int32)')(nonliteral_grid) @skip_on_cudasim('Cudasim does not check types') def test_nonliteral_gridsize_error(self): with self.assertRaisesRegex(TypingError, 'RequireLiteralValue'): cuda.jit('void(int32)')(nonliteral_gridsize) def test_simple_grid1d(self): compiled = cuda.jit("void(int32[::1])")(simple_grid1d) ntid, nctaid = 3, 7 nelem = ntid * nctaid ary = np.empty(nelem, dtype=np.int32) compiled[nctaid, ntid](ary) self.assertTrue(np.all(ary == np.arange(nelem))) def test_simple_grid2d(self): compiled = cuda.jit("void(int32[:,::1])")(simple_grid2d) ntid = (4, 3) nctaid = (5, 6) shape = (ntid[0] * nctaid[0], ntid[1] * nctaid[1]) ary = np.empty(shape, dtype=np.int32) exp = ary.copy() compiled[nctaid, ntid](ary) for i in range(ary.shape[0]): for j in range(ary.shape[1]): exp[i, j] = i + j self.assertTrue(np.all(ary == exp)) def test_simple_gridsize1d(self): compiled = cuda.jit("void(int32[::1])")(simple_gridsize1d) ntid, nctaid = 3, 7 ary = np.zeros(1, dtype=np.int32) compiled[nctaid, ntid](ary) self.assertEqual(ary[0], nctaid * ntid) @skip_on_cudasim('Requires too many threads') def test_issue_9229(self): # Ensure that grid and grid size are correct - #9229 showed that they # overflowed an int32. @cuda.jit def f(grid_error, gridsize_error): i1 = cuda.grid(1) i2 = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x gs1 = cuda.gridsize(1) gs2 = cuda.blockDim.x * cuda.gridDim.x if i1 != i2: grid_error[0] = 1 if gs1 != gs2: gridsize_error[0] = 1 grid_error = np.zeros(1, dtype=np.uint64) gridsize_error = np.zeros(1, dtype=np.uint64) # A large enough grid for thread IDs to overflow an int32 # (22121216 * 256 = 5663031296, which is greater than 2 ** 32) f[22121216, 256](grid_error, gridsize_error) self.assertEqual(grid_error[0], 0) self.assertEqual(gridsize_error[0], 0) @skip_on_cudasim('Tests PTX emission') def test_selp(self): sig = (int64[:], int64, int64[:]) cu_branching_with_ifs = cuda.jit(sig)(branching_with_ifs) cu_branching_with_selps = cuda.jit(sig)(branching_with_selps) n = 32 b = 6 c = np.full(shape=32, fill_value=17, dtype=np.int64) expected = c.copy() expected[:5] = 3 a = np.arange(n, dtype=np.int64) cu_branching_with_ifs[n, 1](a, b, c) ptx = cu_branching_with_ifs.inspect_asm(sig) self.assertEqual(2, len(re.findall(r'\s+bra\s+', ptx))) np.testing.assert_array_equal(a, expected, err_msg='branching') a = np.arange(n, dtype=np.int64) cu_branching_with_selps[n, 1](a, b, c) ptx = cu_branching_with_selps.inspect_asm(sig) self.assertEqual(0, len(re.findall(r'\s+bra\s+', ptx))) np.testing.assert_array_equal(a, expected, err_msg='selp') def test_simple_gridsize2d(self): compiled = cuda.jit("void(int32[::1])")(simple_gridsize2d) ntid = (4, 3) nctaid = (5, 6) ary = np.zeros(2, dtype=np.int32) compiled[nctaid, ntid](ary) self.assertEqual(ary[0], nctaid[0] * ntid[0]) self.assertEqual(ary[1], nctaid[1] * ntid[1]) def test_intrinsic_forloop_step(self): compiled = cuda.jit("void(int32[:,::1])")(intrinsic_forloop_step) ntid = (4, 3) nctaid = (5, 6) shape = (ntid[0] * nctaid[0], ntid[1] * nctaid[1]) ary = np.empty(shape, dtype=np.int32) compiled[nctaid, ntid](ary) gridX, gridY = shape height, width = ary.shape for i, j in zip(range(ntid[0]), range(ntid[1])): startX, startY = gridX + i, gridY + j for x in range(startX, width, gridX): for y in range(startY, height, gridY): self.assertTrue(ary[y, x] == x + y, (ary[y, x], x + y)) def test_3dgrid(self): @cuda.jit def foo(out): x, y, z = cuda.grid(3) a, b, c = cuda.gridsize(3) out[x, y, z] = a * b * c arr = np.zeros(9 ** 3, dtype=np.int32).reshape(9, 9, 9) foo[(3, 3, 3), (3, 3, 3)](arr) np.testing.assert_equal(arr, 9 ** 3) def test_3dgrid_2(self): @cuda.jit def foo(out): x, y, z = cuda.grid(3) a, b, c = cuda.gridsize(3) grid_is_right = ( x == cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x and y == cuda.threadIdx.y + cuda.blockIdx.y * cuda.blockDim.y and z == cuda.threadIdx.z + cuda.blockIdx.z * cuda.blockDim.z ) gridsize_is_right = (a == cuda.blockDim.x * cuda.gridDim.x and b == cuda.blockDim.y * cuda.gridDim.y and c == cuda.blockDim.z * cuda.gridDim.z) out[x, y, z] = grid_is_right and gridsize_is_right x, y, z = (4 * 3, 3 * 2, 2 * 4) arr = np.zeros((x * y * z), dtype=np.bool_).reshape(x, y, z) foo[(4, 3, 2), (3, 2, 4)](arr) self.assertTrue(np.all(arr)) def test_popc_u4(self): compiled = cuda.jit("void(int32[:], uint32)")(simple_popc) ary = np.zeros(1, dtype=np.int32) compiled[1, 1](ary, 0xF0) self.assertEqual(ary[0], 4) def test_popc_u8(self): compiled = cuda.jit("void(int32[:], uint64)")(simple_popc) ary = np.zeros(1, dtype=np.int32) compiled[1, 1](ary, 0xF00000000000) self.assertEqual(ary[0], 4) def test_fma_f4(self): compiled = cuda.jit("void(f4[:], f4, f4, f4)")(simple_fma) ary = np.zeros(1, dtype=np.float32) compiled[1, 1](ary, 2., 3., 4.) np.testing.assert_allclose(ary[0], 2 * 3 + 4) def test_fma_f8(self): compiled = cuda.jit("void(f8[:], f8, f8, f8)")(simple_fma) ary = np.zeros(1, dtype=np.float64) compiled[1, 1](ary, 2., 3., 4.) np.testing.assert_allclose(ary[0], 2 * 3 + 4) @skip_unless_cc_53 def test_hadd(self): compiled = cuda.jit("void(f2[:], f2[:], f2[:])")(simple_hadd) ary = np.zeros(1, dtype=np.float16) arg1 = np.array([3.], dtype=np.float16) arg2 = np.array([4.], dtype=np.float16) compiled[1, 1](ary, arg1, arg2) np.testing.assert_allclose(ary[0], arg1 + arg2) @skip_unless_cc_53 def test_hadd_scalar(self): compiled = cuda.jit("void(f2[:], f2, f2)")(simple_hadd_scalar) ary = np.zeros(1, dtype=np.float16) arg1 = np.float16(3.1415926) arg2 = np.float16(3.) compiled[1, 1](ary, arg1, arg2) ref = arg1 + arg2 np.testing.assert_allclose(ary[0], ref) @skip_on_cudasim('Compilation unsupported in the simulator') def test_hadd_ptx(self): args = (f2[:], f2, f2) ptx, _ = compile_ptx(simple_hadd_scalar, args, cc=(5, 3)) self.assertIn('add.f16', ptx) @skip_unless_cc_53 def test_hfma(self): compiled = cuda.jit("void(f2[:], f2[:], f2[:], f2[:])")(simple_hfma) ary = np.zeros(1, dtype=np.float16) arg1 = np.array([2.], dtype=np.float16) arg2 = np.array([3.], dtype=np.float16) arg3 = np.array([4.], dtype=np.float16) compiled[1, 1](ary, arg1, arg2, arg3) np.testing.assert_allclose(ary[0], arg1 * arg2 + arg3) @skip_unless_cc_53 def test_hfma_scalar(self): compiled = cuda.jit("void(f2[:], f2, f2, f2)")(simple_hfma_scalar) ary = np.zeros(1, dtype=np.float16) arg1 = np.float16(2.) arg2 = np.float16(3.) arg3 = np.float16(4.) compiled[1, 1](ary, arg1, arg2, arg3) ref = arg1 * arg2 + arg3 np.testing.assert_allclose(ary[0], ref) @skip_on_cudasim('Compilation unsupported in the simulator') def test_hfma_ptx(self): args = (f2[:], f2, f2, f2) ptx, _ = compile_ptx(simple_hfma_scalar, args, cc=(5, 3)) self.assertIn('fma.rn.f16', ptx) @skip_unless_cc_53 def test_hsub(self): compiled = cuda.jit("void(f2[:], f2[:], f2[:])")(simple_hsub) ary = np.zeros(1, dtype=np.float16) arg1 = np.array([3.], dtype=np.float16) arg2 = np.array([4.], dtype=np.float16) compiled[1, 1](ary, arg1, arg2) np.testing.assert_allclose(ary[0], arg1 - arg2) @skip_unless_cc_53 def test_hsub_scalar(self): compiled = cuda.jit("void(f2[:], f2, f2)")(simple_hsub_scalar) ary = np.zeros(1, dtype=np.float16) arg1 = np.float16(3.1415926) arg2 = np.float16(1.57) compiled[1, 1](ary, arg1, arg2) ref = arg1 - arg2 np.testing.assert_allclose(ary[0], ref) @skip_on_cudasim('Compilation unsupported in the simulator') def test_hsub_ptx(self): args = (f2[:], f2, f2) ptx, _ = compile_ptx(simple_hsub_scalar, args, cc=(5, 3)) self.assertIn('sub.f16', ptx) @skip_unless_cc_53 def test_hmul(self): compiled = cuda.jit()(simple_hmul) ary = np.zeros(1, dtype=np.float16) arg1 = np.array([3.], dtype=np.float16) arg2 = np.array([4.], dtype=np.float16) compiled[1, 1](ary, arg1, arg2) np.testing.assert_allclose(ary[0], arg1 * arg2) @skip_unless_cc_53 def test_hmul_scalar(self): compiled = cuda.jit("void(f2[:], f2, f2)")(simple_hmul_scalar) ary = np.zeros(1, dtype=np.float16) arg1 = np.float16(3.1415926) arg2 = np.float16(1.57) compiled[1, 1](ary, arg1, arg2) ref = arg1 * arg2 np.testing.assert_allclose(ary[0], ref) @skip_on_cudasim('Compilation unsupported in the simulator') def test_hmul_ptx(self): args = (f2[:], f2, f2) ptx, _ = compile_ptx(simple_hmul_scalar, args, cc=(5, 3)) self.assertIn('mul.f16', ptx) @skip_unless_cc_53 def test_hdiv_scalar(self): compiled = cuda.jit("void(f2[:], f2, f2)")(simple_hdiv_scalar) ary = np.zeros(1, dtype=np.float16) arg1 = np.float16(3.1415926) arg2 = np.float16(1.57) compiled[1, 1](ary, arg1, arg2) ref = arg1 / arg2 np.testing.assert_allclose(ary[0], ref) @skip_unless_cc_53 def test_hdiv(self): compiled = cuda.jit("void(f2[:], f2[:], f2[:])")(simple_hdiv_kernel) arry1 = np.random.randint(-65504, 65505, size=500).astype(np.float16) arry2 = np.random.randint(-65504, 65505, size=500).astype(np.float16) ary = np.zeros_like(arry1, dtype=np.float16) compiled.forall(ary.size)(ary, arry1, arry2) ref = arry1 / arry2 np.testing.assert_allclose(ary, ref) @skip_unless_cc_53 def test_hneg(self): compiled = cuda.jit("void(f2[:], f2[:])")(simple_hneg) ary = np.zeros(1, dtype=np.float16) arg1 = np.array([3.], dtype=np.float16) compiled[1, 1](ary, arg1) np.testing.assert_allclose(ary[0], -arg1) @skip_unless_cc_53 def test_hneg_scalar(self): compiled = cuda.jit("void(f2[:], f2)")(simple_hneg_scalar) ary = np.zeros(1, dtype=np.float16) arg1 = np.float16(3.1415926) compiled[1, 1](ary, arg1) ref = -arg1 np.testing.assert_allclose(ary[0], ref) @skip_on_cudasim('Compilation unsupported in the simulator') def test_hneg_ptx(self): args = (f2[:], f2) ptx, _ = compile_ptx(simple_hneg_scalar, args, cc=(5, 3)) self.assertIn('neg.f16', ptx) @skip_unless_cc_53 def test_habs(self): compiled = cuda.jit()(simple_habs) ary = np.zeros(1, dtype=np.float16) arg1 = np.array([-3.], dtype=np.float16) compiled[1, 1](ary, arg1) np.testing.assert_allclose(ary[0], abs(arg1)) @skip_unless_cc_53 def test_habs_scalar(self): compiled = cuda.jit("void(f2[:], f2)")(simple_habs_scalar) ary = np.zeros(1, dtype=np.float16) arg1 = np.float16(-3.1415926) compiled[1, 1](ary, arg1) ref = abs(arg1) np.testing.assert_allclose(ary[0], ref) @skip_on_cudasim('Compilation unsupported in the simulator') def test_habs_ptx(self): args = (f2[:], f2) ptx, _ = compile_ptx(simple_habs_scalar, args, cc=(5, 3)) self.assertIn('abs.f16', ptx) @skip_unless_cc_53 def test_fp16_intrinsics_common(self): kernels = (simple_hsin, simple_hcos, simple_hlog, simple_hlog2, simple_hlog10, simple_hsqrt, simple_hceil, simple_hfloor, simple_hrcp, simple_htrunc, simple_hrint, simple_hrsqrt) exp_kernels = (simple_hexp, simple_hexp2) expected_functions = (np.sin, np.cos, np.log, np.log2, np.log10, np.sqrt, np.ceil, np.floor, np.reciprocal, np.trunc, np.rint, numpy_hrsqrt) expected_exp_functions = (np.exp, np.exp2) # Generate random data N = 32 np.random.seed(1) x = np.random.randint(1, 65505, size=N).astype(np.float16) r = np.zeros_like(x) for kernel, fn in zip(kernels, expected_functions): with self.subTest(fn=fn): kernel = cuda.jit("void(f2[:], f2[:])")(kernel) kernel[1,N](r, x) expected = fn(x, dtype=np.float16) np.testing.assert_allclose(r, expected) x2 = np.random.randint(1, 10, size=N).astype(np.float16) for kernel, fn in zip(exp_kernels, expected_exp_functions): with self.subTest(fn=fn): kernel = cuda.jit("void(f2[:], f2[:])")(kernel) kernel[1,N](r, x2) expected = fn(x2, dtype=np.float16) np.testing.assert_allclose(r, expected) @skip_unless_cc_53 def test_hexp10(self): @cuda.jit() def hexp10_vectors(r, x): i = cuda.grid(1) if i < len(r): r[i] = cuda.fp16.hexp10(x[i]) # Generate random data N = 32 np.random.seed(1) x = np.random.rand(N).astype(np.float16) r = np.zeros_like(x) # Run the kernel hexp10_vectors[1, N](r, x) np.testing.assert_allclose(r, 10 ** x) @skip_unless_cc_53 def test_fp16_comparison(self): fns = (simple_heq_scalar, simple_hne_scalar, simple_hge_scalar, simple_hgt_scalar, simple_hle_scalar, simple_hlt_scalar) ops = (operator.eq, operator.ne, operator.ge, operator.gt, operator.le, operator.lt) for fn, op in zip(fns, ops): with self.subTest(op=op): kernel = cuda.jit("void(b1[:], f2, f2)")(fn) expected = np.zeros(1, dtype=np.bool8) got = np.zeros(1, dtype=np.bool8) arg2 = np.float16(2) arg3 = np.float16(3) arg4 = np.float16(4) # Check with equal arguments kernel[1, 1](got, arg3, arg3) expected = op(arg3, arg3) self.assertEqual(expected, got[0]) # Check with LHS < RHS kernel[1, 1](got, arg3, arg4) expected = op(arg3, arg4) self.assertEqual(expected, got[0]) # Check with LHS > RHS kernel[1, 1](got, arg3, arg2) expected = op(arg3, arg2) self.assertEqual(expected, got[0]) @skip_unless_cc_53 def test_multiple_float16_comparisons(self): functions = (test_multiple_hcmp_1, test_multiple_hcmp_2, test_multiple_hcmp_3, test_multiple_hcmp_4, test_multiple_hcmp_5) for fn in functions: with self.subTest(fn=fn): compiled = cuda.jit("void(b1[:], f2, f2, f2)")(fn) ary = np.zeros(1, dtype=np.bool8) arg1 = np.float16(2.) arg2 = np.float16(3.) arg3 = np.float16(4.) compiled[1, 1](ary, arg1, arg2, arg3) self.assertTrue(ary[0]) @skip_unless_cc_53 def test_hmax(self): compiled = cuda.jit("void(f2[:], f2, f2)")(simple_hmax_scalar) ary = np.zeros(1, dtype=np.float16) arg1 = np.float16(3.) arg2 = np.float16(4.) compiled[1, 1](ary, arg1, arg2) np.testing.assert_allclose(ary[0], arg2) arg1 = np.float16(5.) compiled[1, 1](ary, arg1, arg2) np.testing.assert_allclose(ary[0], arg1) @skip_unless_cc_53 def test_hmin(self): compiled = cuda.jit("void(f2[:], f2, f2)")(simple_hmin_scalar) ary = np.zeros(1, dtype=np.float16) arg1 = np.float16(3.) arg2 = np.float16(4.) compiled[1, 1](ary, arg1, arg2) np.testing.assert_allclose(ary[0], arg1) arg1 = np.float16(5.) compiled[1, 1](ary, arg1, arg2) np.testing.assert_allclose(ary[0], arg2) def test_cbrt_f32(self): compiled = cuda.jit("void(float32[:], float32)")(simple_cbrt) ary = np.zeros(1, dtype=np.float32) cbrt_arg = 2. compiled[1, 1](ary, cbrt_arg) np.testing.assert_allclose(ary[0], cbrt_arg ** (1 / 3)) def test_cbrt_f64(self): compiled = cuda.jit("void(float64[:], float64)")(simple_cbrt) ary = np.zeros(1, dtype=np.float64) cbrt_arg = 6. compiled[1, 1](ary, cbrt_arg) np.testing.assert_allclose(ary[0], cbrt_arg ** (1 / 3)) def test_brev_u4(self): compiled = cuda.jit("void(uint32[:], uint32)")(simple_brev) ary = np.zeros(1, dtype=np.uint32) compiled[1, 1](ary, 0x000030F0) self.assertEqual(ary[0], 0x0F0C0000) @skip_on_cudasim('only get given a Python "int", assumes 32 bits') def test_brev_u8(self): compiled = cuda.jit("void(uint64[:], uint64)")(simple_brev) ary = np.zeros(1, dtype=np.uint64) compiled[1, 1](ary, 0x000030F0000030F0) self.assertEqual(ary[0], 0x0F0C00000F0C0000) def test_clz_i4(self): compiled = cuda.jit("void(int32[:], int32)")(simple_clz) ary = np.zeros(1, dtype=np.int32) compiled[1, 1](ary, 0x00100000) self.assertEqual(ary[0], 11) def test_clz_u4(self): """ Although the CUDA Math API (http://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__INTRINSIC__INT.html) only says int32 & int64 arguments are supported in C code, the LLVM IR input supports i8, i16, i32 & i64 (LLVM doesn't have a concept of unsigned integers, just unsigned operations on integers). http://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#bit-manipulations-intrinics """ compiled = cuda.jit("void(int32[:], uint32)")(simple_clz) ary = np.zeros(1, dtype=np.int32) compiled[1, 1](ary, 0x00100000) self.assertEqual(ary[0], 11) def test_clz_i4_1s(self): compiled = cuda.jit("void(int32[:], int32)")(simple_clz) ary = np.zeros(1, dtype=np.int32) compiled[1, 1](ary, 0xFFFFFFFF) self.assertEqual(ary[0], 0) def test_clz_i4_0s(self): compiled = cuda.jit("void(int32[:], int32)")(simple_clz) ary = np.zeros(1, dtype=np.int32) compiled[1, 1](ary, 0x0) self.assertEqual(ary[0], 32, "CUDA semantics") @skip_on_cudasim('only get given a Python "int", assumes 32 bits') def test_clz_i8(self): compiled = cuda.jit("void(int32[:], int64)")(simple_clz) ary = np.zeros(1, dtype=np.int32) compiled[1, 1](ary, 0x000000000010000) self.assertEqual(ary[0], 47) def test_ffs_i4(self): compiled = cuda.jit("void(int32[:], int32)")(simple_ffs) ary = np.zeros(1, dtype=np.int32) compiled[1, 1](ary, 0x00100000) self.assertEqual(ary[0], 21) compiled[1, 1](ary, 0x80000000) self.assertEqual(ary[0], 32) def test_ffs_u4(self): compiled = cuda.jit("void(int32[:], uint32)")(simple_ffs) ary = np.zeros(1, dtype=np.int32) compiled[1, 1](ary, 0x00100000) self.assertEqual(ary[0], 21) compiled[1, 1](ary, 0x80000000) self.assertEqual(ary[0], 32) def test_ffs_i4_1s(self): compiled = cuda.jit("void(int32[:], int32)")(simple_ffs) ary = np.zeros(1, dtype=np.int32) compiled[1, 1](ary, 0xFFFFFFFF) self.assertEqual(ary[0], 1) def test_ffs_i4_0s(self): compiled = cuda.jit("void(int32[:], int32)")(simple_ffs) ary = np.zeros(1, dtype=np.int32) compiled[1, 1](ary, 0x0) self.assertEqual(ary[0], 0) @skip_on_cudasim('only get given a Python "int", assumes 32 bits') def test_ffs_i8(self): compiled = cuda.jit("void(int32[:], int64)")(simple_ffs) ary = np.zeros(1, dtype=np.int32) compiled[1, 1](ary, 0x000000000010000) self.assertEqual(ary[0], 17) compiled[1, 1](ary, 0x100000000) self.assertEqual(ary[0], 33) def test_simple_laneid(self): compiled = cuda.jit("void(int32[:])")(simple_laneid) count = 2 ary = np.zeros(count * 32, dtype=np.int32) exp = np.tile(np.arange(32, dtype=np.int32), count) compiled[1, count * 32](ary) self.assertTrue(np.all(ary == exp)) def test_simple_warpsize(self): compiled = cuda.jit("void(int32[:])")(simple_warpsize) ary = np.zeros(1, dtype=np.int32) compiled[1, 1](ary) self.assertEqual(ary[0], 32, "CUDA semantics") def test_round_f4(self): compiled = cuda.jit("void(int64[:], float32)")(simple_round) ary = np.zeros(1, dtype=np.int64) for i in [-3.0, -2.5, -2.25, -1.5, 1.5, 2.25, 2.5, 2.75]: compiled[1, 1](ary, i) self.assertEqual(ary[0], round(i)) def test_round_f8(self): compiled = cuda.jit("void(int64[:], float64)")(simple_round) ary = np.zeros(1, dtype=np.int64) for i in [-3.0, -2.5, -2.25, -1.5, 1.5, 2.25, 2.5, 2.75]: compiled[1, 1](ary, i) self.assertEqual(ary[0], round(i)) def test_round_to_f4(self): compiled = cuda.jit("void(float32[:], float32, int32)")(simple_round_to) ary = np.zeros(1, dtype=np.float32) np.random.seed(123) vals = np.random.random(32).astype(np.float32) np.concatenate((vals, np.array([np.inf, -np.inf, np.nan]))) digits = ( # Common case branch of round_to_impl -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, # The algorithm currently implemented can only round to 13 digits # with single precision. Note that this doesn't trigger the # "overflow safe" branch of the implementation, which can only be # hit when using double precision. 13 ) for val, ndigits in itertools.product(vals, digits): with self.subTest(val=val, ndigits=ndigits): compiled[1, 1](ary, val, ndigits) self.assertPreciseEqual(ary[0], round(val, ndigits), prec='single') # CPython on most platforms uses rounding based on dtoa.c, whereas the CUDA # round-to implementation uses CPython's fallback implementation, which has # slightly different behavior at the edges of the domain. Since the CUDA # simulator executes using CPython, we need to skip this test when the # simulator is active. @skip_on_cudasim('Overflow behavior differs on CPython') def test_round_to_f4_overflow(self): # Test that the input value is returned when y in round_ndigits # overflows. compiled = cuda.jit("void(float32[:], float32, int32)")(simple_round_to) ary = np.zeros(1, dtype=np.float32) val = np.finfo(np.float32).max # An unusually large number of digits is required to hit the "y # overflows" branch of the implementation because the typing results in # the computation of y as float64. ndigits = 300 compiled[1, 1](ary, val, ndigits) self.assertEqual(ary[0], val) def test_round_to_f4_halfway(self): compiled = cuda.jit("void(float32[:], float32, int32)")(simple_round_to) ary = np.zeros(1, dtype=np.float32) # Value chosen to trigger the "round to even" branch of the # implementation val = 0.3425 ndigits = 3 compiled[1, 1](ary, val, ndigits) self.assertPreciseEqual(ary[0], round(val, ndigits), prec='single') def test_round_to_f8(self): compiled = cuda.jit("void(float64[:], float64, int32)")(simple_round_to) ary = np.zeros(1, dtype=np.float64) np.random.seed(123) vals = np.random.random(32) np.concatenate((vals, np.array([np.inf, -np.inf, np.nan]))) digits = (-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5) for val, ndigits in itertools.product(vals, digits): with self.subTest(val=val, ndigits=ndigits): compiled[1, 1](ary, val, ndigits) self.assertPreciseEqual(ary[0], round(val, ndigits), prec='exact') # Trigger the "overflow safe" branch of the implementation val = 0.12345678987654321 * 10e-15 ndigits = 23 with self.subTest(val=val, ndigits=ndigits): compiled[1, 1](ary, val, ndigits) self.assertPreciseEqual(ary[0], round(val, ndigits), prec='double') # Skipped on cudasim for the same reasons as test_round_to_f4 above. @skip_on_cudasim('Overflow behavior differs on CPython') def test_round_to_f8_overflow(self): # Test that the input value is returned when y in round_ndigits # overflows. compiled = cuda.jit("void(float64[:], float64, int32)")(simple_round_to) ary = np.zeros(1, dtype=np.float64) val = np.finfo(np.float64).max # Unlike test_round_to_f4_overflow, a reasonable number of digits can # be used for this test to overflow y in round_ndigits. ndigits = 12 compiled[1, 1](ary, val, ndigits) self.assertEqual(ary[0], val) def test_round_to_f8_halfway(self): compiled = cuda.jit("void(float64[:], float64, int32)")(simple_round_to) ary = np.zeros(1, dtype=np.float64) # Value chosen to trigger the "round to even" branch of the # implementation, with a value that is not exactly representable with a # float32, but only a float64. val = 0.5425 ndigits = 3 compiled[1, 1](ary, val, ndigits) self.assertPreciseEqual(ary[0], round(val, ndigits), prec='double') if __name__ == '__main__': unittest.main()