import numpy as np from textwrap import dedent from numba import cuda, uint32, uint64, float32, float64 from numba.cuda.testing import unittest, CUDATestCase, cc_X_or_above from numba.core import config @cuda.jit(device=True) def atomic_cast_to_uint64(num): return uint64(num) @cuda.jit(device=True) def atomic_cast_to_int(num): return int(num) @cuda.jit(device=True) def atomic_cast_none(num): return num @cuda.jit(device=True) def atomic_binary_1dim_shared(ary, idx, op2, ary_dtype, ary_nelements, binop_func, cast_func, initializer, neg_idx): tid = cuda.threadIdx.x sm = cuda.shared.array(ary_nelements, ary_dtype) sm[tid] = initializer cuda.syncthreads() bin = cast_func(idx[tid] % ary_nelements) if neg_idx: bin = bin - ary_nelements binop_func(sm, bin, op2) cuda.syncthreads() ary[tid] = sm[tid] @cuda.jit(device=True) def atomic_binary_1dim_shared2(ary, idx, op2, ary_dtype, ary_nelements, binop_func, cast_func): tid = cuda.threadIdx.x sm = cuda.shared.array(ary_nelements, ary_dtype) sm[tid] = ary[tid] cuda.syncthreads() bin = cast_func(idx[tid] % ary_nelements) binop_func(sm, bin, op2) cuda.syncthreads() ary[tid] = sm[tid] @cuda.jit(device=True) def atomic_binary_2dim_shared(ary, op2, ary_dtype, ary_shape, binop_func, y_cast_func, neg_idx): tx = cuda.threadIdx.x ty = cuda.threadIdx.y sm = cuda.shared.array(ary_shape, ary_dtype) sm[tx, ty] = ary[tx, ty] cuda.syncthreads() bin = (tx, y_cast_func(ty)) if neg_idx: bin = (bin[0] - ary_shape[0], bin[1] - ary_shape[1]) binop_func(sm, bin, op2) cuda.syncthreads() ary[tx, ty] = sm[tx, ty] @cuda.jit(device=True) def atomic_binary_2dim_global(ary, op2, binop_func, y_cast_func, neg_idx): tx = cuda.threadIdx.x ty = cuda.threadIdx.y bin = (tx, y_cast_func(ty)) if neg_idx: bin = (bin[0] - ary.shape[0], bin[1] - ary.shape[1]) binop_func(ary, bin, op2) @cuda.jit(device=True) def atomic_binary_1dim_global(ary, idx, ary_nelements, op2, binop_func, neg_idx): tid = cuda.threadIdx.x bin = int(idx[tid] % ary_nelements) if neg_idx: bin = bin - ary_nelements binop_func(ary, bin, op2) def atomic_add(ary): atomic_binary_1dim_shared(ary, ary, 1, uint32, 32, cuda.atomic.add, atomic_cast_none, 0, False) def atomic_add_wrap(ary): atomic_binary_1dim_shared(ary, ary, 1, uint32, 32, cuda.atomic.add, atomic_cast_none, 0, True) def atomic_add2(ary): atomic_binary_2dim_shared(ary, 1, uint32, (4, 8), cuda.atomic.add, atomic_cast_none, False) def atomic_add2_wrap(ary): atomic_binary_2dim_shared(ary, 1, uint32, (4, 8), cuda.atomic.add, atomic_cast_none, True) def atomic_add3(ary): atomic_binary_2dim_shared(ary, 1, uint32, (4, 8), cuda.atomic.add, atomic_cast_to_uint64, False) def atomic_add_float(ary): atomic_binary_1dim_shared(ary, ary, 1.0, float32, 32, cuda.atomic.add, atomic_cast_to_int, 0.0, False) def atomic_add_float_wrap(ary): atomic_binary_1dim_shared(ary, ary, 1.0, float32, 32, cuda.atomic.add, atomic_cast_to_int, 0.0, True) def atomic_add_float_2(ary): atomic_binary_2dim_shared(ary, 1.0, float32, (4, 8), cuda.atomic.add, atomic_cast_none, False) def atomic_add_float_2_wrap(ary): atomic_binary_2dim_shared(ary, 1.0, float32, (4, 8), cuda.atomic.add, atomic_cast_none, True) def atomic_add_float_3(ary): atomic_binary_2dim_shared(ary, 1.0, float32, (4, 8), cuda.atomic.add, atomic_cast_to_uint64, False) def atomic_add_double_global(idx, ary): atomic_binary_1dim_global(ary, idx, 32, 1.0, cuda.atomic.add, False) def atomic_add_double_global_wrap(idx, ary): atomic_binary_1dim_global(ary, idx, 32, 1.0, cuda.atomic.add, True) def atomic_add_double_global_2(ary): atomic_binary_2dim_global(ary, 1, cuda.atomic.add, atomic_cast_none, False) def atomic_add_double_global_2_wrap(ary): atomic_binary_2dim_global(ary, 1, cuda.atomic.add, atomic_cast_none, True) def atomic_add_double_global_3(ary): atomic_binary_2dim_global(ary, 1, cuda.atomic.add, atomic_cast_to_uint64, False) def atomic_add_double(idx, ary): atomic_binary_1dim_shared(ary, idx, 1.0, float64, 32, cuda.atomic.add, atomic_cast_none, 0.0, False) def atomic_add_double_wrap(idx, ary): atomic_binary_1dim_shared(ary, idx, 1.0, float64, 32, cuda.atomic.add, atomic_cast_none, 0.0, True) def atomic_add_double_2(ary): atomic_binary_2dim_shared(ary, 1.0, float64, (4, 8), cuda.atomic.add, atomic_cast_none, False) def atomic_add_double_2_wrap(ary): atomic_binary_2dim_shared(ary, 1.0, float64, (4, 8), cuda.atomic.add, atomic_cast_none, True) def atomic_add_double_3(ary): atomic_binary_2dim_shared(ary, 1.0, float64, (4, 8), cuda.atomic.add, atomic_cast_to_uint64, False) def atomic_sub(ary): atomic_binary_1dim_shared(ary, ary, 1, uint32, 32, cuda.atomic.sub, atomic_cast_none, 0, False) def atomic_sub2(ary): atomic_binary_2dim_shared(ary, 1, uint32, (4, 8), cuda.atomic.sub, atomic_cast_none, False) def atomic_sub3(ary): atomic_binary_2dim_shared(ary, 1, uint32, (4, 8), cuda.atomic.sub, atomic_cast_to_uint64, False) def atomic_sub_float(ary): atomic_binary_1dim_shared(ary, ary, 1.0, float32, 32, cuda.atomic.sub, atomic_cast_to_int, 0.0, False) def atomic_sub_float_2(ary): atomic_binary_2dim_shared(ary, 1.0, float32, (4, 8), cuda.atomic.sub, atomic_cast_none, False) def atomic_sub_float_3(ary): atomic_binary_2dim_shared(ary, 1.0, float32, (4, 8), cuda.atomic.sub, atomic_cast_to_uint64, False) def atomic_sub_double(idx, ary): atomic_binary_1dim_shared(ary, idx, 1.0, float64, 32, cuda.atomic.sub, atomic_cast_none, 0.0, False) def atomic_sub_double_2(ary): atomic_binary_2dim_shared(ary, 1.0, float64, (4, 8), cuda.atomic.sub, atomic_cast_none, False) def atomic_sub_double_3(ary): atomic_binary_2dim_shared(ary, 1.0, float64, (4, 8), cuda.atomic.sub, atomic_cast_to_uint64, False) def atomic_sub_double_global(idx, ary): atomic_binary_1dim_global(ary, idx, 32, 1.0, cuda.atomic.sub, False) def atomic_sub_double_global_2(ary): atomic_binary_2dim_global(ary, 1.0, cuda.atomic.sub, atomic_cast_none, False) def atomic_sub_double_global_3(ary): atomic_binary_2dim_shared(ary, 1.0, float64, (4, 8), cuda.atomic.sub, atomic_cast_to_uint64, False) def atomic_and(ary, op2): atomic_binary_1dim_shared(ary, ary, op2, uint32, 32, cuda.atomic.and_, atomic_cast_none, 1, False) def atomic_and2(ary, op2): atomic_binary_2dim_shared(ary, op2, uint32, (4, 8), cuda.atomic.and_, atomic_cast_none, False) def atomic_and3(ary, op2): atomic_binary_2dim_shared(ary, op2, uint32, (4, 8), cuda.atomic.and_, atomic_cast_to_uint64, False) def atomic_and_global(idx, ary, op2): atomic_binary_1dim_global(ary, idx, 32, op2, cuda.atomic.and_, False) def atomic_and_global_2(ary, op2): atomic_binary_2dim_global(ary, op2, cuda.atomic.and_, atomic_cast_none, False) def atomic_or(ary, op2): atomic_binary_1dim_shared(ary, ary, op2, uint32, 32, cuda.atomic.or_, atomic_cast_none, 0, False) def atomic_or2(ary, op2): atomic_binary_2dim_shared(ary, op2, uint32, (4, 8), cuda.atomic.or_, atomic_cast_none, False) def atomic_or3(ary, op2): atomic_binary_2dim_shared(ary, op2, uint32, (4, 8), cuda.atomic.or_, atomic_cast_to_uint64, False) def atomic_or_global(idx, ary, op2): atomic_binary_1dim_global(ary, idx, 32, op2, cuda.atomic.or_, False) def atomic_or_global_2(ary, op2): atomic_binary_2dim_global(ary, op2, cuda.atomic.or_, atomic_cast_none, False) def atomic_xor(ary, op2): atomic_binary_1dim_shared(ary, ary, op2, uint32, 32, cuda.atomic.xor, atomic_cast_none, 0, False) def atomic_xor2(ary, op2): atomic_binary_2dim_shared(ary, op2, uint32, (4, 8), cuda.atomic.xor, atomic_cast_none, False) def atomic_xor3(ary, op2): atomic_binary_2dim_shared(ary, op2, uint32, (4, 8), cuda.atomic.xor, atomic_cast_to_uint64, False) def atomic_xor_global(idx, ary, op2): atomic_binary_1dim_global(ary, idx, 32, op2, cuda.atomic.xor, False) def atomic_xor_global_2(ary, op2): atomic_binary_2dim_global(ary, op2, cuda.atomic.xor, atomic_cast_none, False) def atomic_inc32(ary, idx, op2): atomic_binary_1dim_shared2(ary, idx, op2, uint32, 32, cuda.atomic.inc, atomic_cast_none) def atomic_inc64(ary, idx, op2): atomic_binary_1dim_shared2(ary, idx, op2, uint64, 32, cuda.atomic.inc, atomic_cast_to_int) def atomic_inc2_32(ary, op2): atomic_binary_2dim_shared(ary, op2, uint32, (4, 8), cuda.atomic.inc, atomic_cast_none, False) def atomic_inc2_64(ary, op2): atomic_binary_2dim_shared(ary, op2, uint64, (4, 8), cuda.atomic.inc, atomic_cast_none, False) def atomic_inc3(ary, op2): atomic_binary_2dim_shared(ary, op2, uint32, (4, 8), cuda.atomic.inc, atomic_cast_to_uint64, False) def atomic_inc_global(idx, ary, op2): atomic_binary_1dim_global(ary, idx, 32, op2, cuda.atomic.inc, False) def atomic_inc_global_2(ary, op2): atomic_binary_2dim_global(ary, op2, cuda.atomic.inc, atomic_cast_none, False) def atomic_dec32(ary, idx, op2): atomic_binary_1dim_shared2(ary, idx, op2, uint32, 32, cuda.atomic.dec, atomic_cast_none) def atomic_dec64(ary, idx, op2): atomic_binary_1dim_shared2(ary, idx, op2, uint64, 32, cuda.atomic.dec, atomic_cast_to_int) def atomic_dec2_32(ary, op2): atomic_binary_2dim_shared(ary, op2, uint32, (4, 8), cuda.atomic.dec, atomic_cast_none, False) def atomic_dec2_64(ary, op2): atomic_binary_2dim_shared(ary, op2, uint64, (4, 8), cuda.atomic.dec, atomic_cast_none, False) def atomic_dec3(ary, op2): atomic_binary_2dim_shared(ary, op2, uint32, (4, 8), cuda.atomic.dec, atomic_cast_to_uint64, False) def atomic_dec_global(idx, ary, op2): atomic_binary_1dim_global(ary, idx, 32, op2, cuda.atomic.dec, False) def atomic_dec_global_2(ary, op2): atomic_binary_2dim_global(ary, op2, cuda.atomic.dec, atomic_cast_none, False) def atomic_exch(ary, idx, op2): atomic_binary_1dim_shared2(ary, idx, op2, uint32, 32, cuda.atomic.exch, atomic_cast_none) def atomic_exch2(ary, op2): atomic_binary_2dim_shared(ary, op2, uint32, (4, 8), cuda.atomic.exch, atomic_cast_none, False) def atomic_exch3(ary, op2): atomic_binary_2dim_shared(ary, op2, uint64, (4, 8), cuda.atomic.exch, atomic_cast_none, False) def atomic_exch_global(idx, ary, op2): atomic_binary_1dim_global(ary, idx, 32, op2, cuda.atomic.exch, False) def gen_atomic_extreme_funcs(func): fns = dedent(""" def atomic(res, ary): tx = cuda.threadIdx.x bx = cuda.blockIdx.x {func}(res, 0, ary[tx, bx]) def atomic_double_normalizedindex(res, ary): tx = cuda.threadIdx.x bx = cuda.blockIdx.x {func}(res, 0, ary[tx, uint64(bx)]) def atomic_double_oneindex(res, ary): tx = cuda.threadIdx.x {func}(res, 0, ary[tx]) def atomic_double_shared(res, ary): tid = cuda.threadIdx.x smary = cuda.shared.array(32, float64) smary[tid] = ary[tid] smres = cuda.shared.array(1, float64) if tid == 0: smres[0] = res[0] cuda.syncthreads() {func}(smres, 0, smary[tid]) cuda.syncthreads() if tid == 0: res[0] = smres[0] """).format(func=func) ld = {} exec(fns, {'cuda': cuda, 'float64': float64, 'uint64': uint64}, ld) return (ld['atomic'], ld['atomic_double_normalizedindex'], ld['atomic_double_oneindex'], ld['atomic_double_shared']) (atomic_max, atomic_max_double_normalizedindex, atomic_max_double_oneindex, atomic_max_double_shared) = gen_atomic_extreme_funcs('cuda.atomic.max') (atomic_min, atomic_min_double_normalizedindex, atomic_min_double_oneindex, atomic_min_double_shared) = gen_atomic_extreme_funcs('cuda.atomic.min') (atomic_nanmax, atomic_nanmax_double_normalizedindex, atomic_nanmax_double_oneindex, atomic_nanmax_double_shared) = \ gen_atomic_extreme_funcs('cuda.atomic.nanmax') (atomic_nanmin, atomic_nanmin_double_normalizedindex, atomic_nanmin_double_oneindex, atomic_nanmin_double_shared) = \ gen_atomic_extreme_funcs('cuda.atomic.nanmin') def atomic_compare_and_swap(res, old, ary, fill_val): gid = cuda.grid(1) if gid < res.size: old[gid] = cuda.atomic.compare_and_swap(res[gid:], fill_val, ary[gid]) def atomic_cas_1dim(res, old, ary, fill_val): gid = cuda.grid(1) if gid < res.size: old[gid] = cuda.atomic.cas(res, gid, fill_val, ary[gid]) def atomic_cas_2dim(res, old, ary, fill_val): gid = cuda.grid(2) if gid[0] < res.shape[0] and gid[1] < res.shape[1]: old[gid] = cuda.atomic.cas(res, gid, fill_val, ary[gid]) class TestCudaAtomics(CUDATestCase): def setUp(self): super().setUp() np.random.seed(0) def test_atomic_add(self): ary = np.random.randint(0, 32, size=32).astype(np.uint32) ary_wrap = ary.copy() orig = ary.copy() cuda_atomic_add = cuda.jit('void(uint32[:])')(atomic_add) cuda_atomic_add[1, 32](ary) cuda_atomic_add_wrap = cuda.jit('void(uint32[:])')(atomic_add_wrap) cuda_atomic_add_wrap[1, 32](ary_wrap) gold = np.zeros(32, dtype=np.uint32) for i in range(orig.size): gold[orig[i]] += 1 self.assertTrue(np.all(ary == gold)) self.assertTrue(np.all(ary_wrap == gold)) def test_atomic_add2(self): ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8) ary_wrap = ary.copy() orig = ary.copy() cuda_atomic_add2 = cuda.jit('void(uint32[:,:])')(atomic_add2) cuda_atomic_add2[1, (4, 8)](ary) cuda_atomic_add2_wrap = cuda.jit('void(uint32[:,:])')(atomic_add2_wrap) cuda_atomic_add2_wrap[1, (4, 8)](ary_wrap) self.assertTrue(np.all(ary == orig + 1)) self.assertTrue(np.all(ary_wrap == orig + 1)) def test_atomic_add3(self): ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8) orig = ary.copy() cuda_atomic_add3 = cuda.jit('void(uint32[:,:])')(atomic_add3) cuda_atomic_add3[1, (4, 8)](ary) self.assertTrue(np.all(ary == orig + 1)) def test_atomic_add_float(self): ary = np.random.randint(0, 32, size=32).astype(np.float32) ary_wrap = ary.copy() orig = ary.copy().astype(np.intp) cuda_atomic_add_float = cuda.jit('void(float32[:])')(atomic_add_float) cuda_atomic_add_float[1, 32](ary) add_float_wrap = cuda.jit('void(float32[:])')(atomic_add_float_wrap) add_float_wrap[1, 32](ary_wrap) gold = np.zeros(32, dtype=np.uint32) for i in range(orig.size): gold[orig[i]] += 1.0 self.assertTrue(np.all(ary == gold)) self.assertTrue(np.all(ary_wrap == gold)) def test_atomic_add_float_2(self): ary = np.random.randint(0, 32, size=32).astype(np.float32).reshape(4, 8) ary_wrap = ary.copy() orig = ary.copy() cuda_atomic_add2 = cuda.jit('void(float32[:,:])')(atomic_add_float_2) cuda_atomic_add2[1, (4, 8)](ary) cuda_func_wrap = cuda.jit('void(float32[:,:])')(atomic_add_float_2_wrap) cuda_func_wrap[1, (4, 8)](ary_wrap) self.assertTrue(np.all(ary == orig + 1)) self.assertTrue(np.all(ary_wrap == orig + 1)) def test_atomic_add_float_3(self): ary = np.random.randint(0, 32, size=32).astype(np.float32).reshape(4, 8) orig = ary.copy() cuda_atomic_add3 = cuda.jit('void(float32[:,:])')(atomic_add_float_3) cuda_atomic_add3[1, (4, 8)](ary) self.assertTrue(np.all(ary == orig + 1)) def assertCorrectFloat64Atomics(self, kernel, shared=True): if config.ENABLE_CUDASIM: return # Use the first (and only) definition asm = next(iter(kernel.inspect_asm().values())) if cc_X_or_above(6, 0): if cuda.runtime.get_version() > (12, 1): # CUDA 12.2 and above generate a more optimized reduction # instruction, because the result does not need to be # placed in a register. inst = 'red' else: inst = 'atom' if shared: inst = f'{inst}.shared' self.assertIn(f'{inst}.add.f64', asm) else: if shared: self.assertIn('atom.shared.cas.b64', asm) else: self.assertIn('atom.cas.b64', asm) def test_atomic_add_double(self): idx = np.random.randint(0, 32, size=32, dtype=np.int64) ary = np.zeros(32, np.float64) ary_wrap = ary.copy() cuda_fn = cuda.jit('void(int64[:], float64[:])')(atomic_add_double) cuda_fn[1, 32](idx, ary) wrap_fn = cuda.jit('void(int64[:], float64[:])')(atomic_add_double_wrap) wrap_fn[1, 32](idx, ary_wrap) gold = np.zeros(32, dtype=np.uint32) for i in range(idx.size): gold[idx[i]] += 1.0 np.testing.assert_equal(ary, gold) np.testing.assert_equal(ary_wrap, gold) self.assertCorrectFloat64Atomics(cuda_fn) self.assertCorrectFloat64Atomics(wrap_fn) def test_atomic_add_double_2(self): ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8) ary_wrap = ary.copy() orig = ary.copy() cuda_fn = cuda.jit('void(float64[:,:])')(atomic_add_double_2) cuda_fn[1, (4, 8)](ary) cuda_fn_wrap = cuda.jit('void(float64[:,:])')(atomic_add_double_2_wrap) cuda_fn_wrap[1, (4, 8)](ary_wrap) np.testing.assert_equal(ary, orig + 1) np.testing.assert_equal(ary_wrap, orig + 1) self.assertCorrectFloat64Atomics(cuda_fn) self.assertCorrectFloat64Atomics(cuda_fn_wrap) def test_atomic_add_double_3(self): ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8) orig = ary.copy() cuda_func = cuda.jit('void(float64[:,:])')(atomic_add_double_3) cuda_func[1, (4, 8)](ary) np.testing.assert_equal(ary, orig + 1) self.assertCorrectFloat64Atomics(cuda_func) def test_atomic_add_double_global(self): idx = np.random.randint(0, 32, size=32, dtype=np.int64) ary = np.zeros(32, np.float64) ary_wrap = ary.copy() sig = 'void(int64[:], float64[:])' cuda_func = cuda.jit(sig)(atomic_add_double_global) wrap_cuda_func = cuda.jit(sig)(atomic_add_double_global_wrap) cuda_func[1, 32](idx, ary) wrap_cuda_func[1, 32](idx, ary_wrap) gold = np.zeros(32, dtype=np.uint32) for i in range(idx.size): gold[idx[i]] += 1.0 np.testing.assert_equal(ary, gold) np.testing.assert_equal(ary_wrap, gold) self.assertCorrectFloat64Atomics(cuda_func, shared=False) self.assertCorrectFloat64Atomics(wrap_cuda_func, shared=False) def test_atomic_add_double_global_2(self): ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8) ary_wrap = ary.copy() orig = ary.copy() sig = 'void(float64[:,:])' cuda_func = cuda.jit(sig)(atomic_add_double_global_2) wrap_cuda_func = cuda.jit(sig)(atomic_add_double_global_2_wrap) cuda_func[1, (4, 8)](ary) wrap_cuda_func[1, (4, 8)](ary_wrap) np.testing.assert_equal(ary, orig + 1) np.testing.assert_equal(ary_wrap, orig + 1) self.assertCorrectFloat64Atomics(cuda_func, shared=False) self.assertCorrectFloat64Atomics(wrap_cuda_func, shared=False) def test_atomic_add_double_global_3(self): ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8) orig = ary.copy() cuda_func = cuda.jit('void(float64[:,:])')(atomic_add_double_global_3) cuda_func[1, (4, 8)](ary) np.testing.assert_equal(ary, orig + 1) self.assertCorrectFloat64Atomics(cuda_func, shared=False) def test_atomic_sub(self): ary = np.random.randint(0, 32, size=32).astype(np.uint32) orig = ary.copy() cuda_atomic_sub = cuda.jit('void(uint32[:])')(atomic_sub) cuda_atomic_sub[1, 32](ary) gold = np.zeros(32, dtype=np.uint32) for i in range(orig.size): gold[orig[i]] -= 1 self.assertTrue(np.all(ary == gold)) def test_atomic_sub2(self): ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8) orig = ary.copy() cuda_atomic_sub2 = cuda.jit('void(uint32[:,:])')(atomic_sub2) cuda_atomic_sub2[1, (4, 8)](ary) self.assertTrue(np.all(ary == orig - 1)) def test_atomic_sub3(self): ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8) orig = ary.copy() cuda_atomic_sub3 = cuda.jit('void(uint32[:,:])')(atomic_sub3) cuda_atomic_sub3[1, (4, 8)](ary) self.assertTrue(np.all(ary == orig - 1)) def test_atomic_sub_float(self): ary = np.random.randint(0, 32, size=32).astype(np.float32) orig = ary.copy().astype(np.intp) cuda_atomic_sub_float = cuda.jit('void(float32[:])')(atomic_sub_float) cuda_atomic_sub_float[1, 32](ary) gold = np.zeros(32, dtype=np.float32) for i in range(orig.size): gold[orig[i]] -= 1.0 self.assertTrue(np.all(ary == gold)) def test_atomic_sub_float_2(self): ary = np.random.randint(0, 32, size=32).astype(np.float32).reshape(4, 8) orig = ary.copy() cuda_atomic_sub2 = cuda.jit('void(float32[:,:])')(atomic_sub_float_2) cuda_atomic_sub2[1, (4, 8)](ary) self.assertTrue(np.all(ary == orig - 1)) def test_atomic_sub_float_3(self): ary = np.random.randint(0, 32, size=32).astype(np.float32).reshape(4, 8) orig = ary.copy() cuda_atomic_sub3 = cuda.jit('void(float32[:,:])')(atomic_sub_float_3) cuda_atomic_sub3[1, (4, 8)](ary) self.assertTrue(np.all(ary == orig - 1)) def test_atomic_sub_double(self): idx = np.random.randint(0, 32, size=32, dtype=np.int64) ary = np.zeros(32, np.float64) cuda_func = cuda.jit('void(int64[:], float64[:])')(atomic_sub_double) cuda_func[1, 32](idx, ary) gold = np.zeros(32, dtype=np.float64) for i in range(idx.size): gold[idx[i]] -= 1.0 np.testing.assert_equal(ary, gold) def test_atomic_sub_double_2(self): ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8) orig = ary.copy() cuda_func = cuda.jit('void(float64[:,:])')(atomic_sub_double_2) cuda_func[1, (4, 8)](ary) np.testing.assert_equal(ary, orig - 1) def test_atomic_sub_double_3(self): ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8) orig = ary.copy() cuda_func = cuda.jit('void(float64[:,:])')(atomic_sub_double_3) cuda_func[1, (4, 8)](ary) np.testing.assert_equal(ary, orig - 1) def test_atomic_sub_double_global(self): idx = np.random.randint(0, 32, size=32, dtype=np.int64) ary = np.zeros(32, np.float64) sig = 'void(int64[:], float64[:])' cuda_func = cuda.jit(sig)(atomic_sub_double_global) cuda_func[1, 32](idx, ary) gold = np.zeros(32, dtype=np.float64) for i in range(idx.size): gold[idx[i]] -= 1.0 np.testing.assert_equal(ary, gold) def test_atomic_sub_double_global_2(self): ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8) orig = ary.copy() cuda_func = cuda.jit('void(float64[:,:])')(atomic_sub_double_global_2) cuda_func[1, (4, 8)](ary) np.testing.assert_equal(ary, orig - 1) def test_atomic_sub_double_global_3(self): ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8) orig = ary.copy() cuda_func = cuda.jit('void(float64[:,:])')(atomic_sub_double_global_3) cuda_func[1, (4, 8)](ary) np.testing.assert_equal(ary, orig - 1) def test_atomic_and(self): rand_const = np.random.randint(500) ary = np.random.randint(0, 32, size=32).astype(np.uint32) orig = ary.copy() cuda_func = cuda.jit('void(uint32[:], uint32)')(atomic_and) cuda_func[1, 32](ary, rand_const) gold = ary.copy() for i in range(orig.size): gold[orig[i]] &= rand_const self.assertTrue(np.all(ary == gold)) def test_atomic_and2(self): rand_const = np.random.randint(500) ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8) orig = ary.copy() cuda_atomic_and2 = cuda.jit('void(uint32[:,:], uint32)')(atomic_and2) cuda_atomic_and2[1, (4, 8)](ary, rand_const) self.assertTrue(np.all(ary == orig & rand_const)) def test_atomic_and3(self): rand_const = np.random.randint(500) ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8) orig = ary.copy() cuda_atomic_and3 = cuda.jit('void(uint32[:,:], uint32)')(atomic_and3) cuda_atomic_and3[1, (4, 8)](ary, rand_const) self.assertTrue(np.all(ary == orig & rand_const)) def test_atomic_and_global(self): rand_const = np.random.randint(500) idx = np.random.randint(0, 32, size=32, dtype=np.int32) ary = np.random.randint(0, 32, size=32, dtype=np.int32) sig = 'void(int32[:], int32[:], int32)' cuda_func = cuda.jit(sig)(atomic_and_global) cuda_func[1, 32](idx, ary, rand_const) gold = ary.copy() for i in range(idx.size): gold[idx[i]] &= rand_const np.testing.assert_equal(ary, gold) def test_atomic_and_global_2(self): rand_const = np.random.randint(500) ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8) orig = ary.copy() cuda_func = cuda.jit('void(uint32[:,:], uint32)')(atomic_and_global_2) cuda_func[1, (4, 8)](ary, rand_const) np.testing.assert_equal(ary, orig & rand_const) def test_atomic_or(self): rand_const = np.random.randint(500) ary = np.random.randint(0, 32, size=32).astype(np.uint32) orig = ary.copy() cuda_func = cuda.jit('void(uint32[:], uint32)')(atomic_or) cuda_func[1, 32](ary, rand_const) gold = np.zeros(32, dtype=np.uint32) for i in range(orig.size): gold[orig[i]] |= rand_const self.assertTrue(np.all(ary == gold)) def test_atomic_or2(self): rand_const = np.random.randint(500) ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8) orig = ary.copy() cuda_atomic_and2 = cuda.jit('void(uint32[:,:], uint32)')(atomic_or2) cuda_atomic_and2[1, (4, 8)](ary, rand_const) self.assertTrue(np.all(ary == orig | rand_const)) def test_atomic_or3(self): rand_const = np.random.randint(500) ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8) orig = ary.copy() cuda_atomic_and3 = cuda.jit('void(uint32[:,:], uint32)')(atomic_or3) cuda_atomic_and3[1, (4, 8)](ary, rand_const) self.assertTrue(np.all(ary == orig | rand_const)) def test_atomic_or_global(self): rand_const = np.random.randint(500) idx = np.random.randint(0, 32, size=32, dtype=np.int32) ary = np.random.randint(0, 32, size=32, dtype=np.int32) sig = 'void(int32[:], int32[:], int32)' cuda_func = cuda.jit(sig)(atomic_or_global) cuda_func[1, 32](idx, ary, rand_const) gold = ary.copy() for i in range(idx.size): gold[idx[i]] |= rand_const np.testing.assert_equal(ary, gold) def test_atomic_or_global_2(self): rand_const = np.random.randint(500) ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8) orig = ary.copy() cuda_func = cuda.jit('void(uint32[:,:], uint32)')(atomic_or_global_2) cuda_func[1, (4, 8)](ary, rand_const) np.testing.assert_equal(ary, orig | rand_const) def test_atomic_xor(self): rand_const = np.random.randint(500) ary = np.random.randint(0, 32, size=32).astype(np.uint32) orig = ary.copy() cuda_func = cuda.jit('void(uint32[:], uint32)')(atomic_xor) cuda_func[1, 32](ary, rand_const) gold = np.zeros(32, dtype=np.uint32) for i in range(orig.size): gold[orig[i]] ^= rand_const self.assertTrue(np.all(ary == gold)) def test_atomic_xor2(self): rand_const = np.random.randint(500) ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8) orig = ary.copy() cuda_atomic_xor2 = cuda.jit('void(uint32[:,:], uint32)')(atomic_xor2) cuda_atomic_xor2[1, (4, 8)](ary, rand_const) self.assertTrue(np.all(ary == orig ^ rand_const)) def test_atomic_xor3(self): rand_const = np.random.randint(500) ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8) orig = ary.copy() cuda_atomic_xor3 = cuda.jit('void(uint32[:,:], uint32)')(atomic_xor3) cuda_atomic_xor3[1, (4, 8)](ary, rand_const) self.assertTrue(np.all(ary == orig ^ rand_const)) def test_atomic_xor_global(self): rand_const = np.random.randint(500) idx = np.random.randint(0, 32, size=32, dtype=np.int32) ary = np.random.randint(0, 32, size=32, dtype=np.int32) gold = ary.copy() sig = 'void(int32[:], int32[:], int32)' cuda_func = cuda.jit(sig)(atomic_xor_global) cuda_func[1, 32](idx, ary, rand_const) for i in range(idx.size): gold[idx[i]] ^= rand_const np.testing.assert_equal(ary, gold) def test_atomic_xor_global_2(self): rand_const = np.random.randint(500) ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8) orig = ary.copy() cuda_func = cuda.jit('void(uint32[:,:], uint32)')(atomic_xor_global_2) cuda_func[1, (4, 8)](ary, rand_const) np.testing.assert_equal(ary, orig ^ rand_const) def inc_dec_1dim_setup(self, dtype): rconst = np.random.randint(32, dtype=dtype) rary = np.random.randint(0, 32, size=32).astype(dtype) ary_idx = np.arange(32, dtype=dtype) return rconst, rary, ary_idx def inc_dec_2dim_setup(self, dtype): rconst = np.random.randint(32, dtype=dtype) rary = np.random.randint(0, 32, size=32).astype(dtype).reshape(4, 8) return rconst, rary def check_inc_index(self, ary, idx, rconst, sig, nblocks, blksize, func): orig = ary.copy() cuda_func = cuda.jit(sig)(func) cuda_func[nblocks, blksize](ary, idx, rconst) np.testing.assert_equal(ary, np.where(orig >= rconst, 0, orig + 1)) def check_inc_index2(self, ary, idx, rconst, sig, nblocks, blksize, func): orig = ary.copy() cuda_func = cuda.jit(sig)(func) cuda_func[nblocks, blksize](idx, ary, rconst) np.testing.assert_equal(ary, np.where(orig >= rconst, 0, orig + 1)) def check_inc(self, ary, rconst, sig, nblocks, blksize, func): orig = ary.copy() cuda_func = cuda.jit(sig)(func) cuda_func[nblocks, blksize](ary, rconst) np.testing.assert_equal(ary, np.where(orig >= rconst, 0, orig + 1)) def test_atomic_inc_32(self): rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint32) sig = 'void(uint32[:], uint32[:], uint32)' self.check_inc_index(ary, idx, rand_const, sig, 1, 32, atomic_inc32) def test_atomic_inc_64(self): rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint64) sig = 'void(uint64[:], uint64[:], uint64)' self.check_inc_index(ary, idx, rand_const, sig, 1, 32, atomic_inc64) def test_atomic_inc2_32(self): rand_const, ary = self.inc_dec_2dim_setup(np.uint32) sig = 'void(uint32[:,:], uint32)' self.check_inc(ary, rand_const, sig, 1, (4,8), atomic_inc2_32) def test_atomic_inc2_64(self): rand_const, ary = self.inc_dec_2dim_setup(np.uint64) sig = 'void(uint64[:,:], uint64)' self.check_inc(ary, rand_const, sig, 1, (4,8), atomic_inc2_64) def test_atomic_inc3(self): rand_const, ary = self.inc_dec_2dim_setup(np.uint32) sig = 'void(uint32[:,:], uint32)' self.check_inc(ary, rand_const, sig, 1, (4,8), atomic_inc3) def test_atomic_inc_global_32(self): rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint32) sig = 'void(uint32[:], uint32[:], uint32)' self.check_inc_index2(ary, idx, rand_const, sig, 1, 32, atomic_inc_global) def test_atomic_inc_global_64(self): rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint64) sig = 'void(uint64[:], uint64[:], uint64)' self.check_inc_index2(ary, idx, rand_const, sig, 1, 32, atomic_inc_global) def test_atomic_inc_global_2_32(self): rand_const, ary = self.inc_dec_2dim_setup(np.uint32) sig = 'void(uint32[:,:], uint32)' self.check_inc(ary, rand_const, sig, 1, (4,8), atomic_inc_global_2) def test_atomic_inc_global_2_64(self): rand_const, ary = self.inc_dec_2dim_setup(np.uint64) sig = 'void(uint64[:,:], uint64)' self.check_inc(ary, rand_const, sig, 1, (4,8), atomic_inc_global_2) def check_dec_index(self, ary, idx, rconst, sig, nblocks, blksize, func): orig = ary.copy() cuda_func = cuda.jit(sig)(func) cuda_func[nblocks, blksize](ary, idx, rconst) np.testing.assert_equal(ary, np.where(orig == 0, rconst, np.where(orig > rconst, rconst, orig - 1))) def check_dec_index2(self, ary, idx, rconst, sig, nblocks, blksize, func): orig = ary.copy() cuda_func = cuda.jit(sig)(func) cuda_func[nblocks, blksize](idx, ary, rconst) np.testing.assert_equal(ary, np.where(orig == 0, rconst, np.where(orig > rconst, rconst, orig - 1))) def check_dec(self, ary, rconst, sig, nblocks, blksize, func): orig = ary.copy() cuda_func = cuda.jit(sig)(func) cuda_func[nblocks, blksize](ary, rconst) np.testing.assert_equal(ary, np.where(orig == 0, rconst, np.where(orig > rconst, rconst, orig - 1))) def test_atomic_dec_32(self): rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint32) sig = 'void(uint32[:], uint32[:], uint32)' self.check_dec_index(ary, idx, rand_const, sig, 1, 32, atomic_dec32) def test_atomic_dec_64(self): rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint64) sig = 'void(uint64[:], uint64[:], uint64)' self.check_dec_index(ary, idx, rand_const, sig, 1, 32, atomic_dec64) def test_atomic_dec2_32(self): rand_const, ary = self.inc_dec_2dim_setup(np.uint32) sig = 'void(uint32[:,:], uint32)' self.check_dec(ary, rand_const, sig, 1, (4,8), atomic_dec2_32) def test_atomic_dec2_64(self): rand_const, ary = self.inc_dec_2dim_setup(np.uint64) sig = 'void(uint64[:,:], uint64)' self.check_dec(ary, rand_const, sig, 1, (4,8), atomic_dec2_64) def test_atomic_dec3_new(self): rand_const, ary = self.inc_dec_2dim_setup(np.uint32) sig = 'void(uint32[:,:], uint32)' self.check_dec(ary, rand_const, sig, 1, (4,8), atomic_dec3) def test_atomic_dec_global_32(self): rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint32) sig = 'void(uint32[:], uint32[:], uint32)' self.check_dec_index2(ary, idx, rand_const, sig, 1, 32, atomic_dec_global) def test_atomic_dec_global_64(self): rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint64) sig = 'void(uint64[:], uint64[:], uint64)' self.check_dec_index2(ary, idx, rand_const, sig, 1, 32, atomic_dec_global) def test_atomic_dec_global2_32(self): rand_const, ary = self.inc_dec_2dim_setup(np.uint32) sig = 'void(uint32[:,:], uint32)' self.check_dec(ary, rand_const, sig, 1, (4,8), atomic_dec_global_2) def test_atomic_dec_global2_64(self): rand_const, ary = self.inc_dec_2dim_setup(np.uint64) sig = 'void(uint64[:,:], uint64)' self.check_dec(ary, rand_const, sig, 1, (4,8), atomic_dec_global_2) def test_atomic_exch(self): rand_const = np.random.randint(50, 100, dtype=np.uint32) ary = np.random.randint(0, 32, size=32).astype(np.uint32) idx = np.arange(32, dtype=np.uint32) cuda_func = cuda.jit('void(uint32[:], uint32[:], uint32)')(atomic_exch) cuda_func[1, 32](ary, idx, rand_const) np.testing.assert_equal(ary, rand_const) def test_atomic_exch2(self): rand_const = np.random.randint(50, 100, dtype=np.uint32) ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8) cuda_func = cuda.jit('void(uint32[:,:], uint32)')(atomic_exch2) cuda_func[1, (4, 8)](ary, rand_const) np.testing.assert_equal(ary, rand_const) def test_atomic_exch3(self): rand_const = np.random.randint(50, 100, dtype=np.uint64) ary = np.random.randint(0, 32, size=32).astype(np.uint64).reshape(4, 8) cuda_func = cuda.jit('void(uint64[:,:], uint64)')(atomic_exch3) cuda_func[1, (4, 8)](ary, rand_const) np.testing.assert_equal(ary, rand_const) def test_atomic_exch_global(self): rand_const = np.random.randint(50, 100, dtype=np.uint32) idx = np.arange(32, dtype=np.uint32) ary = np.random.randint(0, 32, size=32, dtype=np.uint32) sig = 'void(uint32[:], uint32[:], uint32)' cuda_func = cuda.jit(sig)(atomic_exch_global) cuda_func[1, 32](idx, ary, rand_const) np.testing.assert_equal(ary, rand_const) def check_atomic_max(self, dtype, lo, hi): vals = np.random.randint(lo, hi, size=(32, 32)).astype(dtype) res = np.zeros(1, dtype=vals.dtype) cuda_func = cuda.jit(atomic_max) cuda_func[32, 32](res, vals) gold = np.max(vals) np.testing.assert_equal(res, gold) def test_atomic_max_int32(self): self.check_atomic_max(dtype=np.int32, lo=-65535, hi=65535) def test_atomic_max_uint32(self): self.check_atomic_max(dtype=np.uint32, lo=0, hi=65535) def test_atomic_max_int64(self): self.check_atomic_max(dtype=np.int64, lo=-65535, hi=65535) def test_atomic_max_uint64(self): self.check_atomic_max(dtype=np.uint64, lo=0, hi=65535) def test_atomic_max_float32(self): self.check_atomic_max(dtype=np.float32, lo=-65535, hi=65535) def test_atomic_max_double(self): self.check_atomic_max(dtype=np.float64, lo=-65535, hi=65535) def test_atomic_max_double_normalizedindex(self): vals = np.random.randint(0, 65535, size=(32, 32)).astype(np.float64) res = np.zeros(1, np.float64) cuda_func = cuda.jit('void(float64[:], float64[:,:])')( atomic_max_double_normalizedindex) cuda_func[32, 32](res, vals) gold = np.max(vals) np.testing.assert_equal(res, gold) def test_atomic_max_double_oneindex(self): vals = np.random.randint(0, 128, size=32).astype(np.float64) res = np.zeros(1, np.float64) cuda_func = cuda.jit('void(float64[:], float64[:])')( atomic_max_double_oneindex) cuda_func[1, 32](res, vals) gold = np.max(vals) np.testing.assert_equal(res, gold) def check_atomic_min(self, dtype, lo, hi): vals = np.random.randint(lo, hi, size=(32, 32)).astype(dtype) res = np.array([65535], dtype=vals.dtype) cuda_func = cuda.jit(atomic_min) cuda_func[32, 32](res, vals) gold = np.min(vals) np.testing.assert_equal(res, gold) def test_atomic_min_int32(self): self.check_atomic_min(dtype=np.int32, lo=-65535, hi=65535) def test_atomic_min_uint32(self): self.check_atomic_min(dtype=np.uint32, lo=0, hi=65535) def test_atomic_min_int64(self): self.check_atomic_min(dtype=np.int64, lo=-65535, hi=65535) def test_atomic_min_uint64(self): self.check_atomic_min(dtype=np.uint64, lo=0, hi=65535) def test_atomic_min_float(self): self.check_atomic_min(dtype=np.float32, lo=-65535, hi=65535) def test_atomic_min_double(self): self.check_atomic_min(dtype=np.float64, lo=-65535, hi=65535) def test_atomic_min_double_normalizedindex(self): vals = np.random.randint(0, 65535, size=(32, 32)).astype(np.float64) res = np.ones(1, np.float64) * 65535 cuda_func = cuda.jit('void(float64[:], float64[:,:])')( atomic_min_double_normalizedindex) cuda_func[32, 32](res, vals) gold = np.min(vals) np.testing.assert_equal(res, gold) def test_atomic_min_double_oneindex(self): vals = np.random.randint(0, 128, size=32).astype(np.float64) res = np.ones(1, np.float64) * 128 cuda_func = cuda.jit('void(float64[:], float64[:])')( atomic_min_double_oneindex) cuda_func[1, 32](res, vals) gold = np.min(vals) np.testing.assert_equal(res, gold) # Taken together, _test_atomic_minmax_nan_location and # _test_atomic_minmax_nan_val check that NaNs are treated similarly to the # way they are in Python / NumPy - that is, {min,max}(a, b) == a if either # a or b is a NaN. For the atomics, this means that the max is taken as the # value stored in the memory location rather than the value supplied - i.e. # for: # # cuda.atomic.{min,max}(ary, idx, val) # # the result will be ary[idx] for either of ary[idx] or val being NaN. def _test_atomic_minmax_nan_location(self, func): cuda_func = cuda.jit('void(float64[:], float64[:,:])')(func) vals = np.random.randint(0, 128, size=(1,1)).astype(np.float64) res = np.zeros(1, np.float64) + np.nan cuda_func[1, 1](res, vals) np.testing.assert_equal(res, [np.nan]) def _test_atomic_minmax_nan_val(self, func): cuda_func = cuda.jit('void(float64[:], float64[:,:])')(func) res = np.random.randint(0, 128, size=1).astype(np.float64) gold = res.copy() vals = np.zeros((1, 1), np.float64) + np.nan cuda_func[1, 1](res, vals) np.testing.assert_equal(res, gold) def test_atomic_min_nan_location(self): self._test_atomic_minmax_nan_location(atomic_min) def test_atomic_max_nan_location(self): self._test_atomic_minmax_nan_location(atomic_max) def test_atomic_min_nan_val(self): self._test_atomic_minmax_nan_val(atomic_min) def test_atomic_max_nan_val(self): self._test_atomic_minmax_nan_val(atomic_max) def test_atomic_max_double_shared(self): vals = np.random.randint(0, 32, size=32).astype(np.float64) res = np.zeros(1, np.float64) sig = 'void(float64[:], float64[:])' cuda_func = cuda.jit(sig)(atomic_max_double_shared) cuda_func[1, 32](res, vals) gold = np.max(vals) np.testing.assert_equal(res, gold) def test_atomic_min_double_shared(self): vals = np.random.randint(0, 32, size=32).astype(np.float64) res = np.ones(1, np.float64) * 32 sig = 'void(float64[:], float64[:])' cuda_func = cuda.jit(sig)(atomic_min_double_shared) cuda_func[1, 32](res, vals) gold = np.min(vals) np.testing.assert_equal(res, gold) def check_cas(self, n, fill, unfill, dtype, cas_func, ndim=1): res = [fill] * (n // 2) + [unfill] * (n // 2) np.random.shuffle(res) res = np.asarray(res, dtype=dtype) if ndim == 2: res.shape = (10, -1) out = np.zeros_like(res) ary = np.random.randint(1, 10, size=res.shape).astype(res.dtype) fill_mask = res == fill unfill_mask = res == unfill expect_res = np.zeros_like(res) expect_res[fill_mask] = ary[fill_mask] expect_res[unfill_mask] = unfill expect_out = res.copy() cuda_func = cuda.jit(cas_func) if ndim == 1: cuda_func[10, 10](res, out, ary, fill) else: cuda_func[(10, 10), (10, 10)](res, out, ary, fill) np.testing.assert_array_equal(expect_res, res) np.testing.assert_array_equal(expect_out, out) def test_atomic_compare_and_swap(self): self.check_cas(n=100, fill=-99, unfill=-1, dtype=np.int32, cas_func=atomic_compare_and_swap) def test_atomic_compare_and_swap2(self): self.check_cas(n=100, fill=-45, unfill=-1, dtype=np.int64, cas_func=atomic_compare_and_swap) def test_atomic_compare_and_swap3(self): rfill = np.random.randint(50, 500, dtype=np.uint32) runfill = np.random.randint(1, 25, dtype=np.uint32) self.check_cas(n=100, fill=rfill, unfill=runfill, dtype=np.uint32, cas_func=atomic_compare_and_swap) def test_atomic_compare_and_swap4(self): rfill = np.random.randint(50, 500, dtype=np.uint64) runfill = np.random.randint(1, 25, dtype=np.uint64) self.check_cas(n=100, fill=rfill, unfill=runfill, dtype=np.uint64, cas_func=atomic_compare_and_swap) def test_atomic_cas_1dim(self): self.check_cas(n=100, fill=-99, unfill=-1, dtype=np.int32, cas_func=atomic_cas_1dim) def test_atomic_cas_2dim(self): self.check_cas(n=100, fill=-99, unfill=-1, dtype=np.int32, cas_func=atomic_cas_2dim, ndim=2) def test_atomic_cas2_1dim(self): self.check_cas(n=100, fill=-45, unfill=-1, dtype=np.int64, cas_func=atomic_cas_1dim) def test_atomic_cas2_2dim(self): self.check_cas(n=100, fill=-45, unfill=-1, dtype=np.int64, cas_func=atomic_cas_2dim, ndim=2) def test_atomic_cas3_1dim(self): rfill = np.random.randint(50, 500, dtype=np.uint32) runfill = np.random.randint(1, 25, dtype=np.uint32) self.check_cas(n=100, fill=rfill, unfill=runfill, dtype=np.uint32, cas_func=atomic_cas_1dim) def test_atomic_cas3_2dim(self): rfill = np.random.randint(50, 500, dtype=np.uint32) runfill = np.random.randint(1, 25, dtype=np.uint32) self.check_cas(n=100, fill=rfill, unfill=runfill, dtype=np.uint32, cas_func=atomic_cas_2dim, ndim=2) def test_atomic_cas4_1dim(self): rfill = np.random.randint(50, 500, dtype=np.uint64) runfill = np.random.randint(1, 25, dtype=np.uint64) self.check_cas(n=100, fill=rfill, unfill=runfill, dtype=np.uint64, cas_func=atomic_cas_1dim) def test_atomic_cas4_2dim(self): rfill = np.random.randint(50, 500, dtype=np.uint64) runfill = np.random.randint(1, 25, dtype=np.uint64) self.check_cas(n=100, fill=rfill, unfill=runfill, dtype=np.uint64, cas_func=atomic_cas_2dim, ndim=2) # Tests that the atomic add, min, and max operations return the old value - # in the simulator, they did not (see Issue #5458). The max and min have # special handling for NaN values, so we explicitly test with a NaN in the # array being modified and the value provided. def _test_atomic_returns_old(self, kernel, initial): x = np.zeros(2, dtype=np.float32) x[0] = initial kernel[1, 1](x) if np.isnan(initial): self.assertTrue(np.isnan(x[1])) else: self.assertEqual(x[1], initial) def test_atomic_add_returns_old(self): @cuda.jit def kernel(x): x[1] = cuda.atomic.add(x, 0, 1) self._test_atomic_returns_old(kernel, 10) def test_atomic_max_returns_no_replace(self): @cuda.jit def kernel(x): x[1] = cuda.atomic.max(x, 0, 1) self._test_atomic_returns_old(kernel, 10) def test_atomic_max_returns_old_replace(self): @cuda.jit def kernel(x): x[1] = cuda.atomic.max(x, 0, 10) self._test_atomic_returns_old(kernel, 1) def test_atomic_max_returns_old_nan_in_array(self): @cuda.jit def kernel(x): x[1] = cuda.atomic.max(x, 0, 1) self._test_atomic_returns_old(kernel, np.nan) def test_atomic_max_returns_old_nan_val(self): @cuda.jit def kernel(x): x[1] = cuda.atomic.max(x, 0, np.nan) self._test_atomic_returns_old(kernel, 10) def test_atomic_min_returns_old_no_replace(self): @cuda.jit def kernel(x): x[1] = cuda.atomic.min(x, 0, 11) self._test_atomic_returns_old(kernel, 10) def test_atomic_min_returns_old_replace(self): @cuda.jit def kernel(x): x[1] = cuda.atomic.min(x, 0, 10) self._test_atomic_returns_old(kernel, 11) def test_atomic_min_returns_old_nan_in_array(self): @cuda.jit def kernel(x): x[1] = cuda.atomic.min(x, 0, 11) self._test_atomic_returns_old(kernel, np.nan) def test_atomic_min_returns_old_nan_val(self): @cuda.jit def kernel(x): x[1] = cuda.atomic.min(x, 0, np.nan) self._test_atomic_returns_old(kernel, 11) # Tests for atomic nanmin/nanmax # nanmax tests def check_atomic_nanmax(self, dtype, lo, hi, init_val): vals = np.random.randint(lo, hi, size=(32, 32)).astype(dtype) vals[1::2] = init_val res = np.zeros(1, dtype=vals.dtype) cuda_func = cuda.jit(atomic_nanmax) cuda_func[32, 32](res, vals) gold = np.nanmax(vals) np.testing.assert_equal(res, gold) def test_atomic_nanmax_int32(self): self.check_atomic_nanmax(dtype=np.int32, lo=-65535, hi=65535, init_val=0) def test_atomic_nanmax_uint32(self): self.check_atomic_nanmax(dtype=np.uint32, lo=0, hi=65535, init_val=0) def test_atomic_nanmax_int64(self): self.check_atomic_nanmax(dtype=np.int64, lo=-65535, hi=65535, init_val=0) def test_atomic_nanmax_uint64(self): self.check_atomic_nanmax(dtype=np.uint64, lo=0, hi=65535, init_val=0) def test_atomic_nanmax_float32(self): self.check_atomic_nanmax(dtype=np.float32, lo=-65535, hi=65535, init_val=np.nan) def test_atomic_nanmax_double(self): self.check_atomic_nanmax(dtype=np.float64, lo=-65535, hi=65535, init_val=np.nan) def test_atomic_nanmax_double_shared(self): vals = np.random.randint(0, 32, size=32).astype(np.float64) vals[1::2] = np.nan res = np.array([0], dtype=vals.dtype) sig = 'void(float64[:], float64[:])' cuda_func = cuda.jit(sig)(atomic_nanmax_double_shared) cuda_func[1, 32](res, vals) gold = np.nanmax(vals) np.testing.assert_equal(res, gold) def test_atomic_nanmax_double_oneindex(self): vals = np.random.randint(0, 128, size=32).astype(np.float64) vals[1::2] = np.nan res = np.zeros(1, np.float64) cuda_func = cuda.jit('void(float64[:], float64[:])')( atomic_max_double_oneindex) cuda_func[1, 32](res, vals) gold = np.nanmax(vals) np.testing.assert_equal(res, gold) # nanmin tests def check_atomic_nanmin(self, dtype, lo, hi, init_val): vals = np.random.randint(lo, hi, size=(32, 32)).astype(dtype) vals[1::2] = init_val res = np.array([65535], dtype=vals.dtype) cuda_func = cuda.jit(atomic_nanmin) cuda_func[32, 32](res, vals) gold = np.nanmin(vals) np.testing.assert_equal(res, gold) def test_atomic_nanmin_int32(self): self.check_atomic_nanmin(dtype=np.int32, lo=-65535, hi=65535, init_val=0) def test_atomic_nanmin_uint32(self): self.check_atomic_nanmin(dtype=np.uint32, lo=0, hi=65535, init_val=0) def test_atomic_nanmin_int64(self): self.check_atomic_nanmin(dtype=np.int64, lo=-65535, hi=65535, init_val=0) def test_atomic_nanmin_uint64(self): self.check_atomic_nanmin(dtype=np.uint64, lo=0, hi=65535, init_val=0) def test_atomic_nanmin_float(self): self.check_atomic_nanmin(dtype=np.float32, lo=-65535, hi=65535, init_val=np.nan) def test_atomic_nanmin_double(self): self.check_atomic_nanmin(dtype=np.float64, lo=-65535, hi=65535, init_val=np.nan) def test_atomic_nanmin_double_shared(self): vals = np.random.randint(0, 32, size=32).astype(np.float64) vals[1::2] = np.nan res = np.array([32], dtype=vals.dtype) sig = 'void(float64[:], float64[:])' cuda_func = cuda.jit(sig)(atomic_nanmin_double_shared) cuda_func[1, 32](res, vals) gold = np.nanmin(vals) np.testing.assert_equal(res, gold) def test_atomic_nanmin_double_oneindex(self): vals = np.random.randint(0, 128, size=32).astype(np.float64) vals[1::2] = np.nan res = np.array([128], np.float64) cuda_func = cuda.jit('void(float64[:], float64[:])')( atomic_min_double_oneindex) cuda_func[1, 32](res, vals) gold = np.nanmin(vals) np.testing.assert_equal(res, gold) # Returning old value tests def _test_atomic_nan_returns_old(self, kernel, initial): x = np.zeros(2, dtype=np.float32) x[0] = initial x[1] = np.nan kernel[1, 1](x) if np.isnan(initial): self.assertFalse(np.isnan(x[0])) self.assertTrue(np.isnan(x[1])) else: self.assertEqual(x[1], initial) def test_atomic_nanmax_returns_old_no_replace(self): @cuda.jit def kernel(x): x[1] = cuda.atomic.nanmax(x, 0, 1) self._test_atomic_nan_returns_old(kernel, 10) def test_atomic_nanmax_returns_old_replace(self): @cuda.jit def kernel(x): x[1] = cuda.atomic.nanmax(x, 0, 10) self._test_atomic_nan_returns_old(kernel, 1) def test_atomic_nanmax_returns_old_nan_in_array(self): @cuda.jit def kernel(x): x[1] = cuda.atomic.nanmax(x, 0, 1) self._test_atomic_nan_returns_old(kernel, np.nan) def test_atomic_nanmax_returns_old_nan_val(self): @cuda.jit def kernel(x): x[1] = cuda.atomic.nanmax(x, 0, np.nan) self._test_atomic_nan_returns_old(kernel, 10) def test_atomic_nanmin_returns_old_no_replace(self): @cuda.jit def kernel(x): x[1] = cuda.atomic.nanmin(x, 0, 11) self._test_atomic_nan_returns_old(kernel, 10) def test_atomic_nanmin_returns_old_replace(self): @cuda.jit def kernel(x): x[1] = cuda.atomic.nanmin(x, 0, 10) self._test_atomic_nan_returns_old(kernel, 11) def test_atomic_nanmin_returns_old_nan_in_array(self): @cuda.jit def kernel(x): x[1] = cuda.atomic.nanmin(x, 0, 11) self._test_atomic_nan_returns_old(kernel, np.nan) def test_atomic_nanmin_returns_old_nan_val(self): @cuda.jit def kernel(x): x[1] = cuda.atomic.nanmin(x, 0, np.nan) self._test_atomic_nan_returns_old(kernel, 11) if __name__ == '__main__': unittest.main()