ai-content-maker/.venv/Lib/site-packages/numba/cuda/tests/cudapy/test_atomics.py

1621 lines
57 KiB
Python

import numpy as np
from textwrap import dedent
from numba import cuda, uint32, uint64, float32, float64
from numba.cuda.testing import unittest, CUDATestCase, cc_X_or_above
from numba.core import config
@cuda.jit(device=True)
def atomic_cast_to_uint64(num):
return uint64(num)
@cuda.jit(device=True)
def atomic_cast_to_int(num):
return int(num)
@cuda.jit(device=True)
def atomic_cast_none(num):
return num
@cuda.jit(device=True)
def atomic_binary_1dim_shared(ary, idx, op2, ary_dtype, ary_nelements,
binop_func, cast_func, initializer,
neg_idx):
tid = cuda.threadIdx.x
sm = cuda.shared.array(ary_nelements, ary_dtype)
sm[tid] = initializer
cuda.syncthreads()
bin = cast_func(idx[tid] % ary_nelements)
if neg_idx:
bin = bin - ary_nelements
binop_func(sm, bin, op2)
cuda.syncthreads()
ary[tid] = sm[tid]
@cuda.jit(device=True)
def atomic_binary_1dim_shared2(ary, idx, op2, ary_dtype, ary_nelements,
binop_func, cast_func):
tid = cuda.threadIdx.x
sm = cuda.shared.array(ary_nelements, ary_dtype)
sm[tid] = ary[tid]
cuda.syncthreads()
bin = cast_func(idx[tid] % ary_nelements)
binop_func(sm, bin, op2)
cuda.syncthreads()
ary[tid] = sm[tid]
@cuda.jit(device=True)
def atomic_binary_2dim_shared(ary, op2, ary_dtype, ary_shape,
binop_func, y_cast_func, neg_idx):
tx = cuda.threadIdx.x
ty = cuda.threadIdx.y
sm = cuda.shared.array(ary_shape, ary_dtype)
sm[tx, ty] = ary[tx, ty]
cuda.syncthreads()
bin = (tx, y_cast_func(ty))
if neg_idx:
bin = (bin[0] - ary_shape[0], bin[1] - ary_shape[1])
binop_func(sm, bin, op2)
cuda.syncthreads()
ary[tx, ty] = sm[tx, ty]
@cuda.jit(device=True)
def atomic_binary_2dim_global(ary, op2, binop_func, y_cast_func, neg_idx):
tx = cuda.threadIdx.x
ty = cuda.threadIdx.y
bin = (tx, y_cast_func(ty))
if neg_idx:
bin = (bin[0] - ary.shape[0], bin[1] - ary.shape[1])
binop_func(ary, bin, op2)
@cuda.jit(device=True)
def atomic_binary_1dim_global(ary, idx, ary_nelements, op2,
binop_func, neg_idx):
tid = cuda.threadIdx.x
bin = int(idx[tid] % ary_nelements)
if neg_idx:
bin = bin - ary_nelements
binop_func(ary, bin, op2)
def atomic_add(ary):
atomic_binary_1dim_shared(ary, ary, 1, uint32, 32,
cuda.atomic.add, atomic_cast_none, 0, False)
def atomic_add_wrap(ary):
atomic_binary_1dim_shared(ary, ary, 1, uint32, 32,
cuda.atomic.add, atomic_cast_none, 0, True)
def atomic_add2(ary):
atomic_binary_2dim_shared(ary, 1, uint32, (4, 8),
cuda.atomic.add, atomic_cast_none, False)
def atomic_add2_wrap(ary):
atomic_binary_2dim_shared(ary, 1, uint32, (4, 8),
cuda.atomic.add, atomic_cast_none, True)
def atomic_add3(ary):
atomic_binary_2dim_shared(ary, 1, uint32, (4, 8),
cuda.atomic.add, atomic_cast_to_uint64, False)
def atomic_add_float(ary):
atomic_binary_1dim_shared(ary, ary, 1.0, float32, 32,
cuda.atomic.add, atomic_cast_to_int, 0.0, False)
def atomic_add_float_wrap(ary):
atomic_binary_1dim_shared(ary, ary, 1.0, float32, 32,
cuda.atomic.add, atomic_cast_to_int, 0.0, True)
def atomic_add_float_2(ary):
atomic_binary_2dim_shared(ary, 1.0, float32, (4, 8),
cuda.atomic.add, atomic_cast_none, False)
def atomic_add_float_2_wrap(ary):
atomic_binary_2dim_shared(ary, 1.0, float32, (4, 8),
cuda.atomic.add, atomic_cast_none, True)
def atomic_add_float_3(ary):
atomic_binary_2dim_shared(ary, 1.0, float32, (4, 8),
cuda.atomic.add, atomic_cast_to_uint64, False)
def atomic_add_double_global(idx, ary):
atomic_binary_1dim_global(ary, idx, 32, 1.0, cuda.atomic.add, False)
def atomic_add_double_global_wrap(idx, ary):
atomic_binary_1dim_global(ary, idx, 32, 1.0, cuda.atomic.add, True)
def atomic_add_double_global_2(ary):
atomic_binary_2dim_global(ary, 1, cuda.atomic.add, atomic_cast_none, False)
def atomic_add_double_global_2_wrap(ary):
atomic_binary_2dim_global(ary, 1, cuda.atomic.add, atomic_cast_none, True)
def atomic_add_double_global_3(ary):
atomic_binary_2dim_global(ary, 1, cuda.atomic.add, atomic_cast_to_uint64,
False)
def atomic_add_double(idx, ary):
atomic_binary_1dim_shared(ary, idx, 1.0, float64, 32,
cuda.atomic.add, atomic_cast_none, 0.0, False)
def atomic_add_double_wrap(idx, ary):
atomic_binary_1dim_shared(ary, idx, 1.0, float64, 32,
cuda.atomic.add, atomic_cast_none, 0.0, True)
def atomic_add_double_2(ary):
atomic_binary_2dim_shared(ary, 1.0, float64, (4, 8),
cuda.atomic.add, atomic_cast_none, False)
def atomic_add_double_2_wrap(ary):
atomic_binary_2dim_shared(ary, 1.0, float64, (4, 8),
cuda.atomic.add, atomic_cast_none, True)
def atomic_add_double_3(ary):
atomic_binary_2dim_shared(ary, 1.0, float64, (4, 8),
cuda.atomic.add, atomic_cast_to_uint64, False)
def atomic_sub(ary):
atomic_binary_1dim_shared(ary, ary, 1, uint32, 32,
cuda.atomic.sub, atomic_cast_none, 0, False)
def atomic_sub2(ary):
atomic_binary_2dim_shared(ary, 1, uint32, (4, 8),
cuda.atomic.sub, atomic_cast_none, False)
def atomic_sub3(ary):
atomic_binary_2dim_shared(ary, 1, uint32, (4, 8),
cuda.atomic.sub, atomic_cast_to_uint64, False)
def atomic_sub_float(ary):
atomic_binary_1dim_shared(ary, ary, 1.0, float32, 32,
cuda.atomic.sub, atomic_cast_to_int, 0.0, False)
def atomic_sub_float_2(ary):
atomic_binary_2dim_shared(ary, 1.0, float32, (4, 8),
cuda.atomic.sub, atomic_cast_none, False)
def atomic_sub_float_3(ary):
atomic_binary_2dim_shared(ary, 1.0, float32, (4, 8),
cuda.atomic.sub, atomic_cast_to_uint64, False)
def atomic_sub_double(idx, ary):
atomic_binary_1dim_shared(ary, idx, 1.0, float64, 32,
cuda.atomic.sub, atomic_cast_none, 0.0, False)
def atomic_sub_double_2(ary):
atomic_binary_2dim_shared(ary, 1.0, float64, (4, 8),
cuda.atomic.sub, atomic_cast_none, False)
def atomic_sub_double_3(ary):
atomic_binary_2dim_shared(ary, 1.0, float64, (4, 8),
cuda.atomic.sub, atomic_cast_to_uint64, False)
def atomic_sub_double_global(idx, ary):
atomic_binary_1dim_global(ary, idx, 32, 1.0, cuda.atomic.sub, False)
def atomic_sub_double_global_2(ary):
atomic_binary_2dim_global(ary, 1.0, cuda.atomic.sub, atomic_cast_none,
False)
def atomic_sub_double_global_3(ary):
atomic_binary_2dim_shared(ary, 1.0, float64, (4, 8),
cuda.atomic.sub, atomic_cast_to_uint64, False)
def atomic_and(ary, op2):
atomic_binary_1dim_shared(ary, ary, op2, uint32, 32,
cuda.atomic.and_, atomic_cast_none, 1, False)
def atomic_and2(ary, op2):
atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
cuda.atomic.and_, atomic_cast_none, False)
def atomic_and3(ary, op2):
atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
cuda.atomic.and_, atomic_cast_to_uint64, False)
def atomic_and_global(idx, ary, op2):
atomic_binary_1dim_global(ary, idx, 32, op2, cuda.atomic.and_, False)
def atomic_and_global_2(ary, op2):
atomic_binary_2dim_global(ary, op2, cuda.atomic.and_,
atomic_cast_none, False)
def atomic_or(ary, op2):
atomic_binary_1dim_shared(ary, ary, op2, uint32, 32,
cuda.atomic.or_, atomic_cast_none, 0, False)
def atomic_or2(ary, op2):
atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
cuda.atomic.or_, atomic_cast_none, False)
def atomic_or3(ary, op2):
atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
cuda.atomic.or_, atomic_cast_to_uint64, False)
def atomic_or_global(idx, ary, op2):
atomic_binary_1dim_global(ary, idx, 32, op2, cuda.atomic.or_, False)
def atomic_or_global_2(ary, op2):
atomic_binary_2dim_global(ary, op2, cuda.atomic.or_,
atomic_cast_none, False)
def atomic_xor(ary, op2):
atomic_binary_1dim_shared(ary, ary, op2, uint32, 32,
cuda.atomic.xor, atomic_cast_none, 0, False)
def atomic_xor2(ary, op2):
atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
cuda.atomic.xor, atomic_cast_none, False)
def atomic_xor3(ary, op2):
atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
cuda.atomic.xor, atomic_cast_to_uint64, False)
def atomic_xor_global(idx, ary, op2):
atomic_binary_1dim_global(ary, idx, 32, op2, cuda.atomic.xor, False)
def atomic_xor_global_2(ary, op2):
atomic_binary_2dim_global(ary, op2, cuda.atomic.xor,
atomic_cast_none, False)
def atomic_inc32(ary, idx, op2):
atomic_binary_1dim_shared2(ary, idx, op2, uint32, 32,
cuda.atomic.inc, atomic_cast_none)
def atomic_inc64(ary, idx, op2):
atomic_binary_1dim_shared2(ary, idx, op2, uint64, 32,
cuda.atomic.inc, atomic_cast_to_int)
def atomic_inc2_32(ary, op2):
atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
cuda.atomic.inc, atomic_cast_none, False)
def atomic_inc2_64(ary, op2):
atomic_binary_2dim_shared(ary, op2, uint64, (4, 8),
cuda.atomic.inc, atomic_cast_none, False)
def atomic_inc3(ary, op2):
atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
cuda.atomic.inc, atomic_cast_to_uint64, False)
def atomic_inc_global(idx, ary, op2):
atomic_binary_1dim_global(ary, idx, 32, op2, cuda.atomic.inc, False)
def atomic_inc_global_2(ary, op2):
atomic_binary_2dim_global(ary, op2, cuda.atomic.inc,
atomic_cast_none, False)
def atomic_dec32(ary, idx, op2):
atomic_binary_1dim_shared2(ary, idx, op2, uint32, 32,
cuda.atomic.dec, atomic_cast_none)
def atomic_dec64(ary, idx, op2):
atomic_binary_1dim_shared2(ary, idx, op2, uint64, 32,
cuda.atomic.dec, atomic_cast_to_int)
def atomic_dec2_32(ary, op2):
atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
cuda.atomic.dec, atomic_cast_none, False)
def atomic_dec2_64(ary, op2):
atomic_binary_2dim_shared(ary, op2, uint64, (4, 8),
cuda.atomic.dec, atomic_cast_none, False)
def atomic_dec3(ary, op2):
atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
cuda.atomic.dec, atomic_cast_to_uint64, False)
def atomic_dec_global(idx, ary, op2):
atomic_binary_1dim_global(ary, idx, 32, op2, cuda.atomic.dec, False)
def atomic_dec_global_2(ary, op2):
atomic_binary_2dim_global(ary, op2, cuda.atomic.dec,
atomic_cast_none, False)
def atomic_exch(ary, idx, op2):
atomic_binary_1dim_shared2(ary, idx, op2, uint32, 32,
cuda.atomic.exch, atomic_cast_none)
def atomic_exch2(ary, op2):
atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
cuda.atomic.exch, atomic_cast_none, False)
def atomic_exch3(ary, op2):
atomic_binary_2dim_shared(ary, op2, uint64, (4, 8),
cuda.atomic.exch, atomic_cast_none, False)
def atomic_exch_global(idx, ary, op2):
atomic_binary_1dim_global(ary, idx, 32, op2, cuda.atomic.exch, False)
def gen_atomic_extreme_funcs(func):
fns = dedent("""
def atomic(res, ary):
tx = cuda.threadIdx.x
bx = cuda.blockIdx.x
{func}(res, 0, ary[tx, bx])
def atomic_double_normalizedindex(res, ary):
tx = cuda.threadIdx.x
bx = cuda.blockIdx.x
{func}(res, 0, ary[tx, uint64(bx)])
def atomic_double_oneindex(res, ary):
tx = cuda.threadIdx.x
{func}(res, 0, ary[tx])
def atomic_double_shared(res, ary):
tid = cuda.threadIdx.x
smary = cuda.shared.array(32, float64)
smary[tid] = ary[tid]
smres = cuda.shared.array(1, float64)
if tid == 0:
smres[0] = res[0]
cuda.syncthreads()
{func}(smres, 0, smary[tid])
cuda.syncthreads()
if tid == 0:
res[0] = smres[0]
""").format(func=func)
ld = {}
exec(fns, {'cuda': cuda, 'float64': float64, 'uint64': uint64}, ld)
return (ld['atomic'], ld['atomic_double_normalizedindex'],
ld['atomic_double_oneindex'], ld['atomic_double_shared'])
(atomic_max, atomic_max_double_normalizedindex, atomic_max_double_oneindex,
atomic_max_double_shared) = gen_atomic_extreme_funcs('cuda.atomic.max')
(atomic_min, atomic_min_double_normalizedindex, atomic_min_double_oneindex,
atomic_min_double_shared) = gen_atomic_extreme_funcs('cuda.atomic.min')
(atomic_nanmax, atomic_nanmax_double_normalizedindex,
atomic_nanmax_double_oneindex, atomic_nanmax_double_shared) = \
gen_atomic_extreme_funcs('cuda.atomic.nanmax')
(atomic_nanmin, atomic_nanmin_double_normalizedindex,
atomic_nanmin_double_oneindex, atomic_nanmin_double_shared) = \
gen_atomic_extreme_funcs('cuda.atomic.nanmin')
def atomic_compare_and_swap(res, old, ary, fill_val):
gid = cuda.grid(1)
if gid < res.size:
old[gid] = cuda.atomic.compare_and_swap(res[gid:], fill_val, ary[gid])
def atomic_cas_1dim(res, old, ary, fill_val):
gid = cuda.grid(1)
if gid < res.size:
old[gid] = cuda.atomic.cas(res, gid, fill_val, ary[gid])
def atomic_cas_2dim(res, old, ary, fill_val):
gid = cuda.grid(2)
if gid[0] < res.shape[0] and gid[1] < res.shape[1]:
old[gid] = cuda.atomic.cas(res, gid, fill_val, ary[gid])
class TestCudaAtomics(CUDATestCase):
def setUp(self):
super().setUp()
np.random.seed(0)
def test_atomic_add(self):
ary = np.random.randint(0, 32, size=32).astype(np.uint32)
ary_wrap = ary.copy()
orig = ary.copy()
cuda_atomic_add = cuda.jit('void(uint32[:])')(atomic_add)
cuda_atomic_add[1, 32](ary)
cuda_atomic_add_wrap = cuda.jit('void(uint32[:])')(atomic_add_wrap)
cuda_atomic_add_wrap[1, 32](ary_wrap)
gold = np.zeros(32, dtype=np.uint32)
for i in range(orig.size):
gold[orig[i]] += 1
self.assertTrue(np.all(ary == gold))
self.assertTrue(np.all(ary_wrap == gold))
def test_atomic_add2(self):
ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
ary_wrap = ary.copy()
orig = ary.copy()
cuda_atomic_add2 = cuda.jit('void(uint32[:,:])')(atomic_add2)
cuda_atomic_add2[1, (4, 8)](ary)
cuda_atomic_add2_wrap = cuda.jit('void(uint32[:,:])')(atomic_add2_wrap)
cuda_atomic_add2_wrap[1, (4, 8)](ary_wrap)
self.assertTrue(np.all(ary == orig + 1))
self.assertTrue(np.all(ary_wrap == orig + 1))
def test_atomic_add3(self):
ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
orig = ary.copy()
cuda_atomic_add3 = cuda.jit('void(uint32[:,:])')(atomic_add3)
cuda_atomic_add3[1, (4, 8)](ary)
self.assertTrue(np.all(ary == orig + 1))
def test_atomic_add_float(self):
ary = np.random.randint(0, 32, size=32).astype(np.float32)
ary_wrap = ary.copy()
orig = ary.copy().astype(np.intp)
cuda_atomic_add_float = cuda.jit('void(float32[:])')(atomic_add_float)
cuda_atomic_add_float[1, 32](ary)
add_float_wrap = cuda.jit('void(float32[:])')(atomic_add_float_wrap)
add_float_wrap[1, 32](ary_wrap)
gold = np.zeros(32, dtype=np.uint32)
for i in range(orig.size):
gold[orig[i]] += 1.0
self.assertTrue(np.all(ary == gold))
self.assertTrue(np.all(ary_wrap == gold))
def test_atomic_add_float_2(self):
ary = np.random.randint(0, 32, size=32).astype(np.float32).reshape(4, 8)
ary_wrap = ary.copy()
orig = ary.copy()
cuda_atomic_add2 = cuda.jit('void(float32[:,:])')(atomic_add_float_2)
cuda_atomic_add2[1, (4, 8)](ary)
cuda_func_wrap = cuda.jit('void(float32[:,:])')(atomic_add_float_2_wrap)
cuda_func_wrap[1, (4, 8)](ary_wrap)
self.assertTrue(np.all(ary == orig + 1))
self.assertTrue(np.all(ary_wrap == orig + 1))
def test_atomic_add_float_3(self):
ary = np.random.randint(0, 32, size=32).astype(np.float32).reshape(4, 8)
orig = ary.copy()
cuda_atomic_add3 = cuda.jit('void(float32[:,:])')(atomic_add_float_3)
cuda_atomic_add3[1, (4, 8)](ary)
self.assertTrue(np.all(ary == orig + 1))
def assertCorrectFloat64Atomics(self, kernel, shared=True):
if config.ENABLE_CUDASIM:
return
# Use the first (and only) definition
asm = next(iter(kernel.inspect_asm().values()))
if cc_X_or_above(6, 0):
if cuda.runtime.get_version() > (12, 1):
# CUDA 12.2 and above generate a more optimized reduction
# instruction, because the result does not need to be
# placed in a register.
inst = 'red'
else:
inst = 'atom'
if shared:
inst = f'{inst}.shared'
self.assertIn(f'{inst}.add.f64', asm)
else:
if shared:
self.assertIn('atom.shared.cas.b64', asm)
else:
self.assertIn('atom.cas.b64', asm)
def test_atomic_add_double(self):
idx = np.random.randint(0, 32, size=32, dtype=np.int64)
ary = np.zeros(32, np.float64)
ary_wrap = ary.copy()
cuda_fn = cuda.jit('void(int64[:], float64[:])')(atomic_add_double)
cuda_fn[1, 32](idx, ary)
wrap_fn = cuda.jit('void(int64[:], float64[:])')(atomic_add_double_wrap)
wrap_fn[1, 32](idx, ary_wrap)
gold = np.zeros(32, dtype=np.uint32)
for i in range(idx.size):
gold[idx[i]] += 1.0
np.testing.assert_equal(ary, gold)
np.testing.assert_equal(ary_wrap, gold)
self.assertCorrectFloat64Atomics(cuda_fn)
self.assertCorrectFloat64Atomics(wrap_fn)
def test_atomic_add_double_2(self):
ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8)
ary_wrap = ary.copy()
orig = ary.copy()
cuda_fn = cuda.jit('void(float64[:,:])')(atomic_add_double_2)
cuda_fn[1, (4, 8)](ary)
cuda_fn_wrap = cuda.jit('void(float64[:,:])')(atomic_add_double_2_wrap)
cuda_fn_wrap[1, (4, 8)](ary_wrap)
np.testing.assert_equal(ary, orig + 1)
np.testing.assert_equal(ary_wrap, orig + 1)
self.assertCorrectFloat64Atomics(cuda_fn)
self.assertCorrectFloat64Atomics(cuda_fn_wrap)
def test_atomic_add_double_3(self):
ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8)
orig = ary.copy()
cuda_func = cuda.jit('void(float64[:,:])')(atomic_add_double_3)
cuda_func[1, (4, 8)](ary)
np.testing.assert_equal(ary, orig + 1)
self.assertCorrectFloat64Atomics(cuda_func)
def test_atomic_add_double_global(self):
idx = np.random.randint(0, 32, size=32, dtype=np.int64)
ary = np.zeros(32, np.float64)
ary_wrap = ary.copy()
sig = 'void(int64[:], float64[:])'
cuda_func = cuda.jit(sig)(atomic_add_double_global)
wrap_cuda_func = cuda.jit(sig)(atomic_add_double_global_wrap)
cuda_func[1, 32](idx, ary)
wrap_cuda_func[1, 32](idx, ary_wrap)
gold = np.zeros(32, dtype=np.uint32)
for i in range(idx.size):
gold[idx[i]] += 1.0
np.testing.assert_equal(ary, gold)
np.testing.assert_equal(ary_wrap, gold)
self.assertCorrectFloat64Atomics(cuda_func, shared=False)
self.assertCorrectFloat64Atomics(wrap_cuda_func, shared=False)
def test_atomic_add_double_global_2(self):
ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8)
ary_wrap = ary.copy()
orig = ary.copy()
sig = 'void(float64[:,:])'
cuda_func = cuda.jit(sig)(atomic_add_double_global_2)
wrap_cuda_func = cuda.jit(sig)(atomic_add_double_global_2_wrap)
cuda_func[1, (4, 8)](ary)
wrap_cuda_func[1, (4, 8)](ary_wrap)
np.testing.assert_equal(ary, orig + 1)
np.testing.assert_equal(ary_wrap, orig + 1)
self.assertCorrectFloat64Atomics(cuda_func, shared=False)
self.assertCorrectFloat64Atomics(wrap_cuda_func, shared=False)
def test_atomic_add_double_global_3(self):
ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8)
orig = ary.copy()
cuda_func = cuda.jit('void(float64[:,:])')(atomic_add_double_global_3)
cuda_func[1, (4, 8)](ary)
np.testing.assert_equal(ary, orig + 1)
self.assertCorrectFloat64Atomics(cuda_func, shared=False)
def test_atomic_sub(self):
ary = np.random.randint(0, 32, size=32).astype(np.uint32)
orig = ary.copy()
cuda_atomic_sub = cuda.jit('void(uint32[:])')(atomic_sub)
cuda_atomic_sub[1, 32](ary)
gold = np.zeros(32, dtype=np.uint32)
for i in range(orig.size):
gold[orig[i]] -= 1
self.assertTrue(np.all(ary == gold))
def test_atomic_sub2(self):
ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
orig = ary.copy()
cuda_atomic_sub2 = cuda.jit('void(uint32[:,:])')(atomic_sub2)
cuda_atomic_sub2[1, (4, 8)](ary)
self.assertTrue(np.all(ary == orig - 1))
def test_atomic_sub3(self):
ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
orig = ary.copy()
cuda_atomic_sub3 = cuda.jit('void(uint32[:,:])')(atomic_sub3)
cuda_atomic_sub3[1, (4, 8)](ary)
self.assertTrue(np.all(ary == orig - 1))
def test_atomic_sub_float(self):
ary = np.random.randint(0, 32, size=32).astype(np.float32)
orig = ary.copy().astype(np.intp)
cuda_atomic_sub_float = cuda.jit('void(float32[:])')(atomic_sub_float)
cuda_atomic_sub_float[1, 32](ary)
gold = np.zeros(32, dtype=np.float32)
for i in range(orig.size):
gold[orig[i]] -= 1.0
self.assertTrue(np.all(ary == gold))
def test_atomic_sub_float_2(self):
ary = np.random.randint(0, 32, size=32).astype(np.float32).reshape(4, 8)
orig = ary.copy()
cuda_atomic_sub2 = cuda.jit('void(float32[:,:])')(atomic_sub_float_2)
cuda_atomic_sub2[1, (4, 8)](ary)
self.assertTrue(np.all(ary == orig - 1))
def test_atomic_sub_float_3(self):
ary = np.random.randint(0, 32, size=32).astype(np.float32).reshape(4, 8)
orig = ary.copy()
cuda_atomic_sub3 = cuda.jit('void(float32[:,:])')(atomic_sub_float_3)
cuda_atomic_sub3[1, (4, 8)](ary)
self.assertTrue(np.all(ary == orig - 1))
def test_atomic_sub_double(self):
idx = np.random.randint(0, 32, size=32, dtype=np.int64)
ary = np.zeros(32, np.float64)
cuda_func = cuda.jit('void(int64[:], float64[:])')(atomic_sub_double)
cuda_func[1, 32](idx, ary)
gold = np.zeros(32, dtype=np.float64)
for i in range(idx.size):
gold[idx[i]] -= 1.0
np.testing.assert_equal(ary, gold)
def test_atomic_sub_double_2(self):
ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8)
orig = ary.copy()
cuda_func = cuda.jit('void(float64[:,:])')(atomic_sub_double_2)
cuda_func[1, (4, 8)](ary)
np.testing.assert_equal(ary, orig - 1)
def test_atomic_sub_double_3(self):
ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8)
orig = ary.copy()
cuda_func = cuda.jit('void(float64[:,:])')(atomic_sub_double_3)
cuda_func[1, (4, 8)](ary)
np.testing.assert_equal(ary, orig - 1)
def test_atomic_sub_double_global(self):
idx = np.random.randint(0, 32, size=32, dtype=np.int64)
ary = np.zeros(32, np.float64)
sig = 'void(int64[:], float64[:])'
cuda_func = cuda.jit(sig)(atomic_sub_double_global)
cuda_func[1, 32](idx, ary)
gold = np.zeros(32, dtype=np.float64)
for i in range(idx.size):
gold[idx[i]] -= 1.0
np.testing.assert_equal(ary, gold)
def test_atomic_sub_double_global_2(self):
ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8)
orig = ary.copy()
cuda_func = cuda.jit('void(float64[:,:])')(atomic_sub_double_global_2)
cuda_func[1, (4, 8)](ary)
np.testing.assert_equal(ary, orig - 1)
def test_atomic_sub_double_global_3(self):
ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8)
orig = ary.copy()
cuda_func = cuda.jit('void(float64[:,:])')(atomic_sub_double_global_3)
cuda_func[1, (4, 8)](ary)
np.testing.assert_equal(ary, orig - 1)
def test_atomic_and(self):
rand_const = np.random.randint(500)
ary = np.random.randint(0, 32, size=32).astype(np.uint32)
orig = ary.copy()
cuda_func = cuda.jit('void(uint32[:], uint32)')(atomic_and)
cuda_func[1, 32](ary, rand_const)
gold = ary.copy()
for i in range(orig.size):
gold[orig[i]] &= rand_const
self.assertTrue(np.all(ary == gold))
def test_atomic_and2(self):
rand_const = np.random.randint(500)
ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
orig = ary.copy()
cuda_atomic_and2 = cuda.jit('void(uint32[:,:], uint32)')(atomic_and2)
cuda_atomic_and2[1, (4, 8)](ary, rand_const)
self.assertTrue(np.all(ary == orig & rand_const))
def test_atomic_and3(self):
rand_const = np.random.randint(500)
ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
orig = ary.copy()
cuda_atomic_and3 = cuda.jit('void(uint32[:,:], uint32)')(atomic_and3)
cuda_atomic_and3[1, (4, 8)](ary, rand_const)
self.assertTrue(np.all(ary == orig & rand_const))
def test_atomic_and_global(self):
rand_const = np.random.randint(500)
idx = np.random.randint(0, 32, size=32, dtype=np.int32)
ary = np.random.randint(0, 32, size=32, dtype=np.int32)
sig = 'void(int32[:], int32[:], int32)'
cuda_func = cuda.jit(sig)(atomic_and_global)
cuda_func[1, 32](idx, ary, rand_const)
gold = ary.copy()
for i in range(idx.size):
gold[idx[i]] &= rand_const
np.testing.assert_equal(ary, gold)
def test_atomic_and_global_2(self):
rand_const = np.random.randint(500)
ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
orig = ary.copy()
cuda_func = cuda.jit('void(uint32[:,:], uint32)')(atomic_and_global_2)
cuda_func[1, (4, 8)](ary, rand_const)
np.testing.assert_equal(ary, orig & rand_const)
def test_atomic_or(self):
rand_const = np.random.randint(500)
ary = np.random.randint(0, 32, size=32).astype(np.uint32)
orig = ary.copy()
cuda_func = cuda.jit('void(uint32[:], uint32)')(atomic_or)
cuda_func[1, 32](ary, rand_const)
gold = np.zeros(32, dtype=np.uint32)
for i in range(orig.size):
gold[orig[i]] |= rand_const
self.assertTrue(np.all(ary == gold))
def test_atomic_or2(self):
rand_const = np.random.randint(500)
ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
orig = ary.copy()
cuda_atomic_and2 = cuda.jit('void(uint32[:,:], uint32)')(atomic_or2)
cuda_atomic_and2[1, (4, 8)](ary, rand_const)
self.assertTrue(np.all(ary == orig | rand_const))
def test_atomic_or3(self):
rand_const = np.random.randint(500)
ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
orig = ary.copy()
cuda_atomic_and3 = cuda.jit('void(uint32[:,:], uint32)')(atomic_or3)
cuda_atomic_and3[1, (4, 8)](ary, rand_const)
self.assertTrue(np.all(ary == orig | rand_const))
def test_atomic_or_global(self):
rand_const = np.random.randint(500)
idx = np.random.randint(0, 32, size=32, dtype=np.int32)
ary = np.random.randint(0, 32, size=32, dtype=np.int32)
sig = 'void(int32[:], int32[:], int32)'
cuda_func = cuda.jit(sig)(atomic_or_global)
cuda_func[1, 32](idx, ary, rand_const)
gold = ary.copy()
for i in range(idx.size):
gold[idx[i]] |= rand_const
np.testing.assert_equal(ary, gold)
def test_atomic_or_global_2(self):
rand_const = np.random.randint(500)
ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
orig = ary.copy()
cuda_func = cuda.jit('void(uint32[:,:], uint32)')(atomic_or_global_2)
cuda_func[1, (4, 8)](ary, rand_const)
np.testing.assert_equal(ary, orig | rand_const)
def test_atomic_xor(self):
rand_const = np.random.randint(500)
ary = np.random.randint(0, 32, size=32).astype(np.uint32)
orig = ary.copy()
cuda_func = cuda.jit('void(uint32[:], uint32)')(atomic_xor)
cuda_func[1, 32](ary, rand_const)
gold = np.zeros(32, dtype=np.uint32)
for i in range(orig.size):
gold[orig[i]] ^= rand_const
self.assertTrue(np.all(ary == gold))
def test_atomic_xor2(self):
rand_const = np.random.randint(500)
ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
orig = ary.copy()
cuda_atomic_xor2 = cuda.jit('void(uint32[:,:], uint32)')(atomic_xor2)
cuda_atomic_xor2[1, (4, 8)](ary, rand_const)
self.assertTrue(np.all(ary == orig ^ rand_const))
def test_atomic_xor3(self):
rand_const = np.random.randint(500)
ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
orig = ary.copy()
cuda_atomic_xor3 = cuda.jit('void(uint32[:,:], uint32)')(atomic_xor3)
cuda_atomic_xor3[1, (4, 8)](ary, rand_const)
self.assertTrue(np.all(ary == orig ^ rand_const))
def test_atomic_xor_global(self):
rand_const = np.random.randint(500)
idx = np.random.randint(0, 32, size=32, dtype=np.int32)
ary = np.random.randint(0, 32, size=32, dtype=np.int32)
gold = ary.copy()
sig = 'void(int32[:], int32[:], int32)'
cuda_func = cuda.jit(sig)(atomic_xor_global)
cuda_func[1, 32](idx, ary, rand_const)
for i in range(idx.size):
gold[idx[i]] ^= rand_const
np.testing.assert_equal(ary, gold)
def test_atomic_xor_global_2(self):
rand_const = np.random.randint(500)
ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
orig = ary.copy()
cuda_func = cuda.jit('void(uint32[:,:], uint32)')(atomic_xor_global_2)
cuda_func[1, (4, 8)](ary, rand_const)
np.testing.assert_equal(ary, orig ^ rand_const)
def inc_dec_1dim_setup(self, dtype):
rconst = np.random.randint(32, dtype=dtype)
rary = np.random.randint(0, 32, size=32).astype(dtype)
ary_idx = np.arange(32, dtype=dtype)
return rconst, rary, ary_idx
def inc_dec_2dim_setup(self, dtype):
rconst = np.random.randint(32, dtype=dtype)
rary = np.random.randint(0, 32, size=32).astype(dtype).reshape(4, 8)
return rconst, rary
def check_inc_index(self, ary, idx, rconst, sig, nblocks, blksize, func):
orig = ary.copy()
cuda_func = cuda.jit(sig)(func)
cuda_func[nblocks, blksize](ary, idx, rconst)
np.testing.assert_equal(ary, np.where(orig >= rconst, 0, orig + 1))
def check_inc_index2(self, ary, idx, rconst, sig, nblocks, blksize, func):
orig = ary.copy()
cuda_func = cuda.jit(sig)(func)
cuda_func[nblocks, blksize](idx, ary, rconst)
np.testing.assert_equal(ary, np.where(orig >= rconst, 0, orig + 1))
def check_inc(self, ary, rconst, sig, nblocks, blksize, func):
orig = ary.copy()
cuda_func = cuda.jit(sig)(func)
cuda_func[nblocks, blksize](ary, rconst)
np.testing.assert_equal(ary, np.where(orig >= rconst, 0, orig + 1))
def test_atomic_inc_32(self):
rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint32)
sig = 'void(uint32[:], uint32[:], uint32)'
self.check_inc_index(ary, idx, rand_const, sig, 1, 32, atomic_inc32)
def test_atomic_inc_64(self):
rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint64)
sig = 'void(uint64[:], uint64[:], uint64)'
self.check_inc_index(ary, idx, rand_const, sig, 1, 32, atomic_inc64)
def test_atomic_inc2_32(self):
rand_const, ary = self.inc_dec_2dim_setup(np.uint32)
sig = 'void(uint32[:,:], uint32)'
self.check_inc(ary, rand_const, sig, 1, (4,8), atomic_inc2_32)
def test_atomic_inc2_64(self):
rand_const, ary = self.inc_dec_2dim_setup(np.uint64)
sig = 'void(uint64[:,:], uint64)'
self.check_inc(ary, rand_const, sig, 1, (4,8), atomic_inc2_64)
def test_atomic_inc3(self):
rand_const, ary = self.inc_dec_2dim_setup(np.uint32)
sig = 'void(uint32[:,:], uint32)'
self.check_inc(ary, rand_const, sig, 1, (4,8), atomic_inc3)
def test_atomic_inc_global_32(self):
rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint32)
sig = 'void(uint32[:], uint32[:], uint32)'
self.check_inc_index2(ary, idx, rand_const, sig, 1, 32,
atomic_inc_global)
def test_atomic_inc_global_64(self):
rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint64)
sig = 'void(uint64[:], uint64[:], uint64)'
self.check_inc_index2(ary, idx, rand_const, sig, 1, 32,
atomic_inc_global)
def test_atomic_inc_global_2_32(self):
rand_const, ary = self.inc_dec_2dim_setup(np.uint32)
sig = 'void(uint32[:,:], uint32)'
self.check_inc(ary, rand_const, sig, 1, (4,8), atomic_inc_global_2)
def test_atomic_inc_global_2_64(self):
rand_const, ary = self.inc_dec_2dim_setup(np.uint64)
sig = 'void(uint64[:,:], uint64)'
self.check_inc(ary, rand_const, sig, 1, (4,8), atomic_inc_global_2)
def check_dec_index(self, ary, idx, rconst, sig, nblocks, blksize, func):
orig = ary.copy()
cuda_func = cuda.jit(sig)(func)
cuda_func[nblocks, blksize](ary, idx, rconst)
np.testing.assert_equal(ary, np.where(orig == 0, rconst,
np.where(orig > rconst,
rconst,
orig - 1)))
def check_dec_index2(self, ary, idx, rconst, sig, nblocks, blksize, func):
orig = ary.copy()
cuda_func = cuda.jit(sig)(func)
cuda_func[nblocks, blksize](idx, ary, rconst)
np.testing.assert_equal(ary, np.where(orig == 0, rconst,
np.where(orig > rconst,
rconst,
orig - 1)))
def check_dec(self, ary, rconst, sig, nblocks, blksize, func):
orig = ary.copy()
cuda_func = cuda.jit(sig)(func)
cuda_func[nblocks, blksize](ary, rconst)
np.testing.assert_equal(ary, np.where(orig == 0, rconst,
np.where(orig > rconst,
rconst,
orig - 1)))
def test_atomic_dec_32(self):
rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint32)
sig = 'void(uint32[:], uint32[:], uint32)'
self.check_dec_index(ary, idx, rand_const, sig, 1, 32, atomic_dec32)
def test_atomic_dec_64(self):
rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint64)
sig = 'void(uint64[:], uint64[:], uint64)'
self.check_dec_index(ary, idx, rand_const, sig, 1, 32, atomic_dec64)
def test_atomic_dec2_32(self):
rand_const, ary = self.inc_dec_2dim_setup(np.uint32)
sig = 'void(uint32[:,:], uint32)'
self.check_dec(ary, rand_const, sig, 1, (4,8), atomic_dec2_32)
def test_atomic_dec2_64(self):
rand_const, ary = self.inc_dec_2dim_setup(np.uint64)
sig = 'void(uint64[:,:], uint64)'
self.check_dec(ary, rand_const, sig, 1, (4,8), atomic_dec2_64)
def test_atomic_dec3_new(self):
rand_const, ary = self.inc_dec_2dim_setup(np.uint32)
sig = 'void(uint32[:,:], uint32)'
self.check_dec(ary, rand_const, sig, 1, (4,8), atomic_dec3)
def test_atomic_dec_global_32(self):
rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint32)
sig = 'void(uint32[:], uint32[:], uint32)'
self.check_dec_index2(ary, idx, rand_const, sig, 1, 32,
atomic_dec_global)
def test_atomic_dec_global_64(self):
rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint64)
sig = 'void(uint64[:], uint64[:], uint64)'
self.check_dec_index2(ary, idx, rand_const, sig, 1, 32,
atomic_dec_global)
def test_atomic_dec_global2_32(self):
rand_const, ary = self.inc_dec_2dim_setup(np.uint32)
sig = 'void(uint32[:,:], uint32)'
self.check_dec(ary, rand_const, sig, 1, (4,8), atomic_dec_global_2)
def test_atomic_dec_global2_64(self):
rand_const, ary = self.inc_dec_2dim_setup(np.uint64)
sig = 'void(uint64[:,:], uint64)'
self.check_dec(ary, rand_const, sig, 1, (4,8), atomic_dec_global_2)
def test_atomic_exch(self):
rand_const = np.random.randint(50, 100, dtype=np.uint32)
ary = np.random.randint(0, 32, size=32).astype(np.uint32)
idx = np.arange(32, dtype=np.uint32)
cuda_func = cuda.jit('void(uint32[:], uint32[:], uint32)')(atomic_exch)
cuda_func[1, 32](ary, idx, rand_const)
np.testing.assert_equal(ary, rand_const)
def test_atomic_exch2(self):
rand_const = np.random.randint(50, 100, dtype=np.uint32)
ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
cuda_func = cuda.jit('void(uint32[:,:], uint32)')(atomic_exch2)
cuda_func[1, (4, 8)](ary, rand_const)
np.testing.assert_equal(ary, rand_const)
def test_atomic_exch3(self):
rand_const = np.random.randint(50, 100, dtype=np.uint64)
ary = np.random.randint(0, 32, size=32).astype(np.uint64).reshape(4, 8)
cuda_func = cuda.jit('void(uint64[:,:], uint64)')(atomic_exch3)
cuda_func[1, (4, 8)](ary, rand_const)
np.testing.assert_equal(ary, rand_const)
def test_atomic_exch_global(self):
rand_const = np.random.randint(50, 100, dtype=np.uint32)
idx = np.arange(32, dtype=np.uint32)
ary = np.random.randint(0, 32, size=32, dtype=np.uint32)
sig = 'void(uint32[:], uint32[:], uint32)'
cuda_func = cuda.jit(sig)(atomic_exch_global)
cuda_func[1, 32](idx, ary, rand_const)
np.testing.assert_equal(ary, rand_const)
def check_atomic_max(self, dtype, lo, hi):
vals = np.random.randint(lo, hi, size=(32, 32)).astype(dtype)
res = np.zeros(1, dtype=vals.dtype)
cuda_func = cuda.jit(atomic_max)
cuda_func[32, 32](res, vals)
gold = np.max(vals)
np.testing.assert_equal(res, gold)
def test_atomic_max_int32(self):
self.check_atomic_max(dtype=np.int32, lo=-65535, hi=65535)
def test_atomic_max_uint32(self):
self.check_atomic_max(dtype=np.uint32, lo=0, hi=65535)
def test_atomic_max_int64(self):
self.check_atomic_max(dtype=np.int64, lo=-65535, hi=65535)
def test_atomic_max_uint64(self):
self.check_atomic_max(dtype=np.uint64, lo=0, hi=65535)
def test_atomic_max_float32(self):
self.check_atomic_max(dtype=np.float32, lo=-65535, hi=65535)
def test_atomic_max_double(self):
self.check_atomic_max(dtype=np.float64, lo=-65535, hi=65535)
def test_atomic_max_double_normalizedindex(self):
vals = np.random.randint(0, 65535, size=(32, 32)).astype(np.float64)
res = np.zeros(1, np.float64)
cuda_func = cuda.jit('void(float64[:], float64[:,:])')(
atomic_max_double_normalizedindex)
cuda_func[32, 32](res, vals)
gold = np.max(vals)
np.testing.assert_equal(res, gold)
def test_atomic_max_double_oneindex(self):
vals = np.random.randint(0, 128, size=32).astype(np.float64)
res = np.zeros(1, np.float64)
cuda_func = cuda.jit('void(float64[:], float64[:])')(
atomic_max_double_oneindex)
cuda_func[1, 32](res, vals)
gold = np.max(vals)
np.testing.assert_equal(res, gold)
def check_atomic_min(self, dtype, lo, hi):
vals = np.random.randint(lo, hi, size=(32, 32)).astype(dtype)
res = np.array([65535], dtype=vals.dtype)
cuda_func = cuda.jit(atomic_min)
cuda_func[32, 32](res, vals)
gold = np.min(vals)
np.testing.assert_equal(res, gold)
def test_atomic_min_int32(self):
self.check_atomic_min(dtype=np.int32, lo=-65535, hi=65535)
def test_atomic_min_uint32(self):
self.check_atomic_min(dtype=np.uint32, lo=0, hi=65535)
def test_atomic_min_int64(self):
self.check_atomic_min(dtype=np.int64, lo=-65535, hi=65535)
def test_atomic_min_uint64(self):
self.check_atomic_min(dtype=np.uint64, lo=0, hi=65535)
def test_atomic_min_float(self):
self.check_atomic_min(dtype=np.float32, lo=-65535, hi=65535)
def test_atomic_min_double(self):
self.check_atomic_min(dtype=np.float64, lo=-65535, hi=65535)
def test_atomic_min_double_normalizedindex(self):
vals = np.random.randint(0, 65535, size=(32, 32)).astype(np.float64)
res = np.ones(1, np.float64) * 65535
cuda_func = cuda.jit('void(float64[:], float64[:,:])')(
atomic_min_double_normalizedindex)
cuda_func[32, 32](res, vals)
gold = np.min(vals)
np.testing.assert_equal(res, gold)
def test_atomic_min_double_oneindex(self):
vals = np.random.randint(0, 128, size=32).astype(np.float64)
res = np.ones(1, np.float64) * 128
cuda_func = cuda.jit('void(float64[:], float64[:])')(
atomic_min_double_oneindex)
cuda_func[1, 32](res, vals)
gold = np.min(vals)
np.testing.assert_equal(res, gold)
# Taken together, _test_atomic_minmax_nan_location and
# _test_atomic_minmax_nan_val check that NaNs are treated similarly to the
# way they are in Python / NumPy - that is, {min,max}(a, b) == a if either
# a or b is a NaN. For the atomics, this means that the max is taken as the
# value stored in the memory location rather than the value supplied - i.e.
# for:
#
# cuda.atomic.{min,max}(ary, idx, val)
#
# the result will be ary[idx] for either of ary[idx] or val being NaN.
def _test_atomic_minmax_nan_location(self, func):
cuda_func = cuda.jit('void(float64[:], float64[:,:])')(func)
vals = np.random.randint(0, 128, size=(1,1)).astype(np.float64)
res = np.zeros(1, np.float64) + np.nan
cuda_func[1, 1](res, vals)
np.testing.assert_equal(res, [np.nan])
def _test_atomic_minmax_nan_val(self, func):
cuda_func = cuda.jit('void(float64[:], float64[:,:])')(func)
res = np.random.randint(0, 128, size=1).astype(np.float64)
gold = res.copy()
vals = np.zeros((1, 1), np.float64) + np.nan
cuda_func[1, 1](res, vals)
np.testing.assert_equal(res, gold)
def test_atomic_min_nan_location(self):
self._test_atomic_minmax_nan_location(atomic_min)
def test_atomic_max_nan_location(self):
self._test_atomic_minmax_nan_location(atomic_max)
def test_atomic_min_nan_val(self):
self._test_atomic_minmax_nan_val(atomic_min)
def test_atomic_max_nan_val(self):
self._test_atomic_minmax_nan_val(atomic_max)
def test_atomic_max_double_shared(self):
vals = np.random.randint(0, 32, size=32).astype(np.float64)
res = np.zeros(1, np.float64)
sig = 'void(float64[:], float64[:])'
cuda_func = cuda.jit(sig)(atomic_max_double_shared)
cuda_func[1, 32](res, vals)
gold = np.max(vals)
np.testing.assert_equal(res, gold)
def test_atomic_min_double_shared(self):
vals = np.random.randint(0, 32, size=32).astype(np.float64)
res = np.ones(1, np.float64) * 32
sig = 'void(float64[:], float64[:])'
cuda_func = cuda.jit(sig)(atomic_min_double_shared)
cuda_func[1, 32](res, vals)
gold = np.min(vals)
np.testing.assert_equal(res, gold)
def check_cas(self, n, fill, unfill, dtype, cas_func, ndim=1):
res = [fill] * (n // 2) + [unfill] * (n // 2)
np.random.shuffle(res)
res = np.asarray(res, dtype=dtype)
if ndim == 2:
res.shape = (10, -1)
out = np.zeros_like(res)
ary = np.random.randint(1, 10, size=res.shape).astype(res.dtype)
fill_mask = res == fill
unfill_mask = res == unfill
expect_res = np.zeros_like(res)
expect_res[fill_mask] = ary[fill_mask]
expect_res[unfill_mask] = unfill
expect_out = res.copy()
cuda_func = cuda.jit(cas_func)
if ndim == 1:
cuda_func[10, 10](res, out, ary, fill)
else:
cuda_func[(10, 10), (10, 10)](res, out, ary, fill)
np.testing.assert_array_equal(expect_res, res)
np.testing.assert_array_equal(expect_out, out)
def test_atomic_compare_and_swap(self):
self.check_cas(n=100, fill=-99, unfill=-1, dtype=np.int32,
cas_func=atomic_compare_and_swap)
def test_atomic_compare_and_swap2(self):
self.check_cas(n=100, fill=-45, unfill=-1, dtype=np.int64,
cas_func=atomic_compare_and_swap)
def test_atomic_compare_and_swap3(self):
rfill = np.random.randint(50, 500, dtype=np.uint32)
runfill = np.random.randint(1, 25, dtype=np.uint32)
self.check_cas(n=100, fill=rfill, unfill=runfill, dtype=np.uint32,
cas_func=atomic_compare_and_swap)
def test_atomic_compare_and_swap4(self):
rfill = np.random.randint(50, 500, dtype=np.uint64)
runfill = np.random.randint(1, 25, dtype=np.uint64)
self.check_cas(n=100, fill=rfill, unfill=runfill, dtype=np.uint64,
cas_func=atomic_compare_and_swap)
def test_atomic_cas_1dim(self):
self.check_cas(n=100, fill=-99, unfill=-1, dtype=np.int32,
cas_func=atomic_cas_1dim)
def test_atomic_cas_2dim(self):
self.check_cas(n=100, fill=-99, unfill=-1, dtype=np.int32,
cas_func=atomic_cas_2dim, ndim=2)
def test_atomic_cas2_1dim(self):
self.check_cas(n=100, fill=-45, unfill=-1, dtype=np.int64,
cas_func=atomic_cas_1dim)
def test_atomic_cas2_2dim(self):
self.check_cas(n=100, fill=-45, unfill=-1, dtype=np.int64,
cas_func=atomic_cas_2dim, ndim=2)
def test_atomic_cas3_1dim(self):
rfill = np.random.randint(50, 500, dtype=np.uint32)
runfill = np.random.randint(1, 25, dtype=np.uint32)
self.check_cas(n=100, fill=rfill, unfill=runfill, dtype=np.uint32,
cas_func=atomic_cas_1dim)
def test_atomic_cas3_2dim(self):
rfill = np.random.randint(50, 500, dtype=np.uint32)
runfill = np.random.randint(1, 25, dtype=np.uint32)
self.check_cas(n=100, fill=rfill, unfill=runfill, dtype=np.uint32,
cas_func=atomic_cas_2dim, ndim=2)
def test_atomic_cas4_1dim(self):
rfill = np.random.randint(50, 500, dtype=np.uint64)
runfill = np.random.randint(1, 25, dtype=np.uint64)
self.check_cas(n=100, fill=rfill, unfill=runfill, dtype=np.uint64,
cas_func=atomic_cas_1dim)
def test_atomic_cas4_2dim(self):
rfill = np.random.randint(50, 500, dtype=np.uint64)
runfill = np.random.randint(1, 25, dtype=np.uint64)
self.check_cas(n=100, fill=rfill, unfill=runfill, dtype=np.uint64,
cas_func=atomic_cas_2dim, ndim=2)
# Tests that the atomic add, min, and max operations return the old value -
# in the simulator, they did not (see Issue #5458). The max and min have
# special handling for NaN values, so we explicitly test with a NaN in the
# array being modified and the value provided.
def _test_atomic_returns_old(self, kernel, initial):
x = np.zeros(2, dtype=np.float32)
x[0] = initial
kernel[1, 1](x)
if np.isnan(initial):
self.assertTrue(np.isnan(x[1]))
else:
self.assertEqual(x[1], initial)
def test_atomic_add_returns_old(self):
@cuda.jit
def kernel(x):
x[1] = cuda.atomic.add(x, 0, 1)
self._test_atomic_returns_old(kernel, 10)
def test_atomic_max_returns_no_replace(self):
@cuda.jit
def kernel(x):
x[1] = cuda.atomic.max(x, 0, 1)
self._test_atomic_returns_old(kernel, 10)
def test_atomic_max_returns_old_replace(self):
@cuda.jit
def kernel(x):
x[1] = cuda.atomic.max(x, 0, 10)
self._test_atomic_returns_old(kernel, 1)
def test_atomic_max_returns_old_nan_in_array(self):
@cuda.jit
def kernel(x):
x[1] = cuda.atomic.max(x, 0, 1)
self._test_atomic_returns_old(kernel, np.nan)
def test_atomic_max_returns_old_nan_val(self):
@cuda.jit
def kernel(x):
x[1] = cuda.atomic.max(x, 0, np.nan)
self._test_atomic_returns_old(kernel, 10)
def test_atomic_min_returns_old_no_replace(self):
@cuda.jit
def kernel(x):
x[1] = cuda.atomic.min(x, 0, 11)
self._test_atomic_returns_old(kernel, 10)
def test_atomic_min_returns_old_replace(self):
@cuda.jit
def kernel(x):
x[1] = cuda.atomic.min(x, 0, 10)
self._test_atomic_returns_old(kernel, 11)
def test_atomic_min_returns_old_nan_in_array(self):
@cuda.jit
def kernel(x):
x[1] = cuda.atomic.min(x, 0, 11)
self._test_atomic_returns_old(kernel, np.nan)
def test_atomic_min_returns_old_nan_val(self):
@cuda.jit
def kernel(x):
x[1] = cuda.atomic.min(x, 0, np.nan)
self._test_atomic_returns_old(kernel, 11)
# Tests for atomic nanmin/nanmax
# nanmax tests
def check_atomic_nanmax(self, dtype, lo, hi, init_val):
vals = np.random.randint(lo, hi, size=(32, 32)).astype(dtype)
vals[1::2] = init_val
res = np.zeros(1, dtype=vals.dtype)
cuda_func = cuda.jit(atomic_nanmax)
cuda_func[32, 32](res, vals)
gold = np.nanmax(vals)
np.testing.assert_equal(res, gold)
def test_atomic_nanmax_int32(self):
self.check_atomic_nanmax(dtype=np.int32, lo=-65535, hi=65535,
init_val=0)
def test_atomic_nanmax_uint32(self):
self.check_atomic_nanmax(dtype=np.uint32, lo=0, hi=65535,
init_val=0)
def test_atomic_nanmax_int64(self):
self.check_atomic_nanmax(dtype=np.int64, lo=-65535, hi=65535,
init_val=0)
def test_atomic_nanmax_uint64(self):
self.check_atomic_nanmax(dtype=np.uint64, lo=0, hi=65535,
init_val=0)
def test_atomic_nanmax_float32(self):
self.check_atomic_nanmax(dtype=np.float32, lo=-65535, hi=65535,
init_val=np.nan)
def test_atomic_nanmax_double(self):
self.check_atomic_nanmax(dtype=np.float64, lo=-65535, hi=65535,
init_val=np.nan)
def test_atomic_nanmax_double_shared(self):
vals = np.random.randint(0, 32, size=32).astype(np.float64)
vals[1::2] = np.nan
res = np.array([0], dtype=vals.dtype)
sig = 'void(float64[:], float64[:])'
cuda_func = cuda.jit(sig)(atomic_nanmax_double_shared)
cuda_func[1, 32](res, vals)
gold = np.nanmax(vals)
np.testing.assert_equal(res, gold)
def test_atomic_nanmax_double_oneindex(self):
vals = np.random.randint(0, 128, size=32).astype(np.float64)
vals[1::2] = np.nan
res = np.zeros(1, np.float64)
cuda_func = cuda.jit('void(float64[:], float64[:])')(
atomic_max_double_oneindex)
cuda_func[1, 32](res, vals)
gold = np.nanmax(vals)
np.testing.assert_equal(res, gold)
# nanmin tests
def check_atomic_nanmin(self, dtype, lo, hi, init_val):
vals = np.random.randint(lo, hi, size=(32, 32)).astype(dtype)
vals[1::2] = init_val
res = np.array([65535], dtype=vals.dtype)
cuda_func = cuda.jit(atomic_nanmin)
cuda_func[32, 32](res, vals)
gold = np.nanmin(vals)
np.testing.assert_equal(res, gold)
def test_atomic_nanmin_int32(self):
self.check_atomic_nanmin(dtype=np.int32, lo=-65535, hi=65535,
init_val=0)
def test_atomic_nanmin_uint32(self):
self.check_atomic_nanmin(dtype=np.uint32, lo=0, hi=65535,
init_val=0)
def test_atomic_nanmin_int64(self):
self.check_atomic_nanmin(dtype=np.int64, lo=-65535, hi=65535,
init_val=0)
def test_atomic_nanmin_uint64(self):
self.check_atomic_nanmin(dtype=np.uint64, lo=0, hi=65535,
init_val=0)
def test_atomic_nanmin_float(self):
self.check_atomic_nanmin(dtype=np.float32, lo=-65535, hi=65535,
init_val=np.nan)
def test_atomic_nanmin_double(self):
self.check_atomic_nanmin(dtype=np.float64, lo=-65535, hi=65535,
init_val=np.nan)
def test_atomic_nanmin_double_shared(self):
vals = np.random.randint(0, 32, size=32).astype(np.float64)
vals[1::2] = np.nan
res = np.array([32], dtype=vals.dtype)
sig = 'void(float64[:], float64[:])'
cuda_func = cuda.jit(sig)(atomic_nanmin_double_shared)
cuda_func[1, 32](res, vals)
gold = np.nanmin(vals)
np.testing.assert_equal(res, gold)
def test_atomic_nanmin_double_oneindex(self):
vals = np.random.randint(0, 128, size=32).astype(np.float64)
vals[1::2] = np.nan
res = np.array([128], np.float64)
cuda_func = cuda.jit('void(float64[:], float64[:])')(
atomic_min_double_oneindex)
cuda_func[1, 32](res, vals)
gold = np.nanmin(vals)
np.testing.assert_equal(res, gold)
# Returning old value tests
def _test_atomic_nan_returns_old(self, kernel, initial):
x = np.zeros(2, dtype=np.float32)
x[0] = initial
x[1] = np.nan
kernel[1, 1](x)
if np.isnan(initial):
self.assertFalse(np.isnan(x[0]))
self.assertTrue(np.isnan(x[1]))
else:
self.assertEqual(x[1], initial)
def test_atomic_nanmax_returns_old_no_replace(self):
@cuda.jit
def kernel(x):
x[1] = cuda.atomic.nanmax(x, 0, 1)
self._test_atomic_nan_returns_old(kernel, 10)
def test_atomic_nanmax_returns_old_replace(self):
@cuda.jit
def kernel(x):
x[1] = cuda.atomic.nanmax(x, 0, 10)
self._test_atomic_nan_returns_old(kernel, 1)
def test_atomic_nanmax_returns_old_nan_in_array(self):
@cuda.jit
def kernel(x):
x[1] = cuda.atomic.nanmax(x, 0, 1)
self._test_atomic_nan_returns_old(kernel, np.nan)
def test_atomic_nanmax_returns_old_nan_val(self):
@cuda.jit
def kernel(x):
x[1] = cuda.atomic.nanmax(x, 0, np.nan)
self._test_atomic_nan_returns_old(kernel, 10)
def test_atomic_nanmin_returns_old_no_replace(self):
@cuda.jit
def kernel(x):
x[1] = cuda.atomic.nanmin(x, 0, 11)
self._test_atomic_nan_returns_old(kernel, 10)
def test_atomic_nanmin_returns_old_replace(self):
@cuda.jit
def kernel(x):
x[1] = cuda.atomic.nanmin(x, 0, 10)
self._test_atomic_nan_returns_old(kernel, 11)
def test_atomic_nanmin_returns_old_nan_in_array(self):
@cuda.jit
def kernel(x):
x[1] = cuda.atomic.nanmin(x, 0, 11)
self._test_atomic_nan_returns_old(kernel, np.nan)
def test_atomic_nanmin_returns_old_nan_val(self):
@cuda.jit
def kernel(x):
x[1] = cuda.atomic.nanmin(x, 0, np.nan)
self._test_atomic_nan_returns_old(kernel, 11)
if __name__ == '__main__':
unittest.main()