1621 lines
57 KiB
Python
1621 lines
57 KiB
Python
|
import numpy as np
|
||
|
from textwrap import dedent
|
||
|
|
||
|
from numba import cuda, uint32, uint64, float32, float64
|
||
|
from numba.cuda.testing import unittest, CUDATestCase, cc_X_or_above
|
||
|
from numba.core import config
|
||
|
|
||
|
|
||
|
@cuda.jit(device=True)
|
||
|
def atomic_cast_to_uint64(num):
|
||
|
return uint64(num)
|
||
|
|
||
|
|
||
|
@cuda.jit(device=True)
|
||
|
def atomic_cast_to_int(num):
|
||
|
return int(num)
|
||
|
|
||
|
|
||
|
@cuda.jit(device=True)
|
||
|
def atomic_cast_none(num):
|
||
|
return num
|
||
|
|
||
|
|
||
|
@cuda.jit(device=True)
|
||
|
def atomic_binary_1dim_shared(ary, idx, op2, ary_dtype, ary_nelements,
|
||
|
binop_func, cast_func, initializer,
|
||
|
neg_idx):
|
||
|
tid = cuda.threadIdx.x
|
||
|
sm = cuda.shared.array(ary_nelements, ary_dtype)
|
||
|
sm[tid] = initializer
|
||
|
cuda.syncthreads()
|
||
|
bin = cast_func(idx[tid] % ary_nelements)
|
||
|
if neg_idx:
|
||
|
bin = bin - ary_nelements
|
||
|
binop_func(sm, bin, op2)
|
||
|
cuda.syncthreads()
|
||
|
ary[tid] = sm[tid]
|
||
|
|
||
|
|
||
|
@cuda.jit(device=True)
|
||
|
def atomic_binary_1dim_shared2(ary, idx, op2, ary_dtype, ary_nelements,
|
||
|
binop_func, cast_func):
|
||
|
tid = cuda.threadIdx.x
|
||
|
sm = cuda.shared.array(ary_nelements, ary_dtype)
|
||
|
sm[tid] = ary[tid]
|
||
|
cuda.syncthreads()
|
||
|
bin = cast_func(idx[tid] % ary_nelements)
|
||
|
binop_func(sm, bin, op2)
|
||
|
cuda.syncthreads()
|
||
|
ary[tid] = sm[tid]
|
||
|
|
||
|
|
||
|
@cuda.jit(device=True)
|
||
|
def atomic_binary_2dim_shared(ary, op2, ary_dtype, ary_shape,
|
||
|
binop_func, y_cast_func, neg_idx):
|
||
|
tx = cuda.threadIdx.x
|
||
|
ty = cuda.threadIdx.y
|
||
|
sm = cuda.shared.array(ary_shape, ary_dtype)
|
||
|
sm[tx, ty] = ary[tx, ty]
|
||
|
cuda.syncthreads()
|
||
|
bin = (tx, y_cast_func(ty))
|
||
|
if neg_idx:
|
||
|
bin = (bin[0] - ary_shape[0], bin[1] - ary_shape[1])
|
||
|
binop_func(sm, bin, op2)
|
||
|
cuda.syncthreads()
|
||
|
ary[tx, ty] = sm[tx, ty]
|
||
|
|
||
|
|
||
|
@cuda.jit(device=True)
|
||
|
def atomic_binary_2dim_global(ary, op2, binop_func, y_cast_func, neg_idx):
|
||
|
tx = cuda.threadIdx.x
|
||
|
ty = cuda.threadIdx.y
|
||
|
bin = (tx, y_cast_func(ty))
|
||
|
if neg_idx:
|
||
|
bin = (bin[0] - ary.shape[0], bin[1] - ary.shape[1])
|
||
|
binop_func(ary, bin, op2)
|
||
|
|
||
|
|
||
|
@cuda.jit(device=True)
|
||
|
def atomic_binary_1dim_global(ary, idx, ary_nelements, op2,
|
||
|
binop_func, neg_idx):
|
||
|
tid = cuda.threadIdx.x
|
||
|
bin = int(idx[tid] % ary_nelements)
|
||
|
if neg_idx:
|
||
|
bin = bin - ary_nelements
|
||
|
binop_func(ary, bin, op2)
|
||
|
|
||
|
|
||
|
def atomic_add(ary):
|
||
|
atomic_binary_1dim_shared(ary, ary, 1, uint32, 32,
|
||
|
cuda.atomic.add, atomic_cast_none, 0, False)
|
||
|
|
||
|
|
||
|
def atomic_add_wrap(ary):
|
||
|
atomic_binary_1dim_shared(ary, ary, 1, uint32, 32,
|
||
|
cuda.atomic.add, atomic_cast_none, 0, True)
|
||
|
|
||
|
|
||
|
def atomic_add2(ary):
|
||
|
atomic_binary_2dim_shared(ary, 1, uint32, (4, 8),
|
||
|
cuda.atomic.add, atomic_cast_none, False)
|
||
|
|
||
|
|
||
|
def atomic_add2_wrap(ary):
|
||
|
atomic_binary_2dim_shared(ary, 1, uint32, (4, 8),
|
||
|
cuda.atomic.add, atomic_cast_none, True)
|
||
|
|
||
|
|
||
|
def atomic_add3(ary):
|
||
|
atomic_binary_2dim_shared(ary, 1, uint32, (4, 8),
|
||
|
cuda.atomic.add, atomic_cast_to_uint64, False)
|
||
|
|
||
|
|
||
|
def atomic_add_float(ary):
|
||
|
atomic_binary_1dim_shared(ary, ary, 1.0, float32, 32,
|
||
|
cuda.atomic.add, atomic_cast_to_int, 0.0, False)
|
||
|
|
||
|
|
||
|
def atomic_add_float_wrap(ary):
|
||
|
atomic_binary_1dim_shared(ary, ary, 1.0, float32, 32,
|
||
|
cuda.atomic.add, atomic_cast_to_int, 0.0, True)
|
||
|
|
||
|
|
||
|
def atomic_add_float_2(ary):
|
||
|
atomic_binary_2dim_shared(ary, 1.0, float32, (4, 8),
|
||
|
cuda.atomic.add, atomic_cast_none, False)
|
||
|
|
||
|
|
||
|
def atomic_add_float_2_wrap(ary):
|
||
|
atomic_binary_2dim_shared(ary, 1.0, float32, (4, 8),
|
||
|
cuda.atomic.add, atomic_cast_none, True)
|
||
|
|
||
|
|
||
|
def atomic_add_float_3(ary):
|
||
|
atomic_binary_2dim_shared(ary, 1.0, float32, (4, 8),
|
||
|
cuda.atomic.add, atomic_cast_to_uint64, False)
|
||
|
|
||
|
|
||
|
def atomic_add_double_global(idx, ary):
|
||
|
atomic_binary_1dim_global(ary, idx, 32, 1.0, cuda.atomic.add, False)
|
||
|
|
||
|
|
||
|
def atomic_add_double_global_wrap(idx, ary):
|
||
|
atomic_binary_1dim_global(ary, idx, 32, 1.0, cuda.atomic.add, True)
|
||
|
|
||
|
|
||
|
def atomic_add_double_global_2(ary):
|
||
|
atomic_binary_2dim_global(ary, 1, cuda.atomic.add, atomic_cast_none, False)
|
||
|
|
||
|
|
||
|
def atomic_add_double_global_2_wrap(ary):
|
||
|
atomic_binary_2dim_global(ary, 1, cuda.atomic.add, atomic_cast_none, True)
|
||
|
|
||
|
|
||
|
def atomic_add_double_global_3(ary):
|
||
|
atomic_binary_2dim_global(ary, 1, cuda.atomic.add, atomic_cast_to_uint64,
|
||
|
False)
|
||
|
|
||
|
|
||
|
def atomic_add_double(idx, ary):
|
||
|
atomic_binary_1dim_shared(ary, idx, 1.0, float64, 32,
|
||
|
cuda.atomic.add, atomic_cast_none, 0.0, False)
|
||
|
|
||
|
|
||
|
def atomic_add_double_wrap(idx, ary):
|
||
|
atomic_binary_1dim_shared(ary, idx, 1.0, float64, 32,
|
||
|
cuda.atomic.add, atomic_cast_none, 0.0, True)
|
||
|
|
||
|
|
||
|
def atomic_add_double_2(ary):
|
||
|
atomic_binary_2dim_shared(ary, 1.0, float64, (4, 8),
|
||
|
cuda.atomic.add, atomic_cast_none, False)
|
||
|
|
||
|
|
||
|
def atomic_add_double_2_wrap(ary):
|
||
|
atomic_binary_2dim_shared(ary, 1.0, float64, (4, 8),
|
||
|
cuda.atomic.add, atomic_cast_none, True)
|
||
|
|
||
|
|
||
|
def atomic_add_double_3(ary):
|
||
|
atomic_binary_2dim_shared(ary, 1.0, float64, (4, 8),
|
||
|
cuda.atomic.add, atomic_cast_to_uint64, False)
|
||
|
|
||
|
|
||
|
def atomic_sub(ary):
|
||
|
atomic_binary_1dim_shared(ary, ary, 1, uint32, 32,
|
||
|
cuda.atomic.sub, atomic_cast_none, 0, False)
|
||
|
|
||
|
|
||
|
def atomic_sub2(ary):
|
||
|
atomic_binary_2dim_shared(ary, 1, uint32, (4, 8),
|
||
|
cuda.atomic.sub, atomic_cast_none, False)
|
||
|
|
||
|
|
||
|
def atomic_sub3(ary):
|
||
|
atomic_binary_2dim_shared(ary, 1, uint32, (4, 8),
|
||
|
cuda.atomic.sub, atomic_cast_to_uint64, False)
|
||
|
|
||
|
|
||
|
def atomic_sub_float(ary):
|
||
|
atomic_binary_1dim_shared(ary, ary, 1.0, float32, 32,
|
||
|
cuda.atomic.sub, atomic_cast_to_int, 0.0, False)
|
||
|
|
||
|
|
||
|
def atomic_sub_float_2(ary):
|
||
|
atomic_binary_2dim_shared(ary, 1.0, float32, (4, 8),
|
||
|
cuda.atomic.sub, atomic_cast_none, False)
|
||
|
|
||
|
|
||
|
def atomic_sub_float_3(ary):
|
||
|
atomic_binary_2dim_shared(ary, 1.0, float32, (4, 8),
|
||
|
cuda.atomic.sub, atomic_cast_to_uint64, False)
|
||
|
|
||
|
|
||
|
def atomic_sub_double(idx, ary):
|
||
|
atomic_binary_1dim_shared(ary, idx, 1.0, float64, 32,
|
||
|
cuda.atomic.sub, atomic_cast_none, 0.0, False)
|
||
|
|
||
|
|
||
|
def atomic_sub_double_2(ary):
|
||
|
atomic_binary_2dim_shared(ary, 1.0, float64, (4, 8),
|
||
|
cuda.atomic.sub, atomic_cast_none, False)
|
||
|
|
||
|
|
||
|
def atomic_sub_double_3(ary):
|
||
|
atomic_binary_2dim_shared(ary, 1.0, float64, (4, 8),
|
||
|
cuda.atomic.sub, atomic_cast_to_uint64, False)
|
||
|
|
||
|
|
||
|
def atomic_sub_double_global(idx, ary):
|
||
|
atomic_binary_1dim_global(ary, idx, 32, 1.0, cuda.atomic.sub, False)
|
||
|
|
||
|
|
||
|
def atomic_sub_double_global_2(ary):
|
||
|
atomic_binary_2dim_global(ary, 1.0, cuda.atomic.sub, atomic_cast_none,
|
||
|
False)
|
||
|
|
||
|
|
||
|
def atomic_sub_double_global_3(ary):
|
||
|
atomic_binary_2dim_shared(ary, 1.0, float64, (4, 8),
|
||
|
cuda.atomic.sub, atomic_cast_to_uint64, False)
|
||
|
|
||
|
|
||
|
def atomic_and(ary, op2):
|
||
|
atomic_binary_1dim_shared(ary, ary, op2, uint32, 32,
|
||
|
cuda.atomic.and_, atomic_cast_none, 1, False)
|
||
|
|
||
|
|
||
|
def atomic_and2(ary, op2):
|
||
|
atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
|
||
|
cuda.atomic.and_, atomic_cast_none, False)
|
||
|
|
||
|
|
||
|
def atomic_and3(ary, op2):
|
||
|
atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
|
||
|
cuda.atomic.and_, atomic_cast_to_uint64, False)
|
||
|
|
||
|
|
||
|
def atomic_and_global(idx, ary, op2):
|
||
|
atomic_binary_1dim_global(ary, idx, 32, op2, cuda.atomic.and_, False)
|
||
|
|
||
|
|
||
|
def atomic_and_global_2(ary, op2):
|
||
|
atomic_binary_2dim_global(ary, op2, cuda.atomic.and_,
|
||
|
atomic_cast_none, False)
|
||
|
|
||
|
|
||
|
def atomic_or(ary, op2):
|
||
|
atomic_binary_1dim_shared(ary, ary, op2, uint32, 32,
|
||
|
cuda.atomic.or_, atomic_cast_none, 0, False)
|
||
|
|
||
|
|
||
|
def atomic_or2(ary, op2):
|
||
|
atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
|
||
|
cuda.atomic.or_, atomic_cast_none, False)
|
||
|
|
||
|
|
||
|
def atomic_or3(ary, op2):
|
||
|
atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
|
||
|
cuda.atomic.or_, atomic_cast_to_uint64, False)
|
||
|
|
||
|
|
||
|
def atomic_or_global(idx, ary, op2):
|
||
|
atomic_binary_1dim_global(ary, idx, 32, op2, cuda.atomic.or_, False)
|
||
|
|
||
|
|
||
|
def atomic_or_global_2(ary, op2):
|
||
|
atomic_binary_2dim_global(ary, op2, cuda.atomic.or_,
|
||
|
atomic_cast_none, False)
|
||
|
|
||
|
|
||
|
def atomic_xor(ary, op2):
|
||
|
atomic_binary_1dim_shared(ary, ary, op2, uint32, 32,
|
||
|
cuda.atomic.xor, atomic_cast_none, 0, False)
|
||
|
|
||
|
|
||
|
def atomic_xor2(ary, op2):
|
||
|
atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
|
||
|
cuda.atomic.xor, atomic_cast_none, False)
|
||
|
|
||
|
|
||
|
def atomic_xor3(ary, op2):
|
||
|
atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
|
||
|
cuda.atomic.xor, atomic_cast_to_uint64, False)
|
||
|
|
||
|
|
||
|
def atomic_xor_global(idx, ary, op2):
|
||
|
atomic_binary_1dim_global(ary, idx, 32, op2, cuda.atomic.xor, False)
|
||
|
|
||
|
|
||
|
def atomic_xor_global_2(ary, op2):
|
||
|
atomic_binary_2dim_global(ary, op2, cuda.atomic.xor,
|
||
|
atomic_cast_none, False)
|
||
|
|
||
|
|
||
|
def atomic_inc32(ary, idx, op2):
|
||
|
atomic_binary_1dim_shared2(ary, idx, op2, uint32, 32,
|
||
|
cuda.atomic.inc, atomic_cast_none)
|
||
|
|
||
|
|
||
|
def atomic_inc64(ary, idx, op2):
|
||
|
atomic_binary_1dim_shared2(ary, idx, op2, uint64, 32,
|
||
|
cuda.atomic.inc, atomic_cast_to_int)
|
||
|
|
||
|
|
||
|
def atomic_inc2_32(ary, op2):
|
||
|
atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
|
||
|
cuda.atomic.inc, atomic_cast_none, False)
|
||
|
|
||
|
|
||
|
def atomic_inc2_64(ary, op2):
|
||
|
atomic_binary_2dim_shared(ary, op2, uint64, (4, 8),
|
||
|
cuda.atomic.inc, atomic_cast_none, False)
|
||
|
|
||
|
|
||
|
def atomic_inc3(ary, op2):
|
||
|
atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
|
||
|
cuda.atomic.inc, atomic_cast_to_uint64, False)
|
||
|
|
||
|
|
||
|
def atomic_inc_global(idx, ary, op2):
|
||
|
atomic_binary_1dim_global(ary, idx, 32, op2, cuda.atomic.inc, False)
|
||
|
|
||
|
|
||
|
def atomic_inc_global_2(ary, op2):
|
||
|
atomic_binary_2dim_global(ary, op2, cuda.atomic.inc,
|
||
|
atomic_cast_none, False)
|
||
|
|
||
|
|
||
|
def atomic_dec32(ary, idx, op2):
|
||
|
atomic_binary_1dim_shared2(ary, idx, op2, uint32, 32,
|
||
|
cuda.atomic.dec, atomic_cast_none)
|
||
|
|
||
|
|
||
|
def atomic_dec64(ary, idx, op2):
|
||
|
atomic_binary_1dim_shared2(ary, idx, op2, uint64, 32,
|
||
|
cuda.atomic.dec, atomic_cast_to_int)
|
||
|
|
||
|
|
||
|
def atomic_dec2_32(ary, op2):
|
||
|
atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
|
||
|
cuda.atomic.dec, atomic_cast_none, False)
|
||
|
|
||
|
|
||
|
def atomic_dec2_64(ary, op2):
|
||
|
atomic_binary_2dim_shared(ary, op2, uint64, (4, 8),
|
||
|
cuda.atomic.dec, atomic_cast_none, False)
|
||
|
|
||
|
|
||
|
def atomic_dec3(ary, op2):
|
||
|
atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
|
||
|
cuda.atomic.dec, atomic_cast_to_uint64, False)
|
||
|
|
||
|
|
||
|
def atomic_dec_global(idx, ary, op2):
|
||
|
atomic_binary_1dim_global(ary, idx, 32, op2, cuda.atomic.dec, False)
|
||
|
|
||
|
|
||
|
def atomic_dec_global_2(ary, op2):
|
||
|
atomic_binary_2dim_global(ary, op2, cuda.atomic.dec,
|
||
|
atomic_cast_none, False)
|
||
|
|
||
|
|
||
|
def atomic_exch(ary, idx, op2):
|
||
|
atomic_binary_1dim_shared2(ary, idx, op2, uint32, 32,
|
||
|
cuda.atomic.exch, atomic_cast_none)
|
||
|
|
||
|
|
||
|
def atomic_exch2(ary, op2):
|
||
|
atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
|
||
|
cuda.atomic.exch, atomic_cast_none, False)
|
||
|
|
||
|
|
||
|
def atomic_exch3(ary, op2):
|
||
|
atomic_binary_2dim_shared(ary, op2, uint64, (4, 8),
|
||
|
cuda.atomic.exch, atomic_cast_none, False)
|
||
|
|
||
|
|
||
|
def atomic_exch_global(idx, ary, op2):
|
||
|
atomic_binary_1dim_global(ary, idx, 32, op2, cuda.atomic.exch, False)
|
||
|
|
||
|
|
||
|
def gen_atomic_extreme_funcs(func):
|
||
|
|
||
|
fns = dedent("""
|
||
|
def atomic(res, ary):
|
||
|
tx = cuda.threadIdx.x
|
||
|
bx = cuda.blockIdx.x
|
||
|
{func}(res, 0, ary[tx, bx])
|
||
|
|
||
|
def atomic_double_normalizedindex(res, ary):
|
||
|
tx = cuda.threadIdx.x
|
||
|
bx = cuda.blockIdx.x
|
||
|
{func}(res, 0, ary[tx, uint64(bx)])
|
||
|
|
||
|
def atomic_double_oneindex(res, ary):
|
||
|
tx = cuda.threadIdx.x
|
||
|
{func}(res, 0, ary[tx])
|
||
|
|
||
|
def atomic_double_shared(res, ary):
|
||
|
tid = cuda.threadIdx.x
|
||
|
smary = cuda.shared.array(32, float64)
|
||
|
smary[tid] = ary[tid]
|
||
|
smres = cuda.shared.array(1, float64)
|
||
|
if tid == 0:
|
||
|
smres[0] = res[0]
|
||
|
cuda.syncthreads()
|
||
|
{func}(smres, 0, smary[tid])
|
||
|
cuda.syncthreads()
|
||
|
if tid == 0:
|
||
|
res[0] = smres[0]
|
||
|
""").format(func=func)
|
||
|
ld = {}
|
||
|
exec(fns, {'cuda': cuda, 'float64': float64, 'uint64': uint64}, ld)
|
||
|
return (ld['atomic'], ld['atomic_double_normalizedindex'],
|
||
|
ld['atomic_double_oneindex'], ld['atomic_double_shared'])
|
||
|
|
||
|
|
||
|
(atomic_max, atomic_max_double_normalizedindex, atomic_max_double_oneindex,
|
||
|
atomic_max_double_shared) = gen_atomic_extreme_funcs('cuda.atomic.max')
|
||
|
(atomic_min, atomic_min_double_normalizedindex, atomic_min_double_oneindex,
|
||
|
atomic_min_double_shared) = gen_atomic_extreme_funcs('cuda.atomic.min')
|
||
|
(atomic_nanmax, atomic_nanmax_double_normalizedindex,
|
||
|
atomic_nanmax_double_oneindex, atomic_nanmax_double_shared) = \
|
||
|
gen_atomic_extreme_funcs('cuda.atomic.nanmax')
|
||
|
(atomic_nanmin, atomic_nanmin_double_normalizedindex,
|
||
|
atomic_nanmin_double_oneindex, atomic_nanmin_double_shared) = \
|
||
|
gen_atomic_extreme_funcs('cuda.atomic.nanmin')
|
||
|
|
||
|
|
||
|
def atomic_compare_and_swap(res, old, ary, fill_val):
|
||
|
gid = cuda.grid(1)
|
||
|
if gid < res.size:
|
||
|
old[gid] = cuda.atomic.compare_and_swap(res[gid:], fill_val, ary[gid])
|
||
|
|
||
|
|
||
|
def atomic_cas_1dim(res, old, ary, fill_val):
|
||
|
gid = cuda.grid(1)
|
||
|
if gid < res.size:
|
||
|
old[gid] = cuda.atomic.cas(res, gid, fill_val, ary[gid])
|
||
|
|
||
|
|
||
|
def atomic_cas_2dim(res, old, ary, fill_val):
|
||
|
gid = cuda.grid(2)
|
||
|
if gid[0] < res.shape[0] and gid[1] < res.shape[1]:
|
||
|
old[gid] = cuda.atomic.cas(res, gid, fill_val, ary[gid])
|
||
|
|
||
|
|
||
|
class TestCudaAtomics(CUDATestCase):
|
||
|
def setUp(self):
|
||
|
super().setUp()
|
||
|
np.random.seed(0)
|
||
|
|
||
|
def test_atomic_add(self):
|
||
|
ary = np.random.randint(0, 32, size=32).astype(np.uint32)
|
||
|
ary_wrap = ary.copy()
|
||
|
orig = ary.copy()
|
||
|
|
||
|
cuda_atomic_add = cuda.jit('void(uint32[:])')(atomic_add)
|
||
|
cuda_atomic_add[1, 32](ary)
|
||
|
|
||
|
cuda_atomic_add_wrap = cuda.jit('void(uint32[:])')(atomic_add_wrap)
|
||
|
cuda_atomic_add_wrap[1, 32](ary_wrap)
|
||
|
|
||
|
gold = np.zeros(32, dtype=np.uint32)
|
||
|
for i in range(orig.size):
|
||
|
gold[orig[i]] += 1
|
||
|
|
||
|
self.assertTrue(np.all(ary == gold))
|
||
|
self.assertTrue(np.all(ary_wrap == gold))
|
||
|
|
||
|
def test_atomic_add2(self):
|
||
|
ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
|
||
|
ary_wrap = ary.copy()
|
||
|
orig = ary.copy()
|
||
|
|
||
|
cuda_atomic_add2 = cuda.jit('void(uint32[:,:])')(atomic_add2)
|
||
|
cuda_atomic_add2[1, (4, 8)](ary)
|
||
|
|
||
|
cuda_atomic_add2_wrap = cuda.jit('void(uint32[:,:])')(atomic_add2_wrap)
|
||
|
cuda_atomic_add2_wrap[1, (4, 8)](ary_wrap)
|
||
|
|
||
|
self.assertTrue(np.all(ary == orig + 1))
|
||
|
self.assertTrue(np.all(ary_wrap == orig + 1))
|
||
|
|
||
|
def test_atomic_add3(self):
|
||
|
ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
|
||
|
orig = ary.copy()
|
||
|
cuda_atomic_add3 = cuda.jit('void(uint32[:,:])')(atomic_add3)
|
||
|
cuda_atomic_add3[1, (4, 8)](ary)
|
||
|
|
||
|
self.assertTrue(np.all(ary == orig + 1))
|
||
|
|
||
|
def test_atomic_add_float(self):
|
||
|
ary = np.random.randint(0, 32, size=32).astype(np.float32)
|
||
|
ary_wrap = ary.copy()
|
||
|
orig = ary.copy().astype(np.intp)
|
||
|
|
||
|
cuda_atomic_add_float = cuda.jit('void(float32[:])')(atomic_add_float)
|
||
|
cuda_atomic_add_float[1, 32](ary)
|
||
|
|
||
|
add_float_wrap = cuda.jit('void(float32[:])')(atomic_add_float_wrap)
|
||
|
add_float_wrap[1, 32](ary_wrap)
|
||
|
|
||
|
gold = np.zeros(32, dtype=np.uint32)
|
||
|
for i in range(orig.size):
|
||
|
gold[orig[i]] += 1.0
|
||
|
|
||
|
self.assertTrue(np.all(ary == gold))
|
||
|
self.assertTrue(np.all(ary_wrap == gold))
|
||
|
|
||
|
def test_atomic_add_float_2(self):
|
||
|
ary = np.random.randint(0, 32, size=32).astype(np.float32).reshape(4, 8)
|
||
|
ary_wrap = ary.copy()
|
||
|
orig = ary.copy()
|
||
|
|
||
|
cuda_atomic_add2 = cuda.jit('void(float32[:,:])')(atomic_add_float_2)
|
||
|
cuda_atomic_add2[1, (4, 8)](ary)
|
||
|
|
||
|
cuda_func_wrap = cuda.jit('void(float32[:,:])')(atomic_add_float_2_wrap)
|
||
|
cuda_func_wrap[1, (4, 8)](ary_wrap)
|
||
|
|
||
|
self.assertTrue(np.all(ary == orig + 1))
|
||
|
self.assertTrue(np.all(ary_wrap == orig + 1))
|
||
|
|
||
|
def test_atomic_add_float_3(self):
|
||
|
ary = np.random.randint(0, 32, size=32).astype(np.float32).reshape(4, 8)
|
||
|
orig = ary.copy()
|
||
|
cuda_atomic_add3 = cuda.jit('void(float32[:,:])')(atomic_add_float_3)
|
||
|
cuda_atomic_add3[1, (4, 8)](ary)
|
||
|
|
||
|
self.assertTrue(np.all(ary == orig + 1))
|
||
|
|
||
|
def assertCorrectFloat64Atomics(self, kernel, shared=True):
|
||
|
if config.ENABLE_CUDASIM:
|
||
|
return
|
||
|
|
||
|
# Use the first (and only) definition
|
||
|
asm = next(iter(kernel.inspect_asm().values()))
|
||
|
if cc_X_or_above(6, 0):
|
||
|
if cuda.runtime.get_version() > (12, 1):
|
||
|
# CUDA 12.2 and above generate a more optimized reduction
|
||
|
# instruction, because the result does not need to be
|
||
|
# placed in a register.
|
||
|
inst = 'red'
|
||
|
else:
|
||
|
inst = 'atom'
|
||
|
|
||
|
if shared:
|
||
|
inst = f'{inst}.shared'
|
||
|
|
||
|
self.assertIn(f'{inst}.add.f64', asm)
|
||
|
else:
|
||
|
if shared:
|
||
|
self.assertIn('atom.shared.cas.b64', asm)
|
||
|
else:
|
||
|
self.assertIn('atom.cas.b64', asm)
|
||
|
|
||
|
def test_atomic_add_double(self):
|
||
|
idx = np.random.randint(0, 32, size=32, dtype=np.int64)
|
||
|
ary = np.zeros(32, np.float64)
|
||
|
ary_wrap = ary.copy()
|
||
|
|
||
|
cuda_fn = cuda.jit('void(int64[:], float64[:])')(atomic_add_double)
|
||
|
cuda_fn[1, 32](idx, ary)
|
||
|
|
||
|
wrap_fn = cuda.jit('void(int64[:], float64[:])')(atomic_add_double_wrap)
|
||
|
wrap_fn[1, 32](idx, ary_wrap)
|
||
|
|
||
|
gold = np.zeros(32, dtype=np.uint32)
|
||
|
for i in range(idx.size):
|
||
|
gold[idx[i]] += 1.0
|
||
|
|
||
|
np.testing.assert_equal(ary, gold)
|
||
|
np.testing.assert_equal(ary_wrap, gold)
|
||
|
self.assertCorrectFloat64Atomics(cuda_fn)
|
||
|
self.assertCorrectFloat64Atomics(wrap_fn)
|
||
|
|
||
|
def test_atomic_add_double_2(self):
|
||
|
ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8)
|
||
|
ary_wrap = ary.copy()
|
||
|
orig = ary.copy()
|
||
|
|
||
|
cuda_fn = cuda.jit('void(float64[:,:])')(atomic_add_double_2)
|
||
|
cuda_fn[1, (4, 8)](ary)
|
||
|
|
||
|
cuda_fn_wrap = cuda.jit('void(float64[:,:])')(atomic_add_double_2_wrap)
|
||
|
cuda_fn_wrap[1, (4, 8)](ary_wrap)
|
||
|
|
||
|
np.testing.assert_equal(ary, orig + 1)
|
||
|
np.testing.assert_equal(ary_wrap, orig + 1)
|
||
|
self.assertCorrectFloat64Atomics(cuda_fn)
|
||
|
self.assertCorrectFloat64Atomics(cuda_fn_wrap)
|
||
|
|
||
|
def test_atomic_add_double_3(self):
|
||
|
ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8)
|
||
|
orig = ary.copy()
|
||
|
cuda_func = cuda.jit('void(float64[:,:])')(atomic_add_double_3)
|
||
|
cuda_func[1, (4, 8)](ary)
|
||
|
|
||
|
np.testing.assert_equal(ary, orig + 1)
|
||
|
self.assertCorrectFloat64Atomics(cuda_func)
|
||
|
|
||
|
def test_atomic_add_double_global(self):
|
||
|
idx = np.random.randint(0, 32, size=32, dtype=np.int64)
|
||
|
ary = np.zeros(32, np.float64)
|
||
|
ary_wrap = ary.copy()
|
||
|
|
||
|
sig = 'void(int64[:], float64[:])'
|
||
|
cuda_func = cuda.jit(sig)(atomic_add_double_global)
|
||
|
wrap_cuda_func = cuda.jit(sig)(atomic_add_double_global_wrap)
|
||
|
|
||
|
cuda_func[1, 32](idx, ary)
|
||
|
wrap_cuda_func[1, 32](idx, ary_wrap)
|
||
|
|
||
|
gold = np.zeros(32, dtype=np.uint32)
|
||
|
for i in range(idx.size):
|
||
|
gold[idx[i]] += 1.0
|
||
|
|
||
|
np.testing.assert_equal(ary, gold)
|
||
|
np.testing.assert_equal(ary_wrap, gold)
|
||
|
self.assertCorrectFloat64Atomics(cuda_func, shared=False)
|
||
|
self.assertCorrectFloat64Atomics(wrap_cuda_func, shared=False)
|
||
|
|
||
|
def test_atomic_add_double_global_2(self):
|
||
|
ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8)
|
||
|
ary_wrap = ary.copy()
|
||
|
orig = ary.copy()
|
||
|
|
||
|
sig = 'void(float64[:,:])'
|
||
|
cuda_func = cuda.jit(sig)(atomic_add_double_global_2)
|
||
|
wrap_cuda_func = cuda.jit(sig)(atomic_add_double_global_2_wrap)
|
||
|
|
||
|
cuda_func[1, (4, 8)](ary)
|
||
|
wrap_cuda_func[1, (4, 8)](ary_wrap)
|
||
|
|
||
|
np.testing.assert_equal(ary, orig + 1)
|
||
|
np.testing.assert_equal(ary_wrap, orig + 1)
|
||
|
self.assertCorrectFloat64Atomics(cuda_func, shared=False)
|
||
|
self.assertCorrectFloat64Atomics(wrap_cuda_func, shared=False)
|
||
|
|
||
|
def test_atomic_add_double_global_3(self):
|
||
|
ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8)
|
||
|
orig = ary.copy()
|
||
|
cuda_func = cuda.jit('void(float64[:,:])')(atomic_add_double_global_3)
|
||
|
cuda_func[1, (4, 8)](ary)
|
||
|
|
||
|
np.testing.assert_equal(ary, orig + 1)
|
||
|
self.assertCorrectFloat64Atomics(cuda_func, shared=False)
|
||
|
|
||
|
def test_atomic_sub(self):
|
||
|
ary = np.random.randint(0, 32, size=32).astype(np.uint32)
|
||
|
orig = ary.copy()
|
||
|
cuda_atomic_sub = cuda.jit('void(uint32[:])')(atomic_sub)
|
||
|
cuda_atomic_sub[1, 32](ary)
|
||
|
|
||
|
gold = np.zeros(32, dtype=np.uint32)
|
||
|
for i in range(orig.size):
|
||
|
gold[orig[i]] -= 1
|
||
|
|
||
|
self.assertTrue(np.all(ary == gold))
|
||
|
|
||
|
def test_atomic_sub2(self):
|
||
|
ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
|
||
|
orig = ary.copy()
|
||
|
cuda_atomic_sub2 = cuda.jit('void(uint32[:,:])')(atomic_sub2)
|
||
|
cuda_atomic_sub2[1, (4, 8)](ary)
|
||
|
self.assertTrue(np.all(ary == orig - 1))
|
||
|
|
||
|
def test_atomic_sub3(self):
|
||
|
ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
|
||
|
orig = ary.copy()
|
||
|
cuda_atomic_sub3 = cuda.jit('void(uint32[:,:])')(atomic_sub3)
|
||
|
cuda_atomic_sub3[1, (4, 8)](ary)
|
||
|
self.assertTrue(np.all(ary == orig - 1))
|
||
|
|
||
|
def test_atomic_sub_float(self):
|
||
|
ary = np.random.randint(0, 32, size=32).astype(np.float32)
|
||
|
orig = ary.copy().astype(np.intp)
|
||
|
cuda_atomic_sub_float = cuda.jit('void(float32[:])')(atomic_sub_float)
|
||
|
cuda_atomic_sub_float[1, 32](ary)
|
||
|
|
||
|
gold = np.zeros(32, dtype=np.float32)
|
||
|
for i in range(orig.size):
|
||
|
gold[orig[i]] -= 1.0
|
||
|
|
||
|
self.assertTrue(np.all(ary == gold))
|
||
|
|
||
|
def test_atomic_sub_float_2(self):
|
||
|
ary = np.random.randint(0, 32, size=32).astype(np.float32).reshape(4, 8)
|
||
|
orig = ary.copy()
|
||
|
cuda_atomic_sub2 = cuda.jit('void(float32[:,:])')(atomic_sub_float_2)
|
||
|
cuda_atomic_sub2[1, (4, 8)](ary)
|
||
|
self.assertTrue(np.all(ary == orig - 1))
|
||
|
|
||
|
def test_atomic_sub_float_3(self):
|
||
|
ary = np.random.randint(0, 32, size=32).astype(np.float32).reshape(4, 8)
|
||
|
orig = ary.copy()
|
||
|
cuda_atomic_sub3 = cuda.jit('void(float32[:,:])')(atomic_sub_float_3)
|
||
|
cuda_atomic_sub3[1, (4, 8)](ary)
|
||
|
self.assertTrue(np.all(ary == orig - 1))
|
||
|
|
||
|
def test_atomic_sub_double(self):
|
||
|
idx = np.random.randint(0, 32, size=32, dtype=np.int64)
|
||
|
ary = np.zeros(32, np.float64)
|
||
|
cuda_func = cuda.jit('void(int64[:], float64[:])')(atomic_sub_double)
|
||
|
cuda_func[1, 32](idx, ary)
|
||
|
|
||
|
gold = np.zeros(32, dtype=np.float64)
|
||
|
for i in range(idx.size):
|
||
|
gold[idx[i]] -= 1.0
|
||
|
|
||
|
np.testing.assert_equal(ary, gold)
|
||
|
|
||
|
def test_atomic_sub_double_2(self):
|
||
|
ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8)
|
||
|
orig = ary.copy()
|
||
|
cuda_func = cuda.jit('void(float64[:,:])')(atomic_sub_double_2)
|
||
|
cuda_func[1, (4, 8)](ary)
|
||
|
np.testing.assert_equal(ary, orig - 1)
|
||
|
|
||
|
def test_atomic_sub_double_3(self):
|
||
|
ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8)
|
||
|
orig = ary.copy()
|
||
|
cuda_func = cuda.jit('void(float64[:,:])')(atomic_sub_double_3)
|
||
|
cuda_func[1, (4, 8)](ary)
|
||
|
np.testing.assert_equal(ary, orig - 1)
|
||
|
|
||
|
def test_atomic_sub_double_global(self):
|
||
|
idx = np.random.randint(0, 32, size=32, dtype=np.int64)
|
||
|
ary = np.zeros(32, np.float64)
|
||
|
sig = 'void(int64[:], float64[:])'
|
||
|
cuda_func = cuda.jit(sig)(atomic_sub_double_global)
|
||
|
cuda_func[1, 32](idx, ary)
|
||
|
|
||
|
gold = np.zeros(32, dtype=np.float64)
|
||
|
for i in range(idx.size):
|
||
|
gold[idx[i]] -= 1.0
|
||
|
|
||
|
np.testing.assert_equal(ary, gold)
|
||
|
|
||
|
def test_atomic_sub_double_global_2(self):
|
||
|
ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8)
|
||
|
orig = ary.copy()
|
||
|
cuda_func = cuda.jit('void(float64[:,:])')(atomic_sub_double_global_2)
|
||
|
cuda_func[1, (4, 8)](ary)
|
||
|
np.testing.assert_equal(ary, orig - 1)
|
||
|
|
||
|
def test_atomic_sub_double_global_3(self):
|
||
|
ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8)
|
||
|
orig = ary.copy()
|
||
|
cuda_func = cuda.jit('void(float64[:,:])')(atomic_sub_double_global_3)
|
||
|
cuda_func[1, (4, 8)](ary)
|
||
|
np.testing.assert_equal(ary, orig - 1)
|
||
|
|
||
|
def test_atomic_and(self):
|
||
|
rand_const = np.random.randint(500)
|
||
|
ary = np.random.randint(0, 32, size=32).astype(np.uint32)
|
||
|
orig = ary.copy()
|
||
|
cuda_func = cuda.jit('void(uint32[:], uint32)')(atomic_and)
|
||
|
cuda_func[1, 32](ary, rand_const)
|
||
|
|
||
|
gold = ary.copy()
|
||
|
for i in range(orig.size):
|
||
|
gold[orig[i]] &= rand_const
|
||
|
|
||
|
self.assertTrue(np.all(ary == gold))
|
||
|
|
||
|
def test_atomic_and2(self):
|
||
|
rand_const = np.random.randint(500)
|
||
|
ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
|
||
|
orig = ary.copy()
|
||
|
cuda_atomic_and2 = cuda.jit('void(uint32[:,:], uint32)')(atomic_and2)
|
||
|
cuda_atomic_and2[1, (4, 8)](ary, rand_const)
|
||
|
self.assertTrue(np.all(ary == orig & rand_const))
|
||
|
|
||
|
def test_atomic_and3(self):
|
||
|
rand_const = np.random.randint(500)
|
||
|
ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
|
||
|
orig = ary.copy()
|
||
|
cuda_atomic_and3 = cuda.jit('void(uint32[:,:], uint32)')(atomic_and3)
|
||
|
cuda_atomic_and3[1, (4, 8)](ary, rand_const)
|
||
|
self.assertTrue(np.all(ary == orig & rand_const))
|
||
|
|
||
|
def test_atomic_and_global(self):
|
||
|
rand_const = np.random.randint(500)
|
||
|
idx = np.random.randint(0, 32, size=32, dtype=np.int32)
|
||
|
ary = np.random.randint(0, 32, size=32, dtype=np.int32)
|
||
|
sig = 'void(int32[:], int32[:], int32)'
|
||
|
cuda_func = cuda.jit(sig)(atomic_and_global)
|
||
|
cuda_func[1, 32](idx, ary, rand_const)
|
||
|
|
||
|
gold = ary.copy()
|
||
|
for i in range(idx.size):
|
||
|
gold[idx[i]] &= rand_const
|
||
|
|
||
|
np.testing.assert_equal(ary, gold)
|
||
|
|
||
|
def test_atomic_and_global_2(self):
|
||
|
rand_const = np.random.randint(500)
|
||
|
ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
|
||
|
orig = ary.copy()
|
||
|
cuda_func = cuda.jit('void(uint32[:,:], uint32)')(atomic_and_global_2)
|
||
|
cuda_func[1, (4, 8)](ary, rand_const)
|
||
|
np.testing.assert_equal(ary, orig & rand_const)
|
||
|
|
||
|
def test_atomic_or(self):
|
||
|
rand_const = np.random.randint(500)
|
||
|
ary = np.random.randint(0, 32, size=32).astype(np.uint32)
|
||
|
orig = ary.copy()
|
||
|
cuda_func = cuda.jit('void(uint32[:], uint32)')(atomic_or)
|
||
|
cuda_func[1, 32](ary, rand_const)
|
||
|
|
||
|
gold = np.zeros(32, dtype=np.uint32)
|
||
|
for i in range(orig.size):
|
||
|
gold[orig[i]] |= rand_const
|
||
|
|
||
|
self.assertTrue(np.all(ary == gold))
|
||
|
|
||
|
def test_atomic_or2(self):
|
||
|
rand_const = np.random.randint(500)
|
||
|
ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
|
||
|
orig = ary.copy()
|
||
|
cuda_atomic_and2 = cuda.jit('void(uint32[:,:], uint32)')(atomic_or2)
|
||
|
cuda_atomic_and2[1, (4, 8)](ary, rand_const)
|
||
|
self.assertTrue(np.all(ary == orig | rand_const))
|
||
|
|
||
|
def test_atomic_or3(self):
|
||
|
rand_const = np.random.randint(500)
|
||
|
ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
|
||
|
orig = ary.copy()
|
||
|
cuda_atomic_and3 = cuda.jit('void(uint32[:,:], uint32)')(atomic_or3)
|
||
|
cuda_atomic_and3[1, (4, 8)](ary, rand_const)
|
||
|
self.assertTrue(np.all(ary == orig | rand_const))
|
||
|
|
||
|
def test_atomic_or_global(self):
|
||
|
rand_const = np.random.randint(500)
|
||
|
idx = np.random.randint(0, 32, size=32, dtype=np.int32)
|
||
|
ary = np.random.randint(0, 32, size=32, dtype=np.int32)
|
||
|
sig = 'void(int32[:], int32[:], int32)'
|
||
|
cuda_func = cuda.jit(sig)(atomic_or_global)
|
||
|
cuda_func[1, 32](idx, ary, rand_const)
|
||
|
|
||
|
gold = ary.copy()
|
||
|
for i in range(idx.size):
|
||
|
gold[idx[i]] |= rand_const
|
||
|
|
||
|
np.testing.assert_equal(ary, gold)
|
||
|
|
||
|
def test_atomic_or_global_2(self):
|
||
|
rand_const = np.random.randint(500)
|
||
|
ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
|
||
|
orig = ary.copy()
|
||
|
cuda_func = cuda.jit('void(uint32[:,:], uint32)')(atomic_or_global_2)
|
||
|
cuda_func[1, (4, 8)](ary, rand_const)
|
||
|
np.testing.assert_equal(ary, orig | rand_const)
|
||
|
|
||
|
def test_atomic_xor(self):
|
||
|
rand_const = np.random.randint(500)
|
||
|
ary = np.random.randint(0, 32, size=32).astype(np.uint32)
|
||
|
orig = ary.copy()
|
||
|
cuda_func = cuda.jit('void(uint32[:], uint32)')(atomic_xor)
|
||
|
cuda_func[1, 32](ary, rand_const)
|
||
|
|
||
|
gold = np.zeros(32, dtype=np.uint32)
|
||
|
for i in range(orig.size):
|
||
|
gold[orig[i]] ^= rand_const
|
||
|
|
||
|
self.assertTrue(np.all(ary == gold))
|
||
|
|
||
|
def test_atomic_xor2(self):
|
||
|
rand_const = np.random.randint(500)
|
||
|
ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
|
||
|
orig = ary.copy()
|
||
|
cuda_atomic_xor2 = cuda.jit('void(uint32[:,:], uint32)')(atomic_xor2)
|
||
|
cuda_atomic_xor2[1, (4, 8)](ary, rand_const)
|
||
|
self.assertTrue(np.all(ary == orig ^ rand_const))
|
||
|
|
||
|
def test_atomic_xor3(self):
|
||
|
rand_const = np.random.randint(500)
|
||
|
ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
|
||
|
orig = ary.copy()
|
||
|
cuda_atomic_xor3 = cuda.jit('void(uint32[:,:], uint32)')(atomic_xor3)
|
||
|
cuda_atomic_xor3[1, (4, 8)](ary, rand_const)
|
||
|
self.assertTrue(np.all(ary == orig ^ rand_const))
|
||
|
|
||
|
def test_atomic_xor_global(self):
|
||
|
rand_const = np.random.randint(500)
|
||
|
idx = np.random.randint(0, 32, size=32, dtype=np.int32)
|
||
|
ary = np.random.randint(0, 32, size=32, dtype=np.int32)
|
||
|
gold = ary.copy()
|
||
|
sig = 'void(int32[:], int32[:], int32)'
|
||
|
cuda_func = cuda.jit(sig)(atomic_xor_global)
|
||
|
cuda_func[1, 32](idx, ary, rand_const)
|
||
|
|
||
|
for i in range(idx.size):
|
||
|
gold[idx[i]] ^= rand_const
|
||
|
|
||
|
np.testing.assert_equal(ary, gold)
|
||
|
|
||
|
def test_atomic_xor_global_2(self):
|
||
|
rand_const = np.random.randint(500)
|
||
|
ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
|
||
|
orig = ary.copy()
|
||
|
cuda_func = cuda.jit('void(uint32[:,:], uint32)')(atomic_xor_global_2)
|
||
|
cuda_func[1, (4, 8)](ary, rand_const)
|
||
|
np.testing.assert_equal(ary, orig ^ rand_const)
|
||
|
|
||
|
def inc_dec_1dim_setup(self, dtype):
|
||
|
rconst = np.random.randint(32, dtype=dtype)
|
||
|
rary = np.random.randint(0, 32, size=32).astype(dtype)
|
||
|
ary_idx = np.arange(32, dtype=dtype)
|
||
|
return rconst, rary, ary_idx
|
||
|
|
||
|
def inc_dec_2dim_setup(self, dtype):
|
||
|
rconst = np.random.randint(32, dtype=dtype)
|
||
|
rary = np.random.randint(0, 32, size=32).astype(dtype).reshape(4, 8)
|
||
|
return rconst, rary
|
||
|
|
||
|
def check_inc_index(self, ary, idx, rconst, sig, nblocks, blksize, func):
|
||
|
orig = ary.copy()
|
||
|
cuda_func = cuda.jit(sig)(func)
|
||
|
cuda_func[nblocks, blksize](ary, idx, rconst)
|
||
|
np.testing.assert_equal(ary, np.where(orig >= rconst, 0, orig + 1))
|
||
|
|
||
|
def check_inc_index2(self, ary, idx, rconst, sig, nblocks, blksize, func):
|
||
|
orig = ary.copy()
|
||
|
cuda_func = cuda.jit(sig)(func)
|
||
|
cuda_func[nblocks, blksize](idx, ary, rconst)
|
||
|
np.testing.assert_equal(ary, np.where(orig >= rconst, 0, orig + 1))
|
||
|
|
||
|
def check_inc(self, ary, rconst, sig, nblocks, blksize, func):
|
||
|
orig = ary.copy()
|
||
|
cuda_func = cuda.jit(sig)(func)
|
||
|
cuda_func[nblocks, blksize](ary, rconst)
|
||
|
np.testing.assert_equal(ary, np.where(orig >= rconst, 0, orig + 1))
|
||
|
|
||
|
def test_atomic_inc_32(self):
|
||
|
rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint32)
|
||
|
sig = 'void(uint32[:], uint32[:], uint32)'
|
||
|
self.check_inc_index(ary, idx, rand_const, sig, 1, 32, atomic_inc32)
|
||
|
|
||
|
def test_atomic_inc_64(self):
|
||
|
rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint64)
|
||
|
sig = 'void(uint64[:], uint64[:], uint64)'
|
||
|
self.check_inc_index(ary, idx, rand_const, sig, 1, 32, atomic_inc64)
|
||
|
|
||
|
def test_atomic_inc2_32(self):
|
||
|
rand_const, ary = self.inc_dec_2dim_setup(np.uint32)
|
||
|
sig = 'void(uint32[:,:], uint32)'
|
||
|
self.check_inc(ary, rand_const, sig, 1, (4,8), atomic_inc2_32)
|
||
|
|
||
|
def test_atomic_inc2_64(self):
|
||
|
rand_const, ary = self.inc_dec_2dim_setup(np.uint64)
|
||
|
sig = 'void(uint64[:,:], uint64)'
|
||
|
self.check_inc(ary, rand_const, sig, 1, (4,8), atomic_inc2_64)
|
||
|
|
||
|
def test_atomic_inc3(self):
|
||
|
rand_const, ary = self.inc_dec_2dim_setup(np.uint32)
|
||
|
sig = 'void(uint32[:,:], uint32)'
|
||
|
self.check_inc(ary, rand_const, sig, 1, (4,8), atomic_inc3)
|
||
|
|
||
|
def test_atomic_inc_global_32(self):
|
||
|
rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint32)
|
||
|
sig = 'void(uint32[:], uint32[:], uint32)'
|
||
|
self.check_inc_index2(ary, idx, rand_const, sig, 1, 32,
|
||
|
atomic_inc_global)
|
||
|
|
||
|
def test_atomic_inc_global_64(self):
|
||
|
rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint64)
|
||
|
sig = 'void(uint64[:], uint64[:], uint64)'
|
||
|
self.check_inc_index2(ary, idx, rand_const, sig, 1, 32,
|
||
|
atomic_inc_global)
|
||
|
|
||
|
def test_atomic_inc_global_2_32(self):
|
||
|
rand_const, ary = self.inc_dec_2dim_setup(np.uint32)
|
||
|
sig = 'void(uint32[:,:], uint32)'
|
||
|
self.check_inc(ary, rand_const, sig, 1, (4,8), atomic_inc_global_2)
|
||
|
|
||
|
def test_atomic_inc_global_2_64(self):
|
||
|
rand_const, ary = self.inc_dec_2dim_setup(np.uint64)
|
||
|
sig = 'void(uint64[:,:], uint64)'
|
||
|
self.check_inc(ary, rand_const, sig, 1, (4,8), atomic_inc_global_2)
|
||
|
|
||
|
def check_dec_index(self, ary, idx, rconst, sig, nblocks, blksize, func):
|
||
|
orig = ary.copy()
|
||
|
cuda_func = cuda.jit(sig)(func)
|
||
|
cuda_func[nblocks, blksize](ary, idx, rconst)
|
||
|
np.testing.assert_equal(ary, np.where(orig == 0, rconst,
|
||
|
np.where(orig > rconst,
|
||
|
rconst,
|
||
|
orig - 1)))
|
||
|
|
||
|
def check_dec_index2(self, ary, idx, rconst, sig, nblocks, blksize, func):
|
||
|
orig = ary.copy()
|
||
|
cuda_func = cuda.jit(sig)(func)
|
||
|
cuda_func[nblocks, blksize](idx, ary, rconst)
|
||
|
np.testing.assert_equal(ary, np.where(orig == 0, rconst,
|
||
|
np.where(orig > rconst,
|
||
|
rconst,
|
||
|
orig - 1)))
|
||
|
|
||
|
def check_dec(self, ary, rconst, sig, nblocks, blksize, func):
|
||
|
orig = ary.copy()
|
||
|
cuda_func = cuda.jit(sig)(func)
|
||
|
cuda_func[nblocks, blksize](ary, rconst)
|
||
|
np.testing.assert_equal(ary, np.where(orig == 0, rconst,
|
||
|
np.where(orig > rconst,
|
||
|
rconst,
|
||
|
orig - 1)))
|
||
|
|
||
|
def test_atomic_dec_32(self):
|
||
|
rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint32)
|
||
|
sig = 'void(uint32[:], uint32[:], uint32)'
|
||
|
self.check_dec_index(ary, idx, rand_const, sig, 1, 32, atomic_dec32)
|
||
|
|
||
|
def test_atomic_dec_64(self):
|
||
|
rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint64)
|
||
|
sig = 'void(uint64[:], uint64[:], uint64)'
|
||
|
self.check_dec_index(ary, idx, rand_const, sig, 1, 32, atomic_dec64)
|
||
|
|
||
|
def test_atomic_dec2_32(self):
|
||
|
rand_const, ary = self.inc_dec_2dim_setup(np.uint32)
|
||
|
sig = 'void(uint32[:,:], uint32)'
|
||
|
self.check_dec(ary, rand_const, sig, 1, (4,8), atomic_dec2_32)
|
||
|
|
||
|
def test_atomic_dec2_64(self):
|
||
|
rand_const, ary = self.inc_dec_2dim_setup(np.uint64)
|
||
|
sig = 'void(uint64[:,:], uint64)'
|
||
|
self.check_dec(ary, rand_const, sig, 1, (4,8), atomic_dec2_64)
|
||
|
|
||
|
def test_atomic_dec3_new(self):
|
||
|
rand_const, ary = self.inc_dec_2dim_setup(np.uint32)
|
||
|
sig = 'void(uint32[:,:], uint32)'
|
||
|
self.check_dec(ary, rand_const, sig, 1, (4,8), atomic_dec3)
|
||
|
|
||
|
def test_atomic_dec_global_32(self):
|
||
|
rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint32)
|
||
|
sig = 'void(uint32[:], uint32[:], uint32)'
|
||
|
self.check_dec_index2(ary, idx, rand_const, sig, 1, 32,
|
||
|
atomic_dec_global)
|
||
|
|
||
|
def test_atomic_dec_global_64(self):
|
||
|
rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint64)
|
||
|
sig = 'void(uint64[:], uint64[:], uint64)'
|
||
|
self.check_dec_index2(ary, idx, rand_const, sig, 1, 32,
|
||
|
atomic_dec_global)
|
||
|
|
||
|
def test_atomic_dec_global2_32(self):
|
||
|
rand_const, ary = self.inc_dec_2dim_setup(np.uint32)
|
||
|
sig = 'void(uint32[:,:], uint32)'
|
||
|
self.check_dec(ary, rand_const, sig, 1, (4,8), atomic_dec_global_2)
|
||
|
|
||
|
def test_atomic_dec_global2_64(self):
|
||
|
rand_const, ary = self.inc_dec_2dim_setup(np.uint64)
|
||
|
sig = 'void(uint64[:,:], uint64)'
|
||
|
self.check_dec(ary, rand_const, sig, 1, (4,8), atomic_dec_global_2)
|
||
|
|
||
|
def test_atomic_exch(self):
|
||
|
rand_const = np.random.randint(50, 100, dtype=np.uint32)
|
||
|
ary = np.random.randint(0, 32, size=32).astype(np.uint32)
|
||
|
idx = np.arange(32, dtype=np.uint32)
|
||
|
|
||
|
cuda_func = cuda.jit('void(uint32[:], uint32[:], uint32)')(atomic_exch)
|
||
|
cuda_func[1, 32](ary, idx, rand_const)
|
||
|
|
||
|
np.testing.assert_equal(ary, rand_const)
|
||
|
|
||
|
def test_atomic_exch2(self):
|
||
|
rand_const = np.random.randint(50, 100, dtype=np.uint32)
|
||
|
ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
|
||
|
|
||
|
cuda_func = cuda.jit('void(uint32[:,:], uint32)')(atomic_exch2)
|
||
|
cuda_func[1, (4, 8)](ary, rand_const)
|
||
|
np.testing.assert_equal(ary, rand_const)
|
||
|
|
||
|
def test_atomic_exch3(self):
|
||
|
rand_const = np.random.randint(50, 100, dtype=np.uint64)
|
||
|
ary = np.random.randint(0, 32, size=32).astype(np.uint64).reshape(4, 8)
|
||
|
|
||
|
cuda_func = cuda.jit('void(uint64[:,:], uint64)')(atomic_exch3)
|
||
|
cuda_func[1, (4, 8)](ary, rand_const)
|
||
|
np.testing.assert_equal(ary, rand_const)
|
||
|
|
||
|
def test_atomic_exch_global(self):
|
||
|
rand_const = np.random.randint(50, 100, dtype=np.uint32)
|
||
|
idx = np.arange(32, dtype=np.uint32)
|
||
|
ary = np.random.randint(0, 32, size=32, dtype=np.uint32)
|
||
|
|
||
|
sig = 'void(uint32[:], uint32[:], uint32)'
|
||
|
cuda_func = cuda.jit(sig)(atomic_exch_global)
|
||
|
cuda_func[1, 32](idx, ary, rand_const)
|
||
|
np.testing.assert_equal(ary, rand_const)
|
||
|
|
||
|
def check_atomic_max(self, dtype, lo, hi):
|
||
|
vals = np.random.randint(lo, hi, size=(32, 32)).astype(dtype)
|
||
|
res = np.zeros(1, dtype=vals.dtype)
|
||
|
cuda_func = cuda.jit(atomic_max)
|
||
|
cuda_func[32, 32](res, vals)
|
||
|
gold = np.max(vals)
|
||
|
np.testing.assert_equal(res, gold)
|
||
|
|
||
|
def test_atomic_max_int32(self):
|
||
|
self.check_atomic_max(dtype=np.int32, lo=-65535, hi=65535)
|
||
|
|
||
|
def test_atomic_max_uint32(self):
|
||
|
self.check_atomic_max(dtype=np.uint32, lo=0, hi=65535)
|
||
|
|
||
|
def test_atomic_max_int64(self):
|
||
|
self.check_atomic_max(dtype=np.int64, lo=-65535, hi=65535)
|
||
|
|
||
|
def test_atomic_max_uint64(self):
|
||
|
self.check_atomic_max(dtype=np.uint64, lo=0, hi=65535)
|
||
|
|
||
|
def test_atomic_max_float32(self):
|
||
|
self.check_atomic_max(dtype=np.float32, lo=-65535, hi=65535)
|
||
|
|
||
|
def test_atomic_max_double(self):
|
||
|
self.check_atomic_max(dtype=np.float64, lo=-65535, hi=65535)
|
||
|
|
||
|
def test_atomic_max_double_normalizedindex(self):
|
||
|
vals = np.random.randint(0, 65535, size=(32, 32)).astype(np.float64)
|
||
|
res = np.zeros(1, np.float64)
|
||
|
cuda_func = cuda.jit('void(float64[:], float64[:,:])')(
|
||
|
atomic_max_double_normalizedindex)
|
||
|
cuda_func[32, 32](res, vals)
|
||
|
|
||
|
gold = np.max(vals)
|
||
|
np.testing.assert_equal(res, gold)
|
||
|
|
||
|
def test_atomic_max_double_oneindex(self):
|
||
|
vals = np.random.randint(0, 128, size=32).astype(np.float64)
|
||
|
res = np.zeros(1, np.float64)
|
||
|
cuda_func = cuda.jit('void(float64[:], float64[:])')(
|
||
|
atomic_max_double_oneindex)
|
||
|
cuda_func[1, 32](res, vals)
|
||
|
|
||
|
gold = np.max(vals)
|
||
|
np.testing.assert_equal(res, gold)
|
||
|
|
||
|
def check_atomic_min(self, dtype, lo, hi):
|
||
|
vals = np.random.randint(lo, hi, size=(32, 32)).astype(dtype)
|
||
|
res = np.array([65535], dtype=vals.dtype)
|
||
|
cuda_func = cuda.jit(atomic_min)
|
||
|
cuda_func[32, 32](res, vals)
|
||
|
|
||
|
gold = np.min(vals)
|
||
|
np.testing.assert_equal(res, gold)
|
||
|
|
||
|
def test_atomic_min_int32(self):
|
||
|
self.check_atomic_min(dtype=np.int32, lo=-65535, hi=65535)
|
||
|
|
||
|
def test_atomic_min_uint32(self):
|
||
|
self.check_atomic_min(dtype=np.uint32, lo=0, hi=65535)
|
||
|
|
||
|
def test_atomic_min_int64(self):
|
||
|
self.check_atomic_min(dtype=np.int64, lo=-65535, hi=65535)
|
||
|
|
||
|
def test_atomic_min_uint64(self):
|
||
|
self.check_atomic_min(dtype=np.uint64, lo=0, hi=65535)
|
||
|
|
||
|
def test_atomic_min_float(self):
|
||
|
self.check_atomic_min(dtype=np.float32, lo=-65535, hi=65535)
|
||
|
|
||
|
def test_atomic_min_double(self):
|
||
|
self.check_atomic_min(dtype=np.float64, lo=-65535, hi=65535)
|
||
|
|
||
|
def test_atomic_min_double_normalizedindex(self):
|
||
|
vals = np.random.randint(0, 65535, size=(32, 32)).astype(np.float64)
|
||
|
res = np.ones(1, np.float64) * 65535
|
||
|
cuda_func = cuda.jit('void(float64[:], float64[:,:])')(
|
||
|
atomic_min_double_normalizedindex)
|
||
|
cuda_func[32, 32](res, vals)
|
||
|
|
||
|
gold = np.min(vals)
|
||
|
np.testing.assert_equal(res, gold)
|
||
|
|
||
|
def test_atomic_min_double_oneindex(self):
|
||
|
vals = np.random.randint(0, 128, size=32).astype(np.float64)
|
||
|
res = np.ones(1, np.float64) * 128
|
||
|
cuda_func = cuda.jit('void(float64[:], float64[:])')(
|
||
|
atomic_min_double_oneindex)
|
||
|
cuda_func[1, 32](res, vals)
|
||
|
|
||
|
gold = np.min(vals)
|
||
|
np.testing.assert_equal(res, gold)
|
||
|
|
||
|
# Taken together, _test_atomic_minmax_nan_location and
|
||
|
# _test_atomic_minmax_nan_val check that NaNs are treated similarly to the
|
||
|
# way they are in Python / NumPy - that is, {min,max}(a, b) == a if either
|
||
|
# a or b is a NaN. For the atomics, this means that the max is taken as the
|
||
|
# value stored in the memory location rather than the value supplied - i.e.
|
||
|
# for:
|
||
|
#
|
||
|
# cuda.atomic.{min,max}(ary, idx, val)
|
||
|
#
|
||
|
# the result will be ary[idx] for either of ary[idx] or val being NaN.
|
||
|
|
||
|
def _test_atomic_minmax_nan_location(self, func):
|
||
|
|
||
|
cuda_func = cuda.jit('void(float64[:], float64[:,:])')(func)
|
||
|
|
||
|
vals = np.random.randint(0, 128, size=(1,1)).astype(np.float64)
|
||
|
res = np.zeros(1, np.float64) + np.nan
|
||
|
cuda_func[1, 1](res, vals)
|
||
|
np.testing.assert_equal(res, [np.nan])
|
||
|
|
||
|
def _test_atomic_minmax_nan_val(self, func):
|
||
|
cuda_func = cuda.jit('void(float64[:], float64[:,:])')(func)
|
||
|
|
||
|
res = np.random.randint(0, 128, size=1).astype(np.float64)
|
||
|
gold = res.copy()
|
||
|
vals = np.zeros((1, 1), np.float64) + np.nan
|
||
|
cuda_func[1, 1](res, vals)
|
||
|
|
||
|
np.testing.assert_equal(res, gold)
|
||
|
|
||
|
def test_atomic_min_nan_location(self):
|
||
|
self._test_atomic_minmax_nan_location(atomic_min)
|
||
|
|
||
|
def test_atomic_max_nan_location(self):
|
||
|
self._test_atomic_minmax_nan_location(atomic_max)
|
||
|
|
||
|
def test_atomic_min_nan_val(self):
|
||
|
self._test_atomic_minmax_nan_val(atomic_min)
|
||
|
|
||
|
def test_atomic_max_nan_val(self):
|
||
|
self._test_atomic_minmax_nan_val(atomic_max)
|
||
|
|
||
|
def test_atomic_max_double_shared(self):
|
||
|
vals = np.random.randint(0, 32, size=32).astype(np.float64)
|
||
|
res = np.zeros(1, np.float64)
|
||
|
sig = 'void(float64[:], float64[:])'
|
||
|
cuda_func = cuda.jit(sig)(atomic_max_double_shared)
|
||
|
cuda_func[1, 32](res, vals)
|
||
|
|
||
|
gold = np.max(vals)
|
||
|
np.testing.assert_equal(res, gold)
|
||
|
|
||
|
def test_atomic_min_double_shared(self):
|
||
|
vals = np.random.randint(0, 32, size=32).astype(np.float64)
|
||
|
res = np.ones(1, np.float64) * 32
|
||
|
sig = 'void(float64[:], float64[:])'
|
||
|
cuda_func = cuda.jit(sig)(atomic_min_double_shared)
|
||
|
cuda_func[1, 32](res, vals)
|
||
|
|
||
|
gold = np.min(vals)
|
||
|
np.testing.assert_equal(res, gold)
|
||
|
|
||
|
def check_cas(self, n, fill, unfill, dtype, cas_func, ndim=1):
|
||
|
res = [fill] * (n // 2) + [unfill] * (n // 2)
|
||
|
np.random.shuffle(res)
|
||
|
res = np.asarray(res, dtype=dtype)
|
||
|
if ndim == 2:
|
||
|
res.shape = (10, -1)
|
||
|
out = np.zeros_like(res)
|
||
|
ary = np.random.randint(1, 10, size=res.shape).astype(res.dtype)
|
||
|
|
||
|
fill_mask = res == fill
|
||
|
unfill_mask = res == unfill
|
||
|
|
||
|
expect_res = np.zeros_like(res)
|
||
|
expect_res[fill_mask] = ary[fill_mask]
|
||
|
expect_res[unfill_mask] = unfill
|
||
|
|
||
|
expect_out = res.copy()
|
||
|
|
||
|
cuda_func = cuda.jit(cas_func)
|
||
|
if ndim == 1:
|
||
|
cuda_func[10, 10](res, out, ary, fill)
|
||
|
else:
|
||
|
cuda_func[(10, 10), (10, 10)](res, out, ary, fill)
|
||
|
|
||
|
np.testing.assert_array_equal(expect_res, res)
|
||
|
np.testing.assert_array_equal(expect_out, out)
|
||
|
|
||
|
def test_atomic_compare_and_swap(self):
|
||
|
self.check_cas(n=100, fill=-99, unfill=-1, dtype=np.int32,
|
||
|
cas_func=atomic_compare_and_swap)
|
||
|
|
||
|
def test_atomic_compare_and_swap2(self):
|
||
|
self.check_cas(n=100, fill=-45, unfill=-1, dtype=np.int64,
|
||
|
cas_func=atomic_compare_and_swap)
|
||
|
|
||
|
def test_atomic_compare_and_swap3(self):
|
||
|
rfill = np.random.randint(50, 500, dtype=np.uint32)
|
||
|
runfill = np.random.randint(1, 25, dtype=np.uint32)
|
||
|
self.check_cas(n=100, fill=rfill, unfill=runfill, dtype=np.uint32,
|
||
|
cas_func=atomic_compare_and_swap)
|
||
|
|
||
|
def test_atomic_compare_and_swap4(self):
|
||
|
rfill = np.random.randint(50, 500, dtype=np.uint64)
|
||
|
runfill = np.random.randint(1, 25, dtype=np.uint64)
|
||
|
self.check_cas(n=100, fill=rfill, unfill=runfill, dtype=np.uint64,
|
||
|
cas_func=atomic_compare_and_swap)
|
||
|
|
||
|
def test_atomic_cas_1dim(self):
|
||
|
self.check_cas(n=100, fill=-99, unfill=-1, dtype=np.int32,
|
||
|
cas_func=atomic_cas_1dim)
|
||
|
|
||
|
def test_atomic_cas_2dim(self):
|
||
|
self.check_cas(n=100, fill=-99, unfill=-1, dtype=np.int32,
|
||
|
cas_func=atomic_cas_2dim, ndim=2)
|
||
|
|
||
|
def test_atomic_cas2_1dim(self):
|
||
|
self.check_cas(n=100, fill=-45, unfill=-1, dtype=np.int64,
|
||
|
cas_func=atomic_cas_1dim)
|
||
|
|
||
|
def test_atomic_cas2_2dim(self):
|
||
|
self.check_cas(n=100, fill=-45, unfill=-1, dtype=np.int64,
|
||
|
cas_func=atomic_cas_2dim, ndim=2)
|
||
|
|
||
|
def test_atomic_cas3_1dim(self):
|
||
|
rfill = np.random.randint(50, 500, dtype=np.uint32)
|
||
|
runfill = np.random.randint(1, 25, dtype=np.uint32)
|
||
|
self.check_cas(n=100, fill=rfill, unfill=runfill, dtype=np.uint32,
|
||
|
cas_func=atomic_cas_1dim)
|
||
|
|
||
|
def test_atomic_cas3_2dim(self):
|
||
|
rfill = np.random.randint(50, 500, dtype=np.uint32)
|
||
|
runfill = np.random.randint(1, 25, dtype=np.uint32)
|
||
|
self.check_cas(n=100, fill=rfill, unfill=runfill, dtype=np.uint32,
|
||
|
cas_func=atomic_cas_2dim, ndim=2)
|
||
|
|
||
|
def test_atomic_cas4_1dim(self):
|
||
|
rfill = np.random.randint(50, 500, dtype=np.uint64)
|
||
|
runfill = np.random.randint(1, 25, dtype=np.uint64)
|
||
|
self.check_cas(n=100, fill=rfill, unfill=runfill, dtype=np.uint64,
|
||
|
cas_func=atomic_cas_1dim)
|
||
|
|
||
|
def test_atomic_cas4_2dim(self):
|
||
|
rfill = np.random.randint(50, 500, dtype=np.uint64)
|
||
|
runfill = np.random.randint(1, 25, dtype=np.uint64)
|
||
|
self.check_cas(n=100, fill=rfill, unfill=runfill, dtype=np.uint64,
|
||
|
cas_func=atomic_cas_2dim, ndim=2)
|
||
|
|
||
|
# Tests that the atomic add, min, and max operations return the old value -
|
||
|
# in the simulator, they did not (see Issue #5458). The max and min have
|
||
|
# special handling for NaN values, so we explicitly test with a NaN in the
|
||
|
# array being modified and the value provided.
|
||
|
|
||
|
def _test_atomic_returns_old(self, kernel, initial):
|
||
|
x = np.zeros(2, dtype=np.float32)
|
||
|
x[0] = initial
|
||
|
kernel[1, 1](x)
|
||
|
if np.isnan(initial):
|
||
|
self.assertTrue(np.isnan(x[1]))
|
||
|
else:
|
||
|
self.assertEqual(x[1], initial)
|
||
|
|
||
|
def test_atomic_add_returns_old(self):
|
||
|
@cuda.jit
|
||
|
def kernel(x):
|
||
|
x[1] = cuda.atomic.add(x, 0, 1)
|
||
|
|
||
|
self._test_atomic_returns_old(kernel, 10)
|
||
|
|
||
|
def test_atomic_max_returns_no_replace(self):
|
||
|
@cuda.jit
|
||
|
def kernel(x):
|
||
|
x[1] = cuda.atomic.max(x, 0, 1)
|
||
|
|
||
|
self._test_atomic_returns_old(kernel, 10)
|
||
|
|
||
|
def test_atomic_max_returns_old_replace(self):
|
||
|
@cuda.jit
|
||
|
def kernel(x):
|
||
|
x[1] = cuda.atomic.max(x, 0, 10)
|
||
|
|
||
|
self._test_atomic_returns_old(kernel, 1)
|
||
|
|
||
|
def test_atomic_max_returns_old_nan_in_array(self):
|
||
|
@cuda.jit
|
||
|
def kernel(x):
|
||
|
x[1] = cuda.atomic.max(x, 0, 1)
|
||
|
|
||
|
self._test_atomic_returns_old(kernel, np.nan)
|
||
|
|
||
|
def test_atomic_max_returns_old_nan_val(self):
|
||
|
@cuda.jit
|
||
|
def kernel(x):
|
||
|
x[1] = cuda.atomic.max(x, 0, np.nan)
|
||
|
|
||
|
self._test_atomic_returns_old(kernel, 10)
|
||
|
|
||
|
def test_atomic_min_returns_old_no_replace(self):
|
||
|
@cuda.jit
|
||
|
def kernel(x):
|
||
|
x[1] = cuda.atomic.min(x, 0, 11)
|
||
|
|
||
|
self._test_atomic_returns_old(kernel, 10)
|
||
|
|
||
|
def test_atomic_min_returns_old_replace(self):
|
||
|
@cuda.jit
|
||
|
def kernel(x):
|
||
|
x[1] = cuda.atomic.min(x, 0, 10)
|
||
|
|
||
|
self._test_atomic_returns_old(kernel, 11)
|
||
|
|
||
|
def test_atomic_min_returns_old_nan_in_array(self):
|
||
|
@cuda.jit
|
||
|
def kernel(x):
|
||
|
x[1] = cuda.atomic.min(x, 0, 11)
|
||
|
|
||
|
self._test_atomic_returns_old(kernel, np.nan)
|
||
|
|
||
|
def test_atomic_min_returns_old_nan_val(self):
|
||
|
@cuda.jit
|
||
|
def kernel(x):
|
||
|
x[1] = cuda.atomic.min(x, 0, np.nan)
|
||
|
|
||
|
self._test_atomic_returns_old(kernel, 11)
|
||
|
|
||
|
# Tests for atomic nanmin/nanmax
|
||
|
|
||
|
# nanmax tests
|
||
|
def check_atomic_nanmax(self, dtype, lo, hi, init_val):
|
||
|
vals = np.random.randint(lo, hi, size=(32, 32)).astype(dtype)
|
||
|
vals[1::2] = init_val
|
||
|
res = np.zeros(1, dtype=vals.dtype)
|
||
|
cuda_func = cuda.jit(atomic_nanmax)
|
||
|
cuda_func[32, 32](res, vals)
|
||
|
gold = np.nanmax(vals)
|
||
|
np.testing.assert_equal(res, gold)
|
||
|
|
||
|
def test_atomic_nanmax_int32(self):
|
||
|
self.check_atomic_nanmax(dtype=np.int32, lo=-65535, hi=65535,
|
||
|
init_val=0)
|
||
|
|
||
|
def test_atomic_nanmax_uint32(self):
|
||
|
self.check_atomic_nanmax(dtype=np.uint32, lo=0, hi=65535,
|
||
|
init_val=0)
|
||
|
|
||
|
def test_atomic_nanmax_int64(self):
|
||
|
self.check_atomic_nanmax(dtype=np.int64, lo=-65535, hi=65535,
|
||
|
init_val=0)
|
||
|
|
||
|
def test_atomic_nanmax_uint64(self):
|
||
|
self.check_atomic_nanmax(dtype=np.uint64, lo=0, hi=65535,
|
||
|
init_val=0)
|
||
|
|
||
|
def test_atomic_nanmax_float32(self):
|
||
|
self.check_atomic_nanmax(dtype=np.float32, lo=-65535, hi=65535,
|
||
|
init_val=np.nan)
|
||
|
|
||
|
def test_atomic_nanmax_double(self):
|
||
|
self.check_atomic_nanmax(dtype=np.float64, lo=-65535, hi=65535,
|
||
|
init_val=np.nan)
|
||
|
|
||
|
def test_atomic_nanmax_double_shared(self):
|
||
|
vals = np.random.randint(0, 32, size=32).astype(np.float64)
|
||
|
vals[1::2] = np.nan
|
||
|
res = np.array([0], dtype=vals.dtype)
|
||
|
sig = 'void(float64[:], float64[:])'
|
||
|
cuda_func = cuda.jit(sig)(atomic_nanmax_double_shared)
|
||
|
cuda_func[1, 32](res, vals)
|
||
|
|
||
|
gold = np.nanmax(vals)
|
||
|
np.testing.assert_equal(res, gold)
|
||
|
|
||
|
def test_atomic_nanmax_double_oneindex(self):
|
||
|
vals = np.random.randint(0, 128, size=32).astype(np.float64)
|
||
|
vals[1::2] = np.nan
|
||
|
res = np.zeros(1, np.float64)
|
||
|
cuda_func = cuda.jit('void(float64[:], float64[:])')(
|
||
|
atomic_max_double_oneindex)
|
||
|
cuda_func[1, 32](res, vals)
|
||
|
|
||
|
gold = np.nanmax(vals)
|
||
|
np.testing.assert_equal(res, gold)
|
||
|
|
||
|
# nanmin tests
|
||
|
def check_atomic_nanmin(self, dtype, lo, hi, init_val):
|
||
|
vals = np.random.randint(lo, hi, size=(32, 32)).astype(dtype)
|
||
|
vals[1::2] = init_val
|
||
|
res = np.array([65535], dtype=vals.dtype)
|
||
|
cuda_func = cuda.jit(atomic_nanmin)
|
||
|
cuda_func[32, 32](res, vals)
|
||
|
|
||
|
gold = np.nanmin(vals)
|
||
|
np.testing.assert_equal(res, gold)
|
||
|
|
||
|
def test_atomic_nanmin_int32(self):
|
||
|
self.check_atomic_nanmin(dtype=np.int32, lo=-65535, hi=65535,
|
||
|
init_val=0)
|
||
|
|
||
|
def test_atomic_nanmin_uint32(self):
|
||
|
self.check_atomic_nanmin(dtype=np.uint32, lo=0, hi=65535,
|
||
|
init_val=0)
|
||
|
|
||
|
def test_atomic_nanmin_int64(self):
|
||
|
self.check_atomic_nanmin(dtype=np.int64, lo=-65535, hi=65535,
|
||
|
init_val=0)
|
||
|
|
||
|
def test_atomic_nanmin_uint64(self):
|
||
|
self.check_atomic_nanmin(dtype=np.uint64, lo=0, hi=65535,
|
||
|
init_val=0)
|
||
|
|
||
|
def test_atomic_nanmin_float(self):
|
||
|
self.check_atomic_nanmin(dtype=np.float32, lo=-65535, hi=65535,
|
||
|
init_val=np.nan)
|
||
|
|
||
|
def test_atomic_nanmin_double(self):
|
||
|
self.check_atomic_nanmin(dtype=np.float64, lo=-65535, hi=65535,
|
||
|
init_val=np.nan)
|
||
|
|
||
|
def test_atomic_nanmin_double_shared(self):
|
||
|
vals = np.random.randint(0, 32, size=32).astype(np.float64)
|
||
|
vals[1::2] = np.nan
|
||
|
res = np.array([32], dtype=vals.dtype)
|
||
|
sig = 'void(float64[:], float64[:])'
|
||
|
cuda_func = cuda.jit(sig)(atomic_nanmin_double_shared)
|
||
|
cuda_func[1, 32](res, vals)
|
||
|
|
||
|
gold = np.nanmin(vals)
|
||
|
np.testing.assert_equal(res, gold)
|
||
|
|
||
|
def test_atomic_nanmin_double_oneindex(self):
|
||
|
vals = np.random.randint(0, 128, size=32).astype(np.float64)
|
||
|
vals[1::2] = np.nan
|
||
|
res = np.array([128], np.float64)
|
||
|
cuda_func = cuda.jit('void(float64[:], float64[:])')(
|
||
|
atomic_min_double_oneindex)
|
||
|
cuda_func[1, 32](res, vals)
|
||
|
|
||
|
gold = np.nanmin(vals)
|
||
|
np.testing.assert_equal(res, gold)
|
||
|
|
||
|
# Returning old value tests
|
||
|
|
||
|
def _test_atomic_nan_returns_old(self, kernel, initial):
|
||
|
x = np.zeros(2, dtype=np.float32)
|
||
|
x[0] = initial
|
||
|
x[1] = np.nan
|
||
|
kernel[1, 1](x)
|
||
|
if np.isnan(initial):
|
||
|
self.assertFalse(np.isnan(x[0]))
|
||
|
self.assertTrue(np.isnan(x[1]))
|
||
|
else:
|
||
|
self.assertEqual(x[1], initial)
|
||
|
|
||
|
def test_atomic_nanmax_returns_old_no_replace(self):
|
||
|
@cuda.jit
|
||
|
def kernel(x):
|
||
|
x[1] = cuda.atomic.nanmax(x, 0, 1)
|
||
|
|
||
|
self._test_atomic_nan_returns_old(kernel, 10)
|
||
|
|
||
|
def test_atomic_nanmax_returns_old_replace(self):
|
||
|
@cuda.jit
|
||
|
def kernel(x):
|
||
|
x[1] = cuda.atomic.nanmax(x, 0, 10)
|
||
|
|
||
|
self._test_atomic_nan_returns_old(kernel, 1)
|
||
|
|
||
|
def test_atomic_nanmax_returns_old_nan_in_array(self):
|
||
|
@cuda.jit
|
||
|
def kernel(x):
|
||
|
x[1] = cuda.atomic.nanmax(x, 0, 1)
|
||
|
|
||
|
self._test_atomic_nan_returns_old(kernel, np.nan)
|
||
|
|
||
|
def test_atomic_nanmax_returns_old_nan_val(self):
|
||
|
@cuda.jit
|
||
|
def kernel(x):
|
||
|
x[1] = cuda.atomic.nanmax(x, 0, np.nan)
|
||
|
|
||
|
self._test_atomic_nan_returns_old(kernel, 10)
|
||
|
|
||
|
def test_atomic_nanmin_returns_old_no_replace(self):
|
||
|
@cuda.jit
|
||
|
def kernel(x):
|
||
|
x[1] = cuda.atomic.nanmin(x, 0, 11)
|
||
|
|
||
|
self._test_atomic_nan_returns_old(kernel, 10)
|
||
|
|
||
|
def test_atomic_nanmin_returns_old_replace(self):
|
||
|
@cuda.jit
|
||
|
def kernel(x):
|
||
|
x[1] = cuda.atomic.nanmin(x, 0, 10)
|
||
|
|
||
|
self._test_atomic_nan_returns_old(kernel, 11)
|
||
|
|
||
|
def test_atomic_nanmin_returns_old_nan_in_array(self):
|
||
|
@cuda.jit
|
||
|
def kernel(x):
|
||
|
x[1] = cuda.atomic.nanmin(x, 0, 11)
|
||
|
|
||
|
self._test_atomic_nan_returns_old(kernel, np.nan)
|
||
|
|
||
|
def test_atomic_nanmin_returns_old_nan_val(self):
|
||
|
@cuda.jit
|
||
|
def kernel(x):
|
||
|
x[1] = cuda.atomic.nanmin(x, 0, np.nan)
|
||
|
|
||
|
self._test_atomic_nan_returns_old(kernel, 11)
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
unittest.main()
|