496 lines
12 KiB
496 lines
12 KiB
Implements the cuda module as called from within an executing kernel
(@cuda.jit-decorated function).
from contextlib import contextmanager
import sys
import threading
import traceback
from numba.core import types
import numpy as np
from numba.np import numpy_support
from .vector_types import vector_types
class Dim3(object):
Used to implement thread/block indices/dimensions
def __init__(self, x, y, z):
self.x = x
self.y = y
self.z = z
def __str__(self):
return '(%s, %s, %s)' % (self.x, self.y, self.z)
def __repr__(self):
return 'Dim3(%s, %s, %s)' % (self.x, self.y, self.z)
def __iter__(self):
yield self.x
yield self.y
yield self.z
class GridGroup:
Used to implement the grid group.
def sync(self):
# Synchronization of the grid group is equivalent to synchronization of
# the thread block, because we only support cooperative grids with one
# block.
class FakeCUDACg:
CUDA Cooperative Groups
def this_grid(self):
return GridGroup()
class FakeCUDALocal(object):
CUDA Local arrays
def array(self, shape, dtype):
if isinstance(dtype, types.Type):
dtype = numpy_support.as_dtype(dtype)
return np.empty(shape, dtype)
class FakeCUDAConst(object):
CUDA Const arrays
def array_like(self, ary):
return ary
class FakeCUDAShared(object):
CUDA Shared arrays.
Limitations: assumes that only one call to cuda.shared.array is on a line,
and that that line is only executed once per thread. i.e.::
a = cuda.shared.array(...); b = cuda.shared.array(...)
will erroneously alias a and b, and::
for i in range(10):
sharedarrs[i] = cuda.shared.array(...)
will alias all arrays created at that point (though it is not certain that
this would be supported by Numba anyway).
def __init__(self, dynshared_size):
self._allocations = {}
self._dynshared_size = dynshared_size
self._dynshared = np.zeros(dynshared_size, dtype=np.byte)
def array(self, shape, dtype):
if isinstance(dtype, types.Type):
dtype = numpy_support.as_dtype(dtype)
# Dynamic shared memory is requested with size 0 - this all shares the
# same underlying memory
if shape == 0:
# Count must be the maximum number of whole elements that fit in the
# buffer (Numpy complains if the buffer is not a multiple of the
# element size)
count = self._dynshared_size // dtype.itemsize
return np.frombuffer(self._dynshared.data, dtype=dtype, count=count)
# Otherwise, identify allocations by source file and line number
# We pass the reference frame explicitly to work around
# http://bugs.python.org/issue25108
stack = traceback.extract_stack(sys._getframe())
caller = stack[-2][0:2]
res = self._allocations.get(caller)
if res is None:
res = np.empty(shape, dtype)
self._allocations[caller] = res
return res
addlock = threading.Lock()
sublock = threading.Lock()
andlock = threading.Lock()
orlock = threading.Lock()
xorlock = threading.Lock()
maxlock = threading.Lock()
minlock = threading.Lock()
compare_and_swaplock = threading.Lock()
caslock = threading.Lock()
inclock = threading.Lock()
declock = threading.Lock()
exchlock = threading.Lock()
class FakeCUDAAtomic(object):
def add(self, array, index, val):
with addlock:
old = array[index]
array[index] += val
return old
def sub(self, array, index, val):
with sublock:
old = array[index]
array[index] -= val
return old
def and_(self, array, index, val):
with andlock:
old = array[index]
array[index] &= val
return old
def or_(self, array, index, val):
with orlock:
old = array[index]
array[index] |= val
return old
def xor(self, array, index, val):
with xorlock:
old = array[index]
array[index] ^= val
return old
def inc(self, array, index, val):
with inclock:
old = array[index]
if old >= val:
array[index] = 0
array[index] += 1
return old
def dec(self, array, index, val):
with declock:
old = array[index]
if (old == 0) or (old > val):
array[index] = val
array[index] -= 1
return old
def exch(self, array, index, val):
with exchlock:
old = array[index]
array[index] = val
return old
def max(self, array, index, val):
with maxlock:
old = array[index]
array[index] = max(old, val)
return old
def min(self, array, index, val):
with minlock:
old = array[index]
array[index] = min(old, val)
return old
def nanmax(self, array, index, val):
with maxlock:
old = array[index]
array[index] = np.nanmax([array[index], val])
return old
def nanmin(self, array, index, val):
with minlock:
old = array[index]
array[index] = np.nanmin([array[index], val])
return old
def compare_and_swap(self, array, old, val):
with compare_and_swaplock:
index = (0,) * array.ndim
loaded = array[index]
if loaded == old:
array[index] = val
return loaded
def cas(self, array, index, old, val):
with caslock:
loaded = array[index]
if loaded == old:
array[index] = val
return loaded
class FakeCUDAFp16(object):
def hadd(self, a, b):
return a + b
def hsub(self, a, b):
return a - b
def hmul(self, a, b):
return a * b
def hdiv(self, a, b):
return a / b
def hfma(self, a, b, c):
return a * b + c
def hneg(self, a):
return -a
def habs(self, a):
return abs(a)
def hsin(self, x):
return np.sin(x, dtype=np.float16)
def hcos(self, x):
return np.cos(x, dtype=np.float16)
def hlog(self, x):
return np.log(x, dtype=np.float16)
def hlog2(self, x):
return np.log2(x, dtype=np.float16)
def hlog10(self, x):
return np.log10(x, dtype=np.float16)
def hexp(self, x):
return np.exp(x, dtype=np.float16)
def hexp2(self, x):
return np.exp2(x, dtype=np.float16)
def hexp10(self, x):
return np.float16(10 ** x)
def hsqrt(self, x):
return np.sqrt(x, dtype=np.float16)
def hrsqrt(self, x):
return np.float16(x ** -0.5)
def hceil(self, x):
return np.ceil(x, dtype=np.float16)
def hfloor(self, x):
return np.ceil(x, dtype=np.float16)
def hrcp(self, x):
return np.reciprocal(x, dtype=np.float16)
def htrunc(self, x):
return np.trunc(x, dtype=np.float16)
def hrint(self, x):
return np.rint(x, dtype=np.float16)
def heq(self, a, b):
return a == b
def hne(self, a, b):
return a != b
def hge(self, a, b):
return a >= b
def hgt(self, a, b):
return a > b
def hle(self, a, b):
return a <= b
def hlt(self, a, b):
return a < b
def hmax(self, a, b):
return max(a, b)
def hmin(self, a, b):
return min(a, b)
class FakeCUDAModule(object):
An instance of this class will be injected into the __globals__ for an
executing function in order to implement calls to cuda.*. This will fail to
work correctly if the user code does::
from numba import cuda as something_else
In other words, the CUDA module must be called cuda.
def __init__(self, grid_dim, block_dim, dynshared_size):
self.gridDim = Dim3(*grid_dim)
self.blockDim = Dim3(*block_dim)
self._cg = FakeCUDACg()
self._local = FakeCUDALocal()
self._shared = FakeCUDAShared(dynshared_size)
self._const = FakeCUDAConst()
self._atomic = FakeCUDAAtomic()
self._fp16 = FakeCUDAFp16()
# Insert the vector types into the kernel context
# Note that we need to do this in addition to exposing them as module
# variables in `simulator.__init__.py`, because the test cases need
# to access the actual cuda module as well as the fake cuda module
# for vector types.
for name, svty in vector_types.items():
setattr(self, name, svty)
for alias in svty.aliases:
setattr(self, alias, svty)
def cg(self):
return self._cg
def local(self):
return self._local
def shared(self):
return self._shared
def const(self):
return self._const
def atomic(self):
return self._atomic
def fp16(self):
return self._fp16
def threadIdx(self):
return threading.current_thread().threadIdx
def blockIdx(self):
return threading.current_thread().blockIdx
def warpsize(self):
return 32
def laneid(self):
return threading.current_thread().thread_id % 32
def syncthreads(self):
def threadfence(self):
# No-op
def threadfence_block(self):
# No-op
def threadfence_system(self):
# No-op
def syncthreads_count(self, val):
return threading.current_thread().syncthreads_count(val)
def syncthreads_and(self, val):
return threading.current_thread().syncthreads_and(val)
def syncthreads_or(self, val):
return threading.current_thread().syncthreads_or(val)
def popc(self, val):
return bin(val).count("1")
def fma(self, a, b, c):
return a * b + c
def cbrt(self, a):
return a ** (1 / 3)
def brev(self, val):
return int('{:032b}'.format(val)[::-1], 2)
def clz(self, val):
s = '{:032b}'.format(val)
return len(s) - len(s.lstrip('0'))
def ffs(self, val):
# The algorithm is:
# 1. Count the number of trailing zeros.
# 2. Add 1, because the LSB is numbered 1 rather than 0, and so on.
# 3. If we've counted 32 zeros (resulting in 33), there were no bits
# set so we need to return zero.
s = '{:032b}'.format(val)
r = (len(s) - len(s.rstrip('0')) + 1) % 33
return r
def selp(self, a, b, c):
return b if a else c
def grid(self, n):
bdim = self.blockDim
bid = self.blockIdx
tid = self.threadIdx
x = bid.x * bdim.x + tid.x
if n == 1:
return x
y = bid.y * bdim.y + tid.y
if n == 2:
return (x, y)
z = bid.z * bdim.z + tid.z
if n == 3:
return (x, y, z)
raise RuntimeError("Global ID has 1-3 dimensions. %d requested" % n)
def gridsize(self, n):
bdim = self.blockDim
gdim = self.gridDim
x = bdim.x * gdim.x
if n == 1:
return x
y = bdim.y * gdim.y
if n == 2:
return (x, y)
z = bdim.z * gdim.z
if n == 3:
return (x, y, z)
raise RuntimeError("Global grid has 1-3 dimensions. %d requested" % n)
def swapped_cuda_module(fn, fake_cuda_module):
from numba import cuda
fn_globs = fn.__globals__
# get all globals that is the "cuda" module
orig = dict((k, v) for k, v in fn_globs.items() if v is cuda)
# build replacement dict
repl = dict((k, fake_cuda_module) for k, v in orig.items())
# replace
# revert