from numba import cuda from numpy import array as np_array from numba.np.ufunc import deviceufunc from numba.np.ufunc.deviceufunc import (UFuncMechanism, GeneralizedUFunc, GUFuncCallSteps) class CUDAUFuncDispatcher(object): """ Invoke the CUDA ufunc specialization for the given inputs. """ def __init__(self, types_to_retty_kernels, pyfunc): self.functions = types_to_retty_kernels self.__name__ = pyfunc.__name__ def __call__(self, *args, **kws): """ *args: numpy arrays or DeviceArrayBase (created by cuda.to_device). Cannot mix the two types in one call. **kws: stream -- cuda stream; when defined, asynchronous mode is used. out -- output array. Can be a numpy array or DeviceArrayBase depending on the input arguments. Type must match the input arguments. """ return CUDAUFuncMechanism.call(self.functions, args, kws) def reduce(self, arg, stream=0): assert len(list(self.functions.keys())[0]) == 2, "must be a binary " \ "ufunc" assert arg.ndim == 1, "must use 1d array" n = arg.shape[0] gpu_mems = [] if n == 0: raise TypeError("Reduction on an empty array.") elif n == 1: # nothing to do return arg[0] # always use a stream stream = stream or cuda.stream() with stream.auto_synchronize(): # transfer memory to device if necessary if cuda.cudadrv.devicearray.is_cuda_ndarray(arg): mem = arg else: mem = cuda.to_device(arg, stream) # do reduction out = self.__reduce(mem, gpu_mems, stream) # use a small buffer to store the result element buf = np_array((1,), dtype=arg.dtype) out.copy_to_host(buf, stream=stream) return buf[0] def __reduce(self, mem, gpu_mems, stream): n = mem.shape[0] if n % 2 != 0: # odd? fatcut, thincut = mem.split(n - 1) # prevent freeing during async mode gpu_mems.append(fatcut) gpu_mems.append(thincut) # execute the kernel out = self.__reduce(fatcut, gpu_mems, stream) gpu_mems.append(out) return self(out, thincut, out=out, stream=stream) else: # even? left, right = mem.split(n // 2) # prevent freeing during async mode gpu_mems.append(left) gpu_mems.append(right) # execute the kernel self(left, right, out=left, stream=stream) if n // 2 > 1: return self.__reduce(left, gpu_mems, stream) else: return left class _CUDAGUFuncCallSteps(GUFuncCallSteps): __slots__ = [ '_stream', ] def __init__(self, nin, nout, args, kwargs): super().__init__(nin, nout, args, kwargs) self._stream = kwargs.get('stream', 0) def is_device_array(self, obj): return cuda.is_cuda_array(obj) def as_device_array(self, obj): # We don't want to call as_cuda_array on objects that are already Numba # device arrays, because this results in exporting the array as a # Producer then importing it as a Consumer, which causes a # synchronization on the array's stream (if it has one) by default. # When we have a Numba device array, we can simply return it. if cuda.cudadrv.devicearray.is_cuda_ndarray(obj): return obj return cuda.as_cuda_array(obj) def to_device(self, hostary): return cuda.to_device(hostary, stream=self._stream) def to_host(self, devary, hostary): out = devary.copy_to_host(hostary, stream=self._stream) return out def allocate_device_array(self, shape, dtype): return cuda.device_array(shape=shape, dtype=dtype, stream=self._stream) def launch_kernel(self, kernel, nelem, args): kernel.forall(nelem, stream=self._stream)(*args) class CUDAGeneralizedUFunc(GeneralizedUFunc): def __init__(self, kernelmap, engine, pyfunc): self.__name__ = pyfunc.__name__ super().__init__(kernelmap, engine) @property def _call_steps(self): return _CUDAGUFuncCallSteps def _broadcast_scalar_input(self, ary, shape): return cuda.cudadrv.devicearray.DeviceNDArray(shape=shape, strides=(0,), dtype=ary.dtype, gpu_data=ary.gpu_data) def _broadcast_add_axis(self, ary, newshape): newax = len(newshape) - len(ary.shape) # Add 0 strides for missing dimension newstrides = (0,) * newax + ary.strides return cuda.cudadrv.devicearray.DeviceNDArray(shape=newshape, strides=newstrides, dtype=ary.dtype, gpu_data=ary.gpu_data) class CUDAUFuncMechanism(UFuncMechanism): """ Provide CUDA specialization """ DEFAULT_STREAM = 0 def launch(self, func, count, stream, args): func.forall(count, stream=stream)(*args) def is_device_array(self, obj): return cuda.is_cuda_array(obj) def as_device_array(self, obj): # We don't want to call as_cuda_array on objects that are already Numba # device arrays, because this results in exporting the array as a # Producer then importing it as a Consumer, which causes a # synchronization on the array's stream (if it has one) by default. # When we have a Numba device array, we can simply return it. if cuda.cudadrv.devicearray.is_cuda_ndarray(obj): return obj return cuda.as_cuda_array(obj) def to_device(self, hostary, stream): return cuda.to_device(hostary, stream=stream) def to_host(self, devary, stream): return devary.copy_to_host(stream=stream) def allocate_device_array(self, shape, dtype, stream): return cuda.device_array(shape=shape, dtype=dtype, stream=stream) def broadcast_device(self, ary, shape): ax_differs = [ax for ax in range(len(shape)) if ax >= ary.ndim or ary.shape[ax] != shape[ax]] missingdim = len(shape) - len(ary.shape) strides = [0] * missingdim + list(ary.strides) for ax in ax_differs: strides[ax] = 0 return cuda.cudadrv.devicearray.DeviceNDArray(shape=shape, strides=strides, dtype=ary.dtype, gpu_data=ary.gpu_data) vectorizer_stager_source = ''' def __vectorized_{name}({args}, __out__): __tid__ = __cuda__.grid(1) if __tid__ < __out__.shape[0]: __out__[__tid__] = __core__({argitems}) ''' class CUDAVectorize(deviceufunc.DeviceVectorize): def _compile_core(self, sig): cudevfn = cuda.jit(sig, device=True, inline=True)(self.pyfunc) return cudevfn, cudevfn.overloads[sig.args].signature.return_type def _get_globals(self, corefn): glbl = self.pyfunc.__globals__.copy() glbl.update({'__cuda__': cuda, '__core__': corefn}) return glbl def _compile_kernel(self, fnobj, sig): return cuda.jit(fnobj) def build_ufunc(self): return CUDAUFuncDispatcher(self.kernelmap, self.pyfunc) @property def _kernel_template(self): return vectorizer_stager_source # ------------------------------------------------------------------------------ # Generalized CUDA ufuncs _gufunc_stager_source = ''' def __gufunc_{name}({args}): __tid__ = __cuda__.grid(1) if __tid__ < {checkedarg}: __core__({argitems}) ''' class CUDAGUFuncVectorize(deviceufunc.DeviceGUFuncVectorize): def build_ufunc(self): engine = deviceufunc.GUFuncEngine(self.inputsig, self.outputsig) return CUDAGeneralizedUFunc(kernelmap=self.kernelmap, engine=engine, pyfunc=self.pyfunc) def _compile_kernel(self, fnobj, sig): return cuda.jit(sig)(fnobj) @property def _kernel_template(self): return _gufunc_stager_source def _get_globals(self, sig): corefn = cuda.jit(sig, device=True)(self.pyfunc) glbls = self.py_func.__globals__.copy() glbls.update({'__cuda__': cuda, '__core__': corefn}) return glbls