ai-content-maker/.venv/Lib/site-packages/numba/cuda/cudadrv/devicearray.py

904 lines
30 KiB
Python
Raw Normal View History

2024-05-03 04:18:51 +03:00
"""
A CUDA ND Array is recognized by checking the __cuda_memory__ attribute
on the object. If it exists and evaluate to True, it must define shape,
strides, dtype and size attributes similar to a NumPy ndarray.
"""
import math
import functools
import operator
import copy
from ctypes import c_void_p
import numpy as np
import numba
from numba import _devicearray
from numba.cuda.cudadrv import devices
from numba.cuda.cudadrv import driver as _driver
from numba.core import types, config
from numba.np.unsafe.ndarray import to_fixed_tuple
from numba.misc import dummyarray
from numba.np import numpy_support
from numba.cuda.api_util import prepare_shape_strides_dtype
from numba.core.errors import NumbaPerformanceWarning
from warnings import warn
try:
lru_cache = getattr(functools, 'lru_cache')(None)
except AttributeError:
# Python 3.1 or lower
def lru_cache(func):
return func
def is_cuda_ndarray(obj):
"Check if an object is a CUDA ndarray"
return getattr(obj, '__cuda_ndarray__', False)
def verify_cuda_ndarray_interface(obj):
"Verify the CUDA ndarray interface for an obj"
require_cuda_ndarray(obj)
def requires_attr(attr, typ):
if not hasattr(obj, attr):
raise AttributeError(attr)
if not isinstance(getattr(obj, attr), typ):
raise AttributeError('%s must be of type %s' % (attr, typ))
requires_attr('shape', tuple)
requires_attr('strides', tuple)
requires_attr('dtype', np.dtype)
requires_attr('size', int)
def require_cuda_ndarray(obj):
"Raises ValueError is is_cuda_ndarray(obj) evaluates False"
if not is_cuda_ndarray(obj):
raise ValueError('require an cuda ndarray object')
class DeviceNDArrayBase(_devicearray.DeviceArray):
"""A on GPU NDArray representation
"""
__cuda_memory__ = True
__cuda_ndarray__ = True # There must be gpu_data attribute
def __init__(self, shape, strides, dtype, stream=0, gpu_data=None):
"""
Args
----
shape
array shape.
strides
array strides.
dtype
data type as np.dtype coercible object.
stream
cuda stream.
gpu_data
user provided device memory for the ndarray data buffer
"""
if isinstance(shape, int):
shape = (shape,)
if isinstance(strides, int):
strides = (strides,)
dtype = np.dtype(dtype)
self.ndim = len(shape)
if len(strides) != self.ndim:
raise ValueError('strides not match ndim')
self._dummy = dummyarray.Array.from_desc(0, shape, strides,
dtype.itemsize)
self.shape = tuple(shape)
self.strides = tuple(strides)
self.dtype = dtype
self.size = int(functools.reduce(operator.mul, self.shape, 1))
# prepare gpu memory
if self.size > 0:
if gpu_data is None:
self.alloc_size = _driver.memory_size_from_info(
self.shape, self.strides, self.dtype.itemsize)
gpu_data = devices.get_context().memalloc(self.alloc_size)
else:
self.alloc_size = _driver.device_memory_size(gpu_data)
else:
# Make NULL pointer for empty allocation
if _driver.USE_NV_BINDING:
null = _driver.binding.CUdeviceptr(0)
else:
null = c_void_p(0)
gpu_data = _driver.MemoryPointer(context=devices.get_context(),
pointer=null, size=0)
self.alloc_size = 0
self.gpu_data = gpu_data
self.stream = stream
@property
def __cuda_array_interface__(self):
if _driver.USE_NV_BINDING:
if self.device_ctypes_pointer is not None:
ptr = int(self.device_ctypes_pointer)
else:
ptr = 0
else:
if self.device_ctypes_pointer.value is not None:
ptr = self.device_ctypes_pointer.value
else:
ptr = 0
return {
'shape': tuple(self.shape),
'strides': None if is_contiguous(self) else tuple(self.strides),
'data': (ptr, False),
'typestr': self.dtype.str,
'stream': int(self.stream) if self.stream != 0 else None,
'version': 3,
}
def bind(self, stream=0):
"""Bind a CUDA stream to this object so that all subsequent operation
on this array defaults to the given stream.
"""
clone = copy.copy(self)
clone.stream = stream
return clone
@property
def T(self):
return self.transpose()
def transpose(self, axes=None):
if axes and tuple(axes) == tuple(range(self.ndim)):
return self
elif self.ndim != 2:
msg = "transposing a non-2D DeviceNDArray isn't supported"
raise NotImplementedError(msg)
elif axes is not None and set(axes) != set(range(self.ndim)):
raise ValueError("invalid axes list %r" % (axes,))
else:
from numba.cuda.kernels.transpose import transpose
return transpose(self)
def _default_stream(self, stream):
return self.stream if not stream else stream
@property
def _numba_type_(self):
"""
Magic attribute expected by Numba to get the numba type that
represents this object.
"""
# Typing considerations:
#
# 1. The preference is to use 'C' or 'F' layout since this enables
# hardcoding stride values into compiled kernels, which is more
# efficient than storing a passed-in value in a register.
#
# 2. If an array is both C- and F-contiguous, prefer 'C' layout as it's
# the more likely / common case.
#
# 3. If an array is broadcast then it must be typed as 'A' - using 'C'
# or 'F' does not apply for broadcast arrays, because the strides, some
# of which will be 0, will not match those hardcoded in for 'C' or 'F'
# layouts.
broadcast = 0 in self.strides
if self.flags['C_CONTIGUOUS'] and not broadcast:
layout = 'C'
elif self.flags['F_CONTIGUOUS'] and not broadcast:
layout = 'F'
else:
layout = 'A'
dtype = numpy_support.from_dtype(self.dtype)
return types.Array(dtype, self.ndim, layout)
@property
def device_ctypes_pointer(self):
"""Returns the ctypes pointer to the GPU data buffer
"""
if self.gpu_data is None:
if _driver.USE_NV_BINDING:
return _driver.binding.CUdeviceptr(0)
else:
return c_void_p(0)
else:
return self.gpu_data.device_ctypes_pointer
@devices.require_context
def copy_to_device(self, ary, stream=0):
"""Copy `ary` to `self`.
If `ary` is a CUDA memory, perform a device-to-device transfer.
Otherwise, perform a a host-to-device transfer.
"""
if ary.size == 0:
# Nothing to do
return
sentry_contiguous(self)
stream = self._default_stream(stream)
self_core, ary_core = array_core(self), array_core(ary)
if _driver.is_device_memory(ary):
sentry_contiguous(ary)
check_array_compatibility(self_core, ary_core)
_driver.device_to_device(self, ary, self.alloc_size, stream=stream)
else:
# Ensure same contiguity. Only makes a host-side copy if necessary
# (i.e., in order to materialize a writable strided view)
ary_core = np.array(
ary_core,
order='C' if self_core.flags['C_CONTIGUOUS'] else 'F',
subok=True,
copy=not ary_core.flags['WRITEABLE'])
check_array_compatibility(self_core, ary_core)
_driver.host_to_device(self, ary_core, self.alloc_size,
stream=stream)
@devices.require_context
def copy_to_host(self, ary=None, stream=0):
"""Copy ``self`` to ``ary`` or create a new Numpy ndarray
if ``ary`` is ``None``.
If a CUDA ``stream`` is given, then the transfer will be made
asynchronously as part as the given stream. Otherwise, the transfer is
synchronous: the function returns after the copy is finished.
Always returns the host array.
Example::
import numpy as np
from numba import cuda
arr = np.arange(1000)
d_arr = cuda.to_device(arr)
my_kernel[100, 100](d_arr)
result_array = d_arr.copy_to_host()
"""
if any(s < 0 for s in self.strides):
msg = 'D->H copy not implemented for negative strides: {}'
raise NotImplementedError(msg.format(self.strides))
assert self.alloc_size >= 0, "Negative memory size"
stream = self._default_stream(stream)
if ary is None:
hostary = np.empty(shape=self.alloc_size, dtype=np.byte)
else:
check_array_compatibility(self, ary)
hostary = ary
if self.alloc_size != 0:
_driver.device_to_host(hostary, self, self.alloc_size,
stream=stream)
if ary is None:
if self.size == 0:
hostary = np.ndarray(shape=self.shape, dtype=self.dtype,
buffer=hostary)
else:
hostary = np.ndarray(shape=self.shape, dtype=self.dtype,
strides=self.strides, buffer=hostary)
return hostary
def split(self, section, stream=0):
"""Split the array into equal partition of the `section` size.
If the array cannot be equally divided, the last section will be
smaller.
"""
stream = self._default_stream(stream)
if self.ndim != 1:
raise ValueError("only support 1d array")
if self.strides[0] != self.dtype.itemsize:
raise ValueError("only support unit stride")
nsect = int(math.ceil(float(self.size) / section))
strides = self.strides
itemsize = self.dtype.itemsize
for i in range(nsect):
begin = i * section
end = min(begin + section, self.size)
shape = (end - begin,)
gpu_data = self.gpu_data.view(begin * itemsize, end * itemsize)
yield DeviceNDArray(shape, strides, dtype=self.dtype, stream=stream,
gpu_data=gpu_data)
def as_cuda_arg(self):
"""Returns a device memory object that is used as the argument.
"""
return self.gpu_data
def get_ipc_handle(self):
"""
Returns a *IpcArrayHandle* object that is safe to serialize and transfer
to another process to share the local allocation.
Note: this feature is only available on Linux.
"""
ipch = devices.get_context().get_ipc_handle(self.gpu_data)
desc = dict(shape=self.shape, strides=self.strides, dtype=self.dtype)
return IpcArrayHandle(ipc_handle=ipch, array_desc=desc)
def squeeze(self, axis=None, stream=0):
"""
Remove axes of size one from the array shape.
Parameters
----------
axis : None or int or tuple of ints, optional
Subset of dimensions to remove. A `ValueError` is raised if an axis
with size greater than one is selected. If `None`, all axes with
size one are removed.
stream : cuda stream or 0, optional
Default stream for the returned view of the array.
Returns
-------
DeviceNDArray
Squeezed view into the array.
"""
new_dummy, _ = self._dummy.squeeze(axis=axis)
return DeviceNDArray(
shape=new_dummy.shape,
strides=new_dummy.strides,
dtype=self.dtype,
stream=self._default_stream(stream),
gpu_data=self.gpu_data,
)
def view(self, dtype):
"""Returns a new object by reinterpretting the dtype without making a
copy of the data.
"""
dtype = np.dtype(dtype)
shape = list(self.shape)
strides = list(self.strides)
if self.dtype.itemsize != dtype.itemsize:
if not self.is_c_contiguous():
raise ValueError(
"To change to a dtype of a different size,"
" the array must be C-contiguous"
)
shape[-1], rem = divmod(
shape[-1] * self.dtype.itemsize,
dtype.itemsize
)
if rem != 0:
raise ValueError(
"When changing to a larger dtype,"
" its size must be a divisor of the total size in bytes"
" of the last axis of the array."
)
strides[-1] = dtype.itemsize
return DeviceNDArray(
shape=shape,
strides=strides,
dtype=dtype,
stream=self.stream,
gpu_data=self.gpu_data,
)
@property
def nbytes(self):
# Note: not using `alloc_size`. `alloc_size` reports memory
# consumption of the allocation, not the size of the array
# https://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.nbytes.html
return self.dtype.itemsize * self.size
class DeviceRecord(DeviceNDArrayBase):
'''
An on-GPU record type
'''
def __init__(self, dtype, stream=0, gpu_data=None):
shape = ()
strides = ()
super(DeviceRecord, self).__init__(shape, strides, dtype, stream,
gpu_data)
@property
def flags(self):
"""
For `numpy.ndarray` compatibility. Ideally this would return a
`np.core.multiarray.flagsobj`, but that needs to be constructed
with an existing `numpy.ndarray` (as the C- and F- contiguous flags
aren't writeable).
"""
return dict(self._dummy.flags) # defensive copy
@property
def _numba_type_(self):
"""
Magic attribute expected by Numba to get the numba type that
represents this object.
"""
return numpy_support.from_dtype(self.dtype)
@devices.require_context
def __getitem__(self, item):
return self._do_getitem(item)
@devices.require_context
def getitem(self, item, stream=0):
"""Do `__getitem__(item)` with CUDA stream
"""
return self._do_getitem(item, stream)
def _do_getitem(self, item, stream=0):
stream = self._default_stream(stream)
typ, offset = self.dtype.fields[item]
newdata = self.gpu_data.view(offset)
if typ.shape == ():
if typ.names is not None:
return DeviceRecord(dtype=typ, stream=stream,
gpu_data=newdata)
else:
hostary = np.empty(1, dtype=typ)
_driver.device_to_host(dst=hostary, src=newdata,
size=typ.itemsize,
stream=stream)
return hostary[0]
else:
shape, strides, dtype = \
prepare_shape_strides_dtype(typ.shape,
None,
typ.subdtype[0], 'C')
return DeviceNDArray(shape=shape, strides=strides,
dtype=dtype, gpu_data=newdata,
stream=stream)
@devices.require_context
def __setitem__(self, key, value):
return self._do_setitem(key, value)
@devices.require_context
def setitem(self, key, value, stream=0):
"""Do `__setitem__(key, value)` with CUDA stream
"""
return self._do_setitem(key, value, stream=stream)
def _do_setitem(self, key, value, stream=0):
stream = self._default_stream(stream)
# If the record didn't have a default stream, and the user didn't
# provide a stream, then we will use the default stream for the
# assignment kernel and synchronize on it.
synchronous = not stream
if synchronous:
ctx = devices.get_context()
stream = ctx.get_default_stream()
# (1) prepare LHS
typ, offset = self.dtype.fields[key]
newdata = self.gpu_data.view(offset)
lhs = type(self)(dtype=typ, stream=stream, gpu_data=newdata)
# (2) prepare RHS
rhs, _ = auto_device(lhs.dtype.type(value), stream=stream)
# (3) do the copy
_driver.device_to_device(lhs, rhs, rhs.dtype.itemsize, stream)
if synchronous:
stream.synchronize()
@lru_cache
def _assign_kernel(ndim):
"""
A separate method so we don't need to compile code every assignment (!).
:param ndim: We need to have static array sizes for cuda.local.array, so
bake in the number of dimensions into the kernel
"""
from numba import cuda # circular!
if ndim == 0:
# the (2, ndim) allocation below is not yet supported, so avoid it
@cuda.jit
def kernel(lhs, rhs):
lhs[()] = rhs[()]
return kernel
@cuda.jit
def kernel(lhs, rhs):
location = cuda.grid(1)
n_elements = 1
for i in range(lhs.ndim):
n_elements *= lhs.shape[i]
if location >= n_elements:
# bake n_elements into the kernel, better than passing it in
# as another argument.
return
# [0, :] is the to-index (into `lhs`)
# [1, :] is the from-index (into `rhs`)
idx = cuda.local.array(
shape=(2, ndim),
dtype=types.int64)
for i in range(ndim - 1, -1, -1):
idx[0, i] = location % lhs.shape[i]
idx[1, i] = (location % lhs.shape[i]) * (rhs.shape[i] > 1)
location //= lhs.shape[i]
lhs[to_fixed_tuple(idx[0], ndim)] = rhs[to_fixed_tuple(idx[1], ndim)]
return kernel
class DeviceNDArray(DeviceNDArrayBase):
'''
An on-GPU array type
'''
def is_f_contiguous(self):
'''
Return true if the array is Fortran-contiguous.
'''
return self._dummy.is_f_contig
@property
def flags(self):
"""
For `numpy.ndarray` compatibility. Ideally this would return a
`np.core.multiarray.flagsobj`, but that needs to be constructed
with an existing `numpy.ndarray` (as the C- and F- contiguous flags
aren't writeable).
"""
return dict(self._dummy.flags) # defensive copy
def is_c_contiguous(self):
'''
Return true if the array is C-contiguous.
'''
return self._dummy.is_c_contig
def __array__(self, dtype=None):
"""
:return: an `numpy.ndarray`, so copies to the host.
"""
if dtype:
return self.copy_to_host().__array__(dtype)
else:
return self.copy_to_host().__array__()
def __len__(self):
return self.shape[0]
def reshape(self, *newshape, **kws):
"""
Reshape the array without changing its contents, similarly to
:meth:`numpy.ndarray.reshape`. Example::
d_arr = d_arr.reshape(20, 50, order='F')
"""
if len(newshape) == 1 and isinstance(newshape[0], (tuple, list)):
newshape = newshape[0]
cls = type(self)
if newshape == self.shape:
# nothing to do
return cls(shape=self.shape, strides=self.strides,
dtype=self.dtype, gpu_data=self.gpu_data)
newarr, extents = self._dummy.reshape(*newshape, **kws)
if extents == [self._dummy.extent]:
return cls(shape=newarr.shape, strides=newarr.strides,
dtype=self.dtype, gpu_data=self.gpu_data)
else:
raise NotImplementedError("operation requires copying")
def ravel(self, order='C', stream=0):
'''
Flattens a contiguous array without changing its contents, similar to
:meth:`numpy.ndarray.ravel`. If the array is not contiguous, raises an
exception.
'''
stream = self._default_stream(stream)
cls = type(self)
newarr, extents = self._dummy.ravel(order=order)
if extents == [self._dummy.extent]:
return cls(shape=newarr.shape, strides=newarr.strides,
dtype=self.dtype, gpu_data=self.gpu_data,
stream=stream)
else:
raise NotImplementedError("operation requires copying")
@devices.require_context
def __getitem__(self, item):
return self._do_getitem(item)
@devices.require_context
def getitem(self, item, stream=0):
"""Do `__getitem__(item)` with CUDA stream
"""
return self._do_getitem(item, stream)
def _do_getitem(self, item, stream=0):
stream = self._default_stream(stream)
arr = self._dummy.__getitem__(item)
extents = list(arr.iter_contiguous_extent())
cls = type(self)
if len(extents) == 1:
newdata = self.gpu_data.view(*extents[0])
if not arr.is_array:
# Check for structured array type (record)
if self.dtype.names is not None:
return DeviceRecord(dtype=self.dtype, stream=stream,
gpu_data=newdata)
else:
# Element indexing
hostary = np.empty(1, dtype=self.dtype)
_driver.device_to_host(dst=hostary, src=newdata,
size=self._dummy.itemsize,
stream=stream)
return hostary[0]
else:
return cls(shape=arr.shape, strides=arr.strides,
dtype=self.dtype, gpu_data=newdata, stream=stream)
else:
newdata = self.gpu_data.view(*arr.extent)
return cls(shape=arr.shape, strides=arr.strides,
dtype=self.dtype, gpu_data=newdata, stream=stream)
@devices.require_context
def __setitem__(self, key, value):
return self._do_setitem(key, value)
@devices.require_context
def setitem(self, key, value, stream=0):
"""Do `__setitem__(key, value)` with CUDA stream
"""
return self._do_setitem(key, value, stream=stream)
def _do_setitem(self, key, value, stream=0):
stream = self._default_stream(stream)
# If the array didn't have a default stream, and the user didn't provide
# a stream, then we will use the default stream for the assignment
# kernel and synchronize on it.
synchronous = not stream
if synchronous:
ctx = devices.get_context()
stream = ctx.get_default_stream()
# (1) prepare LHS
arr = self._dummy.__getitem__(key)
newdata = self.gpu_data.view(*arr.extent)
if isinstance(arr, dummyarray.Element):
# convert to a 0d array
shape = ()
strides = ()
else:
shape = arr.shape
strides = arr.strides
lhs = type(self)(
shape=shape,
strides=strides,
dtype=self.dtype,
gpu_data=newdata,
stream=stream)
# (2) prepare RHS
rhs, _ = auto_device(value, stream=stream, user_explicit=True)
if rhs.ndim > lhs.ndim:
raise ValueError("Can't assign %s-D array to %s-D self" % (
rhs.ndim,
lhs.ndim))
rhs_shape = np.ones(lhs.ndim, dtype=np.int64)
# negative indices would not work if rhs.ndim == 0
rhs_shape[lhs.ndim - rhs.ndim:] = rhs.shape
rhs = rhs.reshape(*rhs_shape)
for i, (l, r) in enumerate(zip(lhs.shape, rhs.shape)):
if r != 1 and l != r:
raise ValueError("Can't copy sequence with size %d to array "
"axis %d with dimension %d" % ( r, i, l))
# (3) do the copy
n_elements = functools.reduce(operator.mul, lhs.shape, 1)
_assign_kernel(lhs.ndim).forall(n_elements, stream=stream)(lhs, rhs)
if synchronous:
stream.synchronize()
class IpcArrayHandle(object):
"""
An IPC array handle that can be serialized and transfer to another process
in the same machine for share a GPU allocation.
On the destination process, use the *.open()* method to creates a new
*DeviceNDArray* object that shares the allocation from the original process.
To release the resources, call the *.close()* method. After that, the
destination can no longer use the shared array object. (Note: the
underlying weakref to the resource is now dead.)
This object implements the context-manager interface that calls the
*.open()* and *.close()* method automatically::
with the_ipc_array_handle as ipc_array:
# use ipc_array here as a normal gpu array object
some_code(ipc_array)
# ipc_array is dead at this point
"""
def __init__(self, ipc_handle, array_desc):
self._array_desc = array_desc
self._ipc_handle = ipc_handle
def open(self):
"""
Returns a new *DeviceNDArray* that shares the allocation from the
original process. Must not be used on the original process.
"""
dptr = self._ipc_handle.open(devices.get_context())
return DeviceNDArray(gpu_data=dptr, **self._array_desc)
def close(self):
"""
Closes the IPC handle to the array.
"""
self._ipc_handle.close()
def __enter__(self):
return self.open()
def __exit__(self, type, value, traceback):
self.close()
class MappedNDArray(DeviceNDArrayBase, np.ndarray):
"""
A host array that uses CUDA mapped memory.
"""
def device_setup(self, gpu_data, stream=0):
self.gpu_data = gpu_data
self.stream = stream
class ManagedNDArray(DeviceNDArrayBase, np.ndarray):
"""
A host array that uses CUDA managed memory.
"""
def device_setup(self, gpu_data, stream=0):
self.gpu_data = gpu_data
self.stream = stream
def from_array_like(ary, stream=0, gpu_data=None):
"Create a DeviceNDArray object that is like ary."
return DeviceNDArray(ary.shape, ary.strides, ary.dtype, stream=stream,
gpu_data=gpu_data)
def from_record_like(rec, stream=0, gpu_data=None):
"Create a DeviceRecord object that is like rec."
return DeviceRecord(rec.dtype, stream=stream, gpu_data=gpu_data)
def array_core(ary):
"""
Extract the repeated core of a broadcast array.
Broadcast arrays are by definition non-contiguous due to repeated
dimensions, i.e., dimensions with stride 0. In order to ascertain memory
contiguity and copy the underlying data from such arrays, we must create
a view without the repeated dimensions.
"""
if not ary.strides or not ary.size:
return ary
core_index = []
for stride in ary.strides:
core_index.append(0 if stride == 0 else slice(None))
return ary[tuple(core_index)]
def is_contiguous(ary):
"""
Returns True iff `ary` is C-style contiguous while ignoring
broadcasted and 1-sized dimensions.
As opposed to array_core(), it does not call require_context(),
which can be quite expensive.
"""
size = ary.dtype.itemsize
for shape, stride in zip(reversed(ary.shape), reversed(ary.strides)):
if shape > 1 and stride != 0:
if size != stride:
return False
size *= shape
return True
errmsg_contiguous_buffer = ("Array contains non-contiguous buffer and cannot "
"be transferred as a single memory region. Please "
"ensure contiguous buffer with numpy "
".ascontiguousarray()")
def sentry_contiguous(ary):
core = array_core(ary)
if not core.flags['C_CONTIGUOUS'] and not core.flags['F_CONTIGUOUS']:
raise ValueError(errmsg_contiguous_buffer)
def auto_device(obj, stream=0, copy=True, user_explicit=False):
"""
Create a DeviceRecord or DeviceArray like obj and optionally copy data from
host to device. If obj already represents device memory, it is returned and
no copy is made.
"""
if _driver.is_device_memory(obj):
return obj, False
elif hasattr(obj, '__cuda_array_interface__'):
return numba.cuda.as_cuda_array(obj), False
else:
if isinstance(obj, np.void):
devobj = from_record_like(obj, stream=stream)
else:
# This allows you to pass non-array objects like constants and
# objects implementing the array interface
# https://docs.scipy.org/doc/numpy-1.13.0/reference/arrays.interface.html
# into this function (with no overhead -- copies -- for `obj`s
# that are already `ndarray`s.
obj = np.array(
obj,
copy=False,
subok=True)
sentry_contiguous(obj)
devobj = from_array_like(obj, stream=stream)
if copy:
if config.CUDA_WARN_ON_IMPLICIT_COPY:
if (
not user_explicit and
(not isinstance(obj, DeviceNDArray)
and isinstance(obj, np.ndarray))
):
msg = ("Host array used in CUDA kernel will incur "
"copy overhead to/from device.")
warn(NumbaPerformanceWarning(msg))
devobj.copy_to_device(obj, stream=stream)
return devobj, True
def check_array_compatibility(ary1, ary2):
ary1sq, ary2sq = ary1.squeeze(), ary2.squeeze()
if ary1.dtype != ary2.dtype:
raise TypeError('incompatible dtype: %s vs. %s' %
(ary1.dtype, ary2.dtype))
if ary1sq.shape != ary2sq.shape:
raise ValueError('incompatible shape: %s vs. %s' %
(ary1.shape, ary2.shape))
# We check strides only if the size is nonzero, because strides are
# irrelevant (and can differ) for zero-length copies.
if ary1.size and ary1sq.strides != ary2sq.strides:
raise ValueError('incompatible strides: %s vs. %s' %
(ary1.strides, ary2.strides))