395 lines
17 KiB
Python
395 lines
17 KiB
Python
from ctypes import (c_byte, c_char_p, c_float, c_int, c_size_t, c_uint,
|
|
c_uint8, c_void_p, py_object, CFUNCTYPE, POINTER)
|
|
|
|
from numba.cuda.cudadrv import _extras
|
|
|
|
cu_device = c_int
|
|
cu_device_attribute = c_int # enum
|
|
cu_context = c_void_p # an opaque handle
|
|
cu_module = c_void_p # an opaque handle
|
|
cu_jit_option = c_int # enum
|
|
cu_jit_input_type = c_int # enum
|
|
cu_function = c_void_p # an opaque handle
|
|
cu_device_ptr = c_size_t # defined as unsigned long long
|
|
cu_stream = c_void_p # an opaque handle
|
|
cu_event = c_void_p
|
|
cu_link_state = c_void_p
|
|
cu_function_attribute = c_int
|
|
cu_ipc_mem_handle = (c_byte * _extras.CUDA_IPC_HANDLE_SIZE) # 64 bytes wide
|
|
cu_uuid = (c_byte * 16) # Device UUID
|
|
|
|
cu_stream_callback_pyobj = CFUNCTYPE(None, cu_stream, c_int, py_object)
|
|
|
|
cu_occupancy_b2d_size = CFUNCTYPE(c_size_t, c_int)
|
|
|
|
# See https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html
|
|
CU_STREAM_DEFAULT = 0
|
|
CU_STREAM_LEGACY = 1
|
|
CU_STREAM_PER_THREAD = 2
|
|
|
|
API_PROTOTYPES = {
|
|
# CUresult cuInit(unsigned int Flags);
|
|
'cuInit' : (c_int, c_uint),
|
|
|
|
# CUresult cuDriverGetVersion (int* driverVersion )
|
|
'cuDriverGetVersion': (c_int, POINTER(c_int)),
|
|
|
|
# CUresult cuDeviceGetCount(int *count);
|
|
'cuDeviceGetCount': (c_int, POINTER(c_int)),
|
|
|
|
# CUresult cuDeviceGet(CUdevice *device, int ordinal);
|
|
'cuDeviceGet': (c_int, POINTER(cu_device), c_int),
|
|
|
|
# CUresult cuDeviceGetName ( char* name, int len, CUdevice dev )
|
|
'cuDeviceGetName': (c_int, c_char_p, c_int, cu_device),
|
|
|
|
# CUresult cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib,
|
|
# CUdevice dev);
|
|
'cuDeviceGetAttribute': (c_int, POINTER(c_int), cu_device_attribute,
|
|
cu_device),
|
|
|
|
# CUresult cuDeviceComputeCapability(int *major, int *minor,
|
|
# CUdevice dev);
|
|
'cuDeviceComputeCapability': (c_int, POINTER(c_int), POINTER(c_int),
|
|
cu_device),
|
|
|
|
# CUresult cuDevicePrimaryCtxGetState(
|
|
# CUdevice dev,
|
|
# unsigned int* flags,
|
|
# int* active)
|
|
'cuDevicePrimaryCtxGetState': (c_int,
|
|
cu_device, POINTER(c_uint), POINTER(c_int)),
|
|
|
|
# CUresult cuDevicePrimaryCtxRelease ( CUdevice dev )
|
|
'cuDevicePrimaryCtxRelease': (c_int, cu_device),
|
|
|
|
# CUresult cuDevicePrimaryCtxReset ( CUdevice dev )
|
|
'cuDevicePrimaryCtxReset': (c_int, cu_device),
|
|
|
|
# CUresult cuDevicePrimaryCtxRetain ( CUcontext* pctx, CUdevice dev )
|
|
'cuDevicePrimaryCtxRetain': (c_int, POINTER(cu_context), cu_device),
|
|
|
|
# CUresult cuDevicePrimaryCtxSetFlags ( CUdevice dev, unsigned int flags )
|
|
'cuDevicePrimaryCtxSetFlags': (c_int, cu_device, c_uint),
|
|
|
|
# CUresult cuCtxCreate(CUcontext *pctx, unsigned int flags,
|
|
# CUdevice dev);
|
|
'cuCtxCreate': (c_int, POINTER(cu_context), c_uint, cu_device),
|
|
|
|
# CUresult cuCtxGetDevice ( CUdevice * device )
|
|
'cuCtxGetDevice': (c_int, POINTER(cu_device)),
|
|
|
|
# CUresult cuCtxGetCurrent (CUcontext *pctx);
|
|
'cuCtxGetCurrent': (c_int, POINTER(cu_context)),
|
|
|
|
# CUresult cuCtxPushCurrent (CUcontext pctx);
|
|
'cuCtxPushCurrent': (c_int, cu_context),
|
|
|
|
# CUresult cuCtxPopCurrent (CUcontext *pctx);
|
|
'cuCtxPopCurrent': (c_int, POINTER(cu_context)),
|
|
|
|
# CUresult cuCtxDestroy(CUcontext pctx);
|
|
'cuCtxDestroy': (c_int, cu_context),
|
|
|
|
# CUresult cuModuleLoadDataEx(CUmodule *module, const void *image,
|
|
# unsigned int numOptions,
|
|
# CUjit_option *options,
|
|
# void **optionValues);
|
|
'cuModuleLoadDataEx': (c_int, cu_module, c_void_p, c_uint,
|
|
POINTER(cu_jit_option), POINTER(c_void_p)),
|
|
|
|
# CUresult cuModuleUnload(CUmodule hmod);
|
|
'cuModuleUnload': (c_int, cu_module),
|
|
|
|
# CUresult cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod,
|
|
# const char *name);
|
|
'cuModuleGetFunction': (c_int, cu_function, cu_module, c_char_p),
|
|
|
|
# CUresult cuModuleGetGlobal ( CUdeviceptr* dptr, size_t* bytes, CUmodule
|
|
# hmod, const char* name )
|
|
'cuModuleGetGlobal': (c_int, POINTER(cu_device_ptr), POINTER(c_size_t),
|
|
cu_module, c_char_p),
|
|
|
|
# CUresult CUDAAPI cuFuncSetCacheConfig(CUfunction hfunc,
|
|
# CUfunc_cache config);
|
|
'cuFuncSetCacheConfig': (c_int, cu_function, c_uint),
|
|
|
|
# CUresult cuMemAlloc(CUdeviceptr *dptr, size_t bytesize);
|
|
'cuMemAlloc': (c_int, POINTER(cu_device_ptr), c_size_t),
|
|
|
|
# CUresult cuMemAllocManaged(CUdeviceptr *dptr, size_t bytesize,
|
|
# unsigned int flags);
|
|
'cuMemAllocManaged': (c_int, c_void_p, c_size_t, c_uint),
|
|
|
|
# CUresult cuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, size_t N)
|
|
'cuMemsetD8': (c_int, cu_device_ptr, c_uint8, c_size_t),
|
|
|
|
# CUresult cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc,
|
|
# size_t N, CUstream hStream);
|
|
'cuMemsetD8Async': (c_int,
|
|
cu_device_ptr, c_uint8, c_size_t, cu_stream),
|
|
|
|
# CUresult cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost,
|
|
# size_t ByteCount);
|
|
'cuMemcpyHtoD': (c_int, cu_device_ptr, c_void_p, c_size_t),
|
|
|
|
# CUresult cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost,
|
|
# size_t ByteCount, CUstream hStream);
|
|
'cuMemcpyHtoDAsync': (c_int, cu_device_ptr, c_void_p, c_size_t,
|
|
cu_stream),
|
|
|
|
# CUresult cuMemcpyDtoD(CUdeviceptr dstDevice, const void *srcDevice,
|
|
# size_t ByteCount);
|
|
'cuMemcpyDtoD': (c_int, cu_device_ptr, cu_device_ptr, c_size_t),
|
|
|
|
# CUresult cuMemcpyDtoDAsync(CUdeviceptr dstDevice, const void *srcDevice,
|
|
# size_t ByteCount, CUstream hStream);
|
|
'cuMemcpyDtoDAsync': (c_int, cu_device_ptr, cu_device_ptr, c_size_t,
|
|
cu_stream),
|
|
|
|
|
|
# CUresult cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice,
|
|
# size_t ByteCount);
|
|
'cuMemcpyDtoH': (c_int, c_void_p, cu_device_ptr, c_size_t),
|
|
|
|
# CUresult cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice,
|
|
# size_t ByteCount, CUstream hStream);
|
|
'cuMemcpyDtoHAsync': (c_int, c_void_p, cu_device_ptr, c_size_t,
|
|
cu_stream),
|
|
|
|
# CUresult cuMemFree(CUdeviceptr dptr);
|
|
'cuMemFree': (c_int, cu_device_ptr),
|
|
|
|
# CUresult cuStreamCreate(CUstream *phStream, unsigned int Flags);
|
|
'cuStreamCreate': (c_int, POINTER(cu_stream), c_uint),
|
|
|
|
# CUresult cuStreamDestroy(CUstream hStream);
|
|
'cuStreamDestroy': (c_int, cu_stream),
|
|
|
|
# CUresult cuStreamSynchronize(CUstream hStream);
|
|
'cuStreamSynchronize': (c_int, cu_stream),
|
|
|
|
# CUresult cuStreamAddCallback(
|
|
# CUstream hStream,
|
|
# CUstreamCallback callback,
|
|
# void* userData,
|
|
# unsigned int flags)
|
|
'cuStreamAddCallback': (c_int, cu_stream, cu_stream_callback_pyobj,
|
|
py_object, c_uint),
|
|
|
|
# CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX,
|
|
# unsigned int gridDimY,
|
|
# unsigned int gridDimZ,
|
|
# unsigned int blockDimX,
|
|
# unsigned int blockDimY,
|
|
# unsigned int blockDimZ,
|
|
# unsigned int sharedMemBytes,
|
|
# CUstream hStream, void **kernelParams,
|
|
# void ** extra)
|
|
'cuLaunchKernel': (c_int, cu_function, c_uint, c_uint, c_uint,
|
|
c_uint, c_uint, c_uint, c_uint, cu_stream,
|
|
POINTER(c_void_p), POINTER(c_void_p)),
|
|
|
|
# CUresult cuLaunchCooperativeKernel(CUfunction f, unsigned int gridDimX,
|
|
# unsigned int gridDimY,
|
|
# unsigned int gridDimZ,
|
|
# unsigned int blockDimX,
|
|
# unsigned int blockDimY,
|
|
# unsigned int blockDimZ,
|
|
# unsigned int sharedMemBytes,
|
|
# CUstream hStream, void **kernelParams)
|
|
'cuLaunchCooperativeKernel': (c_int, cu_function, c_uint, c_uint, c_uint,
|
|
c_uint, c_uint, c_uint, c_uint, cu_stream,
|
|
POINTER(c_void_p)),
|
|
|
|
# CUresult cuMemHostAlloc ( void ** pp,
|
|
# size_t bytesize,
|
|
# unsigned int Flags
|
|
# )
|
|
'cuMemHostAlloc': (c_int, c_void_p, c_size_t, c_uint),
|
|
|
|
# CUresult cuMemFreeHost ( void * p )
|
|
'cuMemFreeHost': (c_int, c_void_p),
|
|
|
|
# CUresult cuMemHostRegister(void * p,
|
|
# size_t bytesize,
|
|
# unsigned int Flags)
|
|
'cuMemHostRegister': (c_int, c_void_p, c_size_t, c_uint),
|
|
|
|
# CUresult cuMemHostUnregister(void * p)
|
|
'cuMemHostUnregister': (c_int, c_void_p),
|
|
|
|
# CUresult cuMemHostGetDevicePointer(CUdeviceptr * pdptr,
|
|
# void * p,
|
|
# unsigned int Flags)
|
|
'cuMemHostGetDevicePointer': (c_int, POINTER(cu_device_ptr),
|
|
c_void_p, c_uint),
|
|
|
|
# CUresult cuMemGetInfo(size_t * free, size_t * total)
|
|
'cuMemGetInfo' : (c_int, POINTER(c_size_t), POINTER(c_size_t)),
|
|
|
|
# CUresult cuEventCreate ( CUevent * phEvent,
|
|
# unsigned int Flags )
|
|
'cuEventCreate': (c_int, POINTER(cu_event), c_uint),
|
|
|
|
# CUresult cuEventDestroy ( CUevent hEvent )
|
|
'cuEventDestroy': (c_int, cu_event),
|
|
|
|
# CUresult cuEventElapsedTime ( float * pMilliseconds,
|
|
# CUevent hStart,
|
|
# CUevent hEnd )
|
|
'cuEventElapsedTime': (c_int, POINTER(c_float), cu_event, cu_event),
|
|
|
|
# CUresult cuEventQuery ( CUevent hEvent )
|
|
'cuEventQuery': (c_int, cu_event),
|
|
|
|
# CUresult cuEventRecord ( CUevent hEvent,
|
|
# CUstream hStream )
|
|
'cuEventRecord': (c_int, cu_event, cu_stream),
|
|
|
|
# CUresult cuEventSynchronize ( CUevent hEvent )
|
|
'cuEventSynchronize': (c_int, cu_event),
|
|
|
|
|
|
# CUresult cuStreamWaitEvent ( CUstream hStream,
|
|
# CUevent hEvent,
|
|
# unsigned int Flags )
|
|
'cuStreamWaitEvent': (c_int, cu_stream, cu_event, c_uint),
|
|
|
|
# CUresult cuPointerGetAttribute (
|
|
# void *data,
|
|
# CUpointer_attribute attribute,
|
|
# CUdeviceptr ptr)
|
|
'cuPointerGetAttribute': (c_int, c_void_p, c_uint, cu_device_ptr),
|
|
|
|
# CUresult cuMemGetAddressRange ( CUdeviceptr * pbase,
|
|
# size_t * psize,
|
|
# CUdeviceptr dptr
|
|
# )
|
|
'cuMemGetAddressRange': (c_int,
|
|
POINTER(cu_device_ptr),
|
|
POINTER(c_size_t),
|
|
cu_device_ptr),
|
|
|
|
# CUresult cuMemHostGetFlags ( unsigned int * pFlags,
|
|
# void * p )
|
|
'cuMemHostGetFlags': (c_int,
|
|
POINTER(c_uint),
|
|
c_void_p),
|
|
|
|
# CUresult cuCtxSynchronize ( void )
|
|
'cuCtxSynchronize' : (c_int,),
|
|
|
|
# CUresult
|
|
# cuLinkCreate(unsigned int numOptions, CUjit_option *options,
|
|
# void **optionValues, CUlinkState *stateOut);
|
|
'cuLinkCreate': (c_int,
|
|
c_uint, POINTER(cu_jit_option),
|
|
POINTER(c_void_p), POINTER(cu_link_state)),
|
|
|
|
# CUresult
|
|
# cuLinkAddData(CUlinkState state, CUjitInputType type, void *data,
|
|
# size_t size, const char *name, unsigned
|
|
# int numOptions, CUjit_option *options,
|
|
# void **optionValues);
|
|
'cuLinkAddData': (c_int,
|
|
cu_link_state, cu_jit_input_type, c_void_p,
|
|
c_size_t, c_char_p, c_uint, POINTER(cu_jit_option),
|
|
POINTER(c_void_p)),
|
|
|
|
# CUresult
|
|
# cuLinkAddFile(CUlinkState state, CUjitInputType type,
|
|
# const char *path, unsigned int numOptions,
|
|
# CUjit_option *options, void **optionValues);
|
|
|
|
'cuLinkAddFile': (c_int,
|
|
cu_link_state, cu_jit_input_type, c_char_p, c_uint,
|
|
POINTER(cu_jit_option), POINTER(c_void_p)),
|
|
|
|
# CUresult CUDAAPI
|
|
# cuLinkComplete(CUlinkState state, void **cubinOut, size_t *sizeOut)
|
|
'cuLinkComplete': (c_int,
|
|
cu_link_state, POINTER(c_void_p), POINTER(c_size_t)),
|
|
|
|
# CUresult CUDAAPI
|
|
# cuLinkDestroy(CUlinkState state)
|
|
'cuLinkDestroy': (c_int, cu_link_state),
|
|
|
|
# cuProfilerStart ( void )
|
|
'cuProfilerStart': (c_int,),
|
|
|
|
# cuProfilerStop ( void )
|
|
'cuProfilerStop': (c_int,),
|
|
|
|
# CUresult cuFuncGetAttribute ( int* pi, CUfunction_attribute attrib,
|
|
# CUfunction hfunc )
|
|
'cuFuncGetAttribute': (c_int,
|
|
POINTER(c_int), cu_function_attribute, cu_function),
|
|
|
|
# CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessor(
|
|
# int *numBlocks,
|
|
# CUfunction func,
|
|
# int blockSize,
|
|
# size_t dynamicSMemSize);
|
|
'cuOccupancyMaxActiveBlocksPerMultiprocessor': (c_int, POINTER(c_int),
|
|
cu_function, c_size_t,
|
|
c_uint),
|
|
|
|
# CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
|
|
# int *numBlocks,
|
|
# CUfunction func,
|
|
# int blockSize,
|
|
# size_t dynamicSMemSize,
|
|
# unsigned int flags);
|
|
'cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags': (c_int,
|
|
POINTER(c_int),
|
|
cu_function,
|
|
c_size_t, c_uint),
|
|
|
|
# CUresult CUDAAPI cuOccupancyMaxPotentialBlockSize(
|
|
# int *minGridSize, int *blockSize,
|
|
# CUfunction func,
|
|
# CUoccupancyB2DSize blockSizeToDynamicSMemSize,
|
|
# size_t dynamicSMemSize, int blockSizeLimit);
|
|
'cuOccupancyMaxPotentialBlockSize': (c_int, POINTER(c_int), POINTER(c_int),
|
|
cu_function, cu_occupancy_b2d_size,
|
|
c_size_t, c_int),
|
|
|
|
# CUresult CUDAAPI cuOccupancyMaxPotentialBlockSizeWithFlags(
|
|
# int *minGridSize, int *blockSize,
|
|
# CUfunction func,
|
|
# CUoccupancyB2DSize blockSizeToDynamicSMemSize,
|
|
# size_t dynamicSMemSize, int blockSizeLimit,
|
|
# unsigned int flags);
|
|
'cuOccupancyMaxPotentialBlockSizeWithFlags': (c_int, POINTER(c_int),
|
|
POINTER(c_int), cu_function,
|
|
cu_occupancy_b2d_size,
|
|
c_size_t, c_int, c_uint),
|
|
|
|
# CUresult cuIpcGetMemHandle ( CUipcMemHandle* pHandle, CUdeviceptr dptr )
|
|
'cuIpcGetMemHandle': (c_int,
|
|
POINTER(cu_ipc_mem_handle), cu_device_ptr),
|
|
|
|
# CUresult cuIpcOpenMemHandle(
|
|
# CUdeviceptr* pdptr,
|
|
# CUipcMemHandle handle,
|
|
# unsigned int Flags)
|
|
'cuIpcOpenMemHandle': (c_int, POINTER(cu_device_ptr), cu_ipc_mem_handle,
|
|
c_uint),
|
|
|
|
# CUresult cuIpcCloseMemHandle ( CUdeviceptr dptr )
|
|
|
|
'cuIpcCloseMemHandle': (c_int, cu_device_ptr),
|
|
|
|
# CUresult cuCtxEnablePeerAccess (CUcontext peerContext, unsigned int Flags)
|
|
'cuCtxEnablePeerAccess': (c_int, cu_context, c_int),
|
|
|
|
# CUresult cuDeviceCanAccessPeer ( int* canAccessPeer,
|
|
# CUdevice dev, CUdevice peerDev )
|
|
'cuDeviceCanAccessPeer': (c_int,
|
|
POINTER(c_int), cu_device, cu_device),
|
|
|
|
# CUresult cuDeviceGetUuid ( CUuuid* uuid, CUdevice dev )
|
|
'cuDeviceGetUuid': (c_int, POINTER(cu_uuid), cu_device),
|
|
}
|