141 lines
4.0 KiB
Python
141 lines
4.0 KiB
Python
from numba import cuda
|
|
import numpy as np
|
|
from numba.cuda.testing import skip_on_cudasim, CUDATestCase
|
|
import threading
|
|
import unittest
|
|
|
|
|
|
class TestMultiGPUContext(CUDATestCase):
|
|
@unittest.skipIf(len(cuda.gpus) < 2, "need more than 1 gpus")
|
|
def test_multigpu_context(self):
|
|
@cuda.jit("void(float64[:], float64[:])")
|
|
def copy_plus_1(inp, out):
|
|
i = cuda.grid(1)
|
|
if i < out.size:
|
|
out[i] = inp[i] + 1
|
|
|
|
def check(inp, out):
|
|
np.testing.assert_equal(inp + 1, out)
|
|
|
|
N = 32
|
|
A = np.arange(N, dtype=np.float64)
|
|
B = np.arange(N, dtype=np.float64)
|
|
|
|
with cuda.gpus[0]:
|
|
copy_plus_1[1, N](A, B)
|
|
|
|
check(A, B)
|
|
|
|
copy_plus_1[1, N](A, B)
|
|
check(A, B)
|
|
|
|
with cuda.gpus[0]:
|
|
A0 = np.arange(N, dtype=np.float64)
|
|
B0 = np.arange(N, dtype=np.float64)
|
|
copy_plus_1[1, N](A0, B0)
|
|
|
|
with cuda.gpus[1]:
|
|
A1 = np.arange(N, dtype=np.float64)
|
|
B1 = np.arange(N, dtype=np.float64)
|
|
copy_plus_1[1, N](A1, B1)
|
|
|
|
check(A0, B0)
|
|
check(A1, B1)
|
|
|
|
A = np.arange(N, dtype=np.float64)
|
|
B = np.arange(N, dtype=np.float64)
|
|
copy_plus_1[1, N](A, B)
|
|
check(A, B)
|
|
|
|
@skip_on_cudasim('Simulator does not support multiple threads')
|
|
def test_multithreaded(self):
|
|
def work(gpu, dA, results, ridx):
|
|
try:
|
|
with gpu:
|
|
arr = dA.copy_to_host()
|
|
|
|
except Exception as e:
|
|
results[ridx] = e
|
|
|
|
else:
|
|
results[ridx] = np.all(arr == np.arange(10))
|
|
|
|
dA = cuda.to_device(np.arange(10))
|
|
|
|
nthreads = 10
|
|
results = [None] * nthreads
|
|
threads = [threading.Thread(target=work, args=(cuda.gpus.current,
|
|
dA, results, i))
|
|
for i in range(nthreads)]
|
|
for th in threads:
|
|
th.start()
|
|
|
|
for th in threads:
|
|
th.join()
|
|
|
|
for r in results:
|
|
if isinstance(r, BaseException):
|
|
raise r
|
|
else:
|
|
self.assertTrue(r)
|
|
|
|
@unittest.skipIf(len(cuda.gpus) < 2, "need more than 1 gpus")
|
|
def test_with_context(self):
|
|
|
|
@cuda.jit
|
|
def vector_add_scalar(arr, val):
|
|
i = cuda.grid(1)
|
|
if i < arr.size:
|
|
arr[i] += val
|
|
|
|
hostarr = np.arange(10, dtype=np.float32)
|
|
with cuda.gpus[0]:
|
|
arr1 = cuda.to_device(hostarr)
|
|
|
|
with cuda.gpus[1]:
|
|
arr2 = cuda.to_device(hostarr)
|
|
|
|
with cuda.gpus[0]:
|
|
vector_add_scalar[1, 10](arr1, 1)
|
|
|
|
with cuda.gpus[1]:
|
|
vector_add_scalar[1, 10](arr2, 2)
|
|
|
|
with cuda.gpus[0]:
|
|
np.testing.assert_equal(arr1.copy_to_host(), (hostarr + 1))
|
|
|
|
with cuda.gpus[1]:
|
|
np.testing.assert_equal(arr2.copy_to_host(), (hostarr + 2))
|
|
|
|
@unittest.skipIf(len(cuda.gpus) < 2, "need more than 1 gpus")
|
|
def test_with_context_peer_copy(self):
|
|
# Peer access is not always possible - for example, with one GPU in TCC
|
|
# mode and one in WDDM - if that is the case, this test would fail so
|
|
# we need to skip it.
|
|
with cuda.gpus[0]:
|
|
ctx = cuda.current_context()
|
|
if not ctx.can_access_peer(1):
|
|
self.skipTest('Peer access between GPUs disabled')
|
|
|
|
# 1. Create a range in an array
|
|
hostarr = np.arange(10, dtype=np.float32)
|
|
|
|
# 2. Copy range array from host -> GPU 0
|
|
with cuda.gpus[0]:
|
|
arr1 = cuda.to_device(hostarr)
|
|
|
|
# 3. Initialize a zero-filled array on GPU 1
|
|
with cuda.gpus[1]:
|
|
arr2 = cuda.to_device(np.zeros_like(hostarr))
|
|
|
|
with cuda.gpus[0]:
|
|
# 4. Copy range from GPU 0 -> GPU 1
|
|
arr2.copy_to_device(arr1)
|
|
|
|
# 5. Copy range from GPU 1 -> host and check contents
|
|
np.testing.assert_equal(arr2.copy_to_host(), hostarr)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
unittest.main()
|