ai-content-maker/.venv/Lib/site-packages/numba/cuda/tests/cudapy/test_multigpu.py

141 lines
4.0 KiB
Python

from numba import cuda
import numpy as np
from numba.cuda.testing import skip_on_cudasim, CUDATestCase
import threading
import unittest
class TestMultiGPUContext(CUDATestCase):
@unittest.skipIf(len(cuda.gpus) < 2, "need more than 1 gpus")
def test_multigpu_context(self):
@cuda.jit("void(float64[:], float64[:])")
def copy_plus_1(inp, out):
i = cuda.grid(1)
if i < out.size:
out[i] = inp[i] + 1
def check(inp, out):
np.testing.assert_equal(inp + 1, out)
N = 32
A = np.arange(N, dtype=np.float64)
B = np.arange(N, dtype=np.float64)
with cuda.gpus[0]:
copy_plus_1[1, N](A, B)
check(A, B)
copy_plus_1[1, N](A, B)
check(A, B)
with cuda.gpus[0]:
A0 = np.arange(N, dtype=np.float64)
B0 = np.arange(N, dtype=np.float64)
copy_plus_1[1, N](A0, B0)
with cuda.gpus[1]:
A1 = np.arange(N, dtype=np.float64)
B1 = np.arange(N, dtype=np.float64)
copy_plus_1[1, N](A1, B1)
check(A0, B0)
check(A1, B1)
A = np.arange(N, dtype=np.float64)
B = np.arange(N, dtype=np.float64)
copy_plus_1[1, N](A, B)
check(A, B)
@skip_on_cudasim('Simulator does not support multiple threads')
def test_multithreaded(self):
def work(gpu, dA, results, ridx):
try:
with gpu:
arr = dA.copy_to_host()
except Exception as e:
results[ridx] = e
else:
results[ridx] = np.all(arr == np.arange(10))
dA = cuda.to_device(np.arange(10))
nthreads = 10
results = [None] * nthreads
threads = [threading.Thread(target=work, args=(cuda.gpus.current,
dA, results, i))
for i in range(nthreads)]
for th in threads:
th.start()
for th in threads:
th.join()
for r in results:
if isinstance(r, BaseException):
raise r
else:
self.assertTrue(r)
@unittest.skipIf(len(cuda.gpus) < 2, "need more than 1 gpus")
def test_with_context(self):
@cuda.jit
def vector_add_scalar(arr, val):
i = cuda.grid(1)
if i < arr.size:
arr[i] += val
hostarr = np.arange(10, dtype=np.float32)
with cuda.gpus[0]:
arr1 = cuda.to_device(hostarr)
with cuda.gpus[1]:
arr2 = cuda.to_device(hostarr)
with cuda.gpus[0]:
vector_add_scalar[1, 10](arr1, 1)
with cuda.gpus[1]:
vector_add_scalar[1, 10](arr2, 2)
with cuda.gpus[0]:
np.testing.assert_equal(arr1.copy_to_host(), (hostarr + 1))
with cuda.gpus[1]:
np.testing.assert_equal(arr2.copy_to_host(), (hostarr + 2))
@unittest.skipIf(len(cuda.gpus) < 2, "need more than 1 gpus")
def test_with_context_peer_copy(self):
# Peer access is not always possible - for example, with one GPU in TCC
# mode and one in WDDM - if that is the case, this test would fail so
# we need to skip it.
with cuda.gpus[0]:
ctx = cuda.current_context()
if not ctx.can_access_peer(1):
self.skipTest('Peer access between GPUs disabled')
# 1. Create a range in an array
hostarr = np.arange(10, dtype=np.float32)
# 2. Copy range array from host -> GPU 0
with cuda.gpus[0]:
arr1 = cuda.to_device(hostarr)
# 3. Initialize a zero-filled array on GPU 1
with cuda.gpus[1]:
arr2 = cuda.to_device(np.zeros_like(hostarr))
with cuda.gpus[0]:
# 4. Copy range from GPU 0 -> GPU 1
arr2.copy_to_device(arr1)
# 5. Copy range from GPU 1 -> host and check contents
np.testing.assert_equal(arr2.copy_to_host(), hostarr)
if __name__ == '__main__':
unittest.main()