from numba import cuda import numpy as np from numba.cuda.testing import skip_on_cudasim, CUDATestCase import threading import unittest class TestMultiGPUContext(CUDATestCase): @unittest.skipIf(len(cuda.gpus) < 2, "need more than 1 gpus") def test_multigpu_context(self): @cuda.jit("void(float64[:], float64[:])") def copy_plus_1(inp, out): i = cuda.grid(1) if i < out.size: out[i] = inp[i] + 1 def check(inp, out): np.testing.assert_equal(inp + 1, out) N = 32 A = np.arange(N, dtype=np.float64) B = np.arange(N, dtype=np.float64) with cuda.gpus[0]: copy_plus_1[1, N](A, B) check(A, B) copy_plus_1[1, N](A, B) check(A, B) with cuda.gpus[0]: A0 = np.arange(N, dtype=np.float64) B0 = np.arange(N, dtype=np.float64) copy_plus_1[1, N](A0, B0) with cuda.gpus[1]: A1 = np.arange(N, dtype=np.float64) B1 = np.arange(N, dtype=np.float64) copy_plus_1[1, N](A1, B1) check(A0, B0) check(A1, B1) A = np.arange(N, dtype=np.float64) B = np.arange(N, dtype=np.float64) copy_plus_1[1, N](A, B) check(A, B) @skip_on_cudasim('Simulator does not support multiple threads') def test_multithreaded(self): def work(gpu, dA, results, ridx): try: with gpu: arr = dA.copy_to_host() except Exception as e: results[ridx] = e else: results[ridx] = np.all(arr == np.arange(10)) dA = cuda.to_device(np.arange(10)) nthreads = 10 results = [None] * nthreads threads = [threading.Thread(target=work, args=(cuda.gpus.current, dA, results, i)) for i in range(nthreads)] for th in threads: th.start() for th in threads: th.join() for r in results: if isinstance(r, BaseException): raise r else: self.assertTrue(r) @unittest.skipIf(len(cuda.gpus) < 2, "need more than 1 gpus") def test_with_context(self): @cuda.jit def vector_add_scalar(arr, val): i = cuda.grid(1) if i < arr.size: arr[i] += val hostarr = np.arange(10, dtype=np.float32) with cuda.gpus[0]: arr1 = cuda.to_device(hostarr) with cuda.gpus[1]: arr2 = cuda.to_device(hostarr) with cuda.gpus[0]: vector_add_scalar[1, 10](arr1, 1) with cuda.gpus[1]: vector_add_scalar[1, 10](arr2, 2) with cuda.gpus[0]: np.testing.assert_equal(arr1.copy_to_host(), (hostarr + 1)) with cuda.gpus[1]: np.testing.assert_equal(arr2.copy_to_host(), (hostarr + 2)) @unittest.skipIf(len(cuda.gpus) < 2, "need more than 1 gpus") def test_with_context_peer_copy(self): # Peer access is not always possible - for example, with one GPU in TCC # mode and one in WDDM - if that is the case, this test would fail so # we need to skip it. with cuda.gpus[0]: ctx = cuda.current_context() if not ctx.can_access_peer(1): self.skipTest('Peer access between GPUs disabled') # 1. Create a range in an array hostarr = np.arange(10, dtype=np.float32) # 2. Copy range array from host -> GPU 0 with cuda.gpus[0]: arr1 = cuda.to_device(hostarr) # 3. Initialize a zero-filled array on GPU 1 with cuda.gpus[1]: arr2 = cuda.to_device(np.zeros_like(hostarr)) with cuda.gpus[0]: # 4. Copy range from GPU 0 -> GPU 1 arr2.copy_to_device(arr1) # 5. Copy range from GPU 1 -> host and check contents np.testing.assert_equal(arr2.copy_to_host(), hostarr) if __name__ == '__main__': unittest.main()