ai-content-maker/.venv/Lib/site-packages/numba/cuda/tests/cudapy/test_multigpu.py

from numba import cuda
import numpy as np
from numba.cuda.testing import skip_on_cudasim, CUDATestCase
import threading
import unittest


class TestMultiGPUContext(CUDATestCase):
    @unittest.skipIf(len(cuda.gpus) < 2, "need more than 1 gpus")
    def test_multigpu_context(self):
        @cuda.jit("void(float64[:], float64[:])")
        def copy_plus_1(inp, out):
            i = cuda.grid(1)
            if i < out.size:
                out[i] = inp[i] + 1

        def check(inp, out):
            np.testing.assert_equal(inp + 1, out)

        N = 32
        A = np.arange(N, dtype=np.float64)
        B = np.arange(N, dtype=np.float64)

        with cuda.gpus[0]:
            copy_plus_1[1, N](A, B)

        check(A, B)

        copy_plus_1[1, N](A, B)
        check(A, B)

        with cuda.gpus[0]:
            A0 = np.arange(N, dtype=np.float64)
            B0 = np.arange(N, dtype=np.float64)
            copy_plus_1[1, N](A0, B0)

            with cuda.gpus[1]:
                A1 = np.arange(N, dtype=np.float64)
                B1 = np.arange(N, dtype=np.float64)
                copy_plus_1[1, N](A1, B1)

        check(A0, B0)
        check(A1, B1)

        A = np.arange(N, dtype=np.float64)
        B = np.arange(N, dtype=np.float64)
        copy_plus_1[1, N](A, B)
        check(A, B)

    @skip_on_cudasim('Simulator does not support multiple threads')
    def test_multithreaded(self):
        def work(gpu, dA, results, ridx):
            try:
                with gpu:
                    arr = dA.copy_to_host()

            except Exception as e:
                results[ridx] = e

            else:
                results[ridx] = np.all(arr == np.arange(10))

        dA = cuda.to_device(np.arange(10))

        nthreads = 10
        results = [None] * nthreads
        threads = [threading.Thread(target=work, args=(cuda.gpus.current,
                                                       dA, results, i))
                   for i in range(nthreads)]
        for th in threads:
            th.start()

        for th in threads:
            th.join()

        for r in results:
            if isinstance(r, BaseException):
                raise r
            else:
                self.assertTrue(r)

    @unittest.skipIf(len(cuda.gpus) < 2, "need more than 1 gpus")
    def test_with_context(self):

        @cuda.jit
        def vector_add_scalar(arr, val):
            i = cuda.grid(1)
            if i < arr.size:
                arr[i] += val

        hostarr = np.arange(10, dtype=np.float32)
        with cuda.gpus[0]:
            arr1 = cuda.to_device(hostarr)

        with cuda.gpus[1]:
            arr2 = cuda.to_device(hostarr)

        with cuda.gpus[0]:
            vector_add_scalar[1, 10](arr1, 1)

        with cuda.gpus[1]:
            vector_add_scalar[1, 10](arr2, 2)

        with cuda.gpus[0]:
            np.testing.assert_equal(arr1.copy_to_host(), (hostarr + 1))

        with cuda.gpus[1]:
            np.testing.assert_equal(arr2.copy_to_host(), (hostarr + 2))

    @unittest.skipIf(len(cuda.gpus) < 2, "need more than 1 gpus")
    def test_with_context_peer_copy(self):
        # Peer access is not always possible - for example, with one GPU in TCC
        # mode and one in WDDM - if that is the case, this test would fail so
        # we need to skip it.
        with cuda.gpus[0]:
            ctx = cuda.current_context()
            if not ctx.can_access_peer(1):
                self.skipTest('Peer access between GPUs disabled')

        # 1. Create a range in an array
        hostarr = np.arange(10, dtype=np.float32)

        # 2. Copy range array from host -> GPU 0
        with cuda.gpus[0]:
            arr1 = cuda.to_device(hostarr)

        # 3. Initialize a zero-filled array on GPU 1
        with cuda.gpus[1]:
            arr2 = cuda.to_device(np.zeros_like(hostarr))

        with cuda.gpus[0]:
            # 4. Copy range from GPU 0 -> GPU 1
            arr2.copy_to_device(arr1)

            # 5. Copy range from GPU 1 -> host and check contents
            np.testing.assert_equal(arr2.copy_to_host(), hostarr)


if __name__ == '__main__':
    unittest.main()
first commit 2024-05-03 04:18:51 +03:00			`from numba import cuda`
			`import numpy as np`
			`from numba.cuda.testing import skip_on_cudasim, CUDATestCase`
			`import threading`
			`import unittest`


			`class TestMultiGPUContext(CUDATestCase):`
			`@unittest.skipIf(len(cuda.gpus) < 2, "need more than 1 gpus")`
			`def test_multigpu_context(self):`
			`@cuda.jit("void(float64[:], float64[:])")`
			`def copy_plus_1(inp, out):`
			`i = cuda.grid(1)`
			`if i < out.size:`
			`out[i] = inp[i] + 1`

			`def check(inp, out):`
			`np.testing.assert_equal(inp + 1, out)`

			`N = 32`
			`A = np.arange(N, dtype=np.float64)`
			`B = np.arange(N, dtype=np.float64)`

			`with cuda.gpus[0]:`
			`copy_plus_1[1, N](A, B)`

			`check(A, B)`

			`copy_plus_1[1, N](A, B)`
			`check(A, B)`

			`with cuda.gpus[0]:`
			`A0 = np.arange(N, dtype=np.float64)`
			`B0 = np.arange(N, dtype=np.float64)`
			`copy_plus_1[1, N](A0, B0)`

			`with cuda.gpus[1]:`
			`A1 = np.arange(N, dtype=np.float64)`
			`B1 = np.arange(N, dtype=np.float64)`
			`copy_plus_1[1, N](A1, B1)`

			`check(A0, B0)`
			`check(A1, B1)`

			`A = np.arange(N, dtype=np.float64)`
			`B = np.arange(N, dtype=np.float64)`
			`copy_plus_1[1, N](A, B)`
			`check(A, B)`

			`@skip_on_cudasim('Simulator does not support multiple threads')`
			`def test_multithreaded(self):`
			`def work(gpu, dA, results, ridx):`
			`try:`
			`with gpu:`
			`arr = dA.copy_to_host()`

			`except Exception as e:`
			`results[ridx] = e`

			`else:`
			`results[ridx] = np.all(arr == np.arange(10))`

			`dA = cuda.to_device(np.arange(10))`

			`nthreads = 10`
			`results = [None] * nthreads`
			`threads = [threading.Thread(target=work, args=(cuda.gpus.current,`
			`dA, results, i))`
			`for i in range(nthreads)]`
			`for th in threads:`
			`th.start()`

			`for th in threads:`
			`th.join()`

			`for r in results:`
			`if isinstance(r, BaseException):`
			`raise r`
			`else:`
			`self.assertTrue(r)`

			`@unittest.skipIf(len(cuda.gpus) < 2, "need more than 1 gpus")`
			`def test_with_context(self):`

			`@cuda.jit`
			`def vector_add_scalar(arr, val):`
			`i = cuda.grid(1)`
			`if i < arr.size:`
			`arr[i] += val`

			`hostarr = np.arange(10, dtype=np.float32)`
			`with cuda.gpus[0]:`
			`arr1 = cuda.to_device(hostarr)`

			`with cuda.gpus[1]:`
			`arr2 = cuda.to_device(hostarr)`

			`with cuda.gpus[0]:`
			`vector_add_scalar[1, 10](arr1, 1)`

			`with cuda.gpus[1]:`
			`vector_add_scalar[1, 10](arr2, 2)`

			`with cuda.gpus[0]:`
			`np.testing.assert_equal(arr1.copy_to_host(), (hostarr + 1))`

			`with cuda.gpus[1]:`
			`np.testing.assert_equal(arr2.copy_to_host(), (hostarr + 2))`

			`@unittest.skipIf(len(cuda.gpus) < 2, "need more than 1 gpus")`
			`def test_with_context_peer_copy(self):`
			`# Peer access is not always possible - for example, with one GPU in TCC`
			`# mode and one in WDDM - if that is the case, this test would fail so`
			`# we need to skip it.`
			`with cuda.gpus[0]:`
			`ctx = cuda.current_context()`
			`if not ctx.can_access_peer(1):`
			`self.skipTest('Peer access between GPUs disabled')`

			`# 1. Create a range in an array`
			`hostarr = np.arange(10, dtype=np.float32)`

			`# 2. Copy range array from host -> GPU 0`
			`with cuda.gpus[0]:`
			`arr1 = cuda.to_device(hostarr)`

			`# 3. Initialize a zero-filled array on GPU 1`
			`with cuda.gpus[1]:`
			`arr2 = cuda.to_device(np.zeros_like(hostarr))`

			`with cuda.gpus[0]:`
			`# 4. Copy range from GPU 0 -> GPU 1`
			`arr2.copy_to_device(arr1)`

			`# 5. Copy range from GPU 1 -> host and check contents`
			`np.testing.assert_equal(arr2.copy_to_host(), hostarr)`


			`if __name__ == '__main__':`
			`unittest.main()`