ai-content-maker/.venv/Lib/site-packages/numba/cuda/tests/cudapy/test_matmul.py

import numpy as np

from numba import cuda, float32, void
from numba.cuda.testing import unittest, CUDATestCase
from numba.core import config

# Ensure the test takes a reasonable amount of time in the simulator
if config.ENABLE_CUDASIM:
    bpg, tpb = 2, 8
else:
    bpg, tpb = 50, 32

n = bpg * tpb
SM_SIZE = (tpb, tpb)


class TestCudaMatMul(CUDATestCase):

    def test_func(self):

        @cuda.jit(void(float32[:, ::1], float32[:, ::1], float32[:, ::1]))
        def cu_square_matrix_mul(A, B, C):
            sA = cuda.shared.array(shape=SM_SIZE, dtype=float32)
            sB = cuda.shared.array(shape=(tpb, tpb), dtype=float32)

            tx = cuda.threadIdx.x
            ty = cuda.threadIdx.y
            bx = cuda.blockIdx.x
            by = cuda.blockIdx.y
            bw = cuda.blockDim.x
            bh = cuda.blockDim.y

            x = tx + bx * bw
            y = ty + by * bh

            acc = float32(0)  # forces all the math to be f32
            for i in range(bpg):
                if x < n and y < n:
                    sA[ty, tx] = A[y, tx + i * tpb]
                    sB[ty, tx] = B[ty + i * tpb, x]

                cuda.syncthreads()

                if x < n and y < n:
                    for j in range(tpb):
                        acc += sA[ty, j] * sB[j, tx]

                cuda.syncthreads()

            if x < n and y < n:
                C[y, x] = acc

        np.random.seed(42)
        A = np.array(np.random.random((n, n)), dtype=np.float32)
        B = np.array(np.random.random((n, n)), dtype=np.float32)
        C = np.empty_like(A)

        stream = cuda.stream()
        with stream.auto_synchronize():
            dA = cuda.to_device(A, stream)
            dB = cuda.to_device(B, stream)
            dC = cuda.to_device(C, stream)
            cu_square_matrix_mul[(bpg, bpg), (tpb, tpb), stream](dA, dB, dC)
            dC.copy_to_host(C, stream)

        # Host compute
        Cans = np.dot(A, B)

        # Check result
        np.testing.assert_allclose(C, Cans, rtol=1e-5)


if __name__ == '__main__':
    unittest.main()
first commit 2024-05-03 04:18:51 +03:00			`import numpy as np`

			`from numba import cuda, float32, void`
			`from numba.cuda.testing import unittest, CUDATestCase`
			`from numba.core import config`

			`# Ensure the test takes a reasonable amount of time in the simulator`
			`if config.ENABLE_CUDASIM:`
			`bpg, tpb = 2, 8`
			`else:`
			`bpg, tpb = 50, 32`

			`n = bpg * tpb`
			`SM_SIZE = (tpb, tpb)`


			`class TestCudaMatMul(CUDATestCase):`

			`def test_func(self):`

			`@cuda.jit(void(float32[:, ::1], float32[:, ::1], float32[:, ::1]))`
			`def cu_square_matrix_mul(A, B, C):`
			`sA = cuda.shared.array(shape=SM_SIZE, dtype=float32)`
			`sB = cuda.shared.array(shape=(tpb, tpb), dtype=float32)`

			`tx = cuda.threadIdx.x`
			`ty = cuda.threadIdx.y`
			`bx = cuda.blockIdx.x`
			`by = cuda.blockIdx.y`
			`bw = cuda.blockDim.x`
			`bh = cuda.blockDim.y`

			`x = tx + bx * bw`
			`y = ty + by * bh`

			`acc = float32(0) # forces all the math to be f32`
			`for i in range(bpg):`
			`if x < n and y < n:`
			`sA[ty, tx] = A[y, tx + i * tpb]`
			`sB[ty, tx] = B[ty + i * tpb, x]`

			`cuda.syncthreads()`

			`if x < n and y < n:`
			`for j in range(tpb):`
			`acc += sA[ty, j] * sB[j, tx]`

			`cuda.syncthreads()`

			`if x < n and y < n:`
			`C[y, x] = acc`

			`np.random.seed(42)`
			`A = np.array(np.random.random((n, n)), dtype=np.float32)`
			`B = np.array(np.random.random((n, n)), dtype=np.float32)`
			`C = np.empty_like(A)`

			`stream = cuda.stream()`
			`with stream.auto_synchronize():`
			`dA = cuda.to_device(A, stream)`
			`dB = cuda.to_device(B, stream)`
			`dC = cuda.to_device(C, stream)`
			`cu_square_matrix_mul[(bpg, bpg), (tpb, tpb), stream](dA, dB, dC)`
			`dC.copy_to_host(C, stream)`

			`# Host compute`
			`Cans = np.dot(A, B)`

			`# Check result`
			`np.testing.assert_allclose(C, Cans, rtol=1e-5)`


			`if __name__ == '__main__':`
			`unittest.main()`