ai-content-maker/.venv/Lib/site-packages/numba/cuda/tests/cudapy/test_laplace.py

120 lines
3.1 KiB
Python

import numpy as np
from numba import cuda, float64, void
from numba.cuda.testing import unittest, CUDATestCase
from numba.core import config
# NOTE: CUDA kernel does not return any value
if config.ENABLE_CUDASIM:
tpb = 4
else:
tpb = 16
SM_SIZE = tpb, tpb
class TestCudaLaplace(CUDATestCase):
def test_laplace_small(self):
@cuda.jit(float64(float64, float64), device=True, inline=True)
def get_max(a, b):
if a > b:
return a
else:
return b
@cuda.jit(void(float64[:, :], float64[:, :], float64[:, :]))
def jocabi_relax_core(A, Anew, error):
err_sm = cuda.shared.array(SM_SIZE, dtype=float64)
ty = cuda.threadIdx.x
tx = cuda.threadIdx.y
bx = cuda.blockIdx.x
by = cuda.blockIdx.y
n = A.shape[0]
m = A.shape[1]
i, j = cuda.grid(2)
err_sm[ty, tx] = 0
if j >= 1 and j < n - 1 and i >= 1 and i < m - 1:
Anew[j, i] = 0.25 * ( A[j, i + 1] + A[j, i - 1]
+ A[j - 1, i] + A[j + 1, i])
err_sm[ty, tx] = Anew[j, i] - A[j, i]
cuda.syncthreads()
# max-reduce err_sm vertically
t = tpb // 2
while t > 0:
if ty < t:
err_sm[ty, tx] = get_max(err_sm[ty, tx], err_sm[ty + t, tx])
t //= 2
cuda.syncthreads()
# max-reduce err_sm horizontally
t = tpb // 2
while t > 0:
if tx < t and ty == 0:
err_sm[ty, tx] = get_max(err_sm[ty, tx], err_sm[ty, tx + t])
t //= 2
cuda.syncthreads()
if tx == 0 and ty == 0:
error[by, bx] = err_sm[0, 0]
if config.ENABLE_CUDASIM:
NN, NM = 4, 4
iter_max = 20
else:
NN, NM = 256, 256
iter_max = 1000
A = np.zeros((NN, NM), dtype=np.float64)
Anew = np.zeros((NN, NM), dtype=np.float64)
n = NN
tol = 1.0e-6
error = 1.0
for j in range(n):
A[j, 0] = 1.0
Anew[j, 0] = 1.0
iter = 0
blockdim = (tpb, tpb)
griddim = (NN // blockdim[0], NM // blockdim[1])
error_grid = np.zeros(griddim)
stream = cuda.stream()
dA = cuda.to_device(A, stream) # to device and don't come back
dAnew = cuda.to_device(Anew, stream) # to device and don't come back
derror_grid = cuda.to_device(error_grid, stream)
while error > tol and iter < iter_max:
self.assertTrue(error_grid.dtype == np.float64)
jocabi_relax_core[griddim, blockdim, stream](dA, dAnew, derror_grid)
derror_grid.copy_to_host(error_grid, stream=stream)
# error_grid is available on host
stream.synchronize()
error = np.abs(error_grid).max()
# swap dA and dAnew
tmp = dA
dA = dAnew
dAnew = tmp
iter += 1
if __name__ == '__main__':
unittest.main()