120 lines
3.1 KiB
Python
120 lines
3.1 KiB
Python
|
import numpy as np
|
||
|
from numba import cuda, float64, void
|
||
|
from numba.cuda.testing import unittest, CUDATestCase
|
||
|
from numba.core import config
|
||
|
|
||
|
# NOTE: CUDA kernel does not return any value
|
||
|
|
||
|
if config.ENABLE_CUDASIM:
|
||
|
tpb = 4
|
||
|
else:
|
||
|
tpb = 16
|
||
|
SM_SIZE = tpb, tpb
|
||
|
|
||
|
|
||
|
class TestCudaLaplace(CUDATestCase):
|
||
|
def test_laplace_small(self):
|
||
|
|
||
|
@cuda.jit(float64(float64, float64), device=True, inline=True)
|
||
|
def get_max(a, b):
|
||
|
if a > b:
|
||
|
return a
|
||
|
else:
|
||
|
return b
|
||
|
|
||
|
@cuda.jit(void(float64[:, :], float64[:, :], float64[:, :]))
|
||
|
def jocabi_relax_core(A, Anew, error):
|
||
|
err_sm = cuda.shared.array(SM_SIZE, dtype=float64)
|
||
|
|
||
|
ty = cuda.threadIdx.x
|
||
|
tx = cuda.threadIdx.y
|
||
|
bx = cuda.blockIdx.x
|
||
|
by = cuda.blockIdx.y
|
||
|
|
||
|
n = A.shape[0]
|
||
|
m = A.shape[1]
|
||
|
|
||
|
i, j = cuda.grid(2)
|
||
|
|
||
|
err_sm[ty, tx] = 0
|
||
|
if j >= 1 and j < n - 1 and i >= 1 and i < m - 1:
|
||
|
Anew[j, i] = 0.25 * ( A[j, i + 1] + A[j, i - 1]
|
||
|
+ A[j - 1, i] + A[j + 1, i])
|
||
|
err_sm[ty, tx] = Anew[j, i] - A[j, i]
|
||
|
|
||
|
cuda.syncthreads()
|
||
|
|
||
|
# max-reduce err_sm vertically
|
||
|
t = tpb // 2
|
||
|
while t > 0:
|
||
|
if ty < t:
|
||
|
err_sm[ty, tx] = get_max(err_sm[ty, tx], err_sm[ty + t, tx])
|
||
|
t //= 2
|
||
|
cuda.syncthreads()
|
||
|
|
||
|
# max-reduce err_sm horizontally
|
||
|
t = tpb // 2
|
||
|
while t > 0:
|
||
|
if tx < t and ty == 0:
|
||
|
err_sm[ty, tx] = get_max(err_sm[ty, tx], err_sm[ty, tx + t])
|
||
|
t //= 2
|
||
|
cuda.syncthreads()
|
||
|
|
||
|
if tx == 0 and ty == 0:
|
||
|
error[by, bx] = err_sm[0, 0]
|
||
|
|
||
|
if config.ENABLE_CUDASIM:
|
||
|
NN, NM = 4, 4
|
||
|
iter_max = 20
|
||
|
else:
|
||
|
NN, NM = 256, 256
|
||
|
iter_max = 1000
|
||
|
|
||
|
A = np.zeros((NN, NM), dtype=np.float64)
|
||
|
Anew = np.zeros((NN, NM), dtype=np.float64)
|
||
|
|
||
|
n = NN
|
||
|
|
||
|
tol = 1.0e-6
|
||
|
error = 1.0
|
||
|
|
||
|
for j in range(n):
|
||
|
A[j, 0] = 1.0
|
||
|
Anew[j, 0] = 1.0
|
||
|
|
||
|
iter = 0
|
||
|
|
||
|
blockdim = (tpb, tpb)
|
||
|
griddim = (NN // blockdim[0], NM // blockdim[1])
|
||
|
|
||
|
error_grid = np.zeros(griddim)
|
||
|
|
||
|
stream = cuda.stream()
|
||
|
|
||
|
dA = cuda.to_device(A, stream) # to device and don't come back
|
||
|
dAnew = cuda.to_device(Anew, stream) # to device and don't come back
|
||
|
derror_grid = cuda.to_device(error_grid, stream)
|
||
|
|
||
|
while error > tol and iter < iter_max:
|
||
|
self.assertTrue(error_grid.dtype == np.float64)
|
||
|
|
||
|
jocabi_relax_core[griddim, blockdim, stream](dA, dAnew, derror_grid)
|
||
|
|
||
|
derror_grid.copy_to_host(error_grid, stream=stream)
|
||
|
|
||
|
# error_grid is available on host
|
||
|
stream.synchronize()
|
||
|
|
||
|
error = np.abs(error_grid).max()
|
||
|
|
||
|
# swap dA and dAnew
|
||
|
tmp = dA
|
||
|
dA = dAnew
|
||
|
dAnew = tmp
|
||
|
|
||
|
iter += 1
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
unittest.main()
|