ai-content-maker/.venv/Lib/site-packages/numba/cuda/tests/cudapy/test_sm.py

from numba import cuda, int32, float64, void
from numba.core.errors import TypingError
from numba.core import types
from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim

import numpy as np
from numba.np import numpy_support as nps

from .extensions_usecases import test_struct_model_type, TestStruct

recordwith2darray = np.dtype([('i', np.int32),
                              ('j', np.float32, (3, 2))])


class TestSharedMemoryIssue(CUDATestCase):
    def test_issue_953_sm_linkage_conflict(self):
        @cuda.jit(device=True)
        def inner():
            inner_arr = cuda.shared.array(1, dtype=int32)  # noqa: F841

        @cuda.jit
        def outer():
            outer_arr = cuda.shared.array(1, dtype=int32)  # noqa: F841
            inner()

        outer[1, 1]()

    def _check_shared_array_size(self, shape, expected):
        @cuda.jit
        def s(a):
            arr = cuda.shared.array(shape, dtype=int32)
            a[0] = arr.size

        result = np.zeros(1, dtype=np.int32)
        s[1, 1](result)
        self.assertEqual(result[0], expected)

    def test_issue_1051_shared_size_broken_1d(self):
        self._check_shared_array_size(2, 2)

    def test_issue_1051_shared_size_broken_2d(self):
        self._check_shared_array_size((2, 3), 6)

    def test_issue_1051_shared_size_broken_3d(self):

        self._check_shared_array_size((2, 3, 4), 24)

    def _check_shared_array_size_fp16(self, shape, expected, ty):
        @cuda.jit
        def s(a):
            arr = cuda.shared.array(shape, dtype=ty)
            a[0] = arr.size

        result = np.zeros(1, dtype=np.float16)
        s[1, 1](result)
        self.assertEqual(result[0], expected)

    def test_issue_fp16_support(self):
        self._check_shared_array_size_fp16(2, 2, types.float16)
        self._check_shared_array_size_fp16(2, 2, np.float16)

    def test_issue_2393(self):
        """
        Test issue of warp misalign address due to nvvm not knowing the
        alignment(? but it should have taken the natural alignment of the type)
        """
        num_weights = 2
        num_blocks = 48
        examples_per_block = 4
        threads_per_block = 1

        @cuda.jit
        def costs_func(d_block_costs):
            s_features = cuda.shared.array((examples_per_block, num_weights),
                                           float64)
            s_initialcost = cuda.shared.array(7, float64)  # Bug

            threadIdx = cuda.threadIdx.x

            prediction = 0
            for j in range(num_weights):
                prediction += s_features[threadIdx, j]

            d_block_costs[0] = s_initialcost[0] + prediction

        block_costs = np.zeros(num_blocks, dtype=np.float64)
        d_block_costs = cuda.to_device(block_costs)

        costs_func[num_blocks, threads_per_block](d_block_costs)

        cuda.synchronize()


class TestSharedMemory(CUDATestCase):
    def _test_shared(self, arr):
        # Use a kernel that copies via shared memory to check loading and
        # storing different dtypes with shared memory. All threads in a block
        # collaborate to load in values, then the output values are written
        # only by the first thread in the block after synchronization.

        nelem = len(arr)
        nthreads = 16
        nblocks = int(nelem / nthreads)
        dt = nps.from_dtype(arr.dtype)

        @cuda.jit
        def use_sm_chunk_copy(x, y):
            sm = cuda.shared.array(nthreads, dtype=dt)

            tx = cuda.threadIdx.x
            bx = cuda.blockIdx.x
            bd = cuda.blockDim.x

            # Load this block's chunk into shared
            i = bx * bd + tx
            if i < len(x):
                sm[tx] = x[i]

            cuda.syncthreads()

            # One thread per block writes this block's chunk
            if tx == 0:
                for j in range(nthreads):
                    y[bd * bx + j] = sm[j]

        d_result = cuda.device_array_like(arr)
        use_sm_chunk_copy[nblocks, nthreads](arr, d_result)
        host_result = d_result.copy_to_host()
        np.testing.assert_array_equal(arr, host_result)

    def test_shared_recarray(self):
        arr = np.recarray(128, dtype=recordwith2darray)
        for x in range(len(arr)):
            arr[x].i = x
            j = np.arange(3 * 2, dtype=np.float32)
            arr[x].j = j.reshape(3, 2) * x

        self._test_shared(arr)

    def test_shared_bool(self):
        arr = np.random.randint(2, size=(1024,), dtype=np.bool_)
        self._test_shared(arr)

    def _test_dynshared_slice(self, func, arr, expected):
        # Check that slices of shared memory are correct
        # (See Bug #5073 - prior to the addition of these tests and
        # corresponding fix, slices of dynamic shared arrays all aliased each
        # other)
        nshared = arr.size * arr.dtype.itemsize
        func[1, 1, 0, nshared](arr)
        np.testing.assert_array_equal(expected, arr)

    def test_dynshared_slice_write(self):
        # Test writing values into disjoint slices of dynamic shared memory
        @cuda.jit
        def slice_write(x):
            dynsmem = cuda.shared.array(0, dtype=int32)
            sm1 = dynsmem[0:1]
            sm2 = dynsmem[1:2]

            sm1[0] = 1
            sm2[0] = 2
            x[0] = dynsmem[0]
            x[1] = dynsmem[1]

        arr = np.zeros(2, dtype=np.int32)
        expected = np.array([1, 2], dtype=np.int32)
        self._test_dynshared_slice(slice_write, arr, expected)

    def test_dynshared_slice_read(self):
        # Test reading values from disjoint slices of dynamic shared memory
        @cuda.jit
        def slice_read(x):
            dynsmem = cuda.shared.array(0, dtype=int32)
            sm1 = dynsmem[0:1]
            sm2 = dynsmem[1:2]

            dynsmem[0] = 1
            dynsmem[1] = 2
            x[0] = sm1[0]
            x[1] = sm2[0]

        arr = np.zeros(2, dtype=np.int32)
        expected = np.array([1, 2], dtype=np.int32)
        self._test_dynshared_slice(slice_read, arr, expected)

    def test_dynshared_slice_diff_sizes(self):
        # Test reading values from disjoint slices of dynamic shared memory
        # with different sizes
        @cuda.jit
        def slice_diff_sizes(x):
            dynsmem = cuda.shared.array(0, dtype=int32)
            sm1 = dynsmem[0:1]
            sm2 = dynsmem[1:3]

            dynsmem[0] = 1
            dynsmem[1] = 2
            dynsmem[2] = 3
            x[0] = sm1[0]
            x[1] = sm2[0]
            x[2] = sm2[1]

        arr = np.zeros(3, dtype=np.int32)
        expected = np.array([1, 2, 3], dtype=np.int32)
        self._test_dynshared_slice(slice_diff_sizes, arr, expected)

    def test_dynshared_slice_overlap(self):
        # Test reading values from overlapping slices of dynamic shared memory
        @cuda.jit
        def slice_overlap(x):
            dynsmem = cuda.shared.array(0, dtype=int32)
            sm1 = dynsmem[0:2]
            sm2 = dynsmem[1:4]

            dynsmem[0] = 1
            dynsmem[1] = 2
            dynsmem[2] = 3
            dynsmem[3] = 4
            x[0] = sm1[0]
            x[1] = sm1[1]
            x[2] = sm2[0]
            x[3] = sm2[1]
            x[4] = sm2[2]

        arr = np.zeros(5, dtype=np.int32)
        expected = np.array([1, 2, 2, 3, 4], dtype=np.int32)
        self._test_dynshared_slice(slice_overlap, arr, expected)

    def test_dynshared_slice_gaps(self):
        # Test writing values to slices of dynamic shared memory doesn't write
        # outside the slice
        @cuda.jit
        def slice_gaps(x):
            dynsmem = cuda.shared.array(0, dtype=int32)
            sm1 = dynsmem[1:3]
            sm2 = dynsmem[4:6]

            # Initial values for dynamic shared memory, some to be overwritten
            dynsmem[0] = 99
            dynsmem[1] = 99
            dynsmem[2] = 99
            dynsmem[3] = 99
            dynsmem[4] = 99
            dynsmem[5] = 99
            dynsmem[6] = 99

            sm1[0] = 1
            sm1[1] = 2
            sm2[0] = 3
            sm2[1] = 4

            x[0] = dynsmem[0]
            x[1] = dynsmem[1]
            x[2] = dynsmem[2]
            x[3] = dynsmem[3]
            x[4] = dynsmem[4]
            x[5] = dynsmem[5]
            x[6] = dynsmem[6]

        arr = np.zeros(7, dtype=np.int32)
        expected = np.array([99, 1, 2, 99, 3, 4, 99], dtype=np.int32)
        self._test_dynshared_slice(slice_gaps, arr, expected)

    def test_dynshared_slice_write_backwards(self):
        # Test writing values into disjoint slices of dynamic shared memory
        # with negative steps
        @cuda.jit
        def slice_write_backwards(x):
            dynsmem = cuda.shared.array(0, dtype=int32)
            sm1 = dynsmem[1::-1]
            sm2 = dynsmem[3:1:-1]

            sm1[0] = 1
            sm1[1] = 2
            sm2[0] = 3
            sm2[1] = 4
            x[0] = dynsmem[0]
            x[1] = dynsmem[1]
            x[2] = dynsmem[2]
            x[3] = dynsmem[3]

        arr = np.zeros(4, dtype=np.int32)
        expected = np.array([2, 1, 4, 3], dtype=np.int32)
        self._test_dynshared_slice(slice_write_backwards, arr, expected)

    def test_dynshared_slice_nonunit_stride(self):
        # Test writing values into slice of dynamic shared memory with
        # non-unit stride
        @cuda.jit
        def slice_nonunit_stride(x):
            dynsmem = cuda.shared.array(0, dtype=int32)
            sm1 = dynsmem[::2]

            # Initial values for dynamic shared memory, some to be overwritten
            dynsmem[0] = 99
            dynsmem[1] = 99
            dynsmem[2] = 99
            dynsmem[3] = 99
            dynsmem[4] = 99
            dynsmem[5] = 99

            sm1[0] = 1
            sm1[1] = 2
            sm1[2] = 3

            x[0] = dynsmem[0]
            x[1] = dynsmem[1]
            x[2] = dynsmem[2]
            x[3] = dynsmem[3]
            x[4] = dynsmem[4]
            x[5] = dynsmem[5]

        arr = np.zeros(6, dtype=np.int32)
        expected = np.array([1, 99, 2, 99, 3, 99], dtype=np.int32)
        self._test_dynshared_slice(slice_nonunit_stride, arr, expected)

    def test_dynshared_slice_nonunit_reverse_stride(self):
        # Test writing values into slice of dynamic shared memory with
        # reverse non-unit stride
        @cuda.jit
        def slice_nonunit_reverse_stride(x):
            dynsmem = cuda.shared.array(0, dtype=int32)
            sm1 = dynsmem[-1::-2]

            # Initial values for dynamic shared memory, some to be overwritten
            dynsmem[0] = 99
            dynsmem[1] = 99
            dynsmem[2] = 99
            dynsmem[3] = 99
            dynsmem[4] = 99
            dynsmem[5] = 99

            sm1[0] = 1
            sm1[1] = 2
            sm1[2] = 3

            x[0] = dynsmem[0]
            x[1] = dynsmem[1]
            x[2] = dynsmem[2]
            x[3] = dynsmem[3]
            x[4] = dynsmem[4]
            x[5] = dynsmem[5]

        arr = np.zeros(6, dtype=np.int32)
        expected = np.array([99, 3, 99, 2, 99, 1], dtype=np.int32)
        self._test_dynshared_slice(slice_nonunit_reverse_stride, arr, expected)

    def test_issue_5073(self):
        # An example with which Bug #5073 (slices of dynamic shared memory all
        # alias) was discovered. The kernel uses all threads in the block to
        # load values into slices of dynamic shared memory. One thread per
        # block then writes the loaded values back to a global array after
        # syncthreads().

        arr = np.arange(1024)
        nelem = len(arr)
        nthreads = 16
        nblocks = int(nelem / nthreads)
        dt = nps.from_dtype(arr.dtype)
        nshared = nthreads * arr.dtype.itemsize
        chunksize = int(nthreads / 2)

        @cuda.jit
        def sm_slice_copy(x, y, chunksize):
            dynsmem = cuda.shared.array(0, dtype=dt)
            sm1 = dynsmem[0:chunksize]
            sm2 = dynsmem[chunksize:chunksize * 2]

            tx = cuda.threadIdx.x
            bx = cuda.blockIdx.x
            bd = cuda.blockDim.x

            # load this block's chunk into shared
            i = bx * bd + tx
            if i < len(x):
                if tx < chunksize:
                    sm1[tx] = x[i]
                else:
                    sm2[tx - chunksize] = x[i]

            cuda.syncthreads()

            # one thread per block writes this block's chunk
            if tx == 0:
                for j in range(chunksize):
                    y[bd * bx + j] = sm1[j]
                    y[bd * bx + j + chunksize] = sm2[j]

        d_result = cuda.device_array_like(arr)
        sm_slice_copy[nblocks, nthreads, 0, nshared](arr, d_result, chunksize)
        host_result = d_result.copy_to_host()
        np.testing.assert_array_equal(arr, host_result)

    @skip_on_cudasim("Can't check typing in simulator")
    def test_invalid_array_type(self):
        rgx = ".*Cannot infer the type of variable 'arr'.*"

        def unsupported_type():
            arr = cuda.shared.array(10, dtype=np.dtype('O')) # noqa: F841
        with self.assertRaisesRegex(TypingError, rgx):
            cuda.jit(void())(unsupported_type)

        rgx = ".*Invalid NumPy dtype specified: 'int33'.*"

        def invalid_string_type():
            arr = cuda.shared.array(10, dtype='int33') # noqa: F841
        with self.assertRaisesRegex(TypingError, rgx):
            cuda.jit(void())(invalid_string_type)

    @skip_on_cudasim("Struct model array unsupported in simulator")
    def test_struct_model_type_static(self):
        nthreads = 64

        @cuda.jit(void(int32[::1], int32[::1]))
        def write_then_reverse_read_static(outx, outy):
            # Test creation
            arr = cuda.shared.array(nthreads, dtype=test_struct_model_type)

            i = cuda.grid(1)
            ri = nthreads - i - 1

            if i < len(outx) and i < len(outy):
                # Test set to arr
                obj = TestStruct(int32(i), int32(i * 2))
                arr[i] = obj

                cuda.syncthreads()
                # Test get from arr
                outx[i] = arr[ri].x
                outy[i] = arr[ri].y

        arrx = np.zeros((nthreads,), dtype="int32")
        arry = np.zeros((nthreads,), dtype="int32")

        write_then_reverse_read_static[1, nthreads](arrx, arry)

        for i, x in enumerate(arrx):
            self.assertEqual(x, nthreads - i - 1)
        for i, y in enumerate(arry):
            self.assertEqual(y, (nthreads - i - 1) * 2)


if __name__ == '__main__':
    unittest.main()
first commit 2024-05-03 04:18:51 +03:00			`from numba import cuda, int32, float64, void`
			`from numba.core.errors import TypingError`
			`from numba.core import types`
			`from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim`

			`import numpy as np`
			`from numba.np import numpy_support as nps`

			`from .extensions_usecases import test_struct_model_type, TestStruct`

			`recordwith2darray = np.dtype([('i', np.int32),`
			`('j', np.float32, (3, 2))])`


			`class TestSharedMemoryIssue(CUDATestCase):`
			`def test_issue_953_sm_linkage_conflict(self):`
			`@cuda.jit(device=True)`
			`def inner():`
			`inner_arr = cuda.shared.array(1, dtype=int32) # noqa: F841`

			`@cuda.jit`
			`def outer():`
			`outer_arr = cuda.shared.array(1, dtype=int32) # noqa: F841`
			`inner()`

			`outer[1, 1]()`

			`def _check_shared_array_size(self, shape, expected):`
			`@cuda.jit`
			`def s(a):`
			`arr = cuda.shared.array(shape, dtype=int32)`
			`a[0] = arr.size`

			`result = np.zeros(1, dtype=np.int32)`
			`s[1, 1](result)`
			`self.assertEqual(result[0], expected)`

			`def test_issue_1051_shared_size_broken_1d(self):`
			`self._check_shared_array_size(2, 2)`

			`def test_issue_1051_shared_size_broken_2d(self):`
			`self._check_shared_array_size((2, 3), 6)`

			`def test_issue_1051_shared_size_broken_3d(self):`

			`self._check_shared_array_size((2, 3, 4), 24)`

			`def _check_shared_array_size_fp16(self, shape, expected, ty):`
			`@cuda.jit`
			`def s(a):`
			`arr = cuda.shared.array(shape, dtype=ty)`
			`a[0] = arr.size`

			`result = np.zeros(1, dtype=np.float16)`
			`s[1, 1](result)`
			`self.assertEqual(result[0], expected)`

			`def test_issue_fp16_support(self):`
			`self._check_shared_array_size_fp16(2, 2, types.float16)`
			`self._check_shared_array_size_fp16(2, 2, np.float16)`

			`def test_issue_2393(self):`
			`"""`
			`Test issue of warp misalign address due to nvvm not knowing the`
			`alignment(? but it should have taken the natural alignment of the type)`
			`"""`
			`num_weights = 2`
			`num_blocks = 48`
			`examples_per_block = 4`
			`threads_per_block = 1`

			`@cuda.jit`
			`def costs_func(d_block_costs):`
			`s_features = cuda.shared.array((examples_per_block, num_weights),`
			`float64)`
			`s_initialcost = cuda.shared.array(7, float64) # Bug`

			`threadIdx = cuda.threadIdx.x`

			`prediction = 0`
			`for j in range(num_weights):`
			`prediction += s_features[threadIdx, j]`

			`d_block_costs[0] = s_initialcost[0] + prediction`

			`block_costs = np.zeros(num_blocks, dtype=np.float64)`
			`d_block_costs = cuda.to_device(block_costs)`

			`costs_func[num_blocks, threads_per_block](d_block_costs)`

			`cuda.synchronize()`


			`class TestSharedMemory(CUDATestCase):`
			`def _test_shared(self, arr):`
			`# Use a kernel that copies via shared memory to check loading and`
			`# storing different dtypes with shared memory. All threads in a block`
			`# collaborate to load in values, then the output values are written`
			`# only by the first thread in the block after synchronization.`

			`nelem = len(arr)`
			`nthreads = 16`
			`nblocks = int(nelem / nthreads)`
			`dt = nps.from_dtype(arr.dtype)`

			`@cuda.jit`
			`def use_sm_chunk_copy(x, y):`
			`sm = cuda.shared.array(nthreads, dtype=dt)`

			`tx = cuda.threadIdx.x`
			`bx = cuda.blockIdx.x`
			`bd = cuda.blockDim.x`

			`# Load this block's chunk into shared`
			`i = bx * bd + tx`
			`if i < len(x):`
			`sm[tx] = x[i]`

			`cuda.syncthreads()`

			`# One thread per block writes this block's chunk`
			`if tx == 0:`
			`for j in range(nthreads):`
			`y[bd * bx + j] = sm[j]`

			`d_result = cuda.device_array_like(arr)`
			`use_sm_chunk_copy[nblocks, nthreads](arr, d_result)`
			`host_result = d_result.copy_to_host()`
			`np.testing.assert_array_equal(arr, host_result)`

			`def test_shared_recarray(self):`
			`arr = np.recarray(128, dtype=recordwith2darray)`
			`for x in range(len(arr)):`
			`arr[x].i = x`
			`j = np.arange(3 * 2, dtype=np.float32)`
			`arr[x].j = j.reshape(3, 2) * x`

			`self._test_shared(arr)`

			`def test_shared_bool(self):`
			`arr = np.random.randint(2, size=(1024,), dtype=np.bool_)`
			`self._test_shared(arr)`

			`def _test_dynshared_slice(self, func, arr, expected):`
			`# Check that slices of shared memory are correct`
			`# (See Bug #5073 - prior to the addition of these tests and`
			`# corresponding fix, slices of dynamic shared arrays all aliased each`
			`# other)`
			`nshared = arr.size * arr.dtype.itemsize`
			`func[1, 1, 0, nshared](arr)`
			`np.testing.assert_array_equal(expected, arr)`

			`def test_dynshared_slice_write(self):`
			`# Test writing values into disjoint slices of dynamic shared memory`
			`@cuda.jit`
			`def slice_write(x):`
			`dynsmem = cuda.shared.array(0, dtype=int32)`
			`sm1 = dynsmem[0:1]`
			`sm2 = dynsmem[1:2]`

			`sm1[0] = 1`
			`sm2[0] = 2`
			`x[0] = dynsmem[0]`
			`x[1] = dynsmem[1]`

			`arr = np.zeros(2, dtype=np.int32)`
			`expected = np.array([1, 2], dtype=np.int32)`
			`self._test_dynshared_slice(slice_write, arr, expected)`

			`def test_dynshared_slice_read(self):`
			`# Test reading values from disjoint slices of dynamic shared memory`
			`@cuda.jit`
			`def slice_read(x):`
			`dynsmem = cuda.shared.array(0, dtype=int32)`
			`sm1 = dynsmem[0:1]`
			`sm2 = dynsmem[1:2]`

			`dynsmem[0] = 1`
			`dynsmem[1] = 2`
			`x[0] = sm1[0]`
			`x[1] = sm2[0]`

			`arr = np.zeros(2, dtype=np.int32)`
			`expected = np.array([1, 2], dtype=np.int32)`
			`self._test_dynshared_slice(slice_read, arr, expected)`

			`def test_dynshared_slice_diff_sizes(self):`
			`# Test reading values from disjoint slices of dynamic shared memory`
			`# with different sizes`
			`@cuda.jit`
			`def slice_diff_sizes(x):`
			`dynsmem = cuda.shared.array(0, dtype=int32)`
			`sm1 = dynsmem[0:1]`
			`sm2 = dynsmem[1:3]`

			`dynsmem[0] = 1`
			`dynsmem[1] = 2`
			`dynsmem[2] = 3`
			`x[0] = sm1[0]`
			`x[1] = sm2[0]`
			`x[2] = sm2[1]`

			`arr = np.zeros(3, dtype=np.int32)`
			`expected = np.array([1, 2, 3], dtype=np.int32)`
			`self._test_dynshared_slice(slice_diff_sizes, arr, expected)`

			`def test_dynshared_slice_overlap(self):`
			`# Test reading values from overlapping slices of dynamic shared memory`
			`@cuda.jit`
			`def slice_overlap(x):`
			`dynsmem = cuda.shared.array(0, dtype=int32)`
			`sm1 = dynsmem[0:2]`
			`sm2 = dynsmem[1:4]`

			`dynsmem[0] = 1`
			`dynsmem[1] = 2`
			`dynsmem[2] = 3`
			`dynsmem[3] = 4`
			`x[0] = sm1[0]`
			`x[1] = sm1[1]`
			`x[2] = sm2[0]`
			`x[3] = sm2[1]`
			`x[4] = sm2[2]`

			`arr = np.zeros(5, dtype=np.int32)`
			`expected = np.array([1, 2, 2, 3, 4], dtype=np.int32)`
			`self._test_dynshared_slice(slice_overlap, arr, expected)`

			`def test_dynshared_slice_gaps(self):`
			`# Test writing values to slices of dynamic shared memory doesn't write`
			`# outside the slice`
			`@cuda.jit`
			`def slice_gaps(x):`
			`dynsmem = cuda.shared.array(0, dtype=int32)`
			`sm1 = dynsmem[1:3]`
			`sm2 = dynsmem[4:6]`

			`# Initial values for dynamic shared memory, some to be overwritten`
			`dynsmem[0] = 99`
			`dynsmem[1] = 99`
			`dynsmem[2] = 99`
			`dynsmem[3] = 99`
			`dynsmem[4] = 99`
			`dynsmem[5] = 99`
			`dynsmem[6] = 99`

			`sm1[0] = 1`
			`sm1[1] = 2`
			`sm2[0] = 3`
			`sm2[1] = 4`

			`x[0] = dynsmem[0]`
			`x[1] = dynsmem[1]`
			`x[2] = dynsmem[2]`
			`x[3] = dynsmem[3]`
			`x[4] = dynsmem[4]`
			`x[5] = dynsmem[5]`
			`x[6] = dynsmem[6]`

			`arr = np.zeros(7, dtype=np.int32)`
			`expected = np.array([99, 1, 2, 99, 3, 4, 99], dtype=np.int32)`
			`self._test_dynshared_slice(slice_gaps, arr, expected)`

			`def test_dynshared_slice_write_backwards(self):`
			`# Test writing values into disjoint slices of dynamic shared memory`
			`# with negative steps`
			`@cuda.jit`
			`def slice_write_backwards(x):`
			`dynsmem = cuda.shared.array(0, dtype=int32)`
			`sm1 = dynsmem[1::-1]`
			`sm2 = dynsmem[3:1:-1]`

			`sm1[0] = 1`
			`sm1[1] = 2`
			`sm2[0] = 3`
			`sm2[1] = 4`
			`x[0] = dynsmem[0]`
			`x[1] = dynsmem[1]`
			`x[2] = dynsmem[2]`
			`x[3] = dynsmem[3]`

			`arr = np.zeros(4, dtype=np.int32)`
			`expected = np.array([2, 1, 4, 3], dtype=np.int32)`
			`self._test_dynshared_slice(slice_write_backwards, arr, expected)`

			`def test_dynshared_slice_nonunit_stride(self):`
			`# Test writing values into slice of dynamic shared memory with`
			`# non-unit stride`
			`@cuda.jit`
			`def slice_nonunit_stride(x):`
			`dynsmem = cuda.shared.array(0, dtype=int32)`
			`sm1 = dynsmem[::2]`

			`# Initial values for dynamic shared memory, some to be overwritten`
			`dynsmem[0] = 99`
			`dynsmem[1] = 99`
			`dynsmem[2] = 99`
			`dynsmem[3] = 99`
			`dynsmem[4] = 99`
			`dynsmem[5] = 99`

			`sm1[0] = 1`
			`sm1[1] = 2`
			`sm1[2] = 3`

			`x[0] = dynsmem[0]`
			`x[1] = dynsmem[1]`
			`x[2] = dynsmem[2]`
			`x[3] = dynsmem[3]`
			`x[4] = dynsmem[4]`
			`x[5] = dynsmem[5]`

			`arr = np.zeros(6, dtype=np.int32)`
			`expected = np.array([1, 99, 2, 99, 3, 99], dtype=np.int32)`
			`self._test_dynshared_slice(slice_nonunit_stride, arr, expected)`

			`def test_dynshared_slice_nonunit_reverse_stride(self):`
			`# Test writing values into slice of dynamic shared memory with`
			`# reverse non-unit stride`
			`@cuda.jit`
			`def slice_nonunit_reverse_stride(x):`
			`dynsmem = cuda.shared.array(0, dtype=int32)`
			`sm1 = dynsmem[-1::-2]`

			`# Initial values for dynamic shared memory, some to be overwritten`
			`dynsmem[0] = 99`
			`dynsmem[1] = 99`
			`dynsmem[2] = 99`
			`dynsmem[3] = 99`
			`dynsmem[4] = 99`
			`dynsmem[5] = 99`

			`sm1[0] = 1`
			`sm1[1] = 2`
			`sm1[2] = 3`

			`x[0] = dynsmem[0]`
			`x[1] = dynsmem[1]`
			`x[2] = dynsmem[2]`
			`x[3] = dynsmem[3]`
			`x[4] = dynsmem[4]`
			`x[5] = dynsmem[5]`

			`arr = np.zeros(6, dtype=np.int32)`
			`expected = np.array([99, 3, 99, 2, 99, 1], dtype=np.int32)`
			`self._test_dynshared_slice(slice_nonunit_reverse_stride, arr, expected)`

			`def test_issue_5073(self):`
			`# An example with which Bug #5073 (slices of dynamic shared memory all`
			`# alias) was discovered. The kernel uses all threads in the block to`
			`# load values into slices of dynamic shared memory. One thread per`
			`# block then writes the loaded values back to a global array after`
			`# syncthreads().`

			`arr = np.arange(1024)`
			`nelem = len(arr)`
			`nthreads = 16`
			`nblocks = int(nelem / nthreads)`
			`dt = nps.from_dtype(arr.dtype)`
			`nshared = nthreads * arr.dtype.itemsize`
			`chunksize = int(nthreads / 2)`

			`@cuda.jit`
			`def sm_slice_copy(x, y, chunksize):`
			`dynsmem = cuda.shared.array(0, dtype=dt)`
			`sm1 = dynsmem[0:chunksize]`
			`sm2 = dynsmem[chunksize:chunksize * 2]`

			`tx = cuda.threadIdx.x`
			`bx = cuda.blockIdx.x`
			`bd = cuda.blockDim.x`

			`# load this block's chunk into shared`
			`i = bx * bd + tx`
			`if i < len(x):`
			`if tx < chunksize:`
			`sm1[tx] = x[i]`
			`else:`
			`sm2[tx - chunksize] = x[i]`

			`cuda.syncthreads()`

			`# one thread per block writes this block's chunk`
			`if tx == 0:`
			`for j in range(chunksize):`
			`y[bd * bx + j] = sm1[j]`
			`y[bd * bx + j + chunksize] = sm2[j]`

			`d_result = cuda.device_array_like(arr)`
			`sm_slice_copy[nblocks, nthreads, 0, nshared](arr, d_result, chunksize)`
			`host_result = d_result.copy_to_host()`
			`np.testing.assert_array_equal(arr, host_result)`

			`@skip_on_cudasim("Can't check typing in simulator")`
			`def test_invalid_array_type(self):`
			`rgx = ".Cannot infer the type of variable 'arr'."`

			`def unsupported_type():`
			`arr = cuda.shared.array(10, dtype=np.dtype('O')) # noqa: F841`
			`with self.assertRaisesRegex(TypingError, rgx):`
			`cuda.jit(void())(unsupported_type)`

			`rgx = ".Invalid NumPy dtype specified: 'int33'."`

			`def invalid_string_type():`
			`arr = cuda.shared.array(10, dtype='int33') # noqa: F841`
			`with self.assertRaisesRegex(TypingError, rgx):`
			`cuda.jit(void())(invalid_string_type)`

			`@skip_on_cudasim("Struct model array unsupported in simulator")`
			`def test_struct_model_type_static(self):`
			`nthreads = 64`

			`@cuda.jit(void(int32[::1], int32[::1]))`
			`def write_then_reverse_read_static(outx, outy):`
			`# Test creation`
			`arr = cuda.shared.array(nthreads, dtype=test_struct_model_type)`

			`i = cuda.grid(1)`
			`ri = nthreads - i - 1`

			`if i < len(outx) and i < len(outy):`
			`# Test set to arr`
			`obj = TestStruct(int32(i), int32(i * 2))`
			`arr[i] = obj`

			`cuda.syncthreads()`
			`# Test get from arr`
			`outx[i] = arr[ri].x`
			`outy[i] = arr[ri].y`

			`arrx = np.zeros((nthreads,), dtype="int32")`
			`arry = np.zeros((nthreads,), dtype="int32")`

			`write_then_reverse_read_static[1, nthreads](arrx, arry)`

			`for i, x in enumerate(arrx):`
			`self.assertEqual(x, nthreads - i - 1)`
			`for i, y in enumerate(arry):`
			`self.assertEqual(y, (nthreads - i - 1) * 2)`


			`if __name__ == '__main__':`
			`unittest.main()`