ai-content-maker/.venv/Lib/site-packages/numba/cuda/tests/cudadrv/test_linker.py

import numpy as np
import warnings
from numba.cuda.testing import unittest
from numba.cuda.testing import (skip_on_cudasim, skip_if_cuda_includes_missing)
from numba.cuda.testing import CUDATestCase, test_data_dir
from numba.cuda.cudadrv.driver import (CudaAPIError, Linker,
                                       LinkerError)
from numba.cuda.cudadrv.error import NvrtcError
from numba.cuda import require_context
from numba.tests.support import ignore_internal_warnings
from numba import cuda, void, float64, int64, int32, typeof, float32


CONST1D = np.arange(10, dtype=np.float64)


def simple_const_mem(A):
    C = cuda.const.array_like(CONST1D)
    i = cuda.grid(1)

    A[i] = C[i] + 1.0


def func_with_lots_of_registers(x, a, b, c, d, e, f):
    a1 = 1.0
    a2 = 1.0
    a3 = 1.0
    a4 = 1.0
    a5 = 1.0
    b1 = 1.0
    b2 = 1.0
    b3 = 1.0
    b4 = 1.0
    b5 = 1.0
    c1 = 1.0
    c2 = 1.0
    c3 = 1.0
    c4 = 1.0
    c5 = 1.0
    d1 = 10
    d2 = 10
    d3 = 10
    d4 = 10
    d5 = 10
    for i in range(a):
        a1 += b
        a2 += c
        a3 += d
        a4 += e
        a5 += f
        b1 *= b
        b2 *= c
        b3 *= d
        b4 *= e
        b5 *= f
        c1 /= b
        c2 /= c
        c3 /= d
        c4 /= e
        c5 /= f
        d1 <<= b
        d2 <<= c
        d3 <<= d
        d4 <<= e
        d5 <<= f
    x[cuda.grid(1)] = a1 + a2 + a3 + a4 + a5
    x[cuda.grid(1)] += b1 + b2 + b3 + b4 + b5
    x[cuda.grid(1)] += c1 + c2 + c3 + c4 + c5
    x[cuda.grid(1)] += d1 + d2 + d3 + d4 + d5


def simple_smem(ary, dty):
    sm = cuda.shared.array(100, dty)
    i = cuda.grid(1)
    if i == 0:
        for j in range(100):
            sm[j] = j
    cuda.syncthreads()
    ary[i] = sm[i]


def coop_smem2d(ary):
    i, j = cuda.grid(2)
    sm = cuda.shared.array((10, 20), float32)
    sm[i, j] = (i + 1) / (j + 1)
    cuda.syncthreads()
    ary[i, j] = sm[i, j]


def simple_maxthreads(ary):
    i = cuda.grid(1)
    ary[i] = i


LMEM_SIZE = 1000


def simple_lmem(A, B, dty):
    C = cuda.local.array(LMEM_SIZE, dty)
    for i in range(C.shape[0]):
        C[i] = A[i]
    for i in range(C.shape[0]):
        B[i] = C[i]


@skip_on_cudasim('Linking unsupported in the simulator')
class TestLinker(CUDATestCase):
    _NUMBA_NVIDIA_BINDING_0_ENV = {'NUMBA_CUDA_USE_NVIDIA_BINDING': '0'}

    @require_context
    def test_linker_basic(self):
        '''Simply go through the constructor and destructor
        '''
        linker = Linker.new(cc=(5, 3))
        del linker

    def _test_linking(self, eager):
        global bar  # must be a global; other it is recognized as a freevar
        bar = cuda.declare_device('bar', 'int32(int32)')

        link = str(test_data_dir / 'jitlink.ptx')

        if eager:
            args = ['void(int32[:], int32[:])']
        else:
            args = []

        @cuda.jit(*args, link=[link])
        def foo(x, y):
            i = cuda.grid(1)
            x[i] += bar(y[i])

        A = np.array([123], dtype=np.int32)
        B = np.array([321], dtype=np.int32)

        foo[1, 1](A, B)

        self.assertTrue(A[0] == 123 + 2 * 321)

    def test_linking_lazy_compile(self):
        self._test_linking(eager=False)

    def test_linking_eager_compile(self):
        self._test_linking(eager=True)

    def test_linking_cu(self):
        bar = cuda.declare_device('bar', 'int32(int32)')

        link = str(test_data_dir / 'jitlink.cu')

        @cuda.jit(link=[link])
        def kernel(r, x):
            i = cuda.grid(1)

            if i < len(r):
                r[i] = bar(x[i])

        x = np.arange(10, dtype=np.int32)
        r = np.zeros_like(x)

        kernel[1, 32](r, x)

        # Matches the operation of bar() in jitlink.cu
        expected = x * 2
        np.testing.assert_array_equal(r, expected)

    def test_linking_cu_log_warning(self):
        bar = cuda.declare_device('bar', 'int32(int32)')

        link = str(test_data_dir / 'warn.cu')

        with warnings.catch_warnings(record=True) as w:
            ignore_internal_warnings()

            @cuda.jit('void(int32)', link=[link])
            def kernel(x):
                bar(x)

        self.assertEqual(len(w), 1, 'Expected warnings from NVRTC')
        # Check the warning refers to the log messages
        self.assertIn('NVRTC log messages', str(w[0].message))
        # Check the message pertaining to the unused variable is provided
        self.assertIn('declared but never referenced', str(w[0].message))

    def test_linking_cu_error(self):
        bar = cuda.declare_device('bar', 'int32(int32)')

        link = str(test_data_dir / 'error.cu')

        with self.assertRaises(NvrtcError) as e:
            @cuda.jit('void(int32)', link=[link])
            def kernel(x):
                bar(x)

        msg = e.exception.args[0]
        # Check the error message refers to the NVRTC compile
        self.assertIn('NVRTC Compilation failure', msg)
        # Check the expected error in the CUDA source is reported
        self.assertIn('identifier "SYNTAX" is undefined', msg)
        # Check the filename is reported correctly
        self.assertIn('in the compilation of "error.cu"', msg)

    def test_linking_unknown_filetype_error(self):
        expected_err = "Don't know how to link file with extension .cuh"
        with self.assertRaisesRegex(RuntimeError, expected_err):
            @cuda.jit('void()', link=['header.cuh'])
            def kernel():
                pass

    def test_linking_file_with_no_extension_error(self):
        expected_err = "Don't know how to link file with no extension"
        with self.assertRaisesRegex(RuntimeError, expected_err):
            @cuda.jit('void()', link=['data'])
            def kernel():
                pass

    @skip_if_cuda_includes_missing
    def test_linking_cu_cuda_include(self):
        link = str(test_data_dir / 'cuda_include.cu')

        # An exception will be raised when linking this kernel due to the
        # compile failure if CUDA includes cannot be found by Nvrtc.
        @cuda.jit('void()', link=[link])
        def kernel():
            pass

    def test_try_to_link_nonexistent(self):
        with self.assertRaises(LinkerError) as e:
            @cuda.jit('void(int32[::1])', link=['nonexistent.a'])
            def f(x):
                x[0] = 0
        self.assertIn('nonexistent.a not found', e.exception.args)

    def test_set_registers_no_max(self):
        """Ensure that the jitted kernel used in the test_set_registers_* tests
        uses more than 57 registers - this ensures that test_set_registers_*
        are really checking that they reduced the number of registers used from
        something greater than the maximum."""
        compiled = cuda.jit(func_with_lots_of_registers)
        compiled = compiled.specialize(np.empty(32), *range(6))
        self.assertGreater(compiled.get_regs_per_thread(), 57)

    def test_set_registers_57(self):
        compiled = cuda.jit(max_registers=57)(func_with_lots_of_registers)
        compiled = compiled.specialize(np.empty(32), *range(6))
        self.assertLessEqual(compiled.get_regs_per_thread(), 57)

    def test_set_registers_38(self):
        compiled = cuda.jit(max_registers=38)(func_with_lots_of_registers)
        compiled = compiled.specialize(np.empty(32), *range(6))
        self.assertLessEqual(compiled.get_regs_per_thread(), 38)

    def test_set_registers_eager(self):
        sig = void(float64[::1], int64, int64, int64, int64, int64, int64)
        compiled = cuda.jit(sig, max_registers=38)(func_with_lots_of_registers)
        self.assertLessEqual(compiled.get_regs_per_thread(), 38)

    def test_get_const_mem_size(self):
        sig = void(float64[::1])
        compiled = cuda.jit(sig)(simple_const_mem)
        const_mem_size = compiled.get_const_mem_size()
        self.assertGreaterEqual(const_mem_size, CONST1D.nbytes)

    def test_get_no_shared_memory(self):
        compiled = cuda.jit(func_with_lots_of_registers)
        compiled = compiled.specialize(np.empty(32), *range(6))
        shared_mem_size = compiled.get_shared_mem_per_block()
        self.assertEqual(shared_mem_size, 0)

    def test_get_shared_mem_per_block(self):
        sig = void(int32[::1], typeof(np.int32))
        compiled = cuda.jit(sig)(simple_smem)
        shared_mem_size = compiled.get_shared_mem_per_block()
        self.assertEqual(shared_mem_size, 400)

    def test_get_shared_mem_per_specialized(self):
        compiled = cuda.jit(simple_smem)
        compiled_specialized = compiled.specialize(
            np.zeros(100, dtype=np.int32), np.float64)
        shared_mem_size = compiled_specialized.get_shared_mem_per_block()
        self.assertEqual(shared_mem_size, 800)

    def test_get_max_threads_per_block(self):
        compiled = cuda.jit("void(float32[:,::1])")(coop_smem2d)
        max_threads = compiled.get_max_threads_per_block()
        self.assertGreater(max_threads, 0)

    def test_max_threads_exceeded(self):
        compiled = cuda.jit("void(int32[::1])")(simple_maxthreads)
        max_threads = compiled.get_max_threads_per_block()
        nelem = max_threads + 1
        ary = np.empty(nelem, dtype=np.int32)
        try:
            compiled[1, nelem](ary)
        except CudaAPIError as e:
            self.assertIn("cuLaunchKernel", e.msg)

    def test_get_local_mem_per_thread(self):
        sig = void(int32[::1], int32[::1], typeof(np.int32))
        compiled = cuda.jit(sig)(simple_lmem)
        local_mem_size = compiled.get_local_mem_per_thread()
        calc_size = np.dtype(np.int32).itemsize * LMEM_SIZE
        self.assertGreaterEqual(local_mem_size, calc_size)

    def test_get_local_mem_per_specialized(self):
        compiled = cuda.jit(simple_lmem)
        compiled_specialized = compiled.specialize(
            np.zeros(LMEM_SIZE, dtype=np.int32),
            np.zeros(LMEM_SIZE, dtype=np.int32),
            np.float64)
        local_mem_size = compiled_specialized.get_local_mem_per_thread()
        calc_size = np.dtype(np.float64).itemsize * LMEM_SIZE
        self.assertGreaterEqual(local_mem_size, calc_size)


if __name__ == '__main__':
    unittest.main()
first commit 2024-05-03 04:18:51 +03:00			`import numpy as np`
			`import warnings`
			`from numba.cuda.testing import unittest`
			`from numba.cuda.testing import (skip_on_cudasim, skip_if_cuda_includes_missing)`
			`from numba.cuda.testing import CUDATestCase, test_data_dir`
			`from numba.cuda.cudadrv.driver import (CudaAPIError, Linker,`
			`LinkerError)`
			`from numba.cuda.cudadrv.error import NvrtcError`
			`from numba.cuda import require_context`
			`from numba.tests.support import ignore_internal_warnings`
			`from numba import cuda, void, float64, int64, int32, typeof, float32`


			`CONST1D = np.arange(10, dtype=np.float64)`


			`def simple_const_mem(A):`
			`C = cuda.const.array_like(CONST1D)`
			`i = cuda.grid(1)`

			`A[i] = C[i] + 1.0`


			`def func_with_lots_of_registers(x, a, b, c, d, e, f):`
			`a1 = 1.0`
			`a2 = 1.0`
			`a3 = 1.0`
			`a4 = 1.0`
			`a5 = 1.0`
			`b1 = 1.0`
			`b2 = 1.0`
			`b3 = 1.0`
			`b4 = 1.0`
			`b5 = 1.0`
			`c1 = 1.0`
			`c2 = 1.0`
			`c3 = 1.0`
			`c4 = 1.0`
			`c5 = 1.0`
			`d1 = 10`
			`d2 = 10`
			`d3 = 10`
			`d4 = 10`
			`d5 = 10`
			`for i in range(a):`
			`a1 += b`
			`a2 += c`
			`a3 += d`
			`a4 += e`
			`a5 += f`
			`b1 *= b`
			`b2 *= c`
			`b3 *= d`
			`b4 *= e`
			`b5 *= f`
			`c1 /= b`
			`c2 /= c`
			`c3 /= d`
			`c4 /= e`
			`c5 /= f`
			`d1 <<= b`
			`d2 <<= c`
			`d3 <<= d`
			`d4 <<= e`
			`d5 <<= f`
			`x[cuda.grid(1)] = a1 + a2 + a3 + a4 + a5`
			`x[cuda.grid(1)] += b1 + b2 + b3 + b4 + b5`
			`x[cuda.grid(1)] += c1 + c2 + c3 + c4 + c5`
			`x[cuda.grid(1)] += d1 + d2 + d3 + d4 + d5`


			`def simple_smem(ary, dty):`
			`sm = cuda.shared.array(100, dty)`
			`i = cuda.grid(1)`
			`if i == 0:`
			`for j in range(100):`
			`sm[j] = j`
			`cuda.syncthreads()`
			`ary[i] = sm[i]`


			`def coop_smem2d(ary):`
			`i, j = cuda.grid(2)`
			`sm = cuda.shared.array((10, 20), float32)`
			`sm[i, j] = (i + 1) / (j + 1)`
			`cuda.syncthreads()`
			`ary[i, j] = sm[i, j]`


			`def simple_maxthreads(ary):`
			`i = cuda.grid(1)`
			`ary[i] = i`


			`LMEM_SIZE = 1000`


			`def simple_lmem(A, B, dty):`
			`C = cuda.local.array(LMEM_SIZE, dty)`
			`for i in range(C.shape[0]):`
			`C[i] = A[i]`
			`for i in range(C.shape[0]):`
			`B[i] = C[i]`


			`@skip_on_cudasim('Linking unsupported in the simulator')`
			`class TestLinker(CUDATestCase):`
			`_NUMBA_NVIDIA_BINDING_0_ENV = {'NUMBA_CUDA_USE_NVIDIA_BINDING': '0'}`

			`@require_context`
			`def test_linker_basic(self):`
			`'''Simply go through the constructor and destructor`
			`'''`
			`linker = Linker.new(cc=(5, 3))`
			`del linker`

			`def _test_linking(self, eager):`
			`global bar # must be a global; other it is recognized as a freevar`
			`bar = cuda.declare_device('bar', 'int32(int32)')`

			`link = str(test_data_dir / 'jitlink.ptx')`

			`if eager:`
			`args = ['void(int32[:], int32[:])']`
			`else:`
			`args = []`

			`@cuda.jit(*args, link=[link])`
			`def foo(x, y):`
			`i = cuda.grid(1)`
			`x[i] += bar(y[i])`

			`A = np.array([123], dtype=np.int32)`
			`B = np.array([321], dtype=np.int32)`

			`foo[1, 1](A, B)`

			`self.assertTrue(A[0] == 123 + 2 * 321)`

			`def test_linking_lazy_compile(self):`
			`self._test_linking(eager=False)`

			`def test_linking_eager_compile(self):`
			`self._test_linking(eager=True)`

			`def test_linking_cu(self):`
			`bar = cuda.declare_device('bar', 'int32(int32)')`

			`link = str(test_data_dir / 'jitlink.cu')`

			`@cuda.jit(link=[link])`
			`def kernel(r, x):`
			`i = cuda.grid(1)`

			`if i < len(r):`
			`r[i] = bar(x[i])`

			`x = np.arange(10, dtype=np.int32)`
			`r = np.zeros_like(x)`

			`kernel[1, 32](r, x)`

			`# Matches the operation of bar() in jitlink.cu`
			`expected = x * 2`
			`np.testing.assert_array_equal(r, expected)`

			`def test_linking_cu_log_warning(self):`
			`bar = cuda.declare_device('bar', 'int32(int32)')`

			`link = str(test_data_dir / 'warn.cu')`

			`with warnings.catch_warnings(record=True) as w:`
			`ignore_internal_warnings()`

			`@cuda.jit('void(int32)', link=[link])`
			`def kernel(x):`
			`bar(x)`

			`self.assertEqual(len(w), 1, 'Expected warnings from NVRTC')`
			`# Check the warning refers to the log messages`
			`self.assertIn('NVRTC log messages', str(w[0].message))`
			`# Check the message pertaining to the unused variable is provided`
			`self.assertIn('declared but never referenced', str(w[0].message))`

			`def test_linking_cu_error(self):`
			`bar = cuda.declare_device('bar', 'int32(int32)')`

			`link = str(test_data_dir / 'error.cu')`

			`with self.assertRaises(NvrtcError) as e:`
			`@cuda.jit('void(int32)', link=[link])`
			`def kernel(x):`
			`bar(x)`

			`msg = e.exception.args[0]`
			`# Check the error message refers to the NVRTC compile`
			`self.assertIn('NVRTC Compilation failure', msg)`
			`# Check the expected error in the CUDA source is reported`
			`self.assertIn('identifier "SYNTAX" is undefined', msg)`
			`# Check the filename is reported correctly`
			`self.assertIn('in the compilation of "error.cu"', msg)`

			`def test_linking_unknown_filetype_error(self):`
			`expected_err = "Don't know how to link file with extension .cuh"`
			`with self.assertRaisesRegex(RuntimeError, expected_err):`
			`@cuda.jit('void()', link=['header.cuh'])`
			`def kernel():`
			`pass`

			`def test_linking_file_with_no_extension_error(self):`
			`expected_err = "Don't know how to link file with no extension"`
			`with self.assertRaisesRegex(RuntimeError, expected_err):`
			`@cuda.jit('void()', link=['data'])`
			`def kernel():`
			`pass`

			`@skip_if_cuda_includes_missing`
			`def test_linking_cu_cuda_include(self):`
			`link = str(test_data_dir / 'cuda_include.cu')`

			`# An exception will be raised when linking this kernel due to the`
			`# compile failure if CUDA includes cannot be found by Nvrtc.`
			`@cuda.jit('void()', link=[link])`
			`def kernel():`
			`pass`

			`def test_try_to_link_nonexistent(self):`
			`with self.assertRaises(LinkerError) as e:`
			`@cuda.jit('void(int32[::1])', link=['nonexistent.a'])`
			`def f(x):`
			`x[0] = 0`
			`self.assertIn('nonexistent.a not found', e.exception.args)`

			`def test_set_registers_no_max(self):`
			`"""Ensure that the jitted kernel used in the test_set_registers_* tests`
			`uses more than 57 registers - this ensures that test_set_registers_*`
			`are really checking that they reduced the number of registers used from`
			`something greater than the maximum."""`
			`compiled = cuda.jit(func_with_lots_of_registers)`
			`compiled = compiled.specialize(np.empty(32), *range(6))`
			`self.assertGreater(compiled.get_regs_per_thread(), 57)`

			`def test_set_registers_57(self):`
			`compiled = cuda.jit(max_registers=57)(func_with_lots_of_registers)`
			`compiled = compiled.specialize(np.empty(32), *range(6))`
			`self.assertLessEqual(compiled.get_regs_per_thread(), 57)`

			`def test_set_registers_38(self):`
			`compiled = cuda.jit(max_registers=38)(func_with_lots_of_registers)`
			`compiled = compiled.specialize(np.empty(32), *range(6))`
			`self.assertLessEqual(compiled.get_regs_per_thread(), 38)`

			`def test_set_registers_eager(self):`
			`sig = void(float64[::1], int64, int64, int64, int64, int64, int64)`
			`compiled = cuda.jit(sig, max_registers=38)(func_with_lots_of_registers)`
			`self.assertLessEqual(compiled.get_regs_per_thread(), 38)`

			`def test_get_const_mem_size(self):`
			`sig = void(float64[::1])`
			`compiled = cuda.jit(sig)(simple_const_mem)`
			`const_mem_size = compiled.get_const_mem_size()`
			`self.assertGreaterEqual(const_mem_size, CONST1D.nbytes)`

			`def test_get_no_shared_memory(self):`
			`compiled = cuda.jit(func_with_lots_of_registers)`
			`compiled = compiled.specialize(np.empty(32), *range(6))`
			`shared_mem_size = compiled.get_shared_mem_per_block()`
			`self.assertEqual(shared_mem_size, 0)`

			`def test_get_shared_mem_per_block(self):`
			`sig = void(int32[::1], typeof(np.int32))`
			`compiled = cuda.jit(sig)(simple_smem)`
			`shared_mem_size = compiled.get_shared_mem_per_block()`
			`self.assertEqual(shared_mem_size, 400)`

			`def test_get_shared_mem_per_specialized(self):`
			`compiled = cuda.jit(simple_smem)`
			`compiled_specialized = compiled.specialize(`
			`np.zeros(100, dtype=np.int32), np.float64)`
			`shared_mem_size = compiled_specialized.get_shared_mem_per_block()`
			`self.assertEqual(shared_mem_size, 800)`

			`def test_get_max_threads_per_block(self):`
			`compiled = cuda.jit("void(float32[:,::1])")(coop_smem2d)`
			`max_threads = compiled.get_max_threads_per_block()`
			`self.assertGreater(max_threads, 0)`

			`def test_max_threads_exceeded(self):`
			`compiled = cuda.jit("void(int32[::1])")(simple_maxthreads)`
			`max_threads = compiled.get_max_threads_per_block()`
			`nelem = max_threads + 1`
			`ary = np.empty(nelem, dtype=np.int32)`
			`try:`
			`compiled[1, nelem](ary)`
			`except CudaAPIError as e:`
			`self.assertIn("cuLaunchKernel", e.msg)`

			`def test_get_local_mem_per_thread(self):`
			`sig = void(int32[::1], int32[::1], typeof(np.int32))`
			`compiled = cuda.jit(sig)(simple_lmem)`
			`local_mem_size = compiled.get_local_mem_per_thread()`
			`calc_size = np.dtype(np.int32).itemsize * LMEM_SIZE`
			`self.assertGreaterEqual(local_mem_size, calc_size)`

			`def test_get_local_mem_per_specialized(self):`
			`compiled = cuda.jit(simple_lmem)`
			`compiled_specialized = compiled.specialize(`
			`np.zeros(LMEM_SIZE, dtype=np.int32),`
			`np.zeros(LMEM_SIZE, dtype=np.int32),`
			`np.float64)`
			`local_mem_size = compiled_specialized.get_local_mem_per_thread()`
			`calc_size = np.dtype(np.float64).itemsize * LMEM_SIZE`
			`self.assertGreaterEqual(local_mem_size, calc_size)`


			`if __name__ == '__main__':`
			`unittest.main()`