ai-content-maker/.venv/Lib/site-packages/numba/cuda/tests/cudapy/test_gufunc.py

import numpy as np

from collections import namedtuple
from numba import void, int32, float32, float64
from numba import guvectorize
from numba import cuda
from numba.cuda.testing import skip_on_cudasim, CUDATestCase
import unittest
import warnings
from numba.core.errors import NumbaPerformanceWarning
from numba.tests.support import override_config


def _get_matmulcore_gufunc(dtype=float32):
    @guvectorize([void(dtype[:, :], dtype[:, :], dtype[:, :])],
                 '(m,n),(n,p)->(m,p)',
                 target='cuda')
    def matmulcore(A, B, C):
        m, n = A.shape
        n, p = B.shape
        for i in range(m):
            for j in range(p):
                C[i, j] = 0
                for k in range(n):
                    C[i, j] += A[i, k] * B[k, j]

    return matmulcore


@skip_on_cudasim('ufunc API unsupported in the simulator')
class TestCUDAGufunc(CUDATestCase):

    def test_gufunc_small(self):

        gufunc = _get_matmulcore_gufunc()

        matrix_ct = 2
        A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2,
                                                                   4)
        B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4,
                                                                   5)

        C = gufunc(A, B)
        Gold = np.matmul(A, B)
        self.assertTrue(np.allclose(C, Gold))

    def test_gufunc_auto_transfer(self):

        gufunc = _get_matmulcore_gufunc()

        matrix_ct = 2
        A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2,
                                                                   4)
        B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4,
                                                                   5)

        dB = cuda.to_device(B)

        C = gufunc(A, dB).copy_to_host()
        Gold = np.matmul(A, B)
        self.assertTrue(np.allclose(C, Gold))

    def test_gufunc(self):

        gufunc = _get_matmulcore_gufunc()

        matrix_ct = 1001 # an odd number to test thread/block division in CUDA
        A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2,
                                                                   4)
        B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4,
                                                                   5)

        C = gufunc(A, B)
        Gold = np.matmul(A, B)
        self.assertTrue(np.allclose(C, Gold))

    def test_gufunc_hidim(self):

        gufunc = _get_matmulcore_gufunc()

        matrix_ct = 100 # an odd number to test thread/block division in CUDA
        A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(4, 25, 2, 4)
        B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(4, 25, 4, 5)

        C = gufunc(A, B)
        Gold = np.matmul(A, B)
        self.assertTrue(np.allclose(C, Gold))

    def test_gufunc_new_axis(self):

        gufunc = _get_matmulcore_gufunc(dtype=float64)

        X = np.random.randn(10, 3, 3)
        Y = np.random.randn(3, 3)

        gold = np.matmul(X, Y)

        res1 = gufunc(X, Y)
        np.testing.assert_allclose(gold, res1)

        res2 = gufunc(X, np.tile(Y, (10, 1, 1)))
        np.testing.assert_allclose(gold, res2)

    def test_gufunc_stream(self):

        gufunc = _get_matmulcore_gufunc()

        #cuda.driver.flush_pending_free()
        matrix_ct = 1001 # an odd number to test thread/block division in CUDA
        A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2,
                                                                   4)
        B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4,
                                                                   5)

        stream = cuda.stream()
        dA = cuda.to_device(A, stream)
        dB = cuda.to_device(B, stream)

        dC = cuda.device_array(shape=(1001, 2, 5), dtype=A.dtype, stream=stream)
        dC = gufunc(dA, dB, out=dC, stream=stream)
        C = dC.copy_to_host(stream=stream)
        stream.synchronize()

        Gold = np.matmul(A, B)

        self.assertTrue(np.allclose(C, Gold))

    def test_copy(self):

        @guvectorize([void(float32[:], float32[:])],
                     '(x)->(x)',
                     target='cuda')
        def copy(A, B):
            for i in range(B.size):
                B[i] = A[i]

        A = np.arange(10, dtype=np.float32) + 1
        B = np.zeros_like(A)
        copy(A, out=B)
        np.testing.assert_allclose(A, B)

    def test_copy_unspecified_return(self):
        # Ensure that behaviour is correct when the return type is not
        # specified in the signature.
        @guvectorize([(float32[:], float32[:])],
                     '(x)->(x)',
                     target='cuda')
        def copy(A, B):
            for i in range(B.size):
                B[i] = A[i]

        A = np.arange(10, dtype=np.float32) + 1
        B = np.zeros_like(A)
        copy(A, out=B)
        self.assertTrue(np.allclose(A, B))

    def test_copy_odd(self):

        @guvectorize([void(float32[:], float32[:])],
                     '(x)->(x)',
                     target='cuda')
        def copy(A, B):
            for i in range(B.size):
                B[i] = A[i]

        A = np.arange(11, dtype=np.float32) + 1
        B = np.zeros_like(A)
        copy(A, out=B)
        self.assertTrue(np.allclose(A, B))

    def test_copy2d(self):

        @guvectorize([void(float32[:, :], float32[:, :])],
                     '(x, y)->(x, y)',
                     target='cuda')
        def copy2d(A, B):
            for x in range(B.shape[0]):
                for y in range(B.shape[1]):
                    B[x, y] = A[x, y]

        A = np.arange(30, dtype=np.float32).reshape(5, 6) + 1
        B = np.zeros_like(A)
        copy2d(A, out=B)
        self.assertTrue(np.allclose(A, B))

    # Test inefficient use of the GPU where the inputs are all mapped onto a
    # single thread in a single block.
    def test_inefficient_launch_configuration(self):
        @guvectorize(['void(float32[:], float32[:], float32[:])'],
                     '(n),(n)->(n)', target='cuda')
        def numba_dist_cuda(a, b, dist):
            len = a.shape[0]
            for i in range(len):
                dist[i] = a[i] * b[i]

        a = np.random.rand(1024 * 32).astype('float32')
        b = np.random.rand(1024 * 32).astype('float32')
        dist = np.zeros(a.shape[0]).astype('float32')

        with override_config('CUDA_LOW_OCCUPANCY_WARNINGS', 1):
            with warnings.catch_warnings(record=True) as w:
                numba_dist_cuda(a, b, dist)
                self.assertEqual(w[0].category, NumbaPerformanceWarning)
                self.assertIn('Grid size', str(w[0].message))
                self.assertIn('low occupancy', str(w[0].message))

    def test_efficient_launch_configuration(self):
        @guvectorize(['void(float32[:], float32[:], float32[:])'],
                     '(n),(n)->(n)', nopython=True, target='cuda')
        def numba_dist_cuda2(a, b, dist):
            len = a.shape[0]
            for i in range(len):
                dist[i] = a[i] * b[i]

        a = np.random.rand(524288 * 2).astype('float32').\
            reshape((524288, 2))
        b = np.random.rand(524288 * 2).astype('float32').\
            reshape((524288, 2))
        dist = np.zeros_like(a)

        with override_config('CUDA_LOW_OCCUPANCY_WARNINGS', 1):
            with warnings.catch_warnings(record=True) as w:
                numba_dist_cuda2(a, b, dist)
                self.assertEqual(len(w), 0)

    def test_nopython_flag(self):

        def foo(A, B):
            pass

        # nopython = True is fine
        guvectorize([void(float32[:], float32[:])], '(x)->(x)', target='cuda',
                    nopython=True)(foo)

        # nopython = False is bad
        with self.assertRaises(TypeError) as raises:
            guvectorize([void(float32[:], float32[:])], '(x)->(x)',
                        target='cuda', nopython=False)(foo)
        self.assertEqual("nopython flag must be True", str(raises.exception))

    def test_invalid_flags(self):
        # Check invalid flags
        def foo(A, B):
            pass

        with self.assertRaises(TypeError) as raises:
            guvectorize([void(float32[:], float32[:])], '(x)->(x)',
                        target='cuda', what1=True, ever2=False)(foo)
        head = "The following target options are not supported:"
        msg = str(raises.exception)
        self.assertEqual(msg[:len(head)], head)
        items = msg[len(head):].strip().split(',')
        items = [i.strip("'\" ") for i in items]
        self.assertEqual(set(['what1', 'ever2']), set(items))

    def test_duplicated_output(self):
        @guvectorize([void(float32[:], float32[:])], '(x)->(x)', target='cuda')
        def foo(inp, out):
            pass  # intentionally empty; never executed

        inp = out = np.zeros(10, dtype=np.float32)
        with self.assertRaises(ValueError) as raises:
            foo(inp, out, out=out)

        msg = "cannot specify argument 'out' as both positional and keyword"
        self.assertEqual(str(raises.exception), msg)

    def check_tuple_arg(self, a, b):
        @guvectorize([(float64[:], float64[:], float64[:])], '(n),(n)->()',
                     target='cuda')
        def gu_reduce(x, y, r):
            s = 0
            for i in range(len(x)):
                s += x[i] * y[i]
            r[0] = s

        r = gu_reduce(a, b)
        expected = np.sum(np.asarray(a) * np.asarray(b), axis=1)
        np.testing.assert_equal(expected, r)

    def test_tuple_of_tuple_arg(self):
        a = ((1.0, 2.0, 3.0),
             (4.0, 5.0, 6.0))
        b = ((1.5, 2.5, 3.5),
             (4.5, 5.5, 6.5))
        self.check_tuple_arg(a, b)

    def test_tuple_of_namedtuple_arg(self):
        Point = namedtuple('Point', ('x', 'y', 'z'))
        a = (Point(x=1.0, y=2.0, z=3.0),
             Point(x=4.0, y=5.0, z=6.0))
        b = (Point(x=1.5, y=2.5, z=3.5),
             Point(x=4.5, y=5.5, z=6.5))
        self.check_tuple_arg(a, b)

    def test_tuple_of_array_arg(self):
        a = (np.asarray((1.0, 2.0, 3.0)),
             np.asarray((4.0, 5.0, 6.0)))
        b = (np.asarray((1.5, 2.5, 3.5)),
             np.asarray((4.5, 5.5, 6.5)))
        self.check_tuple_arg(a, b)

    def test_gufunc_name(self):
        gufunc = _get_matmulcore_gufunc()
        self.assertEqual(gufunc.__name__, 'matmulcore')

    def test_bad_return_type(self):
        with self.assertRaises(TypeError) as te:
            @guvectorize([int32(int32[:], int32[:])], '(m)->(m)', target='cuda')
            def f(x, y):
                pass

        msg = str(te.exception)
        self.assertIn('guvectorized functions cannot return values', msg)
        self.assertIn('specifies int32 return type', msg)

    def test_incorrect_number_of_pos_args(self):
        @guvectorize([(int32[:], int32[:], int32[:])],
                     '(m),(m)->(m)', target='cuda')
        def f(x, y, z):
            pass

        arr = np.arange(5)

        # Inputs only, too few
        with self.assertRaises(TypeError) as te:
            f(arr)

        msg = str(te.exception)
        self.assertIn('gufunc accepts 2 positional arguments', msg)
        self.assertIn('or 3 positional arguments', msg)
        self.assertIn('Got 1 positional argument.', msg)

        # Inputs and outputs, too many
        with self.assertRaises(TypeError) as te:
            f(arr, arr, arr, arr)

        msg = str(te.exception)
        self.assertIn('gufunc accepts 2 positional arguments', msg)
        self.assertIn('or 3 positional arguments', msg)
        self.assertIn('Got 4 positional arguments.', msg)


@skip_on_cudasim('ufunc API unsupported in the simulator')
class TestMultipleOutputs(CUDATestCase):
    def test_multiple_outputs_same_type_passed_in(self):
        @guvectorize([void(float32[:], float32[:], float32[:])],
                     '(x)->(x),(x)',
                     target='cuda')
        def copy(A, B, C):
            for i in range(B.size):
                B[i] = A[i]
                C[i] = A[i]

        A = np.arange(10, dtype=np.float32) + 1
        B = np.zeros_like(A)
        C = np.zeros_like(A)
        copy(A, B, C)
        np.testing.assert_allclose(A, B)
        np.testing.assert_allclose(A, C)

    def test_multiple_outputs_distinct_values(self):

        @guvectorize([void(float32[:], float32[:], float32[:])],
                     '(x)->(x),(x)',
                     target='cuda')
        def copy_and_double(A, B, C):
            for i in range(B.size):
                B[i] = A[i]
                C[i] = A[i] * 2

        A = np.arange(10, dtype=np.float32) + 1
        B = np.zeros_like(A)
        C = np.zeros_like(A)
        copy_and_double(A, B, C)
        np.testing.assert_allclose(A, B)
        np.testing.assert_allclose(A * 2, C)

    def test_multiple_output_allocation(self):
        @guvectorize([void(float32[:], float32[:], float32[:])],
                     '(x)->(x),(x)',
                     target='cuda')
        def copy_and_double(A, B, C):
            for i in range(B.size):
                B[i] = A[i]
                C[i] = A[i] * 2

        A = np.arange(10, dtype=np.float32) + 1
        B, C = copy_and_double(A)
        np.testing.assert_allclose(A, B)
        np.testing.assert_allclose(A * 2, C)

    def test_multiple_output_dtypes(self):

        @guvectorize([void(int32[:], int32[:], float64[:])],
                     '(x)->(x),(x)',
                     target='cuda')
        def copy_and_multiply(A, B, C):
            for i in range(B.size):
                B[i] = A[i]
                C[i] = A[i] * 1.5

        A = np.arange(10, dtype=np.int32) + 1
        B = np.zeros_like(A)
        C = np.zeros_like(A, dtype=np.float64)
        copy_and_multiply(A, B, C)
        np.testing.assert_allclose(A, B)
        np.testing.assert_allclose(A * np.float64(1.5), C)

    def test_incorrect_number_of_pos_args(self):
        @guvectorize([(int32[:], int32[:], int32[:], int32[:])],
                     '(m),(m)->(m),(m)', target='cuda')
        def f(x, y, z, w):
            pass

        arr = np.arange(5)

        # Inputs only, too few
        with self.assertRaises(TypeError) as te:
            f(arr)

        msg = str(te.exception)
        self.assertIn('gufunc accepts 2 positional arguments', msg)
        self.assertIn('or 4 positional arguments', msg)
        self.assertIn('Got 1 positional argument.', msg)

        # Inputs and outputs, too many
        with self.assertRaises(TypeError) as te:
            f(arr, arr, arr, arr, arr)

        msg = str(te.exception)
        self.assertIn('gufunc accepts 2 positional arguments', msg)
        self.assertIn('or 4 positional arguments', msg)
        self.assertIn('Got 5 positional arguments.', msg)


if __name__ == '__main__':
    unittest.main()
first commit 2024-05-03 04:18:51 +03:00			`import numpy as np`

			`from collections import namedtuple`
			`from numba import void, int32, float32, float64`
			`from numba import guvectorize`
			`from numba import cuda`
			`from numba.cuda.testing import skip_on_cudasim, CUDATestCase`
			`import unittest`
			`import warnings`
			`from numba.core.errors import NumbaPerformanceWarning`
			`from numba.tests.support import override_config`


			`def _get_matmulcore_gufunc(dtype=float32):`
			`@guvectorize([void(dtype[:, :], dtype[:, :], dtype[:, :])],`
			`'(m,n),(n,p)->(m,p)',`
			`target='cuda')`
			`def matmulcore(A, B, C):`
			`m, n = A.shape`
			`n, p = B.shape`
			`for i in range(m):`
			`for j in range(p):`
			`C[i, j] = 0`
			`for k in range(n):`
			`C[i, j] += A[i, k] * B[k, j]`

			`return matmulcore`


			`@skip_on_cudasim('ufunc API unsupported in the simulator')`
			`class TestCUDAGufunc(CUDATestCase):`

			`def test_gufunc_small(self):`

			`gufunc = _get_matmulcore_gufunc()`

			`matrix_ct = 2`
			`A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2,`
			`4)`
			`B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4,`
			`5)`

			`C = gufunc(A, B)`
			`Gold = np.matmul(A, B)`
			`self.assertTrue(np.allclose(C, Gold))`

			`def test_gufunc_auto_transfer(self):`

			`gufunc = _get_matmulcore_gufunc()`

			`matrix_ct = 2`
			`A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2,`
			`4)`
			`B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4,`
			`5)`

			`dB = cuda.to_device(B)`

			`C = gufunc(A, dB).copy_to_host()`
			`Gold = np.matmul(A, B)`
			`self.assertTrue(np.allclose(C, Gold))`

			`def test_gufunc(self):`

			`gufunc = _get_matmulcore_gufunc()`

			`matrix_ct = 1001 # an odd number to test thread/block division in CUDA`
			`A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2,`
			`4)`
			`B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4,`
			`5)`

			`C = gufunc(A, B)`
			`Gold = np.matmul(A, B)`
			`self.assertTrue(np.allclose(C, Gold))`

			`def test_gufunc_hidim(self):`

			`gufunc = _get_matmulcore_gufunc()`

			`matrix_ct = 100 # an odd number to test thread/block division in CUDA`
			`A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(4, 25, 2, 4)`
			`B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(4, 25, 4, 5)`

			`C = gufunc(A, B)`
			`Gold = np.matmul(A, B)`
			`self.assertTrue(np.allclose(C, Gold))`

			`def test_gufunc_new_axis(self):`

			`gufunc = _get_matmulcore_gufunc(dtype=float64)`

			`X = np.random.randn(10, 3, 3)`
			`Y = np.random.randn(3, 3)`

			`gold = np.matmul(X, Y)`

			`res1 = gufunc(X, Y)`
			`np.testing.assert_allclose(gold, res1)`

			`res2 = gufunc(X, np.tile(Y, (10, 1, 1)))`
			`np.testing.assert_allclose(gold, res2)`

			`def test_gufunc_stream(self):`

			`gufunc = _get_matmulcore_gufunc()`

			`#cuda.driver.flush_pending_free()`
			`matrix_ct = 1001 # an odd number to test thread/block division in CUDA`
			`A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2,`
			`4)`
			`B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4,`
			`5)`

			`stream = cuda.stream()`
			`dA = cuda.to_device(A, stream)`
			`dB = cuda.to_device(B, stream)`

			`dC = cuda.device_array(shape=(1001, 2, 5), dtype=A.dtype, stream=stream)`
			`dC = gufunc(dA, dB, out=dC, stream=stream)`
			`C = dC.copy_to_host(stream=stream)`
			`stream.synchronize()`

			`Gold = np.matmul(A, B)`

			`self.assertTrue(np.allclose(C, Gold))`

			`def test_copy(self):`

			`@guvectorize([void(float32[:], float32[:])],`
			`'(x)->(x)',`
			`target='cuda')`
			`def copy(A, B):`
			`for i in range(B.size):`
			`B[i] = A[i]`

			`A = np.arange(10, dtype=np.float32) + 1`
			`B = np.zeros_like(A)`
			`copy(A, out=B)`
			`np.testing.assert_allclose(A, B)`

			`def test_copy_unspecified_return(self):`
			`# Ensure that behaviour is correct when the return type is not`
			`# specified in the signature.`
			`@guvectorize([(float32[:], float32[:])],`
			`'(x)->(x)',`
			`target='cuda')`
			`def copy(A, B):`
			`for i in range(B.size):`
			`B[i] = A[i]`

			`A = np.arange(10, dtype=np.float32) + 1`
			`B = np.zeros_like(A)`
			`copy(A, out=B)`
			`self.assertTrue(np.allclose(A, B))`

			`def test_copy_odd(self):`

			`@guvectorize([void(float32[:], float32[:])],`
			`'(x)->(x)',`
			`target='cuda')`
			`def copy(A, B):`
			`for i in range(B.size):`
			`B[i] = A[i]`

			`A = np.arange(11, dtype=np.float32) + 1`
			`B = np.zeros_like(A)`
			`copy(A, out=B)`
			`self.assertTrue(np.allclose(A, B))`

			`def test_copy2d(self):`

			`@guvectorize([void(float32[:, :], float32[:, :])],`
			`'(x, y)->(x, y)',`
			`target='cuda')`
			`def copy2d(A, B):`
			`for x in range(B.shape[0]):`
			`for y in range(B.shape[1]):`
			`B[x, y] = A[x, y]`

			`A = np.arange(30, dtype=np.float32).reshape(5, 6) + 1`
			`B = np.zeros_like(A)`
			`copy2d(A, out=B)`
			`self.assertTrue(np.allclose(A, B))`

			`# Test inefficient use of the GPU where the inputs are all mapped onto a`
			`# single thread in a single block.`
			`def test_inefficient_launch_configuration(self):`
			`@guvectorize(['void(float32[:], float32[:], float32[:])'],`
			`'(n),(n)->(n)', target='cuda')`
			`def numba_dist_cuda(a, b, dist):`
			`len = a.shape[0]`
			`for i in range(len):`
			`dist[i] = a[i] * b[i]`

			`a = np.random.rand(1024 * 32).astype('float32')`
			`b = np.random.rand(1024 * 32).astype('float32')`
			`dist = np.zeros(a.shape[0]).astype('float32')`

			`with override_config('CUDA_LOW_OCCUPANCY_WARNINGS', 1):`
			`with warnings.catch_warnings(record=True) as w:`
			`numba_dist_cuda(a, b, dist)`
			`self.assertEqual(w[0].category, NumbaPerformanceWarning)`
			`self.assertIn('Grid size', str(w[0].message))`
			`self.assertIn('low occupancy', str(w[0].message))`

			`def test_efficient_launch_configuration(self):`
			`@guvectorize(['void(float32[:], float32[:], float32[:])'],`
			`'(n),(n)->(n)', nopython=True, target='cuda')`
			`def numba_dist_cuda2(a, b, dist):`
			`len = a.shape[0]`
			`for i in range(len):`
			`dist[i] = a[i] * b[i]`

			`a = np.random.rand(524288 * 2).astype('float32').\`
			`reshape((524288, 2))`
			`b = np.random.rand(524288 * 2).astype('float32').\`
			`reshape((524288, 2))`
			`dist = np.zeros_like(a)`

			`with override_config('CUDA_LOW_OCCUPANCY_WARNINGS', 1):`
			`with warnings.catch_warnings(record=True) as w:`
			`numba_dist_cuda2(a, b, dist)`
			`self.assertEqual(len(w), 0)`

			`def test_nopython_flag(self):`

			`def foo(A, B):`
			`pass`

			`# nopython = True is fine`
			`guvectorize([void(float32[:], float32[:])], '(x)->(x)', target='cuda',`
			`nopython=True)(foo)`

			`# nopython = False is bad`
			`with self.assertRaises(TypeError) as raises:`
			`guvectorize([void(float32[:], float32[:])], '(x)->(x)',`
			`target='cuda', nopython=False)(foo)`
			`self.assertEqual("nopython flag must be True", str(raises.exception))`

			`def test_invalid_flags(self):`
			`# Check invalid flags`
			`def foo(A, B):`
			`pass`

			`with self.assertRaises(TypeError) as raises:`
			`guvectorize([void(float32[:], float32[:])], '(x)->(x)',`
			`target='cuda', what1=True, ever2=False)(foo)`
			`head = "The following target options are not supported:"`
			`msg = str(raises.exception)`
			`self.assertEqual(msg[:len(head)], head)`
			`items = msg[len(head):].strip().split(',')`
			`items = [i.strip("'\" ") for i in items]`
			`self.assertEqual(set(['what1', 'ever2']), set(items))`

			`def test_duplicated_output(self):`
			`@guvectorize([void(float32[:], float32[:])], '(x)->(x)', target='cuda')`
			`def foo(inp, out):`
			`pass # intentionally empty; never executed`

			`inp = out = np.zeros(10, dtype=np.float32)`
			`with self.assertRaises(ValueError) as raises:`
			`foo(inp, out, out=out)`

			`msg = "cannot specify argument 'out' as both positional and keyword"`
			`self.assertEqual(str(raises.exception), msg)`

			`def check_tuple_arg(self, a, b):`
			`@guvectorize([(float64[:], float64[:], float64[:])], '(n),(n)->()',`
			`target='cuda')`
			`def gu_reduce(x, y, r):`
			`s = 0`
			`for i in range(len(x)):`
			`s += x[i] * y[i]`
			`r[0] = s`

			`r = gu_reduce(a, b)`
			`expected = np.sum(np.asarray(a) * np.asarray(b), axis=1)`
			`np.testing.assert_equal(expected, r)`

			`def test_tuple_of_tuple_arg(self):`
			`a = ((1.0, 2.0, 3.0),`
			`(4.0, 5.0, 6.0))`
			`b = ((1.5, 2.5, 3.5),`
			`(4.5, 5.5, 6.5))`
			`self.check_tuple_arg(a, b)`

			`def test_tuple_of_namedtuple_arg(self):`
			`Point = namedtuple('Point', ('x', 'y', 'z'))`
			`a = (Point(x=1.0, y=2.0, z=3.0),`
			`Point(x=4.0, y=5.0, z=6.0))`
			`b = (Point(x=1.5, y=2.5, z=3.5),`
			`Point(x=4.5, y=5.5, z=6.5))`
			`self.check_tuple_arg(a, b)`

			`def test_tuple_of_array_arg(self):`
			`a = (np.asarray((1.0, 2.0, 3.0)),`
			`np.asarray((4.0, 5.0, 6.0)))`
			`b = (np.asarray((1.5, 2.5, 3.5)),`
			`np.asarray((4.5, 5.5, 6.5)))`
			`self.check_tuple_arg(a, b)`

			`def test_gufunc_name(self):`
			`gufunc = _get_matmulcore_gufunc()`
			`self.assertEqual(gufunc.__name__, 'matmulcore')`

			`def test_bad_return_type(self):`
			`with self.assertRaises(TypeError) as te:`
			`@guvectorize([int32(int32[:], int32[:])], '(m)->(m)', target='cuda')`
			`def f(x, y):`
			`pass`

			`msg = str(te.exception)`
			`self.assertIn('guvectorized functions cannot return values', msg)`
			`self.assertIn('specifies int32 return type', msg)`

			`def test_incorrect_number_of_pos_args(self):`
			`@guvectorize([(int32[:], int32[:], int32[:])],`
			`'(m),(m)->(m)', target='cuda')`
			`def f(x, y, z):`
			`pass`

			`arr = np.arange(5)`

			`# Inputs only, too few`
			`with self.assertRaises(TypeError) as te:`
			`f(arr)`

			`msg = str(te.exception)`
			`self.assertIn('gufunc accepts 2 positional arguments', msg)`
			`self.assertIn('or 3 positional arguments', msg)`
			`self.assertIn('Got 1 positional argument.', msg)`

			`# Inputs and outputs, too many`
			`with self.assertRaises(TypeError) as te:`
			`f(arr, arr, arr, arr)`

			`msg = str(te.exception)`
			`self.assertIn('gufunc accepts 2 positional arguments', msg)`
			`self.assertIn('or 3 positional arguments', msg)`
			`self.assertIn('Got 4 positional arguments.', msg)`


			`@skip_on_cudasim('ufunc API unsupported in the simulator')`
			`class TestMultipleOutputs(CUDATestCase):`
			`def test_multiple_outputs_same_type_passed_in(self):`
			`@guvectorize([void(float32[:], float32[:], float32[:])],`
			`'(x)->(x),(x)',`
			`target='cuda')`
			`def copy(A, B, C):`
			`for i in range(B.size):`
			`B[i] = A[i]`
			`C[i] = A[i]`

			`A = np.arange(10, dtype=np.float32) + 1`
			`B = np.zeros_like(A)`
			`C = np.zeros_like(A)`
			`copy(A, B, C)`
			`np.testing.assert_allclose(A, B)`
			`np.testing.assert_allclose(A, C)`

			`def test_multiple_outputs_distinct_values(self):`

			`@guvectorize([void(float32[:], float32[:], float32[:])],`
			`'(x)->(x),(x)',`
			`target='cuda')`
			`def copy_and_double(A, B, C):`
			`for i in range(B.size):`
			`B[i] = A[i]`
			`C[i] = A[i] * 2`

			`A = np.arange(10, dtype=np.float32) + 1`
			`B = np.zeros_like(A)`
			`C = np.zeros_like(A)`
			`copy_and_double(A, B, C)`
			`np.testing.assert_allclose(A, B)`
			`np.testing.assert_allclose(A * 2, C)`

			`def test_multiple_output_allocation(self):`
			`@guvectorize([void(float32[:], float32[:], float32[:])],`
			`'(x)->(x),(x)',`
			`target='cuda')`
			`def copy_and_double(A, B, C):`
			`for i in range(B.size):`
			`B[i] = A[i]`
			`C[i] = A[i] * 2`

			`A = np.arange(10, dtype=np.float32) + 1`
			`B, C = copy_and_double(A)`
			`np.testing.assert_allclose(A, B)`
			`np.testing.assert_allclose(A * 2, C)`

			`def test_multiple_output_dtypes(self):`

			`@guvectorize([void(int32[:], int32[:], float64[:])],`
			`'(x)->(x),(x)',`
			`target='cuda')`
			`def copy_and_multiply(A, B, C):`
			`for i in range(B.size):`
			`B[i] = A[i]`
			`C[i] = A[i] * 1.5`

			`A = np.arange(10, dtype=np.int32) + 1`
			`B = np.zeros_like(A)`
			`C = np.zeros_like(A, dtype=np.float64)`
			`copy_and_multiply(A, B, C)`
			`np.testing.assert_allclose(A, B)`
			`np.testing.assert_allclose(A * np.float64(1.5), C)`

			`def test_incorrect_number_of_pos_args(self):`
			`@guvectorize([(int32[:], int32[:], int32[:], int32[:])],`
			`'(m),(m)->(m),(m)', target='cuda')`
			`def f(x, y, z, w):`
			`pass`

			`arr = np.arange(5)`

			`# Inputs only, too few`
			`with self.assertRaises(TypeError) as te:`
			`f(arr)`

			`msg = str(te.exception)`
			`self.assertIn('gufunc accepts 2 positional arguments', msg)`
			`self.assertIn('or 4 positional arguments', msg)`
			`self.assertIn('Got 1 positional argument.', msg)`

			`# Inputs and outputs, too many`
			`with self.assertRaises(TypeError) as te:`
			`f(arr, arr, arr, arr, arr)`

			`msg = str(te.exception)`
			`self.assertIn('gufunc accepts 2 positional arguments', msg)`
			`self.assertIn('or 4 positional arguments', msg)`
			`self.assertIn('Got 5 positional arguments.', msg)`


			`if __name__ == '__main__':`
			`unittest.main()`