ai-content-maker/.venv/Lib/site-packages/numba/cuda/tests/cudapy/test_casting.py

258 lines
8.6 KiB
Python

import numpy as np
from numba.cuda import compile_ptx
from numba.core.types import f2, i1, i2, i4, i8, u1, u2, u4, u8
from numba import cuda
from numba.core import types
from numba.cuda.testing import (CUDATestCase, skip_on_cudasim,
skip_unless_cc_53)
from numba.types import float16, float32
import itertools
import unittest
def native_cast(x):
return float(x)
def to_int8(x):
return np.int8(x)
def to_int16(x):
return np.int16(x)
def to_int32(x):
return np.int32(x)
def to_int64(x):
return np.int64(x)
def to_uint8(x):
return np.uint8(x)
def to_uint16(x):
return np.uint16(x)
def to_uint32(x):
return types.uint32(x)
def to_uint64(x):
return types.uint64(x)
def to_float16(x):
# When division and operators on float16 types are supported, this should
# be changed to match the implementation in to_float32.
return (np.float16(x) * np.float16(0.5))
def to_float32(x):
return np.float32(x) / np.float32(2)
def to_float64(x):
return np.float64(x) / np.float64(2)
def to_complex64(x):
return np.complex64(x)
def to_complex128(x):
return np.complex128(x)
# Since multiplication of float16 is not supported via the operator * on
# float16s yet, and the host does not implement cuda.fp16.*, we need two
# versions of the following functions:
#
# - The device version uses cuda.fp16.hmul
# - The host version uses the * operator
def cuda_int_literal_to_float16(x):
# Note that we need to use `2` and not `np.float16(2)` to ensure that this
# types as a literal int and not a const float16.
return cuda.fp16.hmul(np.float16(x), 2)
def reference_int_literal_to_float16(x):
return np.float16(x) * np.float16(2)
def cuda_float_literal_to_float16(x):
# Note that `2.5` types as a const float64 and not a literal float, but
# this case is provided in case that changes in future.
return cuda.fp16.hmul(np.float16(x), 2.5)
def reference_float_literal_to_float16(x):
return np.float16(x) * np.float16(2.5)
class TestCasting(CUDATestCase):
def _create_wrapped(self, pyfunc, intype, outtype):
wrapped_func = cuda.jit(device=True)(pyfunc)
@cuda.jit
def cuda_wrapper_fn(arg, res):
res[0] = wrapped_func(arg[0])
def wrapper_fn(arg):
argarray = np.zeros(1, dtype=intype)
argarray[0] = arg
resarray = np.zeros(1, dtype=outtype)
cuda_wrapper_fn[1, 1](argarray, resarray)
return resarray[0]
return wrapper_fn
@skip_unless_cc_53
def test_float_to_int(self):
pyfuncs = (to_int8, to_int16, to_int32, to_int64)
totys = (np.int8, np.int16, np.int32, np.int64)
fromtys = (np.float16, np.float32, np.float64)
for pyfunc, toty in zip(pyfuncs, totys):
for fromty in fromtys:
with self.subTest(fromty=fromty, toty=toty):
cfunc = self._create_wrapped(pyfunc, fromty, toty)
self.assertEqual(cfunc(12.3), pyfunc(12.3))
self.assertEqual(cfunc(12.3), int(12.3))
self.assertEqual(cfunc(-12.3), pyfunc(-12.3))
self.assertEqual(cfunc(-12.3), int(-12.3))
@skip_on_cudasim('Compilation unsupported in the simulator')
def test_float16_to_int_ptx(self):
pyfuncs = (to_int8, to_int16, to_int32, to_int64)
sizes = (8, 16, 32, 64)
for pyfunc, size in zip(pyfuncs, sizes):
ptx, _ = compile_ptx(pyfunc, (f2,), device=True)
self.assertIn(f"cvt.rni.s{size}.f16", ptx)
@skip_unless_cc_53
def test_float_to_uint(self):
pyfuncs = (to_int8, to_int16, to_int32, to_int64)
totys = (np.uint8, np.uint16, np.uint32, np.uint64)
fromtys = (np.float16, np.float32, np.float64)
for pyfunc, toty in zip(pyfuncs, totys):
for fromty in fromtys:
with self.subTest(fromty=fromty, toty=toty):
cfunc = self._create_wrapped(pyfunc, fromty, toty)
self.assertEqual(cfunc(12.3), pyfunc(12.3))
self.assertEqual(cfunc(12.3), int(12.3))
@skip_on_cudasim('Compilation unsupported in the simulator')
def test_float16_to_uint_ptx(self):
pyfuncs = (to_uint8, to_uint16, to_uint32, to_uint64)
sizes = (8, 16, 32, 64)
for pyfunc, size in zip(pyfuncs, sizes):
ptx, _ = compile_ptx(pyfunc, (f2,), device=True)
self.assertIn(f"cvt.rni.u{size}.f16", ptx)
@skip_unless_cc_53
def test_int_to_float(self):
pyfuncs = (to_float16, to_float32, to_float64)
totys = (np.float16, np.float32, np.float64)
for pyfunc, toty in zip(pyfuncs, totys):
with self.subTest(toty=toty):
cfunc = self._create_wrapped(pyfunc, np.int64, toty)
self.assertEqual(cfunc(321), pyfunc(321))
@skip_unless_cc_53
def test_literal_to_float16(self):
cudafuncs = (cuda_int_literal_to_float16,
cuda_float_literal_to_float16)
hostfuncs = (reference_int_literal_to_float16,
reference_float_literal_to_float16)
for cudafunc, hostfunc in zip(cudafuncs, hostfuncs):
with self.subTest(func=cudafunc):
cfunc = self._create_wrapped(cudafunc, np.float16, np.float16)
self.assertEqual(cfunc(321), hostfunc(321))
@skip_on_cudasim('Compilation unsupported in the simulator')
def test_int_to_float16_ptx(self):
fromtys = (i1, i2, i4, i8)
sizes = (8, 16, 32, 64)
for ty, size in zip(fromtys, sizes):
ptx, _ = compile_ptx(to_float16, (ty,), device=True)
self.assertIn(f"cvt.rn.f16.s{size}", ptx)
@skip_on_cudasim('Compilation unsupported in the simulator')
def test_uint_to_float16_ptx(self):
fromtys = (u1, u2, u4, u8)
sizes = (8, 16, 32, 64)
for ty, size in zip(fromtys, sizes):
ptx, _ = compile_ptx(to_float16, (ty,), device=True)
self.assertIn(f"cvt.rn.f16.u{size}", ptx)
@skip_unless_cc_53
def test_float_to_float(self):
pyfuncs = (to_float16, to_float32, to_float64)
tys = (np.float16, np.float32, np.float64)
for (pyfunc, fromty), toty in itertools.product(zip(pyfuncs, tys), tys):
with self.subTest(fromty=fromty, toty=toty):
cfunc = self._create_wrapped(pyfunc, fromty, toty)
# For this test we cannot use the pyfunc for comparison because
# the CUDA target doesn't yet implement division (or operators)
# for float16 values, so we test by comparing with the computed
# expression instead.
np.testing.assert_allclose(cfunc(12.3),
toty(12.3) / toty(2), rtol=0.0003)
np.testing.assert_allclose(cfunc(-12.3),
toty(-12.3) / toty(2), rtol=0.0003)
@skip_on_cudasim('Compilation unsupported in the simulator')
def test_float16_to_float_ptx(self):
pyfuncs = (to_float32, to_float64)
postfixes = ("f32", "f64")
for pyfunc, postfix in zip(pyfuncs, postfixes):
ptx, _ = compile_ptx(pyfunc, (f2,), device=True)
self.assertIn(f"cvt.{postfix}.f16", ptx)
@skip_unless_cc_53
def test_float_to_complex(self):
pyfuncs = (to_complex64, to_complex128)
totys = (np.complex64, np.complex128)
fromtys = (np.float16, np.float32, np.float64)
for pyfunc, toty in zip(pyfuncs, totys):
for fromty in fromtys:
with self.subTest(fromty=fromty, toty=toty):
cfunc = self._create_wrapped(pyfunc, fromty, toty)
# Here we need to explicitly cast the input to the pyfunc
# to match the casting that is automatically applied when
# passing the input to the cfunc as part of wrapping it in
# an array of type fromtype.
np.testing.assert_allclose(cfunc(3.21),
pyfunc(fromty(3.21)))
np.testing.assert_allclose(cfunc(-3.21),
pyfunc(fromty(-3.21)) + 0j)
@skip_on_cudasim('Compilation unsupported in the simulator')
def test_native_cast(self):
float32_ptx, _ = cuda.compile_ptx(native_cast, (float32,), device=True)
self.assertIn("st.f32", float32_ptx)
float16_ptx, _ = cuda.compile_ptx(native_cast, (float16,), device=True)
self.assertIn("st.u16", float16_ptx)
if __name__ == '__main__':
unittest.main()