ai-content-maker/.venv/Lib/site-packages/numba/cuda/tests/cudapy/test_caching.py

import multiprocessing
import os
import shutil
import subprocess
import sys
import unittest
import warnings

from numba import cuda
from numba.core.errors import NumbaWarning
from numba.cuda.testing import (CUDATestCase, skip_on_cudasim,
                                skip_unless_cc_60, skip_if_cudadevrt_missing,
                                skip_if_mvc_enabled, test_data_dir)
from numba.tests.support import SerialMixin
from numba.tests.test_caching import (DispatcherCacheUsecasesTest,
                                      skip_bad_access)


@skip_on_cudasim('Simulator does not implement caching')
class CUDACachingTest(SerialMixin, DispatcherCacheUsecasesTest):
    here = os.path.dirname(__file__)
    usecases_file = os.path.join(here, "cache_usecases.py")
    modname = "cuda_caching_test_fodder"

    def setUp(self):
        DispatcherCacheUsecasesTest.setUp(self)
        CUDATestCase.setUp(self)

    def tearDown(self):
        CUDATestCase.tearDown(self)
        DispatcherCacheUsecasesTest.tearDown(self)

    def test_caching(self):
        self.check_pycache(0)
        mod = self.import_module()
        self.check_pycache(0)

        f = mod.add_usecase
        self.assertPreciseEqual(f(2, 3), 6)
        self.check_pycache(2)  # 1 index, 1 data
        self.assertPreciseEqual(f(2.5, 3), 6.5)
        self.check_pycache(3)  # 1 index, 2 data
        self.check_hits(f.func, 0, 2)

        f = mod.record_return_aligned
        rec = f(mod.aligned_arr, 1)
        self.assertPreciseEqual(tuple(rec), (2, 43.5))

        f = mod.record_return_packed
        rec = f(mod.packed_arr, 1)
        self.assertPreciseEqual(tuple(rec), (2, 43.5))
        self.check_pycache(6)  # 2 index, 4 data
        self.check_hits(f.func, 0, 2)

        # Check the code runs ok from another process
        self.run_in_separate_process()

    def test_no_caching(self):
        mod = self.import_module()

        f = mod.add_nocache_usecase
        self.assertPreciseEqual(f(2, 3), 6)
        self.check_pycache(0)

    def test_many_locals(self):
        # Declaring many local arrays creates a very large LLVM IR, which
        # cannot be pickled due to the level of recursion it requires to
        # pickle. This test ensures that kernels with many locals (and
        # therefore large IR) can be cached. See Issue #8373:
        # https://github.com/numba/numba/issues/8373
        self.check_pycache(0)
        mod = self.import_module()
        f = mod.many_locals
        f[1, 1]()
        self.check_pycache(2) # 1 index, 1 data

    def test_closure(self):
        mod = self.import_module()

        with warnings.catch_warnings():
            warnings.simplefilter('error', NumbaWarning)

            f = mod.closure1
            self.assertPreciseEqual(f(3), 6) # 3 + 3 = 6
            f = mod.closure2
            self.assertPreciseEqual(f(3), 8) # 3 + 5 = 8
            f = mod.closure3
            self.assertPreciseEqual(f(3), 10) # 3 + 7 = 10
            f = mod.closure4
            self.assertPreciseEqual(f(3), 12) # 3 + 9 = 12
            self.check_pycache(5) # 1 nbi, 4 nbc

    def test_cache_reuse(self):
        mod = self.import_module()
        mod.add_usecase(2, 3)
        mod.add_usecase(2.5, 3.5)
        mod.outer_uncached(2, 3)
        mod.outer(2, 3)
        mod.record_return_packed(mod.packed_arr, 0)
        mod.record_return_aligned(mod.aligned_arr, 1)
        mod.simple_usecase_caller(2)
        mtimes = self.get_cache_mtimes()
        # Two signatures compiled
        self.check_hits(mod.add_usecase.func, 0, 2)

        mod2 = self.import_module()
        self.assertIsNot(mod, mod2)
        f = mod2.add_usecase
        f(2, 3)
        self.check_hits(f.func, 1, 0)
        f(2.5, 3.5)
        self.check_hits(f.func, 2, 0)

        # The files haven't changed
        self.assertEqual(self.get_cache_mtimes(), mtimes)

        self.run_in_separate_process()
        self.assertEqual(self.get_cache_mtimes(), mtimes)

    def test_cache_invalidate(self):
        mod = self.import_module()
        f = mod.add_usecase
        self.assertPreciseEqual(f(2, 3), 6)

        # This should change the functions' results
        with open(self.modfile, "a") as f:
            f.write("\nZ = 10\n")

        mod = self.import_module()
        f = mod.add_usecase
        self.assertPreciseEqual(f(2, 3), 15)

    def test_recompile(self):
        # Explicit call to recompile() should overwrite the cache
        mod = self.import_module()
        f = mod.add_usecase
        self.assertPreciseEqual(f(2, 3), 6)

        mod = self.import_module()
        f = mod.add_usecase
        mod.Z = 10
        self.assertPreciseEqual(f(2, 3), 6)
        f.func.recompile()
        self.assertPreciseEqual(f(2, 3), 15)

        # Freshly recompiled version is re-used from other imports
        mod = self.import_module()
        f = mod.add_usecase
        self.assertPreciseEqual(f(2, 3), 15)

    def test_same_names(self):
        # Function with the same names should still disambiguate
        mod = self.import_module()
        f = mod.renamed_function1
        self.assertPreciseEqual(f(2), 4)
        f = mod.renamed_function2
        self.assertPreciseEqual(f(2), 8)

    @skip_unless_cc_60
    @skip_if_cudadevrt_missing
    @skip_if_mvc_enabled('CG not supported with MVC')
    def test_cache_cg(self):
        # Functions using cooperative groups should be cacheable. See Issue
        # #8888: https://github.com/numba/numba/issues/8888
        self.check_pycache(0)
        mod = self.import_module()
        self.check_pycache(0)

        mod.cg_usecase(0)
        self.check_pycache(2)  # 1 index, 1 data

        # Check the code runs ok from another process
        self.run_in_separate_process()

    @skip_unless_cc_60
    @skip_if_cudadevrt_missing
    @skip_if_mvc_enabled('CG not supported with MVC')
    def test_cache_cg_clean_run(self):
        # See Issue #9432: https://github.com/numba/numba/issues/9432
        # If a cached function using CG sync was the first thing to compile,
        # the compile would fail.
        self.check_pycache(0)

        # This logic is modelled on run_in_separate_process(), but executes the
        # CG usecase directly in the subprocess.
        code = """if 1:
            import sys

            sys.path.insert(0, %(tempdir)r)
            mod = __import__(%(modname)r)
            mod.cg_usecase(0)
            """ % dict(tempdir=self.tempdir, modname=self.modname)

        popen = subprocess.Popen([sys.executable, "-c", code],
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE)
        out, err = popen.communicate(timeout=60)
        if popen.returncode != 0:
            raise AssertionError(
                "process failed with code %s: \n"
                "stdout follows\n%s\n"
                "stderr follows\n%s\n"
                % (popen.returncode, out.decode(), err.decode()),
            )

    def _test_pycache_fallback(self):
        """
        With a disabled __pycache__, test there is a working fallback
        (e.g. on the user-wide cache dir)
        """
        mod = self.import_module()
        f = mod.add_usecase
        # Remove this function's cache files at the end, to avoid accumulation
        # across test calls.
        self.addCleanup(shutil.rmtree, f.func.stats.cache_path,
                        ignore_errors=True)

        self.assertPreciseEqual(f(2, 3), 6)
        # It's a cache miss since the file was copied to a new temp location
        self.check_hits(f.func, 0, 1)

        # Test re-use
        mod2 = self.import_module()
        f = mod2.add_usecase
        self.assertPreciseEqual(f(2, 3), 6)
        self.check_hits(f.func, 1, 0)

        # The __pycache__ is empty (otherwise the test's preconditions
        # wouldn't be met)
        self.check_pycache(0)

    @skip_bad_access
    @unittest.skipIf(os.name == "nt",
                     "cannot easily make a directory read-only on Windows")
    def test_non_creatable_pycache(self):
        # Make it impossible to create the __pycache__ directory
        old_perms = os.stat(self.tempdir).st_mode
        os.chmod(self.tempdir, 0o500)
        self.addCleanup(os.chmod, self.tempdir, old_perms)

        self._test_pycache_fallback()

    @skip_bad_access
    @unittest.skipIf(os.name == "nt",
                     "cannot easily make a directory read-only on Windows")
    def test_non_writable_pycache(self):
        # Make it impossible to write to the __pycache__ directory
        pycache = os.path.join(self.tempdir, '__pycache__')
        os.mkdir(pycache)
        old_perms = os.stat(pycache).st_mode
        os.chmod(pycache, 0o500)
        self.addCleanup(os.chmod, pycache, old_perms)

        self._test_pycache_fallback()

    def test_cannot_cache_linking_libraries(self):
        link = str(test_data_dir / 'jitlink.ptx')
        msg = 'Cannot pickle CUDACodeLibrary with linking files'
        with self.assertRaisesRegex(RuntimeError, msg):
            @cuda.jit('void()', cache=True, link=[link])
            def f():
                pass


@skip_on_cudasim('Simulator does not implement caching')
class CUDAAndCPUCachingTest(SerialMixin, DispatcherCacheUsecasesTest):
    here = os.path.dirname(__file__)
    usecases_file = os.path.join(here, "cache_with_cpu_usecases.py")
    modname = "cuda_and_cpu_caching_test_fodder"

    def setUp(self):
        DispatcherCacheUsecasesTest.setUp(self)
        CUDATestCase.setUp(self)

    def tearDown(self):
        CUDATestCase.tearDown(self)
        DispatcherCacheUsecasesTest.tearDown(self)

    def test_cpu_and_cuda_targets(self):
        # The same function jitted for CPU and CUDA targets should maintain
        # separate caches for each target.
        self.check_pycache(0)
        mod = self.import_module()
        self.check_pycache(0)

        f_cpu = mod.assign_cpu
        f_cuda = mod.assign_cuda
        self.assertPreciseEqual(f_cpu(5), 5)
        self.check_pycache(2)  # 1 index, 1 data
        self.assertPreciseEqual(f_cuda(5), 5)
        self.check_pycache(3)  # 1 index, 2 data

        self.check_hits(f_cpu.func, 0, 1)
        self.check_hits(f_cuda.func, 0, 1)

        self.assertPreciseEqual(f_cpu(5.5), 5.5)
        self.check_pycache(4)  # 1 index, 3 data
        self.assertPreciseEqual(f_cuda(5.5), 5.5)
        self.check_pycache(5)  # 1 index, 4 data

        self.check_hits(f_cpu.func, 0, 2)
        self.check_hits(f_cuda.func, 0, 2)

    def test_cpu_and_cuda_reuse(self):
        # Existing cache files for the CPU and CUDA targets are reused.
        mod = self.import_module()
        mod.assign_cpu(5)
        mod.assign_cpu(5.5)
        mod.assign_cuda(5)
        mod.assign_cuda(5.5)

        mtimes = self.get_cache_mtimes()

        # Two signatures compiled
        self.check_hits(mod.assign_cpu.func, 0, 2)
        self.check_hits(mod.assign_cuda.func, 0, 2)

        mod2 = self.import_module()
        self.assertIsNot(mod, mod2)
        f_cpu = mod2.assign_cpu
        f_cuda = mod2.assign_cuda

        f_cpu(2)
        self.check_hits(f_cpu.func, 1, 0)
        f_cpu(2.5)
        self.check_hits(f_cpu.func, 2, 0)
        f_cuda(2)
        self.check_hits(f_cuda.func, 1, 0)
        f_cuda(2.5)
        self.check_hits(f_cuda.func, 2, 0)

        # The files haven't changed
        self.assertEqual(self.get_cache_mtimes(), mtimes)

        self.run_in_separate_process()
        self.assertEqual(self.get_cache_mtimes(), mtimes)


def get_different_cc_gpus():
    # Find two GPUs with different Compute Capabilities and return them as a
    # tuple. If two GPUs with distinct Compute Capabilities cannot be found,
    # then None is returned.
    first_gpu = cuda.gpus[0]
    with first_gpu:
        first_cc = cuda.current_context().device.compute_capability

    for gpu in cuda.gpus[1:]:
        with gpu:
            cc = cuda.current_context().device.compute_capability
            if cc != first_cc:
                return (first_gpu, gpu)

    return None


@skip_on_cudasim('Simulator does not implement caching')
class TestMultiCCCaching(SerialMixin, DispatcherCacheUsecasesTest):
    here = os.path.dirname(__file__)
    usecases_file = os.path.join(here, "cache_usecases.py")
    modname = "cuda_multi_cc_caching_test_fodder"

    def setUp(self):
        DispatcherCacheUsecasesTest.setUp(self)
        CUDATestCase.setUp(self)

    def tearDown(self):
        CUDATestCase.tearDown(self)
        DispatcherCacheUsecasesTest.tearDown(self)

    def test_cache(self):
        gpus = get_different_cc_gpus()
        if not gpus:
            self.skipTest('Need two different CCs for multi-CC cache test')

        self.check_pycache(0)
        mod = self.import_module()
        self.check_pycache(0)

        # Step 1. Populate the cache with the first GPU
        with gpus[0]:
            f = mod.add_usecase
            self.assertPreciseEqual(f(2, 3), 6)
            self.check_pycache(2)  # 1 index, 1 data
            self.assertPreciseEqual(f(2.5, 3), 6.5)
            self.check_pycache(3)  # 1 index, 2 data
            self.check_hits(f.func, 0, 2)

            f = mod.record_return_aligned
            rec = f(mod.aligned_arr, 1)
            self.assertPreciseEqual(tuple(rec), (2, 43.5))

            f = mod.record_return_packed
            rec = f(mod.packed_arr, 1)
            self.assertPreciseEqual(tuple(rec), (2, 43.5))
            self.check_pycache(6)  # 2 index, 4 data
            self.check_hits(f.func, 0, 2)

        # Step 2. Run with the second GPU - under present behaviour this
        # doesn't further populate the cache.
        with gpus[1]:
            f = mod.add_usecase
            self.assertPreciseEqual(f(2, 3), 6)
            self.check_pycache(6)  # cache unchanged
            self.assertPreciseEqual(f(2.5, 3), 6.5)
            self.check_pycache(6)  # cache unchanged
            self.check_hits(f.func, 0, 2)

            f = mod.record_return_aligned
            rec = f(mod.aligned_arr, 1)
            self.assertPreciseEqual(tuple(rec), (2, 43.5))

            f = mod.record_return_packed
            rec = f(mod.packed_arr, 1)
            self.assertPreciseEqual(tuple(rec), (2, 43.5))
            self.check_pycache(6)  # cache unchanged
            self.check_hits(f.func, 0, 2)

        # Step 3. Run in a separate module with the second GPU - this populates
        # the cache for the second CC.
        mod2 = self.import_module()
        self.assertIsNot(mod, mod2)

        with gpus[1]:
            f = mod2.add_usecase
            self.assertPreciseEqual(f(2, 3), 6)
            self.check_pycache(7)  # 2 index, 5 data
            self.assertPreciseEqual(f(2.5, 3), 6.5)
            self.check_pycache(8)  # 2 index, 6 data
            self.check_hits(f.func, 0, 2)

            f = mod2.record_return_aligned
            rec = f(mod.aligned_arr, 1)
            self.assertPreciseEqual(tuple(rec), (2, 43.5))

            f = mod2.record_return_packed
            rec = f(mod.packed_arr, 1)
            self.assertPreciseEqual(tuple(rec), (2, 43.5))
            self.check_pycache(10)  # 2 index, 8 data
            self.check_hits(f.func, 0, 2)

        # The following steps check that we can use the NVVM IR loaded from the
        # cache to generate PTX for a different compute capability to the
        # cached cubin's CC. To check this, we create another module that loads
        # the cached version containing a cubin for GPU 1. There will be no
        # cubin for GPU 0, so when we try to use it the PTX must be generated.

        mod3 = self.import_module()
        self.assertIsNot(mod, mod3)

        # Step 4. Run with GPU 1 and get a cache hit, loading the cache created
        # during Step 3.
        with gpus[1]:
            f = mod3.add_usecase
            self.assertPreciseEqual(f(2, 3), 6)
            self.assertPreciseEqual(f(2.5, 3), 6.5)

            f = mod3.record_return_aligned
            rec = f(mod.aligned_arr, 1)
            self.assertPreciseEqual(tuple(rec), (2, 43.5))

            f = mod3.record_return_packed
            rec = f(mod.packed_arr, 1)
            self.assertPreciseEqual(tuple(rec), (2, 43.5))

        # Step 5. Run with GPU 0 using the module from Step 4, to force PTX
        # generation from cached NVVM IR.
        with gpus[0]:
            f = mod3.add_usecase
            self.assertPreciseEqual(f(2, 3), 6)
            self.assertPreciseEqual(f(2.5, 3), 6.5)

            f = mod3.record_return_aligned
            rec = f(mod.aligned_arr, 1)
            self.assertPreciseEqual(tuple(rec), (2, 43.5))

            f = mod3.record_return_packed
            rec = f(mod.packed_arr, 1)
            self.assertPreciseEqual(tuple(rec), (2, 43.5))


def child_initializer():
    # Disable occupancy and implicit copy warnings in processes in a
    # multiprocessing pool.
    from numba.core import config
    config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
    config.CUDA_WARN_ON_IMPLICIT_COPY = 0


@skip_on_cudasim('Simulator does not implement caching')
class TestMultiprocessCache(SerialMixin, DispatcherCacheUsecasesTest):

    # Nested multiprocessing.Pool raises AssertionError:
    # "daemonic processes are not allowed to have children"
    _numba_parallel_test_ = False

    here = os.path.dirname(__file__)
    usecases_file = os.path.join(here, "cache_usecases.py")
    modname = "cuda_mp_caching_test_fodder"

    def setUp(self):
        DispatcherCacheUsecasesTest.setUp(self)
        CUDATestCase.setUp(self)

    def tearDown(self):
        CUDATestCase.tearDown(self)
        DispatcherCacheUsecasesTest.tearDown(self)

    def test_multiprocessing(self):
        # Check caching works from multiple processes at once (#2028)
        mod = self.import_module()
        # Calling a pure Python caller of the JIT-compiled function is
        # necessary to reproduce the issue.
        f = mod.simple_usecase_caller
        n = 3
        try:
            ctx = multiprocessing.get_context('spawn')
        except AttributeError:
            ctx = multiprocessing

        pool = ctx.Pool(n, child_initializer)

        try:
            res = sum(pool.imap(f, range(n)))
        finally:
            pool.close()
        self.assertEqual(res, n * (n - 1) // 2)


@skip_on_cudasim('Simulator does not implement the CUDACodeLibrary')
class TestCUDACodeLibrary(CUDATestCase):
    # For tests of miscellaneous CUDACodeLibrary behaviour that we wish to
    # explicitly check

    def test_cannot_serialize_unfinalized(self):
        # The CUDA codegen failes to import under the simulator, so we cannot
        # import it at the top level
        from numba.cuda.codegen import CUDACodeLibrary

        # Usually a CodeLibrary requires a real CodeGen, but since we don't
        # interact with it, anything will do
        codegen = object()
        name = 'library'
        cl = CUDACodeLibrary(codegen, name)
        with self.assertRaisesRegex(RuntimeError, 'Cannot pickle unfinalized'):
            cl._reduce_states()
first commit 2024-05-03 04:18:51 +03:00			`import multiprocessing`
			`import os`
			`import shutil`
			`import subprocess`
			`import sys`
			`import unittest`
			`import warnings`

			`from numba import cuda`
			`from numba.core.errors import NumbaWarning`
			`from numba.cuda.testing import (CUDATestCase, skip_on_cudasim,`
			`skip_unless_cc_60, skip_if_cudadevrt_missing,`
			`skip_if_mvc_enabled, test_data_dir)`
			`from numba.tests.support import SerialMixin`
			`from numba.tests.test_caching import (DispatcherCacheUsecasesTest,`
			`skip_bad_access)`


			`@skip_on_cudasim('Simulator does not implement caching')`
			`class CUDACachingTest(SerialMixin, DispatcherCacheUsecasesTest):`
			`here = os.path.dirname(__file__)`
			`usecases_file = os.path.join(here, "cache_usecases.py")`
			`modname = "cuda_caching_test_fodder"`

			`def setUp(self):`
			`DispatcherCacheUsecasesTest.setUp(self)`
			`CUDATestCase.setUp(self)`

			`def tearDown(self):`
			`CUDATestCase.tearDown(self)`
			`DispatcherCacheUsecasesTest.tearDown(self)`

			`def test_caching(self):`
			`self.check_pycache(0)`
			`mod = self.import_module()`
			`self.check_pycache(0)`

			`f = mod.add_usecase`
			`self.assertPreciseEqual(f(2, 3), 6)`
			`self.check_pycache(2) # 1 index, 1 data`
			`self.assertPreciseEqual(f(2.5, 3), 6.5)`
			`self.check_pycache(3) # 1 index, 2 data`
			`self.check_hits(f.func, 0, 2)`

			`f = mod.record_return_aligned`
			`rec = f(mod.aligned_arr, 1)`
			`self.assertPreciseEqual(tuple(rec), (2, 43.5))`

			`f = mod.record_return_packed`
			`rec = f(mod.packed_arr, 1)`
			`self.assertPreciseEqual(tuple(rec), (2, 43.5))`
			`self.check_pycache(6) # 2 index, 4 data`
			`self.check_hits(f.func, 0, 2)`

			`# Check the code runs ok from another process`
			`self.run_in_separate_process()`

			`def test_no_caching(self):`
			`mod = self.import_module()`

			`f = mod.add_nocache_usecase`
			`self.assertPreciseEqual(f(2, 3), 6)`
			`self.check_pycache(0)`

			`def test_many_locals(self):`
			`# Declaring many local arrays creates a very large LLVM IR, which`
			`# cannot be pickled due to the level of recursion it requires to`
			`# pickle. This test ensures that kernels with many locals (and`
			`# therefore large IR) can be cached. See Issue #8373:`
			`# https://github.com/numba/numba/issues/8373`
			`self.check_pycache(0)`
			`mod = self.import_module()`
			`f = mod.many_locals`
			`f[1, 1]()`
			`self.check_pycache(2) # 1 index, 1 data`

			`def test_closure(self):`
			`mod = self.import_module()`

			`with warnings.catch_warnings():`
			`warnings.simplefilter('error', NumbaWarning)`

			`f = mod.closure1`
			`self.assertPreciseEqual(f(3), 6) # 3 + 3 = 6`
			`f = mod.closure2`
			`self.assertPreciseEqual(f(3), 8) # 3 + 5 = 8`
			`f = mod.closure3`
			`self.assertPreciseEqual(f(3), 10) # 3 + 7 = 10`
			`f = mod.closure4`
			`self.assertPreciseEqual(f(3), 12) # 3 + 9 = 12`
			`self.check_pycache(5) # 1 nbi, 4 nbc`

			`def test_cache_reuse(self):`
			`mod = self.import_module()`
			`mod.add_usecase(2, 3)`
			`mod.add_usecase(2.5, 3.5)`
			`mod.outer_uncached(2, 3)`
			`mod.outer(2, 3)`
			`mod.record_return_packed(mod.packed_arr, 0)`
			`mod.record_return_aligned(mod.aligned_arr, 1)`
			`mod.simple_usecase_caller(2)`
			`mtimes = self.get_cache_mtimes()`
			`# Two signatures compiled`
			`self.check_hits(mod.add_usecase.func, 0, 2)`

			`mod2 = self.import_module()`
			`self.assertIsNot(mod, mod2)`
			`f = mod2.add_usecase`
			`f(2, 3)`
			`self.check_hits(f.func, 1, 0)`
			`f(2.5, 3.5)`
			`self.check_hits(f.func, 2, 0)`

			`# The files haven't changed`
			`self.assertEqual(self.get_cache_mtimes(), mtimes)`

			`self.run_in_separate_process()`
			`self.assertEqual(self.get_cache_mtimes(), mtimes)`

			`def test_cache_invalidate(self):`
			`mod = self.import_module()`
			`f = mod.add_usecase`
			`self.assertPreciseEqual(f(2, 3), 6)`

			`# This should change the functions' results`
			`with open(self.modfile, "a") as f:`
			`f.write("\nZ = 10\n")`

			`mod = self.import_module()`
			`f = mod.add_usecase`
			`self.assertPreciseEqual(f(2, 3), 15)`

			`def test_recompile(self):`
			`# Explicit call to recompile() should overwrite the cache`
			`mod = self.import_module()`
			`f = mod.add_usecase`
			`self.assertPreciseEqual(f(2, 3), 6)`

			`mod = self.import_module()`
			`f = mod.add_usecase`
			`mod.Z = 10`
			`self.assertPreciseEqual(f(2, 3), 6)`
			`f.func.recompile()`
			`self.assertPreciseEqual(f(2, 3), 15)`

			`# Freshly recompiled version is re-used from other imports`
			`mod = self.import_module()`
			`f = mod.add_usecase`
			`self.assertPreciseEqual(f(2, 3), 15)`

			`def test_same_names(self):`
			`# Function with the same names should still disambiguate`
			`mod = self.import_module()`
			`f = mod.renamed_function1`
			`self.assertPreciseEqual(f(2), 4)`
			`f = mod.renamed_function2`
			`self.assertPreciseEqual(f(2), 8)`

			`@skip_unless_cc_60`
			`@skip_if_cudadevrt_missing`
			`@skip_if_mvc_enabled('CG not supported with MVC')`
			`def test_cache_cg(self):`
			`# Functions using cooperative groups should be cacheable. See Issue`
			`# #8888: https://github.com/numba/numba/issues/8888`
			`self.check_pycache(0)`
			`mod = self.import_module()`
			`self.check_pycache(0)`

			`mod.cg_usecase(0)`
			`self.check_pycache(2) # 1 index, 1 data`

			`# Check the code runs ok from another process`
			`self.run_in_separate_process()`

			`@skip_unless_cc_60`
			`@skip_if_cudadevrt_missing`
			`@skip_if_mvc_enabled('CG not supported with MVC')`
			`def test_cache_cg_clean_run(self):`
			`# See Issue #9432: https://github.com/numba/numba/issues/9432`
			`# If a cached function using CG sync was the first thing to compile,`
			`# the compile would fail.`
			`self.check_pycache(0)`

			`# This logic is modelled on run_in_separate_process(), but executes the`
			`# CG usecase directly in the subprocess.`
			`code = """if 1:`
			`import sys`

			`sys.path.insert(0, %(tempdir)r)`
			`mod = __import__(%(modname)r)`
			`mod.cg_usecase(0)`
			`""" % dict(tempdir=self.tempdir, modname=self.modname)`

			`popen = subprocess.Popen([sys.executable, "-c", code],`
			`stdout=subprocess.PIPE,`
			`stderr=subprocess.PIPE)`
			`out, err = popen.communicate(timeout=60)`
			`if popen.returncode != 0:`
			`raise AssertionError(`
			`"process failed with code %s: \n"`
			`"stdout follows\n%s\n"`
			`"stderr follows\n%s\n"`
			`% (popen.returncode, out.decode(), err.decode()),`
			`)`

			`def _test_pycache_fallback(self):`
			`"""`
			`With a disabled __pycache__, test there is a working fallback`
			`(e.g. on the user-wide cache dir)`
			`"""`
			`mod = self.import_module()`
			`f = mod.add_usecase`
			`# Remove this function's cache files at the end, to avoid accumulation`
			`# across test calls.`
			`self.addCleanup(shutil.rmtree, f.func.stats.cache_path,`
			`ignore_errors=True)`

			`self.assertPreciseEqual(f(2, 3), 6)`
			`# It's a cache miss since the file was copied to a new temp location`
			`self.check_hits(f.func, 0, 1)`

			`# Test re-use`
			`mod2 = self.import_module()`
			`f = mod2.add_usecase`
			`self.assertPreciseEqual(f(2, 3), 6)`
			`self.check_hits(f.func, 1, 0)`

			`# The __pycache__ is empty (otherwise the test's preconditions`
			`# wouldn't be met)`
			`self.check_pycache(0)`

			`@skip_bad_access`
			`@unittest.skipIf(os.name == "nt",`
			`"cannot easily make a directory read-only on Windows")`
			`def test_non_creatable_pycache(self):`
			`# Make it impossible to create the __pycache__ directory`
			`old_perms = os.stat(self.tempdir).st_mode`
			`os.chmod(self.tempdir, 0o500)`
			`self.addCleanup(os.chmod, self.tempdir, old_perms)`

			`self._test_pycache_fallback()`

			`@skip_bad_access`
			`@unittest.skipIf(os.name == "nt",`
			`"cannot easily make a directory read-only on Windows")`
			`def test_non_writable_pycache(self):`
			`# Make it impossible to write to the __pycache__ directory`
			`pycache = os.path.join(self.tempdir, '__pycache__')`
			`os.mkdir(pycache)`
			`old_perms = os.stat(pycache).st_mode`
			`os.chmod(pycache, 0o500)`
			`self.addCleanup(os.chmod, pycache, old_perms)`

			`self._test_pycache_fallback()`

			`def test_cannot_cache_linking_libraries(self):`
			`link = str(test_data_dir / 'jitlink.ptx')`
			`msg = 'Cannot pickle CUDACodeLibrary with linking files'`
			`with self.assertRaisesRegex(RuntimeError, msg):`
			`@cuda.jit('void()', cache=True, link=[link])`
			`def f():`
			`pass`


			`@skip_on_cudasim('Simulator does not implement caching')`
			`class CUDAAndCPUCachingTest(SerialMixin, DispatcherCacheUsecasesTest):`
			`here = os.path.dirname(__file__)`
			`usecases_file = os.path.join(here, "cache_with_cpu_usecases.py")`
			`modname = "cuda_and_cpu_caching_test_fodder"`

			`def setUp(self):`
			`DispatcherCacheUsecasesTest.setUp(self)`
			`CUDATestCase.setUp(self)`

			`def tearDown(self):`
			`CUDATestCase.tearDown(self)`
			`DispatcherCacheUsecasesTest.tearDown(self)`

			`def test_cpu_and_cuda_targets(self):`
			`# The same function jitted for CPU and CUDA targets should maintain`
			`# separate caches for each target.`
			`self.check_pycache(0)`
			`mod = self.import_module()`
			`self.check_pycache(0)`

			`f_cpu = mod.assign_cpu`
			`f_cuda = mod.assign_cuda`
			`self.assertPreciseEqual(f_cpu(5), 5)`
			`self.check_pycache(2) # 1 index, 1 data`
			`self.assertPreciseEqual(f_cuda(5), 5)`
			`self.check_pycache(3) # 1 index, 2 data`

			`self.check_hits(f_cpu.func, 0, 1)`
			`self.check_hits(f_cuda.func, 0, 1)`

			`self.assertPreciseEqual(f_cpu(5.5), 5.5)`
			`self.check_pycache(4) # 1 index, 3 data`
			`self.assertPreciseEqual(f_cuda(5.5), 5.5)`
			`self.check_pycache(5) # 1 index, 4 data`

			`self.check_hits(f_cpu.func, 0, 2)`
			`self.check_hits(f_cuda.func, 0, 2)`

			`def test_cpu_and_cuda_reuse(self):`
			`# Existing cache files for the CPU and CUDA targets are reused.`
			`mod = self.import_module()`
			`mod.assign_cpu(5)`
			`mod.assign_cpu(5.5)`
			`mod.assign_cuda(5)`
			`mod.assign_cuda(5.5)`

			`mtimes = self.get_cache_mtimes()`

			`# Two signatures compiled`
			`self.check_hits(mod.assign_cpu.func, 0, 2)`
			`self.check_hits(mod.assign_cuda.func, 0, 2)`

			`mod2 = self.import_module()`
			`self.assertIsNot(mod, mod2)`
			`f_cpu = mod2.assign_cpu`
			`f_cuda = mod2.assign_cuda`

			`f_cpu(2)`
			`self.check_hits(f_cpu.func, 1, 0)`
			`f_cpu(2.5)`
			`self.check_hits(f_cpu.func, 2, 0)`
			`f_cuda(2)`
			`self.check_hits(f_cuda.func, 1, 0)`
			`f_cuda(2.5)`
			`self.check_hits(f_cuda.func, 2, 0)`

			`# The files haven't changed`
			`self.assertEqual(self.get_cache_mtimes(), mtimes)`

			`self.run_in_separate_process()`
			`self.assertEqual(self.get_cache_mtimes(), mtimes)`


			`def get_different_cc_gpus():`
			`# Find two GPUs with different Compute Capabilities and return them as a`
			`# tuple. If two GPUs with distinct Compute Capabilities cannot be found,`
			`# then None is returned.`
			`first_gpu = cuda.gpus[0]`
			`with first_gpu:`
			`first_cc = cuda.current_context().device.compute_capability`

			`for gpu in cuda.gpus[1:]:`
			`with gpu:`
			`cc = cuda.current_context().device.compute_capability`
			`if cc != first_cc:`
			`return (first_gpu, gpu)`

			`return None`


			`@skip_on_cudasim('Simulator does not implement caching')`
			`class TestMultiCCCaching(SerialMixin, DispatcherCacheUsecasesTest):`
			`here = os.path.dirname(__file__)`
			`usecases_file = os.path.join(here, "cache_usecases.py")`
			`modname = "cuda_multi_cc_caching_test_fodder"`

			`def setUp(self):`
			`DispatcherCacheUsecasesTest.setUp(self)`
			`CUDATestCase.setUp(self)`

			`def tearDown(self):`
			`CUDATestCase.tearDown(self)`
			`DispatcherCacheUsecasesTest.tearDown(self)`

			`def test_cache(self):`
			`gpus = get_different_cc_gpus()`
			`if not gpus:`
			`self.skipTest('Need two different CCs for multi-CC cache test')`

			`self.check_pycache(0)`
			`mod = self.import_module()`
			`self.check_pycache(0)`

			`# Step 1. Populate the cache with the first GPU`
			`with gpus[0]:`
			`f = mod.add_usecase`
			`self.assertPreciseEqual(f(2, 3), 6)`
			`self.check_pycache(2) # 1 index, 1 data`
			`self.assertPreciseEqual(f(2.5, 3), 6.5)`
			`self.check_pycache(3) # 1 index, 2 data`
			`self.check_hits(f.func, 0, 2)`

			`f = mod.record_return_aligned`
			`rec = f(mod.aligned_arr, 1)`
			`self.assertPreciseEqual(tuple(rec), (2, 43.5))`

			`f = mod.record_return_packed`
			`rec = f(mod.packed_arr, 1)`
			`self.assertPreciseEqual(tuple(rec), (2, 43.5))`
			`self.check_pycache(6) # 2 index, 4 data`
			`self.check_hits(f.func, 0, 2)`

			`# Step 2. Run with the second GPU - under present behaviour this`
			`# doesn't further populate the cache.`
			`with gpus[1]:`
			`f = mod.add_usecase`
			`self.assertPreciseEqual(f(2, 3), 6)`
			`self.check_pycache(6) # cache unchanged`
			`self.assertPreciseEqual(f(2.5, 3), 6.5)`
			`self.check_pycache(6) # cache unchanged`
			`self.check_hits(f.func, 0, 2)`

			`f = mod.record_return_aligned`
			`rec = f(mod.aligned_arr, 1)`
			`self.assertPreciseEqual(tuple(rec), (2, 43.5))`

			`f = mod.record_return_packed`
			`rec = f(mod.packed_arr, 1)`
			`self.assertPreciseEqual(tuple(rec), (2, 43.5))`
			`self.check_pycache(6) # cache unchanged`
			`self.check_hits(f.func, 0, 2)`

			`# Step 3. Run in a separate module with the second GPU - this populates`
			`# the cache for the second CC.`
			`mod2 = self.import_module()`
			`self.assertIsNot(mod, mod2)`

			`with gpus[1]:`
			`f = mod2.add_usecase`
			`self.assertPreciseEqual(f(2, 3), 6)`
			`self.check_pycache(7) # 2 index, 5 data`
			`self.assertPreciseEqual(f(2.5, 3), 6.5)`
			`self.check_pycache(8) # 2 index, 6 data`
			`self.check_hits(f.func, 0, 2)`

			`f = mod2.record_return_aligned`
			`rec = f(mod.aligned_arr, 1)`
			`self.assertPreciseEqual(tuple(rec), (2, 43.5))`

			`f = mod2.record_return_packed`
			`rec = f(mod.packed_arr, 1)`
			`self.assertPreciseEqual(tuple(rec), (2, 43.5))`
			`self.check_pycache(10) # 2 index, 8 data`
			`self.check_hits(f.func, 0, 2)`

			`# The following steps check that we can use the NVVM IR loaded from the`
			`# cache to generate PTX for a different compute capability to the`
			`# cached cubin's CC. To check this, we create another module that loads`
			`# the cached version containing a cubin for GPU 1. There will be no`
			`# cubin for GPU 0, so when we try to use it the PTX must be generated.`

			`mod3 = self.import_module()`
			`self.assertIsNot(mod, mod3)`

			`# Step 4. Run with GPU 1 and get a cache hit, loading the cache created`
			`# during Step 3.`
			`with gpus[1]:`
			`f = mod3.add_usecase`
			`self.assertPreciseEqual(f(2, 3), 6)`
			`self.assertPreciseEqual(f(2.5, 3), 6.5)`

			`f = mod3.record_return_aligned`
			`rec = f(mod.aligned_arr, 1)`
			`self.assertPreciseEqual(tuple(rec), (2, 43.5))`

			`f = mod3.record_return_packed`
			`rec = f(mod.packed_arr, 1)`
			`self.assertPreciseEqual(tuple(rec), (2, 43.5))`

			`# Step 5. Run with GPU 0 using the module from Step 4, to force PTX`
			`# generation from cached NVVM IR.`
			`with gpus[0]:`
			`f = mod3.add_usecase`
			`self.assertPreciseEqual(f(2, 3), 6)`
			`self.assertPreciseEqual(f(2.5, 3), 6.5)`

			`f = mod3.record_return_aligned`
			`rec = f(mod.aligned_arr, 1)`
			`self.assertPreciseEqual(tuple(rec), (2, 43.5))`

			`f = mod3.record_return_packed`
			`rec = f(mod.packed_arr, 1)`
			`self.assertPreciseEqual(tuple(rec), (2, 43.5))`


			`def child_initializer():`
			`# Disable occupancy and implicit copy warnings in processes in a`
			`# multiprocessing pool.`
			`from numba.core import config`
			`config.CUDA_LOW_OCCUPANCY_WARNINGS = 0`
			`config.CUDA_WARN_ON_IMPLICIT_COPY = 0`


			`@skip_on_cudasim('Simulator does not implement caching')`
			`class TestMultiprocessCache(SerialMixin, DispatcherCacheUsecasesTest):`

			`# Nested multiprocessing.Pool raises AssertionError:`
			`# "daemonic processes are not allowed to have children"`
			`_numba_parallel_test_ = False`

			`here = os.path.dirname(__file__)`
			`usecases_file = os.path.join(here, "cache_usecases.py")`
			`modname = "cuda_mp_caching_test_fodder"`

			`def setUp(self):`
			`DispatcherCacheUsecasesTest.setUp(self)`
			`CUDATestCase.setUp(self)`

			`def tearDown(self):`
			`CUDATestCase.tearDown(self)`
			`DispatcherCacheUsecasesTest.tearDown(self)`

			`def test_multiprocessing(self):`
			`# Check caching works from multiple processes at once (#2028)`
			`mod = self.import_module()`
			`# Calling a pure Python caller of the JIT-compiled function is`
			`# necessary to reproduce the issue.`
			`f = mod.simple_usecase_caller`
			`n = 3`
			`try:`
			`ctx = multiprocessing.get_context('spawn')`
			`except AttributeError:`
			`ctx = multiprocessing`

			`pool = ctx.Pool(n, child_initializer)`

			`try:`
			`res = sum(pool.imap(f, range(n)))`
			`finally:`
			`pool.close()`
			`self.assertEqual(res, n * (n - 1) // 2)`


			`@skip_on_cudasim('Simulator does not implement the CUDACodeLibrary')`
			`class TestCUDACodeLibrary(CUDATestCase):`
			`# For tests of miscellaneous CUDACodeLibrary behaviour that we wish to`
			`# explicitly check`

			`def test_cannot_serialize_unfinalized(self):`
			`# The CUDA codegen failes to import under the simulator, so we cannot`
			`# import it at the top level`
			`from numba.cuda.codegen import CUDACodeLibrary`

			`# Usually a CodeLibrary requires a real CodeGen, but since we don't`
			`# interact with it, anything will do`
			`codegen = object()`
			`name = 'library'`
			`cl = CUDACodeLibrary(codegen, name)`
			`with self.assertRaisesRegex(RuntimeError, 'Cannot pickle unfinalized'):`
			`cl._reduce_states()`