import multiprocessing import os import shutil import subprocess import sys import unittest import warnings from numba import cuda from numba.core.errors import NumbaWarning from numba.cuda.testing import (CUDATestCase, skip_on_cudasim, skip_unless_cc_60, skip_if_cudadevrt_missing, skip_if_mvc_enabled, test_data_dir) from numba.tests.support import SerialMixin from numba.tests.test_caching import (DispatcherCacheUsecasesTest, skip_bad_access) @skip_on_cudasim('Simulator does not implement caching') class CUDACachingTest(SerialMixin, DispatcherCacheUsecasesTest): here = os.path.dirname(__file__) usecases_file = os.path.join(here, "cache_usecases.py") modname = "cuda_caching_test_fodder" def setUp(self): DispatcherCacheUsecasesTest.setUp(self) CUDATestCase.setUp(self) def tearDown(self): CUDATestCase.tearDown(self) DispatcherCacheUsecasesTest.tearDown(self) def test_caching(self): self.check_pycache(0) mod = self.import_module() self.check_pycache(0) f = mod.add_usecase self.assertPreciseEqual(f(2, 3), 6) self.check_pycache(2) # 1 index, 1 data self.assertPreciseEqual(f(2.5, 3), 6.5) self.check_pycache(3) # 1 index, 2 data self.check_hits(f.func, 0, 2) f = mod.record_return_aligned rec = f(mod.aligned_arr, 1) self.assertPreciseEqual(tuple(rec), (2, 43.5)) f = mod.record_return_packed rec = f(mod.packed_arr, 1) self.assertPreciseEqual(tuple(rec), (2, 43.5)) self.check_pycache(6) # 2 index, 4 data self.check_hits(f.func, 0, 2) # Check the code runs ok from another process self.run_in_separate_process() def test_no_caching(self): mod = self.import_module() f = mod.add_nocache_usecase self.assertPreciseEqual(f(2, 3), 6) self.check_pycache(0) def test_many_locals(self): # Declaring many local arrays creates a very large LLVM IR, which # cannot be pickled due to the level of recursion it requires to # pickle. This test ensures that kernels with many locals (and # therefore large IR) can be cached. See Issue #8373: # https://github.com/numba/numba/issues/8373 self.check_pycache(0) mod = self.import_module() f = mod.many_locals f[1, 1]() self.check_pycache(2) # 1 index, 1 data def test_closure(self): mod = self.import_module() with warnings.catch_warnings(): warnings.simplefilter('error', NumbaWarning) f = mod.closure1 self.assertPreciseEqual(f(3), 6) # 3 + 3 = 6 f = mod.closure2 self.assertPreciseEqual(f(3), 8) # 3 + 5 = 8 f = mod.closure3 self.assertPreciseEqual(f(3), 10) # 3 + 7 = 10 f = mod.closure4 self.assertPreciseEqual(f(3), 12) # 3 + 9 = 12 self.check_pycache(5) # 1 nbi, 4 nbc def test_cache_reuse(self): mod = self.import_module() mod.add_usecase(2, 3) mod.add_usecase(2.5, 3.5) mod.outer_uncached(2, 3) mod.outer(2, 3) mod.record_return_packed(mod.packed_arr, 0) mod.record_return_aligned(mod.aligned_arr, 1) mod.simple_usecase_caller(2) mtimes = self.get_cache_mtimes() # Two signatures compiled self.check_hits(mod.add_usecase.func, 0, 2) mod2 = self.import_module() self.assertIsNot(mod, mod2) f = mod2.add_usecase f(2, 3) self.check_hits(f.func, 1, 0) f(2.5, 3.5) self.check_hits(f.func, 2, 0) # The files haven't changed self.assertEqual(self.get_cache_mtimes(), mtimes) self.run_in_separate_process() self.assertEqual(self.get_cache_mtimes(), mtimes) def test_cache_invalidate(self): mod = self.import_module() f = mod.add_usecase self.assertPreciseEqual(f(2, 3), 6) # This should change the functions' results with open(self.modfile, "a") as f: f.write("\nZ = 10\n") mod = self.import_module() f = mod.add_usecase self.assertPreciseEqual(f(2, 3), 15) def test_recompile(self): # Explicit call to recompile() should overwrite the cache mod = self.import_module() f = mod.add_usecase self.assertPreciseEqual(f(2, 3), 6) mod = self.import_module() f = mod.add_usecase mod.Z = 10 self.assertPreciseEqual(f(2, 3), 6) f.func.recompile() self.assertPreciseEqual(f(2, 3), 15) # Freshly recompiled version is re-used from other imports mod = self.import_module() f = mod.add_usecase self.assertPreciseEqual(f(2, 3), 15) def test_same_names(self): # Function with the same names should still disambiguate mod = self.import_module() f = mod.renamed_function1 self.assertPreciseEqual(f(2), 4) f = mod.renamed_function2 self.assertPreciseEqual(f(2), 8) @skip_unless_cc_60 @skip_if_cudadevrt_missing @skip_if_mvc_enabled('CG not supported with MVC') def test_cache_cg(self): # Functions using cooperative groups should be cacheable. See Issue # #8888: https://github.com/numba/numba/issues/8888 self.check_pycache(0) mod = self.import_module() self.check_pycache(0) mod.cg_usecase(0) self.check_pycache(2) # 1 index, 1 data # Check the code runs ok from another process self.run_in_separate_process() @skip_unless_cc_60 @skip_if_cudadevrt_missing @skip_if_mvc_enabled('CG not supported with MVC') def test_cache_cg_clean_run(self): # See Issue #9432: https://github.com/numba/numba/issues/9432 # If a cached function using CG sync was the first thing to compile, # the compile would fail. self.check_pycache(0) # This logic is modelled on run_in_separate_process(), but executes the # CG usecase directly in the subprocess. code = """if 1: import sys sys.path.insert(0, %(tempdir)r) mod = __import__(%(modname)r) mod.cg_usecase(0) """ % dict(tempdir=self.tempdir, modname=self.modname) popen = subprocess.Popen([sys.executable, "-c", code], stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = popen.communicate(timeout=60) if popen.returncode != 0: raise AssertionError( "process failed with code %s: \n" "stdout follows\n%s\n" "stderr follows\n%s\n" % (popen.returncode, out.decode(), err.decode()), ) def _test_pycache_fallback(self): """ With a disabled __pycache__, test there is a working fallback (e.g. on the user-wide cache dir) """ mod = self.import_module() f = mod.add_usecase # Remove this function's cache files at the end, to avoid accumulation # across test calls. self.addCleanup(shutil.rmtree, f.func.stats.cache_path, ignore_errors=True) self.assertPreciseEqual(f(2, 3), 6) # It's a cache miss since the file was copied to a new temp location self.check_hits(f.func, 0, 1) # Test re-use mod2 = self.import_module() f = mod2.add_usecase self.assertPreciseEqual(f(2, 3), 6) self.check_hits(f.func, 1, 0) # The __pycache__ is empty (otherwise the test's preconditions # wouldn't be met) self.check_pycache(0) @skip_bad_access @unittest.skipIf(os.name == "nt", "cannot easily make a directory read-only on Windows") def test_non_creatable_pycache(self): # Make it impossible to create the __pycache__ directory old_perms = os.stat(self.tempdir).st_mode os.chmod(self.tempdir, 0o500) self.addCleanup(os.chmod, self.tempdir, old_perms) self._test_pycache_fallback() @skip_bad_access @unittest.skipIf(os.name == "nt", "cannot easily make a directory read-only on Windows") def test_non_writable_pycache(self): # Make it impossible to write to the __pycache__ directory pycache = os.path.join(self.tempdir, '__pycache__') os.mkdir(pycache) old_perms = os.stat(pycache).st_mode os.chmod(pycache, 0o500) self.addCleanup(os.chmod, pycache, old_perms) self._test_pycache_fallback() def test_cannot_cache_linking_libraries(self): link = str(test_data_dir / 'jitlink.ptx') msg = 'Cannot pickle CUDACodeLibrary with linking files' with self.assertRaisesRegex(RuntimeError, msg): @cuda.jit('void()', cache=True, link=[link]) def f(): pass @skip_on_cudasim('Simulator does not implement caching') class CUDAAndCPUCachingTest(SerialMixin, DispatcherCacheUsecasesTest): here = os.path.dirname(__file__) usecases_file = os.path.join(here, "cache_with_cpu_usecases.py") modname = "cuda_and_cpu_caching_test_fodder" def setUp(self): DispatcherCacheUsecasesTest.setUp(self) CUDATestCase.setUp(self) def tearDown(self): CUDATestCase.tearDown(self) DispatcherCacheUsecasesTest.tearDown(self) def test_cpu_and_cuda_targets(self): # The same function jitted for CPU and CUDA targets should maintain # separate caches for each target. self.check_pycache(0) mod = self.import_module() self.check_pycache(0) f_cpu = mod.assign_cpu f_cuda = mod.assign_cuda self.assertPreciseEqual(f_cpu(5), 5) self.check_pycache(2) # 1 index, 1 data self.assertPreciseEqual(f_cuda(5), 5) self.check_pycache(3) # 1 index, 2 data self.check_hits(f_cpu.func, 0, 1) self.check_hits(f_cuda.func, 0, 1) self.assertPreciseEqual(f_cpu(5.5), 5.5) self.check_pycache(4) # 1 index, 3 data self.assertPreciseEqual(f_cuda(5.5), 5.5) self.check_pycache(5) # 1 index, 4 data self.check_hits(f_cpu.func, 0, 2) self.check_hits(f_cuda.func, 0, 2) def test_cpu_and_cuda_reuse(self): # Existing cache files for the CPU and CUDA targets are reused. mod = self.import_module() mod.assign_cpu(5) mod.assign_cpu(5.5) mod.assign_cuda(5) mod.assign_cuda(5.5) mtimes = self.get_cache_mtimes() # Two signatures compiled self.check_hits(mod.assign_cpu.func, 0, 2) self.check_hits(mod.assign_cuda.func, 0, 2) mod2 = self.import_module() self.assertIsNot(mod, mod2) f_cpu = mod2.assign_cpu f_cuda = mod2.assign_cuda f_cpu(2) self.check_hits(f_cpu.func, 1, 0) f_cpu(2.5) self.check_hits(f_cpu.func, 2, 0) f_cuda(2) self.check_hits(f_cuda.func, 1, 0) f_cuda(2.5) self.check_hits(f_cuda.func, 2, 0) # The files haven't changed self.assertEqual(self.get_cache_mtimes(), mtimes) self.run_in_separate_process() self.assertEqual(self.get_cache_mtimes(), mtimes) def get_different_cc_gpus(): # Find two GPUs with different Compute Capabilities and return them as a # tuple. If two GPUs with distinct Compute Capabilities cannot be found, # then None is returned. first_gpu = cuda.gpus[0] with first_gpu: first_cc = cuda.current_context().device.compute_capability for gpu in cuda.gpus[1:]: with gpu: cc = cuda.current_context().device.compute_capability if cc != first_cc: return (first_gpu, gpu) return None @skip_on_cudasim('Simulator does not implement caching') class TestMultiCCCaching(SerialMixin, DispatcherCacheUsecasesTest): here = os.path.dirname(__file__) usecases_file = os.path.join(here, "cache_usecases.py") modname = "cuda_multi_cc_caching_test_fodder" def setUp(self): DispatcherCacheUsecasesTest.setUp(self) CUDATestCase.setUp(self) def tearDown(self): CUDATestCase.tearDown(self) DispatcherCacheUsecasesTest.tearDown(self) def test_cache(self): gpus = get_different_cc_gpus() if not gpus: self.skipTest('Need two different CCs for multi-CC cache test') self.check_pycache(0) mod = self.import_module() self.check_pycache(0) # Step 1. Populate the cache with the first GPU with gpus[0]: f = mod.add_usecase self.assertPreciseEqual(f(2, 3), 6) self.check_pycache(2) # 1 index, 1 data self.assertPreciseEqual(f(2.5, 3), 6.5) self.check_pycache(3) # 1 index, 2 data self.check_hits(f.func, 0, 2) f = mod.record_return_aligned rec = f(mod.aligned_arr, 1) self.assertPreciseEqual(tuple(rec), (2, 43.5)) f = mod.record_return_packed rec = f(mod.packed_arr, 1) self.assertPreciseEqual(tuple(rec), (2, 43.5)) self.check_pycache(6) # 2 index, 4 data self.check_hits(f.func, 0, 2) # Step 2. Run with the second GPU - under present behaviour this # doesn't further populate the cache. with gpus[1]: f = mod.add_usecase self.assertPreciseEqual(f(2, 3), 6) self.check_pycache(6) # cache unchanged self.assertPreciseEqual(f(2.5, 3), 6.5) self.check_pycache(6) # cache unchanged self.check_hits(f.func, 0, 2) f = mod.record_return_aligned rec = f(mod.aligned_arr, 1) self.assertPreciseEqual(tuple(rec), (2, 43.5)) f = mod.record_return_packed rec = f(mod.packed_arr, 1) self.assertPreciseEqual(tuple(rec), (2, 43.5)) self.check_pycache(6) # cache unchanged self.check_hits(f.func, 0, 2) # Step 3. Run in a separate module with the second GPU - this populates # the cache for the second CC. mod2 = self.import_module() self.assertIsNot(mod, mod2) with gpus[1]: f = mod2.add_usecase self.assertPreciseEqual(f(2, 3), 6) self.check_pycache(7) # 2 index, 5 data self.assertPreciseEqual(f(2.5, 3), 6.5) self.check_pycache(8) # 2 index, 6 data self.check_hits(f.func, 0, 2) f = mod2.record_return_aligned rec = f(mod.aligned_arr, 1) self.assertPreciseEqual(tuple(rec), (2, 43.5)) f = mod2.record_return_packed rec = f(mod.packed_arr, 1) self.assertPreciseEqual(tuple(rec), (2, 43.5)) self.check_pycache(10) # 2 index, 8 data self.check_hits(f.func, 0, 2) # The following steps check that we can use the NVVM IR loaded from the # cache to generate PTX for a different compute capability to the # cached cubin's CC. To check this, we create another module that loads # the cached version containing a cubin for GPU 1. There will be no # cubin for GPU 0, so when we try to use it the PTX must be generated. mod3 = self.import_module() self.assertIsNot(mod, mod3) # Step 4. Run with GPU 1 and get a cache hit, loading the cache created # during Step 3. with gpus[1]: f = mod3.add_usecase self.assertPreciseEqual(f(2, 3), 6) self.assertPreciseEqual(f(2.5, 3), 6.5) f = mod3.record_return_aligned rec = f(mod.aligned_arr, 1) self.assertPreciseEqual(tuple(rec), (2, 43.5)) f = mod3.record_return_packed rec = f(mod.packed_arr, 1) self.assertPreciseEqual(tuple(rec), (2, 43.5)) # Step 5. Run with GPU 0 using the module from Step 4, to force PTX # generation from cached NVVM IR. with gpus[0]: f = mod3.add_usecase self.assertPreciseEqual(f(2, 3), 6) self.assertPreciseEqual(f(2.5, 3), 6.5) f = mod3.record_return_aligned rec = f(mod.aligned_arr, 1) self.assertPreciseEqual(tuple(rec), (2, 43.5)) f = mod3.record_return_packed rec = f(mod.packed_arr, 1) self.assertPreciseEqual(tuple(rec), (2, 43.5)) def child_initializer(): # Disable occupancy and implicit copy warnings in processes in a # multiprocessing pool. from numba.core import config config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 config.CUDA_WARN_ON_IMPLICIT_COPY = 0 @skip_on_cudasim('Simulator does not implement caching') class TestMultiprocessCache(SerialMixin, DispatcherCacheUsecasesTest): # Nested multiprocessing.Pool raises AssertionError: # "daemonic processes are not allowed to have children" _numba_parallel_test_ = False here = os.path.dirname(__file__) usecases_file = os.path.join(here, "cache_usecases.py") modname = "cuda_mp_caching_test_fodder" def setUp(self): DispatcherCacheUsecasesTest.setUp(self) CUDATestCase.setUp(self) def tearDown(self): CUDATestCase.tearDown(self) DispatcherCacheUsecasesTest.tearDown(self) def test_multiprocessing(self): # Check caching works from multiple processes at once (#2028) mod = self.import_module() # Calling a pure Python caller of the JIT-compiled function is # necessary to reproduce the issue. f = mod.simple_usecase_caller n = 3 try: ctx = multiprocessing.get_context('spawn') except AttributeError: ctx = multiprocessing pool = ctx.Pool(n, child_initializer) try: res = sum(pool.imap(f, range(n))) finally: pool.close() self.assertEqual(res, n * (n - 1) // 2) @skip_on_cudasim('Simulator does not implement the CUDACodeLibrary') class TestCUDACodeLibrary(CUDATestCase): # For tests of miscellaneous CUDACodeLibrary behaviour that we wish to # explicitly check def test_cannot_serialize_unfinalized(self): # The CUDA codegen failes to import under the simulator, so we cannot # import it at the top level from numba.cuda.codegen import CUDACodeLibrary # Usually a CodeLibrary requires a real CodeGen, but since we don't # interact with it, anything will do codegen = object() name = 'library' cl = CUDACodeLibrary(codegen, name) with self.assertRaisesRegex(RuntimeError, 'Cannot pickle unfinalized'): cl._reduce_states()