ai-content-maker/.venv/Lib/site-packages/numba/cuda/tests/cudapy/test_caching.py

546 lines
19 KiB
Python

import multiprocessing
import os
import shutil
import subprocess
import sys
import unittest
import warnings
from numba import cuda
from numba.core.errors import NumbaWarning
from numba.cuda.testing import (CUDATestCase, skip_on_cudasim,
skip_unless_cc_60, skip_if_cudadevrt_missing,
skip_if_mvc_enabled, test_data_dir)
from numba.tests.support import SerialMixin
from numba.tests.test_caching import (DispatcherCacheUsecasesTest,
skip_bad_access)
@skip_on_cudasim('Simulator does not implement caching')
class CUDACachingTest(SerialMixin, DispatcherCacheUsecasesTest):
here = os.path.dirname(__file__)
usecases_file = os.path.join(here, "cache_usecases.py")
modname = "cuda_caching_test_fodder"
def setUp(self):
DispatcherCacheUsecasesTest.setUp(self)
CUDATestCase.setUp(self)
def tearDown(self):
CUDATestCase.tearDown(self)
DispatcherCacheUsecasesTest.tearDown(self)
def test_caching(self):
self.check_pycache(0)
mod = self.import_module()
self.check_pycache(0)
f = mod.add_usecase
self.assertPreciseEqual(f(2, 3), 6)
self.check_pycache(2) # 1 index, 1 data
self.assertPreciseEqual(f(2.5, 3), 6.5)
self.check_pycache(3) # 1 index, 2 data
self.check_hits(f.func, 0, 2)
f = mod.record_return_aligned
rec = f(mod.aligned_arr, 1)
self.assertPreciseEqual(tuple(rec), (2, 43.5))
f = mod.record_return_packed
rec = f(mod.packed_arr, 1)
self.assertPreciseEqual(tuple(rec), (2, 43.5))
self.check_pycache(6) # 2 index, 4 data
self.check_hits(f.func, 0, 2)
# Check the code runs ok from another process
self.run_in_separate_process()
def test_no_caching(self):
mod = self.import_module()
f = mod.add_nocache_usecase
self.assertPreciseEqual(f(2, 3), 6)
self.check_pycache(0)
def test_many_locals(self):
# Declaring many local arrays creates a very large LLVM IR, which
# cannot be pickled due to the level of recursion it requires to
# pickle. This test ensures that kernels with many locals (and
# therefore large IR) can be cached. See Issue #8373:
# https://github.com/numba/numba/issues/8373
self.check_pycache(0)
mod = self.import_module()
f = mod.many_locals
f[1, 1]()
self.check_pycache(2) # 1 index, 1 data
def test_closure(self):
mod = self.import_module()
with warnings.catch_warnings():
warnings.simplefilter('error', NumbaWarning)
f = mod.closure1
self.assertPreciseEqual(f(3), 6) # 3 + 3 = 6
f = mod.closure2
self.assertPreciseEqual(f(3), 8) # 3 + 5 = 8
f = mod.closure3
self.assertPreciseEqual(f(3), 10) # 3 + 7 = 10
f = mod.closure4
self.assertPreciseEqual(f(3), 12) # 3 + 9 = 12
self.check_pycache(5) # 1 nbi, 4 nbc
def test_cache_reuse(self):
mod = self.import_module()
mod.add_usecase(2, 3)
mod.add_usecase(2.5, 3.5)
mod.outer_uncached(2, 3)
mod.outer(2, 3)
mod.record_return_packed(mod.packed_arr, 0)
mod.record_return_aligned(mod.aligned_arr, 1)
mod.simple_usecase_caller(2)
mtimes = self.get_cache_mtimes()
# Two signatures compiled
self.check_hits(mod.add_usecase.func, 0, 2)
mod2 = self.import_module()
self.assertIsNot(mod, mod2)
f = mod2.add_usecase
f(2, 3)
self.check_hits(f.func, 1, 0)
f(2.5, 3.5)
self.check_hits(f.func, 2, 0)
# The files haven't changed
self.assertEqual(self.get_cache_mtimes(), mtimes)
self.run_in_separate_process()
self.assertEqual(self.get_cache_mtimes(), mtimes)
def test_cache_invalidate(self):
mod = self.import_module()
f = mod.add_usecase
self.assertPreciseEqual(f(2, 3), 6)
# This should change the functions' results
with open(self.modfile, "a") as f:
f.write("\nZ = 10\n")
mod = self.import_module()
f = mod.add_usecase
self.assertPreciseEqual(f(2, 3), 15)
def test_recompile(self):
# Explicit call to recompile() should overwrite the cache
mod = self.import_module()
f = mod.add_usecase
self.assertPreciseEqual(f(2, 3), 6)
mod = self.import_module()
f = mod.add_usecase
mod.Z = 10
self.assertPreciseEqual(f(2, 3), 6)
f.func.recompile()
self.assertPreciseEqual(f(2, 3), 15)
# Freshly recompiled version is re-used from other imports
mod = self.import_module()
f = mod.add_usecase
self.assertPreciseEqual(f(2, 3), 15)
def test_same_names(self):
# Function with the same names should still disambiguate
mod = self.import_module()
f = mod.renamed_function1
self.assertPreciseEqual(f(2), 4)
f = mod.renamed_function2
self.assertPreciseEqual(f(2), 8)
@skip_unless_cc_60
@skip_if_cudadevrt_missing
@skip_if_mvc_enabled('CG not supported with MVC')
def test_cache_cg(self):
# Functions using cooperative groups should be cacheable. See Issue
# #8888: https://github.com/numba/numba/issues/8888
self.check_pycache(0)
mod = self.import_module()
self.check_pycache(0)
mod.cg_usecase(0)
self.check_pycache(2) # 1 index, 1 data
# Check the code runs ok from another process
self.run_in_separate_process()
@skip_unless_cc_60
@skip_if_cudadevrt_missing
@skip_if_mvc_enabled('CG not supported with MVC')
def test_cache_cg_clean_run(self):
# See Issue #9432: https://github.com/numba/numba/issues/9432
# If a cached function using CG sync was the first thing to compile,
# the compile would fail.
self.check_pycache(0)
# This logic is modelled on run_in_separate_process(), but executes the
# CG usecase directly in the subprocess.
code = """if 1:
import sys
sys.path.insert(0, %(tempdir)r)
mod = __import__(%(modname)r)
mod.cg_usecase(0)
""" % dict(tempdir=self.tempdir, modname=self.modname)
popen = subprocess.Popen([sys.executable, "-c", code],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
out, err = popen.communicate(timeout=60)
if popen.returncode != 0:
raise AssertionError(
"process failed with code %s: \n"
"stdout follows\n%s\n"
"stderr follows\n%s\n"
% (popen.returncode, out.decode(), err.decode()),
)
def _test_pycache_fallback(self):
"""
With a disabled __pycache__, test there is a working fallback
(e.g. on the user-wide cache dir)
"""
mod = self.import_module()
f = mod.add_usecase
# Remove this function's cache files at the end, to avoid accumulation
# across test calls.
self.addCleanup(shutil.rmtree, f.func.stats.cache_path,
ignore_errors=True)
self.assertPreciseEqual(f(2, 3), 6)
# It's a cache miss since the file was copied to a new temp location
self.check_hits(f.func, 0, 1)
# Test re-use
mod2 = self.import_module()
f = mod2.add_usecase
self.assertPreciseEqual(f(2, 3), 6)
self.check_hits(f.func, 1, 0)
# The __pycache__ is empty (otherwise the test's preconditions
# wouldn't be met)
self.check_pycache(0)
@skip_bad_access
@unittest.skipIf(os.name == "nt",
"cannot easily make a directory read-only on Windows")
def test_non_creatable_pycache(self):
# Make it impossible to create the __pycache__ directory
old_perms = os.stat(self.tempdir).st_mode
os.chmod(self.tempdir, 0o500)
self.addCleanup(os.chmod, self.tempdir, old_perms)
self._test_pycache_fallback()
@skip_bad_access
@unittest.skipIf(os.name == "nt",
"cannot easily make a directory read-only on Windows")
def test_non_writable_pycache(self):
# Make it impossible to write to the __pycache__ directory
pycache = os.path.join(self.tempdir, '__pycache__')
os.mkdir(pycache)
old_perms = os.stat(pycache).st_mode
os.chmod(pycache, 0o500)
self.addCleanup(os.chmod, pycache, old_perms)
self._test_pycache_fallback()
def test_cannot_cache_linking_libraries(self):
link = str(test_data_dir / 'jitlink.ptx')
msg = 'Cannot pickle CUDACodeLibrary with linking files'
with self.assertRaisesRegex(RuntimeError, msg):
@cuda.jit('void()', cache=True, link=[link])
def f():
pass
@skip_on_cudasim('Simulator does not implement caching')
class CUDAAndCPUCachingTest(SerialMixin, DispatcherCacheUsecasesTest):
here = os.path.dirname(__file__)
usecases_file = os.path.join(here, "cache_with_cpu_usecases.py")
modname = "cuda_and_cpu_caching_test_fodder"
def setUp(self):
DispatcherCacheUsecasesTest.setUp(self)
CUDATestCase.setUp(self)
def tearDown(self):
CUDATestCase.tearDown(self)
DispatcherCacheUsecasesTest.tearDown(self)
def test_cpu_and_cuda_targets(self):
# The same function jitted for CPU and CUDA targets should maintain
# separate caches for each target.
self.check_pycache(0)
mod = self.import_module()
self.check_pycache(0)
f_cpu = mod.assign_cpu
f_cuda = mod.assign_cuda
self.assertPreciseEqual(f_cpu(5), 5)
self.check_pycache(2) # 1 index, 1 data
self.assertPreciseEqual(f_cuda(5), 5)
self.check_pycache(3) # 1 index, 2 data
self.check_hits(f_cpu.func, 0, 1)
self.check_hits(f_cuda.func, 0, 1)
self.assertPreciseEqual(f_cpu(5.5), 5.5)
self.check_pycache(4) # 1 index, 3 data
self.assertPreciseEqual(f_cuda(5.5), 5.5)
self.check_pycache(5) # 1 index, 4 data
self.check_hits(f_cpu.func, 0, 2)
self.check_hits(f_cuda.func, 0, 2)
def test_cpu_and_cuda_reuse(self):
# Existing cache files for the CPU and CUDA targets are reused.
mod = self.import_module()
mod.assign_cpu(5)
mod.assign_cpu(5.5)
mod.assign_cuda(5)
mod.assign_cuda(5.5)
mtimes = self.get_cache_mtimes()
# Two signatures compiled
self.check_hits(mod.assign_cpu.func, 0, 2)
self.check_hits(mod.assign_cuda.func, 0, 2)
mod2 = self.import_module()
self.assertIsNot(mod, mod2)
f_cpu = mod2.assign_cpu
f_cuda = mod2.assign_cuda
f_cpu(2)
self.check_hits(f_cpu.func, 1, 0)
f_cpu(2.5)
self.check_hits(f_cpu.func, 2, 0)
f_cuda(2)
self.check_hits(f_cuda.func, 1, 0)
f_cuda(2.5)
self.check_hits(f_cuda.func, 2, 0)
# The files haven't changed
self.assertEqual(self.get_cache_mtimes(), mtimes)
self.run_in_separate_process()
self.assertEqual(self.get_cache_mtimes(), mtimes)
def get_different_cc_gpus():
# Find two GPUs with different Compute Capabilities and return them as a
# tuple. If two GPUs with distinct Compute Capabilities cannot be found,
# then None is returned.
first_gpu = cuda.gpus[0]
with first_gpu:
first_cc = cuda.current_context().device.compute_capability
for gpu in cuda.gpus[1:]:
with gpu:
cc = cuda.current_context().device.compute_capability
if cc != first_cc:
return (first_gpu, gpu)
return None
@skip_on_cudasim('Simulator does not implement caching')
class TestMultiCCCaching(SerialMixin, DispatcherCacheUsecasesTest):
here = os.path.dirname(__file__)
usecases_file = os.path.join(here, "cache_usecases.py")
modname = "cuda_multi_cc_caching_test_fodder"
def setUp(self):
DispatcherCacheUsecasesTest.setUp(self)
CUDATestCase.setUp(self)
def tearDown(self):
CUDATestCase.tearDown(self)
DispatcherCacheUsecasesTest.tearDown(self)
def test_cache(self):
gpus = get_different_cc_gpus()
if not gpus:
self.skipTest('Need two different CCs for multi-CC cache test')
self.check_pycache(0)
mod = self.import_module()
self.check_pycache(0)
# Step 1. Populate the cache with the first GPU
with gpus[0]:
f = mod.add_usecase
self.assertPreciseEqual(f(2, 3), 6)
self.check_pycache(2) # 1 index, 1 data
self.assertPreciseEqual(f(2.5, 3), 6.5)
self.check_pycache(3) # 1 index, 2 data
self.check_hits(f.func, 0, 2)
f = mod.record_return_aligned
rec = f(mod.aligned_arr, 1)
self.assertPreciseEqual(tuple(rec), (2, 43.5))
f = mod.record_return_packed
rec = f(mod.packed_arr, 1)
self.assertPreciseEqual(tuple(rec), (2, 43.5))
self.check_pycache(6) # 2 index, 4 data
self.check_hits(f.func, 0, 2)
# Step 2. Run with the second GPU - under present behaviour this
# doesn't further populate the cache.
with gpus[1]:
f = mod.add_usecase
self.assertPreciseEqual(f(2, 3), 6)
self.check_pycache(6) # cache unchanged
self.assertPreciseEqual(f(2.5, 3), 6.5)
self.check_pycache(6) # cache unchanged
self.check_hits(f.func, 0, 2)
f = mod.record_return_aligned
rec = f(mod.aligned_arr, 1)
self.assertPreciseEqual(tuple(rec), (2, 43.5))
f = mod.record_return_packed
rec = f(mod.packed_arr, 1)
self.assertPreciseEqual(tuple(rec), (2, 43.5))
self.check_pycache(6) # cache unchanged
self.check_hits(f.func, 0, 2)
# Step 3. Run in a separate module with the second GPU - this populates
# the cache for the second CC.
mod2 = self.import_module()
self.assertIsNot(mod, mod2)
with gpus[1]:
f = mod2.add_usecase
self.assertPreciseEqual(f(2, 3), 6)
self.check_pycache(7) # 2 index, 5 data
self.assertPreciseEqual(f(2.5, 3), 6.5)
self.check_pycache(8) # 2 index, 6 data
self.check_hits(f.func, 0, 2)
f = mod2.record_return_aligned
rec = f(mod.aligned_arr, 1)
self.assertPreciseEqual(tuple(rec), (2, 43.5))
f = mod2.record_return_packed
rec = f(mod.packed_arr, 1)
self.assertPreciseEqual(tuple(rec), (2, 43.5))
self.check_pycache(10) # 2 index, 8 data
self.check_hits(f.func, 0, 2)
# The following steps check that we can use the NVVM IR loaded from the
# cache to generate PTX for a different compute capability to the
# cached cubin's CC. To check this, we create another module that loads
# the cached version containing a cubin for GPU 1. There will be no
# cubin for GPU 0, so when we try to use it the PTX must be generated.
mod3 = self.import_module()
self.assertIsNot(mod, mod3)
# Step 4. Run with GPU 1 and get a cache hit, loading the cache created
# during Step 3.
with gpus[1]:
f = mod3.add_usecase
self.assertPreciseEqual(f(2, 3), 6)
self.assertPreciseEqual(f(2.5, 3), 6.5)
f = mod3.record_return_aligned
rec = f(mod.aligned_arr, 1)
self.assertPreciseEqual(tuple(rec), (2, 43.5))
f = mod3.record_return_packed
rec = f(mod.packed_arr, 1)
self.assertPreciseEqual(tuple(rec), (2, 43.5))
# Step 5. Run with GPU 0 using the module from Step 4, to force PTX
# generation from cached NVVM IR.
with gpus[0]:
f = mod3.add_usecase
self.assertPreciseEqual(f(2, 3), 6)
self.assertPreciseEqual(f(2.5, 3), 6.5)
f = mod3.record_return_aligned
rec = f(mod.aligned_arr, 1)
self.assertPreciseEqual(tuple(rec), (2, 43.5))
f = mod3.record_return_packed
rec = f(mod.packed_arr, 1)
self.assertPreciseEqual(tuple(rec), (2, 43.5))
def child_initializer():
# Disable occupancy and implicit copy warnings in processes in a
# multiprocessing pool.
from numba.core import config
config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
config.CUDA_WARN_ON_IMPLICIT_COPY = 0
@skip_on_cudasim('Simulator does not implement caching')
class TestMultiprocessCache(SerialMixin, DispatcherCacheUsecasesTest):
# Nested multiprocessing.Pool raises AssertionError:
# "daemonic processes are not allowed to have children"
_numba_parallel_test_ = False
here = os.path.dirname(__file__)
usecases_file = os.path.join(here, "cache_usecases.py")
modname = "cuda_mp_caching_test_fodder"
def setUp(self):
DispatcherCacheUsecasesTest.setUp(self)
CUDATestCase.setUp(self)
def tearDown(self):
CUDATestCase.tearDown(self)
DispatcherCacheUsecasesTest.tearDown(self)
def test_multiprocessing(self):
# Check caching works from multiple processes at once (#2028)
mod = self.import_module()
# Calling a pure Python caller of the JIT-compiled function is
# necessary to reproduce the issue.
f = mod.simple_usecase_caller
n = 3
try:
ctx = multiprocessing.get_context('spawn')
except AttributeError:
ctx = multiprocessing
pool = ctx.Pool(n, child_initializer)
try:
res = sum(pool.imap(f, range(n)))
finally:
pool.close()
self.assertEqual(res, n * (n - 1) // 2)
@skip_on_cudasim('Simulator does not implement the CUDACodeLibrary')
class TestCUDACodeLibrary(CUDATestCase):
# For tests of miscellaneous CUDACodeLibrary behaviour that we wish to
# explicitly check
def test_cannot_serialize_unfinalized(self):
# The CUDA codegen failes to import under the simulator, so we cannot
# import it at the top level
from numba.cuda.codegen import CUDACodeLibrary
# Usually a CodeLibrary requires a real CodeGen, but since we don't
# interact with it, anything will do
codegen = object()
name = 'library'
cl = CUDACodeLibrary(codegen, name)
with self.assertRaisesRegex(RuntimeError, 'Cannot pickle unfinalized'):
cl._reduce_states()