ai-content-maker/.venv/Lib/site-packages/numba/cuda/tests/cudapy/test_caching.py

546 lines
19 KiB
Python
Raw Permalink Normal View History

2024-05-03 04:18:51 +03:00
import multiprocessing
import os
import shutil
import subprocess
import sys
import unittest
import warnings
from numba import cuda
from numba.core.errors import NumbaWarning
from numba.cuda.testing import (CUDATestCase, skip_on_cudasim,
skip_unless_cc_60, skip_if_cudadevrt_missing,
skip_if_mvc_enabled, test_data_dir)
from numba.tests.support import SerialMixin
from numba.tests.test_caching import (DispatcherCacheUsecasesTest,
skip_bad_access)
@skip_on_cudasim('Simulator does not implement caching')
class CUDACachingTest(SerialMixin, DispatcherCacheUsecasesTest):
here = os.path.dirname(__file__)
usecases_file = os.path.join(here, "cache_usecases.py")
modname = "cuda_caching_test_fodder"
def setUp(self):
DispatcherCacheUsecasesTest.setUp(self)
CUDATestCase.setUp(self)
def tearDown(self):
CUDATestCase.tearDown(self)
DispatcherCacheUsecasesTest.tearDown(self)
def test_caching(self):
self.check_pycache(0)
mod = self.import_module()
self.check_pycache(0)
f = mod.add_usecase
self.assertPreciseEqual(f(2, 3), 6)
self.check_pycache(2) # 1 index, 1 data
self.assertPreciseEqual(f(2.5, 3), 6.5)
self.check_pycache(3) # 1 index, 2 data
self.check_hits(f.func, 0, 2)
f = mod.record_return_aligned
rec = f(mod.aligned_arr, 1)
self.assertPreciseEqual(tuple(rec), (2, 43.5))
f = mod.record_return_packed
rec = f(mod.packed_arr, 1)
self.assertPreciseEqual(tuple(rec), (2, 43.5))
self.check_pycache(6) # 2 index, 4 data
self.check_hits(f.func, 0, 2)
# Check the code runs ok from another process
self.run_in_separate_process()
def test_no_caching(self):
mod = self.import_module()
f = mod.add_nocache_usecase
self.assertPreciseEqual(f(2, 3), 6)
self.check_pycache(0)
def test_many_locals(self):
# Declaring many local arrays creates a very large LLVM IR, which
# cannot be pickled due to the level of recursion it requires to
# pickle. This test ensures that kernels with many locals (and
# therefore large IR) can be cached. See Issue #8373:
# https://github.com/numba/numba/issues/8373
self.check_pycache(0)
mod = self.import_module()
f = mod.many_locals
f[1, 1]()
self.check_pycache(2) # 1 index, 1 data
def test_closure(self):
mod = self.import_module()
with warnings.catch_warnings():
warnings.simplefilter('error', NumbaWarning)
f = mod.closure1
self.assertPreciseEqual(f(3), 6) # 3 + 3 = 6
f = mod.closure2
self.assertPreciseEqual(f(3), 8) # 3 + 5 = 8
f = mod.closure3
self.assertPreciseEqual(f(3), 10) # 3 + 7 = 10
f = mod.closure4
self.assertPreciseEqual(f(3), 12) # 3 + 9 = 12
self.check_pycache(5) # 1 nbi, 4 nbc
def test_cache_reuse(self):
mod = self.import_module()
mod.add_usecase(2, 3)
mod.add_usecase(2.5, 3.5)
mod.outer_uncached(2, 3)
mod.outer(2, 3)
mod.record_return_packed(mod.packed_arr, 0)
mod.record_return_aligned(mod.aligned_arr, 1)
mod.simple_usecase_caller(2)
mtimes = self.get_cache_mtimes()
# Two signatures compiled
self.check_hits(mod.add_usecase.func, 0, 2)
mod2 = self.import_module()
self.assertIsNot(mod, mod2)
f = mod2.add_usecase
f(2, 3)
self.check_hits(f.func, 1, 0)
f(2.5, 3.5)
self.check_hits(f.func, 2, 0)
# The files haven't changed
self.assertEqual(self.get_cache_mtimes(), mtimes)
self.run_in_separate_process()
self.assertEqual(self.get_cache_mtimes(), mtimes)
def test_cache_invalidate(self):
mod = self.import_module()
f = mod.add_usecase
self.assertPreciseEqual(f(2, 3), 6)
# This should change the functions' results
with open(self.modfile, "a") as f:
f.write("\nZ = 10\n")
mod = self.import_module()
f = mod.add_usecase
self.assertPreciseEqual(f(2, 3), 15)
def test_recompile(self):
# Explicit call to recompile() should overwrite the cache
mod = self.import_module()
f = mod.add_usecase
self.assertPreciseEqual(f(2, 3), 6)
mod = self.import_module()
f = mod.add_usecase
mod.Z = 10
self.assertPreciseEqual(f(2, 3), 6)
f.func.recompile()
self.assertPreciseEqual(f(2, 3), 15)
# Freshly recompiled version is re-used from other imports
mod = self.import_module()
f = mod.add_usecase
self.assertPreciseEqual(f(2, 3), 15)
def test_same_names(self):
# Function with the same names should still disambiguate
mod = self.import_module()
f = mod.renamed_function1
self.assertPreciseEqual(f(2), 4)
f = mod.renamed_function2
self.assertPreciseEqual(f(2), 8)
@skip_unless_cc_60
@skip_if_cudadevrt_missing
@skip_if_mvc_enabled('CG not supported with MVC')
def test_cache_cg(self):
# Functions using cooperative groups should be cacheable. See Issue
# #8888: https://github.com/numba/numba/issues/8888
self.check_pycache(0)
mod = self.import_module()
self.check_pycache(0)
mod.cg_usecase(0)
self.check_pycache(2) # 1 index, 1 data
# Check the code runs ok from another process
self.run_in_separate_process()
@skip_unless_cc_60
@skip_if_cudadevrt_missing
@skip_if_mvc_enabled('CG not supported with MVC')
def test_cache_cg_clean_run(self):
# See Issue #9432: https://github.com/numba/numba/issues/9432
# If a cached function using CG sync was the first thing to compile,
# the compile would fail.
self.check_pycache(0)
# This logic is modelled on run_in_separate_process(), but executes the
# CG usecase directly in the subprocess.
code = """if 1:
import sys
sys.path.insert(0, %(tempdir)r)
mod = __import__(%(modname)r)
mod.cg_usecase(0)
""" % dict(tempdir=self.tempdir, modname=self.modname)
popen = subprocess.Popen([sys.executable, "-c", code],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
out, err = popen.communicate(timeout=60)
if popen.returncode != 0:
raise AssertionError(
"process failed with code %s: \n"
"stdout follows\n%s\n"
"stderr follows\n%s\n"
% (popen.returncode, out.decode(), err.decode()),
)
def _test_pycache_fallback(self):
"""
With a disabled __pycache__, test there is a working fallback
(e.g. on the user-wide cache dir)
"""
mod = self.import_module()
f = mod.add_usecase
# Remove this function's cache files at the end, to avoid accumulation
# across test calls.
self.addCleanup(shutil.rmtree, f.func.stats.cache_path,
ignore_errors=True)
self.assertPreciseEqual(f(2, 3), 6)
# It's a cache miss since the file was copied to a new temp location
self.check_hits(f.func, 0, 1)
# Test re-use
mod2 = self.import_module()
f = mod2.add_usecase
self.assertPreciseEqual(f(2, 3), 6)
self.check_hits(f.func, 1, 0)
# The __pycache__ is empty (otherwise the test's preconditions
# wouldn't be met)
self.check_pycache(0)
@skip_bad_access
@unittest.skipIf(os.name == "nt",
"cannot easily make a directory read-only on Windows")
def test_non_creatable_pycache(self):
# Make it impossible to create the __pycache__ directory
old_perms = os.stat(self.tempdir).st_mode
os.chmod(self.tempdir, 0o500)
self.addCleanup(os.chmod, self.tempdir, old_perms)
self._test_pycache_fallback()
@skip_bad_access
@unittest.skipIf(os.name == "nt",
"cannot easily make a directory read-only on Windows")
def test_non_writable_pycache(self):
# Make it impossible to write to the __pycache__ directory
pycache = os.path.join(self.tempdir, '__pycache__')
os.mkdir(pycache)
old_perms = os.stat(pycache).st_mode
os.chmod(pycache, 0o500)
self.addCleanup(os.chmod, pycache, old_perms)
self._test_pycache_fallback()
def test_cannot_cache_linking_libraries(self):
link = str(test_data_dir / 'jitlink.ptx')
msg = 'Cannot pickle CUDACodeLibrary with linking files'
with self.assertRaisesRegex(RuntimeError, msg):
@cuda.jit('void()', cache=True, link=[link])
def f():
pass
@skip_on_cudasim('Simulator does not implement caching')
class CUDAAndCPUCachingTest(SerialMixin, DispatcherCacheUsecasesTest):
here = os.path.dirname(__file__)
usecases_file = os.path.join(here, "cache_with_cpu_usecases.py")
modname = "cuda_and_cpu_caching_test_fodder"
def setUp(self):
DispatcherCacheUsecasesTest.setUp(self)
CUDATestCase.setUp(self)
def tearDown(self):
CUDATestCase.tearDown(self)
DispatcherCacheUsecasesTest.tearDown(self)
def test_cpu_and_cuda_targets(self):
# The same function jitted for CPU and CUDA targets should maintain
# separate caches for each target.
self.check_pycache(0)
mod = self.import_module()
self.check_pycache(0)
f_cpu = mod.assign_cpu
f_cuda = mod.assign_cuda
self.assertPreciseEqual(f_cpu(5), 5)
self.check_pycache(2) # 1 index, 1 data
self.assertPreciseEqual(f_cuda(5), 5)
self.check_pycache(3) # 1 index, 2 data
self.check_hits(f_cpu.func, 0, 1)
self.check_hits(f_cuda.func, 0, 1)
self.assertPreciseEqual(f_cpu(5.5), 5.5)
self.check_pycache(4) # 1 index, 3 data
self.assertPreciseEqual(f_cuda(5.5), 5.5)
self.check_pycache(5) # 1 index, 4 data
self.check_hits(f_cpu.func, 0, 2)
self.check_hits(f_cuda.func, 0, 2)
def test_cpu_and_cuda_reuse(self):
# Existing cache files for the CPU and CUDA targets are reused.
mod = self.import_module()
mod.assign_cpu(5)
mod.assign_cpu(5.5)
mod.assign_cuda(5)
mod.assign_cuda(5.5)
mtimes = self.get_cache_mtimes()
# Two signatures compiled
self.check_hits(mod.assign_cpu.func, 0, 2)
self.check_hits(mod.assign_cuda.func, 0, 2)
mod2 = self.import_module()
self.assertIsNot(mod, mod2)
f_cpu = mod2.assign_cpu
f_cuda = mod2.assign_cuda
f_cpu(2)
self.check_hits(f_cpu.func, 1, 0)
f_cpu(2.5)
self.check_hits(f_cpu.func, 2, 0)
f_cuda(2)
self.check_hits(f_cuda.func, 1, 0)
f_cuda(2.5)
self.check_hits(f_cuda.func, 2, 0)
# The files haven't changed
self.assertEqual(self.get_cache_mtimes(), mtimes)
self.run_in_separate_process()
self.assertEqual(self.get_cache_mtimes(), mtimes)
def get_different_cc_gpus():
# Find two GPUs with different Compute Capabilities and return them as a
# tuple. If two GPUs with distinct Compute Capabilities cannot be found,
# then None is returned.
first_gpu = cuda.gpus[0]
with first_gpu:
first_cc = cuda.current_context().device.compute_capability
for gpu in cuda.gpus[1:]:
with gpu:
cc = cuda.current_context().device.compute_capability
if cc != first_cc:
return (first_gpu, gpu)
return None
@skip_on_cudasim('Simulator does not implement caching')
class TestMultiCCCaching(SerialMixin, DispatcherCacheUsecasesTest):
here = os.path.dirname(__file__)
usecases_file = os.path.join(here, "cache_usecases.py")
modname = "cuda_multi_cc_caching_test_fodder"
def setUp(self):
DispatcherCacheUsecasesTest.setUp(self)
CUDATestCase.setUp(self)
def tearDown(self):
CUDATestCase.tearDown(self)
DispatcherCacheUsecasesTest.tearDown(self)
def test_cache(self):
gpus = get_different_cc_gpus()
if not gpus:
self.skipTest('Need two different CCs for multi-CC cache test')
self.check_pycache(0)
mod = self.import_module()
self.check_pycache(0)
# Step 1. Populate the cache with the first GPU
with gpus[0]:
f = mod.add_usecase
self.assertPreciseEqual(f(2, 3), 6)
self.check_pycache(2) # 1 index, 1 data
self.assertPreciseEqual(f(2.5, 3), 6.5)
self.check_pycache(3) # 1 index, 2 data
self.check_hits(f.func, 0, 2)
f = mod.record_return_aligned
rec = f(mod.aligned_arr, 1)
self.assertPreciseEqual(tuple(rec), (2, 43.5))
f = mod.record_return_packed
rec = f(mod.packed_arr, 1)
self.assertPreciseEqual(tuple(rec), (2, 43.5))
self.check_pycache(6) # 2 index, 4 data
self.check_hits(f.func, 0, 2)
# Step 2. Run with the second GPU - under present behaviour this
# doesn't further populate the cache.
with gpus[1]:
f = mod.add_usecase
self.assertPreciseEqual(f(2, 3), 6)
self.check_pycache(6) # cache unchanged
self.assertPreciseEqual(f(2.5, 3), 6.5)
self.check_pycache(6) # cache unchanged
self.check_hits(f.func, 0, 2)
f = mod.record_return_aligned
rec = f(mod.aligned_arr, 1)
self.assertPreciseEqual(tuple(rec), (2, 43.5))
f = mod.record_return_packed
rec = f(mod.packed_arr, 1)
self.assertPreciseEqual(tuple(rec), (2, 43.5))
self.check_pycache(6) # cache unchanged
self.check_hits(f.func, 0, 2)
# Step 3. Run in a separate module with the second GPU - this populates
# the cache for the second CC.
mod2 = self.import_module()
self.assertIsNot(mod, mod2)
with gpus[1]:
f = mod2.add_usecase
self.assertPreciseEqual(f(2, 3), 6)
self.check_pycache(7) # 2 index, 5 data
self.assertPreciseEqual(f(2.5, 3), 6.5)
self.check_pycache(8) # 2 index, 6 data
self.check_hits(f.func, 0, 2)
f = mod2.record_return_aligned
rec = f(mod.aligned_arr, 1)
self.assertPreciseEqual(tuple(rec), (2, 43.5))
f = mod2.record_return_packed
rec = f(mod.packed_arr, 1)
self.assertPreciseEqual(tuple(rec), (2, 43.5))
self.check_pycache(10) # 2 index, 8 data
self.check_hits(f.func, 0, 2)
# The following steps check that we can use the NVVM IR loaded from the
# cache to generate PTX for a different compute capability to the
# cached cubin's CC. To check this, we create another module that loads
# the cached version containing a cubin for GPU 1. There will be no
# cubin for GPU 0, so when we try to use it the PTX must be generated.
mod3 = self.import_module()
self.assertIsNot(mod, mod3)
# Step 4. Run with GPU 1 and get a cache hit, loading the cache created
# during Step 3.
with gpus[1]:
f = mod3.add_usecase
self.assertPreciseEqual(f(2, 3), 6)
self.assertPreciseEqual(f(2.5, 3), 6.5)
f = mod3.record_return_aligned
rec = f(mod.aligned_arr, 1)
self.assertPreciseEqual(tuple(rec), (2, 43.5))
f = mod3.record_return_packed
rec = f(mod.packed_arr, 1)
self.assertPreciseEqual(tuple(rec), (2, 43.5))
# Step 5. Run with GPU 0 using the module from Step 4, to force PTX
# generation from cached NVVM IR.
with gpus[0]:
f = mod3.add_usecase
self.assertPreciseEqual(f(2, 3), 6)
self.assertPreciseEqual(f(2.5, 3), 6.5)
f = mod3.record_return_aligned
rec = f(mod.aligned_arr, 1)
self.assertPreciseEqual(tuple(rec), (2, 43.5))
f = mod3.record_return_packed
rec = f(mod.packed_arr, 1)
self.assertPreciseEqual(tuple(rec), (2, 43.5))
def child_initializer():
# Disable occupancy and implicit copy warnings in processes in a
# multiprocessing pool.
from numba.core import config
config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
config.CUDA_WARN_ON_IMPLICIT_COPY = 0
@skip_on_cudasim('Simulator does not implement caching')
class TestMultiprocessCache(SerialMixin, DispatcherCacheUsecasesTest):
# Nested multiprocessing.Pool raises AssertionError:
# "daemonic processes are not allowed to have children"
_numba_parallel_test_ = False
here = os.path.dirname(__file__)
usecases_file = os.path.join(here, "cache_usecases.py")
modname = "cuda_mp_caching_test_fodder"
def setUp(self):
DispatcherCacheUsecasesTest.setUp(self)
CUDATestCase.setUp(self)
def tearDown(self):
CUDATestCase.tearDown(self)
DispatcherCacheUsecasesTest.tearDown(self)
def test_multiprocessing(self):
# Check caching works from multiple processes at once (#2028)
mod = self.import_module()
# Calling a pure Python caller of the JIT-compiled function is
# necessary to reproduce the issue.
f = mod.simple_usecase_caller
n = 3
try:
ctx = multiprocessing.get_context('spawn')
except AttributeError:
ctx = multiprocessing
pool = ctx.Pool(n, child_initializer)
try:
res = sum(pool.imap(f, range(n)))
finally:
pool.close()
self.assertEqual(res, n * (n - 1) // 2)
@skip_on_cudasim('Simulator does not implement the CUDACodeLibrary')
class TestCUDACodeLibrary(CUDATestCase):
# For tests of miscellaneous CUDACodeLibrary behaviour that we wish to
# explicitly check
def test_cannot_serialize_unfinalized(self):
# The CUDA codegen failes to import under the simulator, so we cannot
# import it at the top level
from numba.cuda.codegen import CUDACodeLibrary
# Usually a CodeLibrary requires a real CodeGen, but since we don't
# interact with it, anything will do
codegen = object()
name = 'library'
cl = CUDACodeLibrary(codegen, name)
with self.assertRaisesRegex(RuntimeError, 'Cannot pickle unfinalized'):
cl._reduce_states()