import os # noqa: C101 import sys from typing import Any, Callable, Dict, Optional, TYPE_CHECKING import torch def is_fbcode(): return not hasattr(torch.version, "git_version") # add some debug printouts debug = False # add inf and NaN checkers debug_check_inf_and_nan = False # Whether to disable a progress bar for autotuning disable_progress = True # Whether to enable printing the source code for each future verbose_progress = False # use fx aot graph codegen cache fx_graph_cache = os.environ.get("TORCHINDUCTOR_FX_GRAPH_CACHE") == "1" # use cpp wrapper instead of python wrapper cpp_wrapper = os.environ.get("TORCHINDUCTOR_CPP_WRAPPER", "0") == "1" # codegen cpp wrapper code in an ABI compatible mode abi_compatible = ( os.environ.get("TORCHINDUCTOR_ABI_COMPATIBLE", "1" if is_fbcode() else "0") == "1" ) c_shim_version = os.environ.get( "TORCHINDUCTOR_C_SHIM_VERSION", "1" if is_fbcode() else "2" ) # dead code elimination dce = False # assume weight tensors are fixed size static_weight_shapes = True # put correctness assertions in generated code size_asserts = os.environ.get("TORCHINDUCTOR_SIZE_ASSERTS", "1") == "1" nan_asserts = os.environ.get("TORCHINDUCTOR_NAN_ASSERTS") == "1" # enable loop reordering based on input orders pick_loop_orders = True # reuse a kernel input as the output inplace_buffers = True # reuse a buffer for an unrelated purpose allow_buffer_reuse = True # Enable pooled allocations for non-output tensors memory_planning = os.environ.get("TORCHINDUCTOR_MEMORY_PLANNING", "0") == "1" # How to organize memory under memory_planning=True: # - "none": do not try to pool storage, just reuse # - "intermediates": all non-outputs share storage, outputs each get unique storage # - "outputs": two pools, one for intermediates (freed on return) and one for outputs # - "combined": a single pool for both intermediates and outputs memory_pool = os.environ.get("TORCHINDUCTOR_MEMORY_POOL", "intermediates") # codegen benchmark harness benchmark_harness = True # fuse pointwise into templates epilogue_fusion = True # do epilogue fusions before other fusions epilogue_fusion_first = False # enable pattern match+replace optimizations pattern_matcher = True # register custom graph optimization pass hook. so far, pre/post passes are # only applied before/after pattern_matcher in post_grad_passes. # # def my_custom_pre_pass(graph: torch.fx.graph.Graph): # # my custom graph optimization pass # ... # # def my_custom_post_pass(graph: torch.fx.graph.Graph): # # my custom graph optimization pass # ... # # torch._inductor.config.post_grad_custom_pre_pass = my_custom_pre_pass # torch._inductor.config.post_grad_custom_post_pass = my_custom_post_pass post_grad_custom_pre_pass: Optional[Callable[[torch.fx.graph.Graph], None]] = None post_grad_custom_post_pass: Optional[Callable[[torch.fx.graph.Graph], None]] = None # Registers a custom pregrad pass. Note that the pre-grad IR is 1. # non-functional, 2. non-normalized, and 3. prone to change. Ideally we should # use post-grad passes. pre_grad_custom_pass: Optional[Callable[[torch.fx.graph.Graph], None]] = None # Optimize away split cat patterns (Experimental) split_cat_fx_passes = True # Optimize conv-batchnorm if batchnorm is in eval mode. Slightly reduces numerical stability. efficient_conv_bn_eval_fx_passes = False # Enable predispatch aten IR for export is_predispatch = False # Deprecated group_fusion = False # Deprecated batch_fusion = True # Pre grad group/batch fusion and options in order, set to empty dict to disable fusion. # Call `torch._inductor.fx_passes.group_batch_fusion.list_group_batch_fusions()` to see available fusions. pre_grad_fusion_options: Dict[str, Dict[str, Any]] = { "batch_linear": {}, "batch_linear_lhs": {}, "batch_layernorm": {}, "batch_tanh": {}, "batch_relu": {}, "batch_sigmoid": {}, } # Post grad group/batch fusion and options, set to empty dict to disable fusion. # Call `torch._inductor.fx_passes.group_batch_fusion.list_group_batch_fusions(False)` to see available fusions. post_grad_fusion_options: Dict[str, Dict[str, Any]] = {} # enable reordering pass for improving memory locality reorder_for_locality = True # Scale down RBLOCK for better occupancy dynamic_scale_rblock = os.environ.get("TORCHINDUCTOR_DYNAMIC_SCALE_RBLOCK", "1") == "1" # this forces fusion for int_mm with mul. Needed when you want to avoid realizing the int32 # but the mul gets fused with other pointwise ops instead. force_fuse_int_mm_with_mul = False # for pattern torch.mm(a, b.to(dtype)) with cuda tensors, # enable torch._inductor.kernel.mm.tuned_mixed_mm fused kernel. # Autotune will compare perf with normal cast->then->mm option use_mixed_mm = False # enable runtime numeric check for pre/post grad fx passes # floating point provides limited accuracy (about 7 decimal digits for single precision # floating point numbers,about 16 decimal digits for double precision floating point numbers) # according to PyTorch documentation. # https://pytorch.org/docs/stable/notes/numerical_accuracy.html#batched-computations-or-slice-computations fx_passes_numeric_check: Dict[str, Any] = { "pre_grad": False, "precision": 1e-4, "num_iterations": 1, "requires_optimizer": True, } # for pattern torch.mm(a, b.to(dtype)) with cuda tensors, always use # torch._inductor.kernel.mm.tuned_mixed_mm's fused kernel. # Autotune will not compare with normal cast->then->mm option. # (if force_mixed_mm is true, the use_mixed_mm flag will be ignored) force_mixed_mm = False # enable reordering pass for increasing overlap between compute and communication reorder_for_compute_comm_overlap = False # passes (in execution order) for increasing overlap between compute and communication # for built-in passes, use string name; for user-defined passes, pass in the function handle reorder_for_compute_comm_overlap_passes = [ "reorder_compute_for_overlap", "sink_waits", "raise_comms", ] # runtime estimation function for ops # for built-in estimation function, pass in "default"; for user-defined estimation function, pass in the function handle estimate_op_runtime = "default" # unit: GB/s, uni-directional P2P bandwidth per card # default value is NVLink intra_node_bw = 300 # unit: GB/s, uni-directional P2P bandwidth per node # default value is InfiniBand inter_node_bw = 25 # enable slow autotuning passes to select algorithms max_autotune = os.environ.get("TORCHINDUCTOR_MAX_AUTOTUNE") == "1" # enable slow autotuning passes to select pointwise/reductions algorithms max_autotune_pointwise = os.environ.get("TORCHINDUCTOR_MAX_AUTOTUNE_POINTWISE") == "1" # enable slow autotuning passes to select gemm algorithms max_autotune_gemm = os.environ.get("TORCHINDUCTOR_MAX_AUTOTUNE_GEMM") == "1" # enable autotune local cache use_autotune_local_cache = True # enable autotune remote cache use_autotune_remote_cache = ( os.environ.get("TORCH_INDUCTOR_AUTOTUNE_REMOTE_CACHE") == "1" ) # force cublas and triton to use the same precision; cublas supports TF32 for matmul operations # when m, n, k are multiples of 16, 16, 8, whereas triton supports TF32 for matmul operations # for any combinations of m, n, k, regardless of their alignment. setting this flag will ensure # that triton does not use TF32 wherever cublas would not use TF32 force_same_precision = ( True if is_fbcode() else os.environ.get("TORCHINDUCTOR_FORCE_SAME_PRECISION") == "1" ) # Specify candidate backends for gemm autotune. # Possible choices are combinations of: ATen, Triton, CUTLASS. # ATen: default Pytorch ATen kernels. # Triton: Triton templates defined in torch inductor. # CUTLASS: Cutlass templates and kernels. max_autotune_gemm_backends = os.environ.get( "TORCHINDUCTOR_MAX_AUTOTUNE_GEMM_BACKENDS", "ATEN,TRITON" ).upper() # the value used as a fallback for the unbacked SymInts # that can appear in the input shapes (e.g., in autotuning) unbacked_symint_fallback = 8192 # enable searching global and local cache regardless of `max_autotune` search_autotune_cache = os.environ.get("TORCHINDUCTOR_SEARCH_AUTOTUNE_CACHE") == "1" save_args = os.environ.get("TORCHINDUCTOR_SAVE_ARGS") == "1" # We will disable creating subprocess for autotuning if this is False autotune_in_subproc = os.environ.get("TORCHINDUCTOR_AUTOTUNE_IN_SUBPROC") == "1" # If autotuning in subprocess, whether to use multiple devices autotune_multi_device = os.environ.get("TORCHINDUCTOR_AUTOTUNE_MULTI_DEVICE") == "1" coordinate_descent_tuning = ( os.environ.get("TORCHINDUCTOR_COORDINATE_DESCENT_TUNING") == "1" ) coordinate_descent_check_all_directions = ( os.environ.get("TORCHINDUCTOR_COORDINATE_DESCENT_CHECK_ALL_DIRECTIONS") == "1" ) coordinate_descent_search_radius = int( os.environ.get("TORCHINDUCTOR_COORDINATE_DESCENT_RADIUS", "1") ) # Disabled by default on ROCm, opt-in if model utilises NHWC convolutions layout_opt_default = "1" if not torch.version.hip else "0" layout_optimization = ( os.environ.get("TORCHINDUCTOR_LAYOUT_OPTIMIZATION", layout_opt_default) == "1" ) force_layout_optimization = os.environ.get("TORCHINDUCTOR_FORCE_LAYOUT_OPT", "0") == "1" # Whether to keep the output strides the same as eager after layout optimization. keep_output_stride = os.environ.get("TORCHINDUCTOR_KEEP_OUTPUT_STRIDE", "1") == "1" # Enabling this will let compiler print warning messages if a generated triton # kernel has inputs with mixed layouts. This is helpful for perf debugging # since kernel with mixed layout inputs may run much slower then one whose inputs # have uniform layouts. warn_mix_layout = os.environ.get("TORCHINDUCTOR_WARN_MIX_LAYOUT") == "1" # control store vs recompute heuristic # For fanouts, rematerialization can lead to exponential blowup. So, have # smaller threshold realize_reads_threshold = 4 realize_opcount_threshold = 30 # Threshold to prevent excessive accumulation of ops in one buffer during lowering realize_acc_reads_threshold = 8 # fallback to eager for random/dropout, this is slow but useful for debugging fallback_random = False # automatically create fallbacks when encountering an unhandled op implicit_fallbacks = True # fuse even in cases without common reads aggressive_fusion = False # For each fused kernel in the wrapper, comment with the nodes that get fused. # Useful for debugging fusion. debug_fusion = os.environ.get("TORCHINDUCTOR_DEBUG_FUSION") == "1" benchmark_fusion = os.environ.get("TORCHINDUCTOR_BENCHMARK_FUSION") == "1" enabled_metric_tables = os.environ.get("TORCHINDUCTOR_ENABLED_METRIC_TABLES", "") # how many nodes to allow into a single fusion max_fusion_size = 64 # max number of inputs to generate cat as a pointwise op with masked laods max_pointwise_cat_inputs = 8 # replace small reductions with pointwise, disable with `= 1` unroll_reductions_threshold = 8 # Add extra comments to output code (causes compile cache misses) comment_origin = False # Convert 1x1 convs into matmuls conv_1x1_as_mm = False # Enable split reductions for better utilization when the dimension # being reduced over is large (by splitting it) split_reductions = True benchmark_kernel = os.environ.get("TORCHINDUCTOR_BENCHMARK_KERNEL", "0") == "1" # Enable constant and index_expr folding constant_and_index_propagation = True # we always add constants into graph.constants without # performing any constant-inlining optimization always_keep_tensor_constants = False # assert that indirect indexing does not read / write out of bounds assert_indirect_indexing = True # constant folding on the joint graph joint_graph_constant_folding = True # Enable indirect_indexing asserts for decompositions and lowerings debug_index_asserts = False # warnings intended for PyTorch developers, disable for point releases is_nightly_or_source = "dev" in torch.__version__ or "git" in torch.__version__ developer_warnings = is_fbcode() or is_nightly_or_source # The multiprocessing start method to use for inductor workers in the codecache. # TODO: fork is not safe in a multithreaded environment, we should evaluate changing # the default to spawn. worker_start_method = "fork" def decide_compile_threads(): """ Here are the precedence to decide compile_threads 1. User can override it by TORCHINDUCTOR_COMPILE_THREADS. One may want to disable async compiling by setting this to 1 to make pdb happy. 2. Set to 1 if it's win32 platform or it's a fbcode build 3. decide by the number of CPU cores """ if "TORCHINDUCTOR_COMPILE_THREADS" in os.environ: return int(os.environ["TORCHINDUCTOR_COMPILE_THREADS"]) elif sys.platform == "win32" or is_fbcode(): return 1 else: cpu_count = ( len(os.sched_getaffinity(0)) if hasattr(os, "sched_getaffinity") else os.cpu_count() ) assert cpu_count return min(32, cpu_count) compile_threads = decide_compile_threads() # gemm autotuning global cache dir if is_fbcode(): from libfb.py import parutil try: if __package__: global_cache_dir = parutil.get_dir_path( os.path.join(__package__.replace(".", os.sep), "fb/cache") ) else: global_cache_dir = parutil.get_dir_path("fb/cache") except ValueError: global_cache_dir = None else: global_cache_dir = None # If kernel is fused, the name is generated from the origin node op names # for larger kernels limit this kernel_name_max_ops = 10 # Pad input tensors of matmul/bmm/addmm to leverage Tensor Cores in NVIDIA GPUs shape_padding = os.environ.get("TORCHINDUCTOR_SHAPE_PADDING", "1") == "1" # Fx-based linear/matmul/bmm + permute/transpose vertical fusion permute_fusion = os.environ.get("TORCHINDUCTOR_PERMUTE_FUSION", "0") == "1" # Mark the wrapper call in PyTorch profiler profiler_mark_wrapper_call = False # Generate hook calls to torch._inductor.hooks.run_intermediate_hooks for # every intermediate for which we can correlate it with an intermediate # from the original FX graph generate_intermediate_hooks = False # Populate traceback field on IRNode; good for debugging why origin_node is # not populated, or finding out where an IRNode was constructed debug_ir_traceback = False # used for debugging to make sure config is properly set _raise_error_for_testing = False _profile_var = os.environ.get("TORCHINDUCTOR_PROFILE", "") profile_bandwidth = _profile_var != "" profile_bandwidth_regex = "" if _profile_var == "1" else _profile_var # Specify a file where we print out the profiling results. # None means we do not dump results to a file. profile_bandwidth_output = os.environ.get("TORCHINDUCTOR_PROFILE_OUTPUT", None) # TODO: remove later disable_cpp_codegen = False # Freezing will attempt to inline weights as constants in optimization # and run constant folding and other optimizations on them. After freezing, weights # can no longer be updated. freezing: bool = os.environ.get("TORCHINDUCTOR_FREEZING", "0") == "1" # Make freezing invalidate the eager Parameters of nn modules, to avoid memory overhead # of potentially keeping multiple copies of weights. freezing_discard_parameters: bool = False # Kill switch for allowing temporary tensors to be allocated as stack arrays. Tests # should be run with this flag both on and off to make sure we have coverage. allow_stack_allocation: bool = ( os.environ.get("TORCHINDUCTOR_STACK_ALLOCATION", "1") == "1" ) # Enables an alternate DSO interface (the "minimal ArrayRef interface") intended # to maximize performance for use cases that it can accommodate at the expense of # generality. In brief: # - inputs and outputs are ArrayRefTensor (note that strides are required, but the # tensor must be contiguous) # - constant handling is unchanged because it is not a per-inference-iteration bottleneck # # When the DSO is generated in this mode, the usual interface will also be supported, # but performance for that interface may be degraded. use_minimal_arrayref_interface: bool = False # decompose some memory bound matmul/bmm to mul decompose_mem_bound_mm: bool = False # config specific to codegen/cpp.py class cpp: # set to torch.get_num_threads() threads = -1 # Do not generate loops when the condition doesn't hold, like: # for(long i0=4096; i0<4096; i0+=1) no_redundant_loops = True # Assume number of threads is dynamic, don't specialize thread number. # Kernels don't recompile on thread number changes with this flag on. # For single-threaded workload, turning it on would incur a slight # performance degradation. dynamic_threads = False simdlen: Optional[int] = None min_chunk_size = 4096 cxx = ( None, # download gcc12 from conda-forge if conda is installed # "g++-12", # "g++-11", # "g++-10", # "clang++", os.environ.get("CXX", "clang++" if sys.platform == "darwin" else "g++"), # "g++.par", ) # Allow kernel performance profiling via PyTorch profiler enable_kernel_profile = False # enable weight prepacking to get a better performance; may lead to large memory footprint weight_prepack = True # Inject a bug into our relu implementation; useful for testing our repro # extraction and minification functionality. # Valid values: "compile_error", "runtime_error", "accuracy" inject_relu_bug_TESTING_ONLY: Optional[str] = None inject_log1p_bug_TESTING_ONLY: Optional[str] = None # If None, autodetect whether or not AVX512/AVX2 can be used. Otherwise, # force usage as specified, without testing. vec_isa_ok: Optional[bool] = None # similar to config.triton.descriptive_names descriptive_names = "original_aten" # how many nodes to allow into a single horizontal fusion max_horizontal_fusion_size = 16 # Make scatter_reduce fallback when reduce is sum to avoid performance regression # using atomic_add. fallback_scatter_reduce_sum = True # Use funsafe-math-optimizations when compiling enable_unsafe_math_opt_flag = False # Use ffp-contract when compiling enable_floating_point_contract_flag = False # config specific to codegen/triton.py class triton: # Use cudagraphs on output code cudagraphs = False # Use cudagraph trees for memory pooling if `cudagraphs` is True cudagraph_trees = True # assertions not on the fast path, steady state slow_path_cudagraph_asserts = True # TODO - need to debug why this prevents cleanup cudagraph_trees_history_recording = False # assertions on the fast path fast_path_cudagraph_asserts = False # skip warmup for cudagraph trees skip_cudagraph_warmup = False # Synchronize before and after every compiled graph. debug_sync_graph = False # Synchronize after every kernel launch, to help pinpoint bugs debug_sync_kernel = False # Always load full blocks (rather than broadcasting inside the block) dense_indexing = False # limit tiling dimensions max_tiles = 2 # use triton.autotune for pointwise ops with complex layouts # this should only be disabled for debugging/testing autotune_pointwise = True # max autotune gemm with cublasLt autotune_cublasLt = True # should we stop a fusion to allow better tiling? tiling_prevents_pointwise_fusion = True tiling_prevents_reduction_fusion = True # should we give different names to kernels # Note: This is orthogonal to descriptive_names - this is deciding whether # our triton kernel names should all be `triton_` (to maximize caching) or # whether they should be unique. unique_kernel_names = os.environ.get("TORCHINDUCTOR_UNIQUE_KERNEL_NAMES") == "1" # should we put op names in kernel names # False: No special names (just triton__1, triton__2, etc.) # "torch": Maps to the fx op in the Dynamo graph (module name, method name, etc.) # "original_aten": Maps to the highest-level aten op (i.e. pre-decompositions) # "inductor_node": Maps to the node name in the FX graph passed to Inductor descriptive_names = "original_aten" # use alternate codegen for smaller reductions persistent_reductions = ( os.environ.get("TORCHINDUCTOR_PERSISTENT_REDUCTIONS", "1") == "1" ) # 0/False: disable # 1/True: enable, use tuning to pick between different subkernels # 2: enable, force using persistent reduction (for debugging) # 3: enable, force using non-persistent reduction (for debugging) multi_kernel = int(os.environ.get("TORCHINDUCTOR_MULTI_KERNEL", "0")) # hint to Triton when arguments are divisible by 16 divisible_by_16 = True # theses are not enforced, but they are used by asserts in triton_heuristics.py # NOTE: mobilevit_s in timm_models required X to be set to the higher value 2048 # Max RBLOCK will be large for multi-kernel since we do more aggressive # persistent reduction. max_block = { "X": 2048, "Y": 1024, "Z": 1024, "R": 4096 * (16 if multi_kernel else 1), } # Minimum RBLOCK to be used for a TritonSplitScanKernel # NOTE: This also indirectly controls the size of workspace buffer required min_split_scan_rblock = 256 # Store the generated cubin files for cpp wrapper code to load store_cubin = False # the max number of spills we allow for the configs we benchmark. # Setting this to 0 means we skip a config if it spills even a single # register. # Setting it to a larger value allows a config spilling a small amount # of registers being benchmarked. # # NOTE: triton will always report >0 register spills for kernels using sin/cos. # (check this issue https://github.com/openai/triton/issues/1756 ) # So far we see a fixed 8 spilled registers for kernels using sin/cos. # Raise the threshold to 16 to be safe. # We should revisit this once we understand more of the source of register spills. spill_threshold: int = 16 # Generate code containing the newer tl.make_block_ptr() API for loads/store use_block_ptr = False # Inject a bug into our relu implementation; useful for testing our repro # extraction and minification functionality. # Valid values: "compile_error", "runtime_error", "accuracy" inject_relu_bug_TESTING_ONLY: Optional[str] = None class aot_inductor: # AOTInductor output path # If an absolute path is specified, the generated lib files will be stored under the directory; # If a relative path is specified, it will be used as a subdirectory under the default caching path; # If not specified, a temp directory will be created under the default caching path. # If the specified path contains something like "model.so", the sub-string will be used # to name the generated library. output_path = "" debug_compile = os.environ.get("AOT_INDUCTOR_DEBUG_COMPILE", "0") == "1" # Serialized tree spec for flattening inputs serialized_in_spec = "" # Serialized tree spec for flattening outputs serialized_out_spec = "" # flag to decide whether to create a submodule for constant graph. use_runtime_constant_folding: bool = False class cuda: # CUDA arch to use for CUDA template kernel compilation. # e.g. "70", "75", "80", "90", etc. # When arch is None, Inductor uses torch.cuda.get_device_capability(0). arch: Optional[str] = None # CUDA version to use for CUDA template kernel compilation. # e.g. "11.4", "12.1", etc. # When version is None, Inductor uses torch.version.cuda. version: Optional[str] = None # Optimization level for the host compiler. compile_opt_level = "-O1" # Whether to enable device LTO (link-time-optimization). enable_cuda_lto = False # Whether to keep intermediate files dring compilation. enable_ptxas_info = False # Whether to enable debug info, e.g. line number, cutlass debug info. enable_debug_info = False # Whether to use fast math. use_fast_math = False # Path to the CUTLASS repo root directory. # The default path only works under PyTorch local development environment. cutlass_dir = os.environ.get( "TORCHINDUCTOR_CUTLASS_DIR", os.path.abspath( os.path.join(os.path.dirname(torch.__file__), "../third_party/cutlass/") ), ) # Configures the maximum number of CUTLASS configs to profile in max_autotune. # By default it's None, so that all CUTLASS configs are tuned. # This is mainly used to reduce test time in CI. cutlass_max_profiling_configs: Optional[int] = None # Path to CUDA NVCC. # NVCC search order: # 1) cuda_cxx set in this config # 2)CUDACXX environment variable # 3)CUDA_HOME environment variable # 4) default system search PATH. cuda_cxx: Optional[str] = None # If set to True, it will ensure that only GEMM ops capable of # epilogue fusion via CUTLASS Epilogue Visitor Trees ( EVT ) # are enabled for the CUTLASS backend. cutlass_only_evt_capable_ops: bool = False # create a directory containing lots of debug information class trace: # master switch for all debugging flags below enabled = os.environ.get("TORCH_COMPILE_DEBUG", "0") == "1" # Save debug information to a temporary directory # If not specified, a temp directory will be created by system debug_dir: Optional[str] = None # Save python logger call >=logging.DEBUG debug_log = False # Save python logger call >=logging.INFO info_log = False # Save input FX graph (post decomps, pre optimization) fx_graph = True # Save FX graph after transformations fx_graph_transformed = True # Save TorchInductor IR before fusion pass ir_pre_fusion = True # Save TorchInductor IR after fusion pass ir_post_fusion = True # Copy generated code to trace dir output_code = True # SVG figure showing post-fusion graph graph_diagram = os.environ.get("INDUCTOR_POST_FUSION_SVG", "0") == "1" # SVG figure showing fx with fusion draw_orig_fx_graph = os.environ.get("INDUCTOR_ORIG_FX_SVG", "0") == "1" # We draw our fx graphs with the "record" shape attribute by default. # Sometimes, when the graph is very complex, we may hit dot errors like below: # "flat edge between adjacent nodes one of which has a record shape - # replace records with HTML-like labels" # and thus fail to generate a graph. So, let's give the user an option # to specify the shape attribute for the dot graph. For example, passing # INDUCTOR_DOT_GRAPH_SHAPE_SVG = "none" would let us generate HTML-like lables # to workaround the above failure. dot_graph_shape = os.environ.get("INDUCTOR_DOT_GRAPH_SHAPE_SVG", None) # Store cProfile (see snakeviz to view) compile_profile = False # Upload the .tar.gz file # Needs to be overriden based on specific environment needs upload_tar: Optional[Callable[[str], None]] = None log_autotuning_results: bool = False _save_config_ignore = { # workaround: "Can't pickle " "trace.upload_tar", } if TYPE_CHECKING: from torch.utils._config_typing import * # noqa: F401, F403 from torch.utils._config_module import install_config_module # adds patch, save_config, etc install_config_module(sys.modules[__name__])