ai-content-maker/.venv/Lib/site-packages/torch/_inductor/decomposition.py

679 lines
21 KiB
Python

import functools
import logging
import math
import sys
import typing
from typing import Optional
import torch
import torch._decomp as decomp
import torch._prims_common as utils
import torch.ao.quantization.fx._decomposed
from torch._decomp import (
core_aten_decompositions,
get_decompositions,
remove_decompositions,
)
from torch._decomp.decompositions import (
_grid_sampler_2d as decomp_grid_sampler_2d,
pw_cast_for_opmath,
)
from torch._decomp.decompositions_for_rng import extra_random_decomps
from torch._higher_order_ops.out_dtype import out_dtype
from torch._prims_common import (
elementwise_dtypes,
ELEMENTWISE_TYPE_PROMOTION_KIND,
type_to_dtype,
)
from . import config, inductor_prims
log = logging.getLogger(__name__)
aten = torch.ops.aten
prims = torch.ops.prims
quantized_decomposed = torch.ops.quantized_decomposed
inductor_decompositions = get_decompositions(
[
aten._adaptive_avg_pool2d_backward,
aten.arange,
aten.bitwise_and_,
aten.bitwise_or_,
aten.clamp_min_,
aten.dist,
aten.empty_like,
aten.flip,
aten.gelu,
aten.hardtanh,
aten.index_select,
aten.lcm,
aten.leaky_relu,
aten.linalg_vector_norm,
aten._log_softmax,
aten.max_pool2d_with_indices_backward,
aten._native_batch_norm_legit,
aten._native_batch_norm_legit_functional,
aten._native_batch_norm_legit_no_training,
aten.native_batch_norm,
aten.native_group_norm,
aten.native_layer_norm,
aten.nll_loss2d_backward,
aten._softmax,
aten.sin_,
aten.sqrt_,
out_dtype,
aten._to_copy,
aten.tril_indices,
aten.triu_indices,
aten.upsample_bilinear2d.vec,
]
)
decompositions = {**core_aten_decompositions(), **inductor_decompositions}
# Remove unwanted decompositions included via the core ATen decompositions from
# the Inductor decomp table.
decomps_to_exclude = [
aten._unsafe_index,
aten._scaled_dot_product_flash_attention_for_cpu.default, # See comments in torch/_decomp/decompositions.py
aten.clamp_max,
aten.clamp_min,
aten.glu, # inductor lowers this directly
aten.split.Tensor, # inductor lowers this directly
aten.squeeze, # inductor lowers this directly
aten.sum, # inductor lowers this directly
aten.unbind, # inductor lowers this directly
]
remove_decompositions(decompositions, decomps_to_exclude)
def register_decomposition(ops):
for op in [ops] if callable(ops) else ops:
if op in decompositions:
log.warning("duplicate decomp: %s", ops)
return decomp.register_decomposition(ops, decompositions)
# TODO: for now, inductor doesn't handle asserts
# because the condition is symbool -> tensor in the graph.
@register_decomposition([aten._assert_async.msg])
def assert_async_msg_decomp(tensor, msg):
return
# Following `assert_async_msg_decomp` and implement as non-op.
@register_decomposition([aten._functional_assert_async.msg])
def functional_assert_async_msg_decomp(tensor, msg):
return
@register_decomposition([aten.sym_constrain_range_for_size.default])
def sym_constrain_range_for_size(symbol, *, min=None, max=None):
return
@register_decomposition([aten.clamp])
@pw_cast_for_opmath
def clamp(x, min=None, max=None):
if min is not None:
x = x.clamp_min(min)
if max is not None:
x = x.clamp_max(max)
return x
@register_decomposition([aten.full])
def full(size, fill_value, **kwargs):
dtype = kwargs.get("dtype")
if dtype is None:
kwargs["dtype"] = type_to_dtype(type(fill_value))
return aten.full(size, fill_value, **kwargs)
return NotImplemented
# Not really sure how to put this into the main library. PrimTorch wants
# empty_permuted to go to the prim, and typically users don't really want
# to decompose to empty_strided (but inductor is OK with it, because we are
# cool with strides and everything goes to empty_strided)
@register_decomposition([aten.empty_permuted.default])
def empty_permuted(size, physical_layout, **kwargs):
perm = [0] * len(size)
for p, l in enumerate(physical_layout):
perm[l] = p
return torch.empty([size[l] for l in physical_layout], **kwargs).permute(perm)
@register_decomposition([aten.convolution_backward])
def convolution_backward(
grad_output,
input,
weight,
bias_sizes,
stride,
padding,
dilation,
transposed,
output_padding,
groups,
output_mask,
):
if not output_mask[2] or grad_output.device.type != "cuda":
return NotImplemented
grad_bias = aten.sum(grad_output, [0] + list(range(2, grad_output.dim())))
grad_inp, grad_weight, _ = aten.convolution_backward(
grad_output,
input,
weight,
bias_sizes,
stride,
padding,
dilation,
transposed,
output_padding,
groups,
[output_mask[0], output_mask[1], False],
)
return (grad_inp, grad_weight, grad_bias)
@register_decomposition([aten.log2])
def log2(x):
return torch.log(x) * (1.0 / math.log(2.0))
@register_decomposition([aten.round.decimals])
def round_dec(x, decimals=0):
ten_pow_decimals = 10.0**decimals
return aten.round(x * ten_pow_decimals) * (1.0 / ten_pow_decimals)
@register_decomposition([aten.bmm])
@pw_cast_for_opmath
def bmm(self, batch2):
if config.coordinate_descent_tuning:
if self.shape[1] == 1 or batch2.shape[2] == 1:
out = (self.unsqueeze(-1) * batch2.unsqueeze(1)).sum(dim=2)
return out
if self.device.type == "cpu":
if self.size(1) == 1 and batch2.size(-1) == 1:
return torch.sum(
self.squeeze(1) * batch2.squeeze(-1), dim=1, keepdim=True
).unsqueeze(1)
return NotImplemented
@register_decomposition([aten.addmm])
@pw_cast_for_opmath
def addmm(self, mat1, mat2, beta=1, alpha=1):
if self.device.type == "cpu":
if mat1.size(0) == 1 and mat2.size(-1) == 1:
out = torch.sum(
mat1.squeeze(0) * mat2.squeeze(-1), dim=0, keepdim=True
).unsqueeze(0)
return alpha * out + beta * self
if mat1.size(0) == 1 and mat2.size(0) <= 16 and mat2.size(1) <= 16:
out = (mat1.T * mat2).sum(dim=0, keepdim=True)
return alpha * out + beta * self
return NotImplemented
@register_decomposition([aten.mm])
@pw_cast_for_opmath
def mm(self, input2):
from torch.fx.experimental.symbolic_shapes import (
definitely_true,
guard_size_oblivious,
)
# Our matrix vector multiplies only achieve peak bandwidth with coordinate descent tuning.
# todo: Look into why and fix it (hopefully)
if config.coordinate_descent_tuning:
if self.shape[0] == 1 or input2.shape[1] == 1:
return (self.unsqueeze(2) * input2.unsqueeze(0)).sum(dim=1)
if self.device.type == "cpu":
if (
guard_size_oblivious(self.size(-1) == 1)
and guard_size_oblivious(self.size(0) > 0)
and guard_size_oblivious(input2.size(0) == 1)
and (self.dtype == input2.dtype)
and definitely_true((torch.numel(self) + torch.numel(input2)) <= 32)
):
return torch.cat([self[i, :] * input2 for i in range(self.size(0))])
if guard_size_oblivious(self.size(0) == 1) and guard_size_oblivious(
input2.size(-1) == 1
):
return torch.sum(
self.squeeze(0) * input2.squeeze(-1), dim=0, keepdim=True
).unsqueeze(0)
return NotImplemented
# This pass does two things:
# - Eliminate cat when there is only one tensor input
# - Normalize cat calls, so that legacy empty 1-D tensors are removed (NB: we
# don't remove ALL empty tensors, only the naughty ones)
@register_decomposition([aten.cat.default])
def cat(tensors, dim=0):
from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
def non_empty_tensor(x):
# For better or worse, this is a valid cat:
#
# torch.cat([torch.randn(2, 2, 4), torch.randn(0), torch.randn(3, 2, 4)])
#
# We'd like to eliminate naughtiness like this for downstream passes
# like split_cat. The easiest way is to just drop such inputs
# (guarding that they are non-zero).
#
# Is it permissible for this filtering to be size-oblivious? A case
# where this could matter is cat([(2, 2), (u0,)], dim=0); if u0
# happened to be zero, we would have liked to have filtered it out.
# But actually, the ONLY way this could have passed is if u0 == 0,
# so by the time we get here we have already installed a deferred
# runtime assert forcing u0 to be zero. So if this hasn't happened,
# we know that the unbacked SymInt has appropriate size and there are
# no problems.
return len(x.shape) != 1 or guard_size_oblivious(x.shape[0] > 0)
filtered_tensors = list(filter(non_empty_tensor, tensors))
if len(filtered_tensors) == 1:
return filtered_tensors[0].clone()
elif 1 < len(filtered_tensors) < len(tensors):
# on the first call, when we remove empty tensors, we redispatch recursively
return aten.cat.default(filtered_tensors, dim)
# when no 'filtering' has occurred, we raise to prevent infinite recursion (no more decomposition needed)
return NotImplemented
@register_decomposition([aten.angle])
def angle(x):
if x.is_complex():
return torch.where(
torch.isnan(x.real), float("nan"), torch.atan2(x.imag, x.real)
)
# when x is real number
# if x >= 0, return 0
# if x < 0, return pi
# if x is nan, return nan
_, dtype = elementwise_dtypes(
x,
type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
)
pi = torch.scalar_tensor(math.pi, dtype=dtype, device=x.device)
ret = torch.where(x < 0, pi, 0.0)
return torch.where(torch.isnan(x), float("nan"), ret)
@register_decomposition([aten.add])
def add(x, y, *, alpha=None):
x_is_complex_tensor = torch.is_tensor(x) and x.is_complex()
y_is_complex_tensor = torch.is_tensor(y) and y.is_complex()
if not x_is_complex_tensor or not y_is_complex_tensor:
return NotImplemented
z = y
if alpha is not None:
z = alpha * y
complex_type = torch.promote_types(x.dtype, y.dtype)
return (x.view(x.real.dtype) + z.view(y.real.dtype)).view(complex_type)
@register_decomposition([aten.conj_physical])
def conj_physical(self):
assert not self.is_complex(), "TODO: implement this"
return self
@register_decomposition([aten.lift, aten.detach_])
def lift(self):
return self
@register_decomposition([aten.bernoulli.default])
def bernoulli(self, *, generator=None):
assert generator is None
return (torch.rand_like(self, dtype=torch.float32) < self).to(self.dtype)
@register_decomposition([aten.fmin, prims.fmin])
def fmin(self, other):
return torch.where(torch.isnan(other) | (other > self), self, other)
@register_decomposition([aten.fmax, prims.fmax])
def fmax(self, other):
return torch.where(torch.isnan(other) | (other < self), self, other)
@register_decomposition(aten.amax)
def amax(self, dim=None, keepdim=False):
if self.dtype == torch.bool:
return torch.any(self, dim=dim, keepdim=keepdim)
return NotImplemented
@register_decomposition(aten.amin)
def amin(self, dim=None, keepdim=False):
if self.dtype == torch.bool:
return torch.all(self, dim=dim, keepdim=keepdim)
return NotImplemented
@register_decomposition([aten.narrow_copy])
def narrow_copy(self, dim, start, length):
return torch.narrow(self, dim, start, length).clone()
@register_decomposition([aten.expand_copy])
def expand_copy(self, size, *, implicit=False):
return aten.expand(self, size, implicit=implicit).clone()
@register_decomposition([aten.view_copy.default])
def view_copy_default(self, size):
return aten.view(self, size).clone()
@register_decomposition([aten.view_copy.dtype])
def view_copy_dtype(self, dtype):
return self.to(dtype).clone()
def get_like_layout(
tensor: torch.Tensor, memory_format: Optional[torch.memory_format]
) -> torch.memory_format:
# TODO: _to_copy tensor to stride permutation
if memory_format is torch.preserve_format or memory_format is None:
return utils.suggest_memory_format(tensor)
else:
return memory_format
@register_decomposition(aten.rand_like)
def rand_like(self, *, dtype=None, device=None, memory_format=None, **kwargs):
return torch.rand(
[*self.size()],
dtype=dtype or self.dtype,
device=device or self.device,
**kwargs,
).to(memory_format=get_like_layout(self, memory_format))
@register_decomposition(aten.randn_like)
def randn_like(self, *, dtype=None, device=None, memory_format=None, **kwargs):
return torch.randn(
[*self.size()],
dtype=dtype or self.dtype,
device=device or self.device,
**kwargs,
).to(memory_format=get_like_layout(self, memory_format))
@register_decomposition(aten.full_like)
def full_like(
self,
fill_value,
*,
dtype=None,
layout=None,
device=None,
pin_memory=False,
requires_grad=False,
memory_format=torch.preserve_format,
):
return torch.full(
[*self.size()],
fill_value,
dtype=dtype or self.dtype,
layout=layout or self.layout,
device=device or self.device,
requires_grad=requires_grad,
).to(memory_format=get_like_layout(self, memory_format))
@register_decomposition(aten.randint_like.default)
def randint_like(self, high, *, dtype=None, device=None, memory_format=None, **kwargs):
return aten.randint.low(
0,
high,
[*self.size()],
dtype=dtype or self.dtype,
device=device or self.device,
**kwargs,
).to(memory_format=get_like_layout(self, memory_format))
@register_decomposition(aten.randint_like.low_dtype)
def randint_like_low(
self, low, high, *, dtype=None, device=None, memory_format=None, **kwargs
):
return aten.randint.low(
low,
high,
[*self.size()],
dtype=dtype or self.dtype,
device=device or self.device,
**kwargs,
).to(memory_format=get_like_layout(self, memory_format))
@register_decomposition(aten.randint.default)
def randint(high, size, **kwargs):
return aten.randint.low(0, high, size, **kwargs)
# The difference between quantize_per_tensor.default and quantize_per_tensor.tensor is
# scale and zero_point is scalar or scalar tensor
@register_decomposition(quantized_decomposed.quantize_per_tensor.default)
def quantize_per_tensor_default_decomp_impl(
input: torch.Tensor,
scale: float,
zero_point: int,
quant_min: int,
quant_max: int,
dtype: torch.dtype,
) -> torch.Tensor:
if input.dtype == torch.bfloat16:
input = input.to(torch.float32)
inv_scale = 1.0 / scale
return torch.clamp(
torch.round(input * inv_scale) + zero_point, quant_min, quant_max
).to(dtype)
# The difference between dequantize_per_tensor.default and dequantize_per_tensor.tensor is
# scale and zero_point is scalar or scalar tensor
@register_decomposition(quantized_decomposed.dequantize_per_tensor.default)
def dequantize_per_tensor_default_decomp_impl(
input: torch.Tensor,
scale: float,
zero_point: int,
quant_min: int,
quant_max: int,
dtype: torch.dtype,
) -> torch.Tensor:
return (input.to(torch.float32) - zero_point) * scale
@register_decomposition(quantized_decomposed.quantize_per_tensor.tensor)
def quantize_per_tensor_tensor_decomp_impl(
input: torch.Tensor,
scale: torch.Tensor,
zero_point: torch.Tensor,
quant_min: int,
quant_max: int,
dtype: torch.dtype,
) -> torch.Tensor:
if input.dtype == torch.bfloat16:
input = input.to(torch.float32)
inv_scale = 1.0 / scale
return torch.clamp(
torch.round(input * inv_scale) + zero_point, quant_min, quant_max
).to(dtype)
@register_decomposition(quantized_decomposed.dequantize_per_tensor.tensor)
def dequantize_per_tensor_tensor_decomp_impl(
input: torch.Tensor,
scale: torch.Tensor,
zero_point: torch.Tensor,
quant_min: int,
quant_max: int,
dtype: torch.dtype,
) -> torch.Tensor:
return (input.to(torch.float32) - zero_point.to(torch.int32)) * scale.to(
torch.float32
)
@register_decomposition(torch.ops.quantized.embedding_bag_byte_unpack)
def q_embedding_bag_byte_unpack_decomp(packed):
def bitcast_u8_to_f32(u8):
x, y, z, w = (u8[..., n].to(torch.int32) for n in (0, 1, 2, 3))
if sys.byteorder == "little":
return (x + (y << 8) + (z << 16) + (w << 24)).view(torch.float32)[..., None]
else:
return ((x << 24) + (y << 16) + (z << 8) + w).view(torch.float32)[..., None]
scales = bitcast_u8_to_f32(packed[..., -8:-4])
offsets = bitcast_u8_to_f32(packed[..., -4:])
return packed[..., :-8].to(torch.float32) * scales + offsets
@register_decomposition([aten.grid_sampler_2d])
@pw_cast_for_opmath
def grid_sampler_2d(
a: torch.Tensor,
grid: torch.Tensor,
interpolation_mode: int = 0,
padding_mode: int = 0,
align_corners: bool = False,
) -> torch.Tensor:
# We do not expand the grid (_expand_grid=False) on cpu for performance reasons
# Experimenting locally it was found that compiled CUDA code is accelerated by ~5x
# and CPU code by ~2x on bicubic mode, if we expand the grid from (N, H, W, 2) into (N, C, H, W, 2)
# However, this leads to a slowdown around ~0.8x on CPU bilinear mode, channels first.
# Thus we apply this hack to not expand the grid for this case.
_expand_grid = not (
a.device == torch.device("cpu")
and interpolation_mode == 0
and a.is_contiguous(memory_format=torch.contiguous_format)
)
output = decomp_grid_sampler_2d(
a,
grid=grid,
interpolation_mode=interpolation_mode,
padding_mode=padding_mode,
align_corners=align_corners,
_expand_grid=_expand_grid,
)
return output
@register_decomposition(aten._foreach_addcmul.Scalar)
def _foreach_addcmul_scalar(self, left_tensors, right_tensors, scalar=1):
return aten._foreach_add.List(
self, aten._foreach_mul.List(left_tensors, right_tensors), alpha=scalar
)
@register_decomposition(aten._foreach_addcdiv.Scalar)
def _foreach_addcdiv_scalar(self, left_tensors, right_tensors, scalar=1):
return aten._foreach_add.List(
self, aten._foreach_div.List(left_tensors, right_tensors), alpha=scalar
)
@register_decomposition(aten._foreach_lerp.Scalar)
def _foreach_lerp_scalar(start_tensors, end_tensors, weight):
return aten._foreach_add.List(
start_tensors,
aten._foreach_mul.Scalar(
aten._foreach_sub.List(end_tensors, start_tensors), weight
),
)
@aten.miopen_batch_norm.default.py_impl(torch._C.DispatchKey.Autograd)
@register_decomposition(aten.miopen_batch_norm)
def miopen_batch_norm(
input: torch.Tensor,
weight: torch.Tensor,
bias: typing.Optional[torch.Tensor],
running_mean: typing.Optional[torch.Tensor],
running_var: typing.Optional[torch.Tensor],
training: bool,
exponential_average_factor: float,
epsilon: float,
):
a, b, c = aten.native_batch_norm(
input,
weight,
bias,
running_mean,
running_var,
training,
exponential_average_factor,
epsilon,
)
if training:
return (a, b, c)
return (
a,
weight.new_zeros((0,)),
weight.new_zeros((0,)),
)
@functools.lru_cache(None)
def fast_random_decomps():
return {**decompositions, **extra_random_decomps}
def select_decomp_table():
"""decomps can change based on config"""
if config.fallback_random:
return decompositions
return fast_random_decomps()
@register_decomposition(aten.masked_scatter)
def masked_scatter(self, mask, source):
if self.device.type == "cuda":
# This two-step algorithm is the same as eager CUDA, for eager CPU we
# use a 1-shot serial iteration.
self, mask = aten.broadcast_tensors([self, mask])
source_idx = mask.reshape(-1).cumsum(0) - 1
return inductor_prims.masked_scatter_with_index(self, mask, source_idx, source)
return NotImplemented
@register_decomposition(quantized_decomposed.choose_qparams.tensor)
def choose_qparams_tensor(
input: torch.Tensor, quant_min: int, quant_max: int, eps: float, dtype: torch.dtype
):
min_val, max_val = torch.aminmax(input)
scale = (max_val - min_val) / float(quant_max - quant_min)
scale = torch.max(scale, torch.Tensor([eps]))
zero_point = quant_min - torch.round(min_val / scale).to(torch.int)
zero_point = torch.clamp(zero_point, quant_min, quant_max)
return scale.to(torch.float64), zero_point.to(torch.int64)
@register_decomposition(aten.put)
def put(self, index, source, accumulate=False):
flattened = self.flatten()
flattened = torch.index_put(
flattened, [index], source.reshape(index.shape), accumulate
)
return flattened.reshape(self.shape)
@register_decomposition(aten.put_)
def put_(self, index, source, accumulate=False):
out = aten.put(self, index, source, accumulate=accumulate)
return self.copy_(out)