ai-content-maker/.venv/Lib/site-packages/torch/utils/flop_counter.py

import torch
import torch.nn as nn
from torch.utils._pytree import tree_map, tree_flatten, tree_unflatten
from typing import List, Any, Dict, Optional, Union, NamedTuple
from collections import defaultdict
from torch.utils._python_dispatch import TorchDispatchMode
from torch.utils.hooks import RemovableHandle
from torch._decomp import register_decomposition
from math import prod
from functools import wraps


__all__ = ["FlopCounterMode", "register_flop_formula"]

aten = torch.ops.aten

def get_shape(i):
    if isinstance(i, torch.Tensor):
        return i.shape
    return i

flop_registry: Dict[Any, Any] = {}

def shape_wrapper(f):
    @wraps(f)
    def nf(*args, out=None, **kwargs):
        args, kwargs, out_shape = tree_map(get_shape, (args, kwargs, out))
        return f(*args, out_shape=out_shape, **kwargs)
    return nf

def register_flop_formula(targets, get_raw=False):
    def register_fun(flop_formula):
        if not get_raw:
            flop_formula = shape_wrapper(flop_formula)
        register_decomposition(targets, registry=flop_registry, unsafe=True)(flop_formula)
        return flop_formula

    return register_fun

@register_flop_formula(aten.mm)
def mm_flop(a_shape, b_shape, *args, out_shape=None, **kwargs) -> int:
    """Count flops for matmul."""
    # Inputs should be a list of length 2.
    # Inputs contains the shapes of two matrices.
    m, k = a_shape
    k2, n = b_shape
    assert k == k2
    # NB(chilli): Should be 2 * k - 1 technically for FLOPs.
    return m * n * 2 * k

@register_flop_formula(aten.addmm)
def addmm_flop(self_shape, a_shape, b_shape, out_shape=None, **kwargs) -> int:
    """Count flops for addmm."""
    return mm_flop(a_shape, b_shape)

@register_flop_formula(aten.bmm)
def bmm_flop(a_shape, b_shape, out_shape=None, **kwargs) -> int:
    """Count flops for the bmm operation."""
    # Inputs should be a list of length 2.
    # Inputs contains the shapes of two tensor.
    b, m, k = a_shape
    b2, k2, n = b_shape
    assert b == b2
    assert k == k2
    # NB(chilli): Should be 2 * k - 1 technically for FLOPs.
    flop = b * m * n * 2 * k
    return flop

@register_flop_formula(aten.baddbmm)
def baddbmm_flop(self_shape, a_shape, b_shape, out_shape=None, **kwargs) -> int:
    """Count flops for the baddbmm operation."""
    # Inputs should be a list of length 3.
    # Inputs contains the shapes of three tensors.
    return bmm_flop(a_shape, b_shape)


def conv_flop_count(
    x_shape: List[int],
    w_shape: List[int],
    out_shape: List[int],
    transposed: bool = False,
) -> int:
    """Count flops for convolution.

    Note only multiplication is
    counted. Computation for bias are ignored.
    Flops for a transposed convolution are calculated as
    flops = (x_shape[2:] * prod(w_shape) * batch_size).
    Args:
        x_shape (list(int)): The input shape before convolution.
        w_shape (list(int)): The filter shape.
        out_shape (list(int)): The output shape after convolution.
        transposed (bool): is the convolution transposed
    Returns:
        int: the number of flops
    """

    batch_size = x_shape[0]
    conv_shape = (x_shape if transposed else out_shape)[2:]
    c_out, c_in, *filter_size = w_shape

    """
    General idea here is that for a regular conv, for each point in the output
    spatial dimension we convolve the filter with something (hence
    `prod(conv_shape) * prod(filter_size)` ops). Then, this gets multiplied by
    1. batch_size, 2. the cross product of input and weight channels.

    For the transpose, it's not each point in the *output* spatial dimension but
    each point in the *input* spatial dimension.
    """
    # NB(chilli): I don't think this properly accounts for padding :think:
    # NB(chilli): Should be 2 * c_in - 1 technically for FLOPs.
    flop = prod(conv_shape) * prod(filter_size) * batch_size * c_out * c_in * 2
    return flop

@register_flop_formula([aten.convolution, aten._convolution])
def conv_flop(x_shape, w_shape, _bias, _stride, _padding, _dilation, transposed, *args, out_shape=None, **kwargs) -> int:
    """Count flops for convolution."""
    return conv_flop_count(x_shape, w_shape, out_shape, transposed=transposed)


@register_flop_formula(aten.convolution_backward)
def conv_backward_flop(
        grad_out_shape,
        x_shape,
        w_shape,
        _bias,
        _stride,
        _padding,
        _dilation,
        transposed,
        _output_padding,
        _groups,
        output_mask,
        out_shape) -> int:

    def t(shape):
        return [shape[1], shape[0]] + list(shape[2:])
    flop_count = 0

    """
    Let's say we have a regular 1D conv
    {A, B, C} [inp]
    {i, j} [weight]
    => (conv)
    {Ai + Bj, Bi + Cj} [out]

    And as a reminder, the transposed conv of the above is
    => {Ai, Aj + Bi, Bj + Ci, Cj} [transposed conv out]

    For the backwards of conv, we now have
    {D, E} [grad_out]
    {A, B, C} [inp]
    {i, j} [weight]

    # grad_inp as conv_transpose(grad_out, weight)
    Let's first compute grad_inp. To do so, we can simply look at all the
    multiplications that each element of inp is involved in. For example, A is
    only involved in the first element of the output (and thus only depends upon
    D in grad_out), and C is only involved in the last element of the output
    (and thus only depends upon E in grad_out)

    {Di, Dj + Ei, Ej} [grad_inp]

    Note that this corresponds to the below conv_transpose. This gives us the
    output_mask[0] branch, which is grad_inp.

    {D, E} [inp (grad_out)]
    {i, j} [weight]
    => (conv_transpose)
    {Di, Dj + Ei, Ej} [out (grad_inp)]

    I leave the fact that grad_inp for a transposed conv is just conv(grad_out,
    weight) as an exercise for the reader.

    # grad_weight as conv(inp, grad_out)
    To compute grad_weight, we again look at the terms in the output, which as
    a reminder is:
    => {Ai + Bj, Bi + Cj} [out]
    => {D, E} [grad_out]
    If we manually compute the gradient for the weights, we see it's
    {AD + BE, BD + CE} [grad_weight]

    This corresponds to the below conv
    {A, B, C} [inp]
    {D, E} [weight (grad_out)]
    => (conv)
    {AD + BE, BD + CE} [out (grad_weight)]

    # grad_weight of transposed conv as conv(grad_out, inp)
    As a reminder, the terms of the output of a transposed conv are:
    => {Ai, Aj + Bi, Bj + Ci, Cj} [transposed conv out]
    => {D, E, F, G} [grad_out]

    Manually computing the gradient for the weights, we see it's
    {AD + BE + CF, AE + BF + CG} [grad_weight]

    This corresponds to the below conv
    {D, E, F, G} [inp (grad_out)]
    {A, B, C} [weight (inp)]
    => (conv)
    {AD + BE + CF, AE + BF + CG} [out (grad_weight)]

    For the full backwards formula, there are also some details involving
    transpose of the batch/channel dimensions and groups, but I skip those for
    the sake of brevity (and they're pretty similar to matmul backwards)

    Check [conv backwards decomposition as conv forwards]
    """
    # grad_inp as conv_transpose(grad_out, weight)
    if output_mask[0]:
        grad_input_shape = get_shape(out_shape[0])
        flop_count += conv_flop_count(grad_out_shape, w_shape, grad_input_shape, not transposed)

    if output_mask[1]:
        grad_weight_shape = get_shape(out_shape[1])
        if transposed:
            # grad_weight of transposed conv as conv(grad_out, inp)
            flop_count += conv_flop_count(t(grad_out_shape), t(x_shape), t(grad_weight_shape), transposed=False)
        else:
            # grad_weight as conv(inp, grad_out)
            flop_count += conv_flop_count(t(x_shape), t(grad_out_shape), t(grad_weight_shape), transposed=False)

    return flop_count

def sdpa_flop_count(query_shape, key_shape, value_shape):
    """
    Count flops for self-attention.

    NB: We can assume that value_shape == key_shape
    """
    b, h, s_q, d_q = query_shape
    _b2, _h2, s_k, _d2 = key_shape
    _b3, _h3, _s3, d_v = value_shape
    assert b == _b2 == _b3 and h == _h2 == _h3 and d_q == _d2 and s_k == _s3 and d_q == _d2
    total_flops = 0
    # q: [b, h, s_q, d_q] @ k: [b, h, d_q, s_k] -> scores: [b, h, s_q, s_k]
    total_flops += bmm_flop((b * h, s_q, d_q), (b * h, d_q, s_k))
    # scores: [b, h, s_q, s_k] @ v: [b, h, s_k, d_v] -> out: [b, h, s_q, d_v]
    total_flops += bmm_flop((b * h, s_q, s_k), (b * h, s_k, d_v))
    return total_flops


@register_flop_formula([aten._scaled_dot_product_efficient_attention, aten._scaled_dot_product_flash_attention])
def sdpa_flop(query_shape, key_shape, value_shape, *args, out_shape=None, **kwargs) -> int:
    """Count flops for self-attention."""
    # NB: We aren't accounting for causal attention here
    return sdpa_flop_count(query_shape, key_shape, value_shape)


def sdpa_backward_flop_count(grad_out_shape, query_shape, key_shape, value_shape):
    total_flops = 0
    b, h, s_q, d_q = query_shape
    _b2, _h2, s_k, _d2 = key_shape
    _b3, _h3, _s3, d_v = value_shape
    _b4, _h4, _s4, _d4 = grad_out_shape
    assert b == _b2 == _b3 == _b4 and h == _h2 == _h3 == _h4 and d_q == _d2
    assert d_v == _d4 and s_k == _s3 and s_q == _s4
    total_flops = 0
    # Step 1: We recompute the scores matrix.
    # q: [b, h, s_q, d_q] @ k: [b, h, d_q, s_k] -> scores: [b, h, s_q, s_k]
    total_flops += bmm_flop((b * h, s_q, d_q), (b * h, d_q, s_k))

    # Step 2: We propagate the gradients through the score @ v operation.
    # gradOut: [b, h, s_q, d_v] @ v: [b, h, d_v, s_k] -> gradScores: [b, h, s_q, s_k]
    total_flops += bmm_flop((b * h, s_q, d_v), (b * h, d_v, s_k))
    # scores: [b, h, s_k, s_q] @ gradOut: [b, h, s_q, d_v] -> gradV: [b, h, s_k, d_v]
    total_flops += bmm_flop((b * h, s_k, s_q), (b * h, s_q, d_v))

    # Step 3: We propagate th gradients through the k @ v operation
    # gradScores: [b, h, s_q, s_k] @ k: [b, h, s_k, d_q] -> gradQ: [b, h, s_q, d_q]
    total_flops += bmm_flop((b * h, s_q, s_k), (b * h, s_k, d_q))
    # q: [b, h, d_q, s_q] @ gradScores: [b, h, s_q, s_k] -> gradK: [b, h, d_q, s_k]
    total_flops += bmm_flop((b * h, d_q, s_q), (b * h, s_q, s_k))
    return total_flops


@register_flop_formula([aten._scaled_dot_product_efficient_attention_backward, aten._scaled_dot_product_flash_attention_backward])
def sdpa_backward_flop(grad_out_shape, query_shape, key_shape, value_shape, *args, out_shape=None, **kwargs) -> int:
    """Count flops for self-attention backward."""
    return sdpa_backward_flop_count(grad_out_shape, query_shape, key_shape, value_shape)

flop_registry = {
    aten.mm: mm_flop,
    aten.addmm: addmm_flop,
    aten.bmm: bmm_flop,
    aten.baddbmm: baddbmm_flop,
    aten.convolution: conv_flop,
    aten._convolution: conv_flop,
    aten.convolution_backward: conv_backward_flop,
    aten._scaled_dot_product_efficient_attention: sdpa_flop,
    aten._scaled_dot_product_flash_attention: sdpa_flop,
    aten._scaled_dot_product_efficient_attention_backward: sdpa_backward_flop,
    aten._scaled_dot_product_flash_attention_backward: sdpa_backward_flop,
}

def normalize_tuple(x):
    if not isinstance(x, tuple):
        return (x,)
    return x


# Define the suffixes for different orders of magnitude
suffixes = ["", "K", "M", "B", "T"]
# Thanks BingChat!
def get_suffix_str(number):
    # Find the index of the appropriate suffix based on the number of digits
    # with some additional overflow.
    # i.e. 1.01B should be displayed as 1001M, not 1.001B
    index = max(0, min(len(suffixes) - 1, (len(str(number)) - 2) // 3))
    return suffixes[index]

def convert_num_with_suffix(number, suffix):
    index = suffixes.index(suffix)
    # Divide the number by 1000^index and format it to two decimal places
    value = f"{number / 1000 ** index:.3f}"
    # Return the value and the suffix as a string
    return value + suffixes[index]

def convert_to_percent_str(num, denom):
    if denom == 0:
        return "0%"
    return f"{num / denom:.2%}"

def _pytreeify_preserve_structure(f):
    @wraps(f)
    def nf(args):
        flat_args, spec = tree_flatten(args)
        out = f(*flat_args)
        return tree_unflatten(out, spec)

    return nf


class FlopCounterMode(TorchDispatchMode):
    """
    ``FlopCounterMode`` is a context manager that counts the number of flops within its context.

    It does this using a ``TorchDispatchMode``.

    It also supports hierarchical output by passing a module (or list of
    modules) to FlopCounterMode on construction. If you do not need hierarchical
    output, you do not need to use it with a module.

    Example usage

    .. code-block:: python

        mod = ...
        flop_counter = FlopCounterMode(mod)
        with flop_counter:
            mod.sum().backward()

    """

    def __init__(
            self,
            mods: Optional[Union[torch.nn.Module, List[torch.nn.Module]]] = None,
            depth: int = 2,
            display: bool = True,
            custom_mapping: Optional[Dict[Any, Any]] = None):
        self.flop_counts: Dict[str, Dict[Any, int]] = defaultdict(lambda: defaultdict(int))
        self.depth = depth
        self.parents = ["Global"]
        self.in_backward = False
        self.display = display
        if custom_mapping is None:
            custom_mapping = {}
        if isinstance(mods, torch.nn.Module):
            mods = [mods]
        self.mods = mods
        # Keys will include the modules in `mods` and their submodules
        self._module_to_forward_hook_handles: Dict[nn.Module, _ForwardHookHandles] = {}
        self.flop_registry = {
            **flop_registry,
            **{k: v if getattr(v, "_get_raw", False) else shape_wrapper(v) for k, v in custom_mapping.items()}
        }

    def _register_forward_hooks(self):
        if self.mods is None:
            return
        for mod in self.mods:
            prefix = type(mod).__name__
            for name, module in dict(mod.named_modules()).items():
                if name == "":
                    name = prefix
                else:
                    name = ".".join([prefix, name])

                forward_pre_hook_handle = module.register_forward_pre_hook(self._enter_module(name))
                forward_hook_handle = module.register_forward_hook(self._exit_module(name))
                self._module_to_forward_hook_handles[module] = _ForwardHookHandles(
                    forward_pre_hook_handle, forward_hook_handle
                )

    def _deregister_forward_hooks(self):
        for forward_hook_handles in self._module_to_forward_hook_handles.values():
            forward_hook_handles[0].remove()
            forward_hook_handles[1].remove()
        self._module_to_forward_hook_handles.clear()

    def _enter_module(self, name):
        def f(module, inputs):
            out = _pytreeify_preserve_structure(self._create_pre_module(name))(inputs)
            return out

        return f

    def _exit_module(self, name):
        def f(module, inputs, outputs):
            outputs = _pytreeify_preserve_structure(self._create_post_module(name))(outputs)
            return outputs
        return f

    def _create_post_module(self, name):
        class PushState(torch.autograd.Function):
            @staticmethod
            def forward(ctx, *args):
                assert self.parents[-1] == name, f"{self.parents[-1]} is not {name}"
                self.parents.pop()
                args = tree_map(lambda x: x.clone() if isinstance(x, torch.Tensor) else x, args)
                return args

            @staticmethod
            def backward(ctx, *grad_outs):
                self.in_backward = True
                self.parents.append(name)
                return grad_outs

        return PushState.apply

    def _create_pre_module(self, name):
        class PopState(torch.autograd.Function):
            @staticmethod
            def forward(ctx, *args):
                if self.in_backward:
                    self.parents = ["Global"]
                    self.in_backward = True
                self.parents.append(name)
                args = tree_map(lambda x: x.clone() if isinstance(x, torch.Tensor) else x, args)
                return args

            @staticmethod
            def backward(ctx, *grad_outs):
                assert self.parents[-1] == name
                self.parents.pop()
                return grad_outs

        return PopState.apply

    def get_total_flops(self) -> int:
        return sum(self.flop_counts['Global'].values())

    def get_flop_counts(self) -> Dict[str, Dict[Any, int]]:
        """Return the flop counts as a dictionary of dictionaries.

        The outer
        dictionary is keyed by module name, and the inner dictionary is keyed by
        operation name.

        Returns:
            Dict[str, Dict[Any, int]]: The flop counts as a dictionary.
        """
        return {k: dict(v) for k, v in self.flop_counts.items()}

    def get_table(self, depth=None):
        if depth is None:
            depth = self.depth
        if depth is None:
            depth = 999999

        import tabulate
        tabulate.PRESERVE_WHITESPACE = True
        header = ["Module", "FLOP", "% Total"]
        values = []
        global_flops = self.get_total_flops()
        global_suffix = get_suffix_str(global_flops)
        is_global_subsumed = False

        def process_mod(mod_name, depth):
            nonlocal is_global_subsumed

            total_flops = sum(self.flop_counts[mod_name].values())

            is_global_subsumed |= total_flops >= global_flops

            padding = " " * depth
            values = []
            values.append([
                padding + mod_name,
                convert_num_with_suffix(total_flops, global_suffix),
                convert_to_percent_str(total_flops, global_flops)
            ])
            for k, v in self.flop_counts[mod_name].items():
                values.append([
                    padding + " - " + str(k),
                    convert_num_with_suffix(v, global_suffix),
                    convert_to_percent_str(v, global_flops)
                ])
            return values

        for mod in self.flop_counts.keys():
            if mod == 'Global':
                continue
            mod_depth = mod.count(".") + 1
            if mod_depth > depth:
                continue

            cur_values = process_mod(mod, mod_depth - 1)
            values.extend(cur_values)

        # We do a bit of messing around here to only output the "Global" value
        # if there are any FLOPs in there that aren't already fully contained by
        # a module.
        if 'Global' in self.flop_counts and not is_global_subsumed:
            for idx, value in enumerate(values):
                values[idx][0] = " " + values[idx][0]

            values = process_mod('Global', 0) + values

        if len(values) == 0:
            values = [["Global", "0", "0%"]]

        return tabulate.tabulate(values, headers=header, colalign=("left", "right", "right"))

    def __enter__(self):
        self.flop_counts.clear()
        self._register_forward_hooks()
        super().__enter__()
        return self

    def __exit__(self, *args):
        if self.display:
            print(self.get_table(self.depth))
        self._deregister_forward_hooks()
        super().__exit__(*args)

    def __torch_dispatch__(self, func, types, args=(), kwargs=None):
        kwargs = kwargs if kwargs else {}
        out = func(*args, **kwargs)
        func_packet = func._overloadpacket
        if func_packet in self.flop_registry:
            flop_count_func = self.flop_registry[func_packet]
            flop_count = flop_count_func(*args, **kwargs, out=out)  # type: ignore[operator]
            if len(set(self.parents)) != len(self.parents):
                print(
                    "The module hierarchy tracking seems to be messed up."
                    "Please file a bug or just run the flop counter without"
                    "tracking the module hierarchy (i.e. `with FlopCounterMode():`)"
                )
            for par in set(self.parents):
                self.flop_counts[par][func_packet] += flop_count

        return out

class _ForwardHookHandles(NamedTuple):
    forward_pre_hook_handle: RemovableHandle
    forward_hook_handle: RemovableHandle
first commit 2024-05-03 04:18:51 +03:00			`import torch`
			`import torch.nn as nn`
			`from torch.utils._pytree import tree_map, tree_flatten, tree_unflatten`
			`from typing import List, Any, Dict, Optional, Union, NamedTuple`
			`from collections import defaultdict`
			`from torch.utils._python_dispatch import TorchDispatchMode`
			`from torch.utils.hooks import RemovableHandle`
			`from torch._decomp import register_decomposition`
			`from math import prod`
			`from functools import wraps`



			`__all__ = ["FlopCounterMode", "register_flop_formula"]`

			`aten = torch.ops.aten`

			`def get_shape(i):`
			`if isinstance(i, torch.Tensor):`
			`return i.shape`
			`return i`

			`flop_registry: Dict[Any, Any] = {}`

			`def shape_wrapper(f):`
			`@wraps(f)`
			`def nf(args, out=None, *kwargs):`
			`args, kwargs, out_shape = tree_map(get_shape, (args, kwargs, out))`
			`return f(args, out_shape=out_shape, *kwargs)`
			`return nf`

			`def register_flop_formula(targets, get_raw=False):`
			`def register_fun(flop_formula):`
			`if not get_raw:`
			`flop_formula = shape_wrapper(flop_formula)`
			`register_decomposition(targets, registry=flop_registry, unsafe=True)(flop_formula)`
			`return flop_formula`

			`return register_fun`

			`@register_flop_formula(aten.mm)`
			`def mm_flop(a_shape, b_shape, args, out_shape=None, *kwargs) -> int:`
			`"""Count flops for matmul."""`
			`# Inputs should be a list of length 2.`
			`# Inputs contains the shapes of two matrices.`
			`m, k = a_shape`
			`k2, n = b_shape`
			`assert k == k2`
			`# NB(chilli): Should be 2 * k - 1 technically for FLOPs.`
			`return m * n * 2 * k`

			`@register_flop_formula(aten.addmm)`
			`def addmm_flop(self_shape, a_shape, b_shape, out_shape=None, **kwargs) -> int:`
			`"""Count flops for addmm."""`
			`return mm_flop(a_shape, b_shape)`

			`@register_flop_formula(aten.bmm)`
			`def bmm_flop(a_shape, b_shape, out_shape=None, **kwargs) -> int:`
			`"""Count flops for the bmm operation."""`
			`# Inputs should be a list of length 2.`
			`# Inputs contains the shapes of two tensor.`
			`b, m, k = a_shape`
			`b2, k2, n = b_shape`
			`assert b == b2`
			`assert k == k2`
			`# NB(chilli): Should be 2 * k - 1 technically for FLOPs.`
			`flop = b * m * n * 2 * k`
			`return flop`

			`@register_flop_formula(aten.baddbmm)`
			`def baddbmm_flop(self_shape, a_shape, b_shape, out_shape=None, **kwargs) -> int:`
			`"""Count flops for the baddbmm operation."""`
			`# Inputs should be a list of length 3.`
			`# Inputs contains the shapes of three tensors.`
			`return bmm_flop(a_shape, b_shape)`


			`def conv_flop_count(`
			`x_shape: List[int],`
			`w_shape: List[int],`
			`out_shape: List[int],`
			`transposed: bool = False,`
			`) -> int:`
			`"""Count flops for convolution.`

			`Note only multiplication is`
			`counted. Computation for bias are ignored.`
			`Flops for a transposed convolution are calculated as`
			`flops = (x_shape[2:] * prod(w_shape) * batch_size).`
			`Args:`
			`x_shape (list(int)): The input shape before convolution.`
			`w_shape (list(int)): The filter shape.`
			`out_shape (list(int)): The output shape after convolution.`
			`transposed (bool): is the convolution transposed`
			`Returns:`
			`int: the number of flops`
			`"""`

			`batch_size = x_shape[0]`
			`conv_shape = (x_shape if transposed else out_shape)[2:]`
			`c_out, c_in, *filter_size = w_shape`

			`"""`
			`General idea here is that for a regular conv, for each point in the output`
			`spatial dimension we convolve the filter with something (hence`
			`prod(conv_shape) * prod(filter_size)` ops). Then, this gets multiplied by
			`1. batch_size, 2. the cross product of input and weight channels.`

			`For the transpose, it's not each point in the output spatial dimension but`
			`each point in the input spatial dimension.`
			`"""`
			`# NB(chilli): I don't think this properly accounts for padding :think:`
			`# NB(chilli): Should be 2 * c_in - 1 technically for FLOPs.`
			`flop = prod(conv_shape) * prod(filter_size) * batch_size * c_out * c_in * 2`
			`return flop`

			`@register_flop_formula([aten.convolution, aten._convolution])`
			`def conv_flop(x_shape, w_shape, _bias, _stride, _padding, _dilation, transposed, args, out_shape=None, *kwargs) -> int:`
			`"""Count flops for convolution."""`
			`return conv_flop_count(x_shape, w_shape, out_shape, transposed=transposed)`


			`@register_flop_formula(aten.convolution_backward)`
			`def conv_backward_flop(`
			`grad_out_shape,`
			`x_shape,`
			`w_shape,`
			`_bias,`
			`_stride,`
			`_padding,`
			`_dilation,`
			`transposed,`
			`_output_padding,`
			`_groups,`
			`output_mask,`
			`out_shape) -> int:`

			`def t(shape):`
			`return [shape[1], shape[0]] + list(shape[2:])`
			`flop_count = 0`

			`"""`
			`Let's say we have a regular 1D conv`
			`{A, B, C} [inp]`
			`{i, j} [weight]`
			`=> (conv)`
			`{Ai + Bj, Bi + Cj} [out]`

			`And as a reminder, the transposed conv of the above is`
			`=> {Ai, Aj + Bi, Bj + Ci, Cj} [transposed conv out]`

			`For the backwards of conv, we now have`
			`{D, E} [grad_out]`
			`{A, B, C} [inp]`
			`{i, j} [weight]`

			`# grad_inp as conv_transpose(grad_out, weight)`
			`Let's first compute grad_inp. To do so, we can simply look at all the`
			`multiplications that each element of inp is involved in. For example, A is`
			`only involved in the first element of the output (and thus only depends upon`
			`D in grad_out), and C is only involved in the last element of the output`
			`(and thus only depends upon E in grad_out)`

			`{Di, Dj + Ei, Ej} [grad_inp]`

			`Note that this corresponds to the below conv_transpose. This gives us the`
			`output_mask[0] branch, which is grad_inp.`

			`{D, E} [inp (grad_out)]`
			`{i, j} [weight]`
			`=> (conv_transpose)`
			`{Di, Dj + Ei, Ej} [out (grad_inp)]`

			`I leave the fact that grad_inp for a transposed conv is just conv(grad_out,`
			`weight) as an exercise for the reader.`

			`# grad_weight as conv(inp, grad_out)`
			`To compute grad_weight, we again look at the terms in the output, which as`
			`a reminder is:`
			`=> {Ai + Bj, Bi + Cj} [out]`
			`=> {D, E} [grad_out]`
			`If we manually compute the gradient for the weights, we see it's`
			`{AD + BE, BD + CE} [grad_weight]`

			`This corresponds to the below conv`
			`{A, B, C} [inp]`
			`{D, E} [weight (grad_out)]`
			`=> (conv)`
			`{AD + BE, BD + CE} [out (grad_weight)]`

			`# grad_weight of transposed conv as conv(grad_out, inp)`
			`As a reminder, the terms of the output of a transposed conv are:`
			`=> {Ai, Aj + Bi, Bj + Ci, Cj} [transposed conv out]`
			`=> {D, E, F, G} [grad_out]`

			`Manually computing the gradient for the weights, we see it's`
			`{AD + BE + CF, AE + BF + CG} [grad_weight]`

			`This corresponds to the below conv`
			`{D, E, F, G} [inp (grad_out)]`
			`{A, B, C} [weight (inp)]`
			`=> (conv)`
			`{AD + BE + CF, AE + BF + CG} [out (grad_weight)]`

			`For the full backwards formula, there are also some details involving`
			`transpose of the batch/channel dimensions and groups, but I skip those for`
			`the sake of brevity (and they're pretty similar to matmul backwards)`

			`Check [conv backwards decomposition as conv forwards]`
			`"""`
			`# grad_inp as conv_transpose(grad_out, weight)`
			`if output_mask[0]:`
			`grad_input_shape = get_shape(out_shape[0])`
			`flop_count += conv_flop_count(grad_out_shape, w_shape, grad_input_shape, not transposed)`

			`if output_mask[1]:`
			`grad_weight_shape = get_shape(out_shape[1])`
			`if transposed:`
			`# grad_weight of transposed conv as conv(grad_out, inp)`
			`flop_count += conv_flop_count(t(grad_out_shape), t(x_shape), t(grad_weight_shape), transposed=False)`
			`else:`
			`# grad_weight as conv(inp, grad_out)`
			`flop_count += conv_flop_count(t(x_shape), t(grad_out_shape), t(grad_weight_shape), transposed=False)`

			`return flop_count`

			`def sdpa_flop_count(query_shape, key_shape, value_shape):`
			`"""`
			`Count flops for self-attention.`

			`NB: We can assume that value_shape == key_shape`
			`"""`
			`b, h, s_q, d_q = query_shape`
			`_b2, _h2, s_k, _d2 = key_shape`
			`_b3, _h3, _s3, d_v = value_shape`
			`assert b == _b2 == _b3 and h == _h2 == _h3 and d_q == _d2 and s_k == _s3 and d_q == _d2`
			`total_flops = 0`
			`# q: [b, h, s_q, d_q] @ k: [b, h, d_q, s_k] -> scores: [b, h, s_q, s_k]`
			`total_flops += bmm_flop((b * h, s_q, d_q), (b * h, d_q, s_k))`
			`# scores: [b, h, s_q, s_k] @ v: [b, h, s_k, d_v] -> out: [b, h, s_q, d_v]`
			`total_flops += bmm_flop((b * h, s_q, s_k), (b * h, s_k, d_v))`
			`return total_flops`


			`@register_flop_formula([aten._scaled_dot_product_efficient_attention, aten._scaled_dot_product_flash_attention])`
			`def sdpa_flop(query_shape, key_shape, value_shape, args, out_shape=None, *kwargs) -> int:`
			`"""Count flops for self-attention."""`
			`# NB: We aren't accounting for causal attention here`
			`return sdpa_flop_count(query_shape, key_shape, value_shape)`


			`def sdpa_backward_flop_count(grad_out_shape, query_shape, key_shape, value_shape):`
			`total_flops = 0`
			`b, h, s_q, d_q = query_shape`
			`_b2, _h2, s_k, _d2 = key_shape`
			`_b3, _h3, _s3, d_v = value_shape`
			`_b4, _h4, _s4, _d4 = grad_out_shape`
			`assert b == _b2 == _b3 == _b4 and h == _h2 == _h3 == _h4 and d_q == _d2`
			`assert d_v == _d4 and s_k == _s3 and s_q == _s4`
			`total_flops = 0`
			`# Step 1: We recompute the scores matrix.`
			`# q: [b, h, s_q, d_q] @ k: [b, h, d_q, s_k] -> scores: [b, h, s_q, s_k]`
			`total_flops += bmm_flop((b * h, s_q, d_q), (b * h, d_q, s_k))`

			`# Step 2: We propagate the gradients through the score @ v operation.`
			`# gradOut: [b, h, s_q, d_v] @ v: [b, h, d_v, s_k] -> gradScores: [b, h, s_q, s_k]`
			`total_flops += bmm_flop((b * h, s_q, d_v), (b * h, d_v, s_k))`
			`# scores: [b, h, s_k, s_q] @ gradOut: [b, h, s_q, d_v] -> gradV: [b, h, s_k, d_v]`
			`total_flops += bmm_flop((b * h, s_k, s_q), (b * h, s_q, d_v))`

			`# Step 3: We propagate th gradients through the k @ v operation`
			`# gradScores: [b, h, s_q, s_k] @ k: [b, h, s_k, d_q] -> gradQ: [b, h, s_q, d_q]`
			`total_flops += bmm_flop((b * h, s_q, s_k), (b * h, s_k, d_q))`
			`# q: [b, h, d_q, s_q] @ gradScores: [b, h, s_q, s_k] -> gradK: [b, h, d_q, s_k]`
			`total_flops += bmm_flop((b * h, d_q, s_q), (b * h, s_q, s_k))`
			`return total_flops`


			`@register_flop_formula([aten._scaled_dot_product_efficient_attention_backward, aten._scaled_dot_product_flash_attention_backward])`
			`def sdpa_backward_flop(grad_out_shape, query_shape, key_shape, value_shape, args, out_shape=None, *kwargs) -> int:`
			`"""Count flops for self-attention backward."""`
			`return sdpa_backward_flop_count(grad_out_shape, query_shape, key_shape, value_shape)`

			`flop_registry = {`
			`aten.mm: mm_flop,`
			`aten.addmm: addmm_flop,`
			`aten.bmm: bmm_flop,`
			`aten.baddbmm: baddbmm_flop,`
			`aten.convolution: conv_flop,`
			`aten._convolution: conv_flop,`
			`aten.convolution_backward: conv_backward_flop,`
			`aten._scaled_dot_product_efficient_attention: sdpa_flop,`
			`aten._scaled_dot_product_flash_attention: sdpa_flop,`
			`aten._scaled_dot_product_efficient_attention_backward: sdpa_backward_flop,`
			`aten._scaled_dot_product_flash_attention_backward: sdpa_backward_flop,`
			`}`

			`def normalize_tuple(x):`
			`if not isinstance(x, tuple):`
			`return (x,)`
			`return x`


			`# Define the suffixes for different orders of magnitude`
			`suffixes = ["", "K", "M", "B", "T"]`
			`# Thanks BingChat!`
			`def get_suffix_str(number):`
			`# Find the index of the appropriate suffix based on the number of digits`
			`# with some additional overflow.`
			`# i.e. 1.01B should be displayed as 1001M, not 1.001B`
			`index = max(0, min(len(suffixes) - 1, (len(str(number)) - 2) // 3))`
			`return suffixes[index]`

			`def convert_num_with_suffix(number, suffix):`
			`index = suffixes.index(suffix)`
			`# Divide the number by 1000^index and format it to two decimal places`
			`value = f"{number / 1000 ** index:.3f}"`
			`# Return the value and the suffix as a string`
			`return value + suffixes[index]`

			`def convert_to_percent_str(num, denom):`
			`if denom == 0:`
			`return "0%"`
			`return f"{num / denom:.2%}"`

			`def _pytreeify_preserve_structure(f):`
			`@wraps(f)`
			`def nf(args):`
			`flat_args, spec = tree_flatten(args)`
			`out = f(*flat_args)`
			`return tree_unflatten(out, spec)`

			`return nf`


			`class FlopCounterMode(TorchDispatchMode):`
			`"""`
			``FlopCounterMode`` is a context manager that counts the number of flops within its context.

			It does this using a ``TorchDispatchMode``.

			`It also supports hierarchical output by passing a module (or list of`
			`modules) to FlopCounterMode on construction. If you do not need hierarchical`
			`output, you do not need to use it with a module.`

			`Example usage`

			`.. code-block:: python`

			`mod = ...`
			`flop_counter = FlopCounterMode(mod)`
			`with flop_counter:`
			`mod.sum().backward()`

			`"""`

			`def __init__(`
			`self,`
			`mods: Optional[Union[torch.nn.Module, List[torch.nn.Module]]] = None,`
			`depth: int = 2,`
			`display: bool = True,`
			`custom_mapping: Optional[Dict[Any, Any]] = None):`
			`self.flop_counts: Dict[str, Dict[Any, int]] = defaultdict(lambda: defaultdict(int))`
			`self.depth = depth`
			`self.parents = ["Global"]`
			`self.in_backward = False`
			`self.display = display`
			`if custom_mapping is None:`
			`custom_mapping = {}`
			`if isinstance(mods, torch.nn.Module):`
			`mods = [mods]`
			`self.mods = mods`
			# Keys will include the modules in `mods` and their submodules
			`self._module_to_forward_hook_handles: Dict[nn.Module, _ForwardHookHandles] = {}`
			`self.flop_registry = {`
			`**flop_registry,`
			`**{k: v if getattr(v, "_get_raw", False) else shape_wrapper(v) for k, v in custom_mapping.items()}`
			`}`

			`def _register_forward_hooks(self):`
			`if self.mods is None:`
			`return`
			`for mod in self.mods:`
			`prefix = type(mod).__name__`
			`for name, module in dict(mod.named_modules()).items():`
			`if name == "":`
			`name = prefix`
			`else:`
			`name = ".".join([prefix, name])`

			`forward_pre_hook_handle = module.register_forward_pre_hook(self._enter_module(name))`
			`forward_hook_handle = module.register_forward_hook(self._exit_module(name))`
			`self._module_to_forward_hook_handles[module] = _ForwardHookHandles(`
			`forward_pre_hook_handle, forward_hook_handle`
			`)`

			`def _deregister_forward_hooks(self):`
			`for forward_hook_handles in self._module_to_forward_hook_handles.values():`
			`forward_hook_handles[0].remove()`
			`forward_hook_handles[1].remove()`
			`self._module_to_forward_hook_handles.clear()`

			`def _enter_module(self, name):`
			`def f(module, inputs):`
			`out = _pytreeify_preserve_structure(self._create_pre_module(name))(inputs)`
			`return out`

			`return f`

			`def _exit_module(self, name):`
			`def f(module, inputs, outputs):`
			`outputs = _pytreeify_preserve_structure(self._create_post_module(name))(outputs)`
			`return outputs`
			`return f`

			`def _create_post_module(self, name):`
			`class PushState(torch.autograd.Function):`
			`@staticmethod`
			`def forward(ctx, *args):`
			`assert self.parents[-1] == name, f"{self.parents[-1]} is not {name}"`
			`self.parents.pop()`
			`args = tree_map(lambda x: x.clone() if isinstance(x, torch.Tensor) else x, args)`
			`return args`

			`@staticmethod`
			`def backward(ctx, *grad_outs):`
			`self.in_backward = True`
			`self.parents.append(name)`
			`return grad_outs`

			`return PushState.apply`

			`def _create_pre_module(self, name):`
			`class PopState(torch.autograd.Function):`
			`@staticmethod`
			`def forward(ctx, *args):`
			`if self.in_backward:`
			`self.parents = ["Global"]`
			`self.in_backward = True`
			`self.parents.append(name)`
			`args = tree_map(lambda x: x.clone() if isinstance(x, torch.Tensor) else x, args)`
			`return args`

			`@staticmethod`
			`def backward(ctx, *grad_outs):`
			`assert self.parents[-1] == name`
			`self.parents.pop()`
			`return grad_outs`

			`return PopState.apply`

			`def get_total_flops(self) -> int:`
			`return sum(self.flop_counts['Global'].values())`

			`def get_flop_counts(self) -> Dict[str, Dict[Any, int]]:`
			`"""Return the flop counts as a dictionary of dictionaries.`

			`The outer`
			`dictionary is keyed by module name, and the inner dictionary is keyed by`
			`operation name.`

			`Returns:`
			`Dict[str, Dict[Any, int]]: The flop counts as a dictionary.`
			`"""`
			`return {k: dict(v) for k, v in self.flop_counts.items()}`

			`def get_table(self, depth=None):`
			`if depth is None:`
			`depth = self.depth`
			`if depth is None:`
			`depth = 999999`

			`import tabulate`
			`tabulate.PRESERVE_WHITESPACE = True`
			`header = ["Module", "FLOP", "% Total"]`
			`values = []`
			`global_flops = self.get_total_flops()`
			`global_suffix = get_suffix_str(global_flops)`
			`is_global_subsumed = False`

			`def process_mod(mod_name, depth):`
			`nonlocal is_global_subsumed`

			`total_flops = sum(self.flop_counts[mod_name].values())`

			`is_global_subsumed \|= total_flops >= global_flops`

			`padding = " " * depth`
			`values = []`
			`values.append([`
			`padding + mod_name,`
			`convert_num_with_suffix(total_flops, global_suffix),`
			`convert_to_percent_str(total_flops, global_flops)`
			`])`
			`for k, v in self.flop_counts[mod_name].items():`
			`values.append([`
			`padding + " - " + str(k),`
			`convert_num_with_suffix(v, global_suffix),`
			`convert_to_percent_str(v, global_flops)`
			`])`
			`return values`

			`for mod in self.flop_counts.keys():`
			`if mod == 'Global':`
			`continue`
			`mod_depth = mod.count(".") + 1`
			`if mod_depth > depth:`
			`continue`

			`cur_values = process_mod(mod, mod_depth - 1)`
			`values.extend(cur_values)`

			`# We do a bit of messing around here to only output the "Global" value`
			`# if there are any FLOPs in there that aren't already fully contained by`
			`# a module.`
			`if 'Global' in self.flop_counts and not is_global_subsumed:`
			`for idx, value in enumerate(values):`
			`values[idx][0] = " " + values[idx][0]`

			`values = process_mod('Global', 0) + values`

			`if len(values) == 0:`
			`values = [["Global", "0", "0%"]]`

			`return tabulate.tabulate(values, headers=header, colalign=("left", "right", "right"))`

			`def __enter__(self):`
			`self.flop_counts.clear()`
			`self._register_forward_hooks()`
			`super().__enter__()`
			`return self`

			`def __exit__(self, *args):`
			`if self.display:`
			`print(self.get_table(self.depth))`
			`self._deregister_forward_hooks()`
			`super().__exit__(*args)`

			`def __torch_dispatch__(self, func, types, args=(), kwargs=None):`
			`kwargs = kwargs if kwargs else {}`
			`out = func(args, *kwargs)`
			`func_packet = func._overloadpacket`
			`if func_packet in self.flop_registry:`
			`flop_count_func = self.flop_registry[func_packet]`
			`flop_count = flop_count_func(args, *kwargs, out=out) # type: ignore[operator]`
			`if len(set(self.parents)) != len(self.parents):`
			`print(`
			`"The module hierarchy tracking seems to be messed up."`
			`"Please file a bug or just run the flop counter without"`
			"tracking the module hierarchy (i.e. `with FlopCounterMode():`)"
			`)`
			`for par in set(self.parents):`
			`self.flop_counts[par][func_packet] += flop_count`

			`return out`

			`class _ForwardHookHandles(NamedTuple):`
			`forward_pre_hook_handle: RemovableHandle`
			`forward_hook_handle: RemovableHandle`