663 lines
24 KiB
Python
663 lines
24 KiB
Python
|
import json
|
||
|
import math
|
||
|
import os
|
||
|
import re
|
||
|
from typing import Dict, List, Optional, Set
|
||
|
|
||
|
import torch
|
||
|
import torch.utils.benchmark as benchmark
|
||
|
from torch._C._profiler import (
|
||
|
_EventType,
|
||
|
_ExtraFields_PyCall,
|
||
|
_ExtraFields_PyCCall,
|
||
|
_ExtraFields_TorchOp,
|
||
|
_ProfilerEvent,
|
||
|
)
|
||
|
from torch.profiler import profile
|
||
|
from torch.profiler._utils import index_of_first_match, traverse_bfs, traverse_dfs
|
||
|
|
||
|
|
||
|
class Pattern:
|
||
|
"""
|
||
|
Base class for all patterns, subclass this class and implement match()
|
||
|
to define custom patterns.
|
||
|
|
||
|
In subclass, define description and skip property.
|
||
|
"""
|
||
|
|
||
|
def __init__(self, prof: profile, should_benchmark: bool = False):
|
||
|
self.prof = prof
|
||
|
self.should_benchmark = should_benchmark
|
||
|
self.name = "Please specify a name for pattern"
|
||
|
self.description = "Please specify a description for pattern"
|
||
|
self.url = ""
|
||
|
assert prof.profiler is not None and prof.profiler.kineto_results is not None
|
||
|
self.event_tree = prof.profiler.kineto_results.experimental_event_tree()
|
||
|
self.tid_root: Dict[int, List[_ProfilerEvent]] = {}
|
||
|
for event in self.event_tree:
|
||
|
self.tid_root.setdefault(event.start_tid, []).append(event)
|
||
|
|
||
|
@property
|
||
|
def skip(self):
|
||
|
return False
|
||
|
|
||
|
def report(self, event: _ProfilerEvent):
|
||
|
msg = (
|
||
|
f"{self.description}\n[Source Code Location] {source_code_location(event)}"
|
||
|
)
|
||
|
return msg
|
||
|
|
||
|
def eventTreeTraversal(self):
|
||
|
"""
|
||
|
Traverse the event tree and yield all events.
|
||
|
Override this method in subclass to customize the traversal.
|
||
|
"""
|
||
|
yield from traverse_dfs(self.event_tree)
|
||
|
|
||
|
def summary(self, events: List[_ProfilerEvent]):
|
||
|
default_summary = f"{self.name}: {len(events)} events matched."
|
||
|
if self.should_benchmark:
|
||
|
# If benchmark summary is not empty, use it.
|
||
|
return (
|
||
|
self.benchmark_summary(events)
|
||
|
if hasattr(self, "benchmark") # type: ignore[attr-defined]
|
||
|
else default_summary
|
||
|
)
|
||
|
return default_summary
|
||
|
|
||
|
def benchmark_summary(self, events: List[_ProfilerEvent]):
|
||
|
def format_time(time_ns: int):
|
||
|
unit_lst = ["ns", "us", "ms"]
|
||
|
for unit in unit_lst:
|
||
|
if time_ns < 1000:
|
||
|
return f"{time_ns:.2f} {unit}"
|
||
|
time_ns //= 1000
|
||
|
return f"{time_ns:.2f} s"
|
||
|
|
||
|
assert hasattr(self, "benchmark"), "Please implement benchmark()"
|
||
|
shapes_factor_map = self.benchmark(events) # type: ignore[attr-defined]
|
||
|
original_time = sum(event.duration_time_ns for event in events)
|
||
|
new_time = sum(
|
||
|
shapes_factor_map[input_shapes(event)] * event.duration_time_ns
|
||
|
for event in events
|
||
|
)
|
||
|
return (
|
||
|
f"{self.name}: {len(events)} events matched. "
|
||
|
f"Total Estimated Speedup: {format_time(original_time - new_time)} ({round(original_time/new_time, 2)}X)"
|
||
|
)
|
||
|
|
||
|
def match(self, event: _ProfilerEvent):
|
||
|
"""
|
||
|
Return True if the event matches the pattern.
|
||
|
This method should be overriden in subclass.
|
||
|
"""
|
||
|
raise NotImplementedError
|
||
|
|
||
|
def matched_events(self):
|
||
|
if self.skip:
|
||
|
return []
|
||
|
matched_events = []
|
||
|
for event in self.eventTreeTraversal():
|
||
|
if self.match(event):
|
||
|
matched_events.append(event)
|
||
|
return matched_events
|
||
|
|
||
|
def root_of(self, event: _ProfilerEvent):
|
||
|
while event.parent:
|
||
|
event = event.parent
|
||
|
return event
|
||
|
|
||
|
def siblings_of(self, event: _ProfilerEvent):
|
||
|
if event.parent:
|
||
|
children = event.parent.children
|
||
|
else:
|
||
|
children = self.tid_root[event.start_tid]
|
||
|
index = children.index(event)
|
||
|
return children[:index], children[index + 1 :]
|
||
|
|
||
|
def next_of(self, event: _ProfilerEvent):
|
||
|
_, next_events = self.siblings_of(event)
|
||
|
return next_events[0] if next_events else None
|
||
|
|
||
|
def prev_of(self, event: _ProfilerEvent):
|
||
|
prev_events, _ = self.siblings_of(event)
|
||
|
return prev_events[-1] if prev_events else None
|
||
|
|
||
|
def go_up_until(self, event: _ProfilerEvent, predicate):
|
||
|
if not event:
|
||
|
return None
|
||
|
while event.parent and not predicate(event):
|
||
|
event = event.parent
|
||
|
return event
|
||
|
|
||
|
|
||
|
# Patterns
|
||
|
|
||
|
|
||
|
class NamePattern(Pattern):
|
||
|
def __init__(self, prof: profile, name: str, should_benchmark: bool = False):
|
||
|
super().__init__(prof, should_benchmark)
|
||
|
self.description = f"Matched Name Event: {name}"
|
||
|
self.name = name
|
||
|
|
||
|
def match(self, event: _ProfilerEvent):
|
||
|
return re.search(self.name, event.name) is not None
|
||
|
|
||
|
|
||
|
class ExtraCUDACopyPattern(Pattern):
|
||
|
"""
|
||
|
This pattern identifies if we creates a constant tensor on CPU and immediately moves it to GPU.
|
||
|
example: torch.zeros((100, 100)).to("cuda")
|
||
|
|
||
|
Pattern:
|
||
|
build-in method |build-in method
|
||
|
... | aten::to
|
||
|
aten::fill_/aten::zero_ | aten::_to_copy
|
||
|
|
||
|
Algorithm:
|
||
|
We start at node aten::to, go parent events' previous events,
|
||
|
and check if we have a aten::fill_/aten::zero_ as we keep going down the tree.
|
||
|
We always select the last child in the children list when we go down the tree.
|
||
|
If at any step we failed, it is not a match.
|
||
|
"""
|
||
|
|
||
|
def __init__(self, prof: profile, should_benchmark: bool = False):
|
||
|
super().__init__(prof, should_benchmark)
|
||
|
self.name = "Extra CUDA Copy Pattern"
|
||
|
self.description = "Filled a CPU tensor and immediately moved it to GPU. Please initialize it on GPU."
|
||
|
self.url = "https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html#create-tensors-directly-on-the-target-device"
|
||
|
self.init_ops = {
|
||
|
"aten::fill_",
|
||
|
"aten::zero_",
|
||
|
"aten::normal_",
|
||
|
"aten::uniform_",
|
||
|
}
|
||
|
|
||
|
@property
|
||
|
def skip(self):
|
||
|
return not self.prof.with_stack or not self.prof.record_shapes
|
||
|
|
||
|
def match(self, event):
|
||
|
# TODO: We should also check tensor identities
|
||
|
if event.name != "aten::to":
|
||
|
return False
|
||
|
to_event = event
|
||
|
if not event.children:
|
||
|
return False
|
||
|
event = event.children[-1]
|
||
|
if event.name != "aten::_to_copy":
|
||
|
return False
|
||
|
if not event.children:
|
||
|
return False
|
||
|
event = event.children[-1]
|
||
|
if event.name != "aten::copy_":
|
||
|
return False
|
||
|
# aten::copy_ should have the first 2 args dtype the same
|
||
|
dtypes = input_dtypes(event)
|
||
|
if len(dtypes) < 2:
|
||
|
return False
|
||
|
if dtypes[0] is None or dtypes[0] != dtypes[1]:
|
||
|
return False
|
||
|
event = to_event
|
||
|
# Up one level
|
||
|
event = event.parent
|
||
|
if event is None:
|
||
|
return False
|
||
|
# Check if we have a aten::fill_ in previous leaf
|
||
|
event = self.prev_of(event)
|
||
|
if event is None:
|
||
|
return False
|
||
|
while event.children:
|
||
|
event = event.children[-1]
|
||
|
# aten::zero_ is a special optimzation case where fill_ is not called
|
||
|
if event.name in self.init_ops:
|
||
|
return True
|
||
|
return event.name in self.init_ops
|
||
|
# TODO: Check if tensor is reused
|
||
|
|
||
|
def benchmark(self, events: List[_ProfilerEvent]):
|
||
|
shapes_factor_map = {input_shapes(event): 0.0 for event in events}
|
||
|
for shape in shapes_factor_map:
|
||
|
size = shape[0]
|
||
|
to_timer = benchmark.Timer(
|
||
|
stmt='torch.ones(size).to("cuda")', globals={"size": size}
|
||
|
)
|
||
|
de_timer = benchmark.Timer(
|
||
|
stmt='torch.ones(size, device="cuda")', globals={"size": size}
|
||
|
)
|
||
|
to_time = to_timer.timeit(10).mean
|
||
|
de_time = de_timer.timeit(10).mean
|
||
|
shapes_factor_map[shape] = de_time / to_time
|
||
|
return shapes_factor_map
|
||
|
|
||
|
|
||
|
class ForLoopIndexingPattern(Pattern):
|
||
|
"""
|
||
|
This pattern identifies if we use a for loop to index a tensor that
|
||
|
can be vectorized.
|
||
|
example:
|
||
|
tensor = torch.empty((100, 100))
|
||
|
for i in range(100):
|
||
|
tensor[i] = i
|
||
|
|
||
|
Pattern:
|
||
|
aten::select | ... | aten::select | ... (Repeat)
|
||
|
|
||
|
Algorithm:
|
||
|
We start at node aten::select, and we check if we can find this alternating patterns.
|
||
|
We also keep a dictionary to avoid duplicate match in the for loop.
|
||
|
"""
|
||
|
|
||
|
def __init__(self, prof: profile, should_benchmark: bool = False):
|
||
|
super().__init__(prof, should_benchmark)
|
||
|
self.name = "For Loop Indexing Pattern"
|
||
|
self.description = "For loop indexing detected. Vectorization recommended."
|
||
|
self.visited: Set[int] = set()
|
||
|
|
||
|
def eventTreeTraversal(self):
|
||
|
"""
|
||
|
We need to use BFS traversal order to avoid duplicate match.
|
||
|
"""
|
||
|
yield from traverse_bfs(self.event_tree)
|
||
|
|
||
|
def match(self, event: _ProfilerEvent):
|
||
|
if event.name != "aten::select":
|
||
|
return False
|
||
|
if event.id in self.visited:
|
||
|
return False
|
||
|
repeat_count = 1
|
||
|
_, next = self.siblings_of(event)
|
||
|
if len(next) <= 1:
|
||
|
return False
|
||
|
|
||
|
# Custom event list matching
|
||
|
def same_ops(list1, list2):
|
||
|
if len(list1) != len(list2):
|
||
|
return False
|
||
|
for op1, op2 in zip(list1, list2):
|
||
|
if op1.name != op2.name:
|
||
|
return False
|
||
|
return True
|
||
|
|
||
|
# Record the ops between two aten::select
|
||
|
next_select_idx = index_of_first_match(next, lambda e: e.name == "aten::select")
|
||
|
if next_select_idx is None:
|
||
|
return False
|
||
|
indexing_ops = [event] + next[:next_select_idx]
|
||
|
next = next[len(indexing_ops) - 1 :]
|
||
|
for i in range(0, len(next), len(indexing_ops)):
|
||
|
if same_ops(indexing_ops, next[i : i + len(indexing_ops)]):
|
||
|
repeat_count += 1
|
||
|
self.visited.add(next[i].id)
|
||
|
else:
|
||
|
break
|
||
|
return repeat_count >= 10
|
||
|
|
||
|
|
||
|
class FP32MatMulPattern(Pattern):
|
||
|
def __init__(self, prof: profile, should_benchmark: bool = False):
|
||
|
super().__init__(prof, should_benchmark)
|
||
|
self.name = "FP32 MatMul Pattern"
|
||
|
self.description = (
|
||
|
"You are currently using GPU that supports TF32. "
|
||
|
"Please enable TF32 by setting 'torch.backends.cuda.matmul.allow_tf32 = True'"
|
||
|
)
|
||
|
self.url = "https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
|
||
|
|
||
|
@property
|
||
|
def skip(self):
|
||
|
if torch.version.hip is not None:
|
||
|
has_tf32 = False
|
||
|
else:
|
||
|
# Anything less than sm_80 is not Ampere which doesn't support TF32
|
||
|
has_tf32 = all(int(arch[3:]) >= 80 for arch in torch.cuda.get_arch_list())
|
||
|
return has_tf32 is False or super().skip or not self.prof.record_shapes
|
||
|
|
||
|
def match(self, event: _ProfilerEvent):
|
||
|
# If we saw this pattern once, we don't need to match it again
|
||
|
if event.tag != _EventType.TorchOp:
|
||
|
return False
|
||
|
assert isinstance(event.extra_fields, _ExtraFields_TorchOp)
|
||
|
if event.name == "aten::mm":
|
||
|
if event.extra_fields.allow_tf32_cublas is False:
|
||
|
return True
|
||
|
return False
|
||
|
|
||
|
def report(self, event: _ProfilerEvent):
|
||
|
return self.description
|
||
|
|
||
|
def benchmark(self, events: List[_ProfilerEvent]):
|
||
|
shapes_factor_map = {input_shapes(event): 0.0 for event in events}
|
||
|
for shape in shapes_factor_map:
|
||
|
matrixA = torch.randn(shape[0], device="cuda", dtype=torch.float32)
|
||
|
matrixB = torch.randn(shape[1], device="cuda", dtype=torch.float32)
|
||
|
fp32_timer = benchmark.Timer(
|
||
|
stmt="torch.mm(matrixA, matrixB)",
|
||
|
globals={"matrixA": matrixA, "matrixB": matrixB},
|
||
|
)
|
||
|
tf32_timer = benchmark.Timer(
|
||
|
stmt="torch.mm(matrixA, matrixB)",
|
||
|
setup="torch.backends.cuda.matmul.allow_tf32 = True",
|
||
|
globals={"matrixA": matrixA, "matrixB": matrixB},
|
||
|
)
|
||
|
torch.backends.cuda.matmul.allow_tf32 = False
|
||
|
fp32_time = fp32_timer.timeit(10).mean
|
||
|
tf32_time = tf32_timer.timeit(10).mean
|
||
|
shapes_factor_map[shape] = tf32_time / fp32_time
|
||
|
return shapes_factor_map
|
||
|
|
||
|
|
||
|
class OptimizerSingleTensorPattern(Pattern):
|
||
|
"""
|
||
|
This pattern identifies if we are using the single-tensor version of an optimizer.
|
||
|
example:
|
||
|
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
|
||
|
By adding foreach=True to enable multi-tensor optimizer, we can gain speedup when
|
||
|
the kernels are relatively small.
|
||
|
|
||
|
Pattern:
|
||
|
XXXXX: _single_tenser_<OPTIMIZER_NAME>
|
||
|
|
||
|
Algorithm:
|
||
|
String match
|
||
|
"""
|
||
|
|
||
|
def __init__(self, prof: profile, should_benchmark: bool = False):
|
||
|
super().__init__(prof, should_benchmark)
|
||
|
self.name = "Optimizer Single Tensor Pattern"
|
||
|
self.optimizers_with_foreach = ["adam", "sgd", "adamw"]
|
||
|
self.description = (
|
||
|
"Deteced optimizer running with single tensor implementation. "
|
||
|
"Please enable multi tensor implementation by passing 'foreach=True' into optimizer."
|
||
|
)
|
||
|
self.url = ""
|
||
|
|
||
|
def match(self, event: _ProfilerEvent):
|
||
|
for optimizer in self.optimizers_with_foreach:
|
||
|
if event.name.endswith(f"_single_tensor_{optimizer}"):
|
||
|
return True
|
||
|
return False
|
||
|
|
||
|
|
||
|
class SynchronizedDataLoaderPattern(Pattern):
|
||
|
"""
|
||
|
This pattern identifies if we are using num_workers=0 in DataLoader.
|
||
|
example:
|
||
|
torch.utils.data.DataLoader(dataset, batch_size=batch_size)
|
||
|
Add num_workers=N to the arguments. N depends on system configuration.
|
||
|
|
||
|
Pattern:
|
||
|
dataloader.py(...): __iter__
|
||
|
dataloader.py(...): _get_iterator
|
||
|
NOT dataloader.py(...): check_worker_number_rationality
|
||
|
|
||
|
Algorithm:
|
||
|
If we don't see check_worker_number_rationality call in the dataloader __iter__,
|
||
|
It is not an asynchronous dataloader.
|
||
|
|
||
|
"""
|
||
|
|
||
|
def __init__(self, prof: profile, should_benchmark: bool = False):
|
||
|
super().__init__(prof, should_benchmark)
|
||
|
self.name = "Synchronized DataLoader Pattern"
|
||
|
self.description = (
|
||
|
"Detected DataLoader running with synchronized implementation. "
|
||
|
"Please enable asynchronous dataloading by setting num_workers > 0 when initializing DataLoader."
|
||
|
)
|
||
|
self.url = (
|
||
|
"https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html"
|
||
|
"#enable-async-data-loading-and-augmentation"
|
||
|
)
|
||
|
|
||
|
def match(self, event: _ProfilerEvent):
|
||
|
def is_dataloader_function(name: str, function_name: str):
|
||
|
return name.startswith(
|
||
|
os.path.join("torch", "utils", "data", "dataloader.py")
|
||
|
) and name.endswith(function_name)
|
||
|
|
||
|
# TODO: fixme! Due to lifetime issues of the function name, this field might
|
||
|
# actually point to an already freed string when the even is a PyCall.
|
||
|
# Just silently skip this to unblock testing.
|
||
|
try:
|
||
|
event.name
|
||
|
except UnicodeDecodeError:
|
||
|
return False
|
||
|
|
||
|
if not is_dataloader_function(event.name, "__iter__"):
|
||
|
return False
|
||
|
if not event.children:
|
||
|
return False
|
||
|
event = event.children[0]
|
||
|
if not is_dataloader_function(event.name, "_get_iterator"):
|
||
|
return False
|
||
|
if not event.children:
|
||
|
return False
|
||
|
event = event.children[0]
|
||
|
return not is_dataloader_function(event.name, "check_worker_number_rationality")
|
||
|
# TODO: We should also check if the loader is bottleneck.
|
||
|
|
||
|
|
||
|
class GradNotSetToNonePattern(Pattern):
|
||
|
"""
|
||
|
This pattern identifies if we are not setting grad to None in zero_grad.
|
||
|
example:
|
||
|
optimizer.zero_grad()
|
||
|
By setting set_to_none=True, we can gain speedup
|
||
|
|
||
|
Pattern:
|
||
|
XXXXX: _zero_grad
|
||
|
NOT aten::zeros
|
||
|
aten::zero_
|
||
|
|
||
|
aten::zero_ is called on each parameter in the model.
|
||
|
We also want to make sure it is not called by aten::zeros.
|
||
|
|
||
|
Algorithm:
|
||
|
String match
|
||
|
"""
|
||
|
|
||
|
def __init__(self, prof: profile, should_benchmark: bool = False):
|
||
|
super().__init__(prof, should_benchmark)
|
||
|
self.name = "Gradient Set To Zero Instead of None Pattern"
|
||
|
self.description = (
|
||
|
"Detected gradient set to zero instead of None. "
|
||
|
"Please add 'set_to_none=True' when calling zero_grad()."
|
||
|
)
|
||
|
self.url = (
|
||
|
"https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html"
|
||
|
"#disable-gradient-calculation-for-validation-or-inference"
|
||
|
)
|
||
|
|
||
|
def match(self, event: _ProfilerEvent):
|
||
|
if not event.name.endswith(": zero_grad"):
|
||
|
return False
|
||
|
if not event.children:
|
||
|
return False
|
||
|
|
||
|
for sub_event in traverse_dfs(event.children):
|
||
|
if (
|
||
|
sub_event.name == "aten::zero_"
|
||
|
and sub_event.parent.name != "aten::zeros"
|
||
|
):
|
||
|
return True
|
||
|
# TODO: We should also check if the optimizer's numerical behavior will change.
|
||
|
return False
|
||
|
|
||
|
|
||
|
class Conv2dBiasFollowedByBatchNorm2dPattern(Pattern):
|
||
|
"""
|
||
|
This pattern identifies if we are enabling bias in Conv2d which is followed by BatchNorm2d.
|
||
|
Bias doesn't do anything when followed by batchnorm.
|
||
|
Pattern:
|
||
|
nn.Module: Conv2d | nn.Module: BatchNorm2d
|
||
|
...
|
||
|
aten::conv2d AND dtype of third argument is not null
|
||
|
The third argument is the bias
|
||
|
Algorithm:
|
||
|
String match
|
||
|
"""
|
||
|
|
||
|
def __init__(self, prof: profile, should_benchmark: bool = False):
|
||
|
super().__init__(prof, should_benchmark)
|
||
|
self.name = "Enabling Bias in Conv2d Followed By BatchNorm Pattern"
|
||
|
self.description = "Detected bias enabled in Conv2d that is followed by BatchNorm2d. Please set 'bias=False' in Conv2d."
|
||
|
self.url = (
|
||
|
"https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html"
|
||
|
"#disable-bias-for-convolutions-directly-followed-by-a-batch-norm"
|
||
|
)
|
||
|
|
||
|
@property
|
||
|
def skip(self):
|
||
|
return self.prof.record_shapes is False or super().skip
|
||
|
|
||
|
def match(self, event: _ProfilerEvent):
|
||
|
if event.name != "aten::conv2d":
|
||
|
return False
|
||
|
if len(input_dtypes(event)) < 3 or input_dtypes(event)[2] is None:
|
||
|
return False
|
||
|
# This means bias=True
|
||
|
event = self.go_up_until(
|
||
|
event, lambda e: e.name.startswith("nn.Module: Conv2d")
|
||
|
)
|
||
|
if not event:
|
||
|
return False
|
||
|
event = self.next_of(event)
|
||
|
if not event:
|
||
|
return False
|
||
|
return event.name.startswith("nn.Module: BatchNorm2d")
|
||
|
|
||
|
|
||
|
class MatMulDimInFP16Pattern(Pattern):
|
||
|
def __init__(self, prof: profile, should_benchmark: bool = False):
|
||
|
super().__init__(prof, should_benchmark)
|
||
|
self.name = "Matrix Multiplication Dimension Not Aligned Pattern"
|
||
|
self.description = "Detected matmul with dimension not aligned. Please use matmul with aligned dimension."
|
||
|
self.url = "https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html#use-mixed-precision-and-amp"
|
||
|
|
||
|
@property
|
||
|
def skip(self):
|
||
|
return not self.prof.with_stack or not self.prof.record_shapes
|
||
|
|
||
|
def match(self, event: _ProfilerEvent):
|
||
|
def mutiple_of(shapes, multiple):
|
||
|
return all(dim % multiple == 0 for shape in shapes for dim in shape[-2:])
|
||
|
|
||
|
if event.name not in ("aten::mm", "aten::bmm", "aten::addmm"):
|
||
|
return False
|
||
|
if not input_dtypes(event):
|
||
|
return False
|
||
|
arg_dtype = input_dtypes(event)[0]
|
||
|
if arg_dtype in (torch.bfloat16, torch.half) and not mutiple_of(
|
||
|
input_shapes(event), 8
|
||
|
):
|
||
|
return True
|
||
|
return False
|
||
|
|
||
|
def benchmark(self, events: List[_ProfilerEvent]):
|
||
|
def closest_multiple(shapes, multiple):
|
||
|
return [multiple * math.ceil(shape / multiple) for shape in shapes]
|
||
|
|
||
|
shapes_factor_map = {input_shapes(event): 0.0 for event in events}
|
||
|
for shape in shapes_factor_map:
|
||
|
matrixA = torch.randn(shape[0], device="cuda", dtype=torch.float16)
|
||
|
matrixB = torch.randn(shape[1], device="cuda", dtype=torch.float16)
|
||
|
not_aligned_dim_timer = benchmark.Timer(
|
||
|
stmt="torch.mm(matrixA, matrixB)",
|
||
|
globals={"matrixA": matrixA, "matrixB": matrixB},
|
||
|
)
|
||
|
matrixA = torch.randn(
|
||
|
closest_multiple(shape[0], 8), device="cuda", dtype=torch.float16
|
||
|
)
|
||
|
matrixB = torch.randn(
|
||
|
closest_multiple(shape[1], 8), device="cuda", dtype=torch.float16
|
||
|
)
|
||
|
aligned_dim_timer = benchmark.Timer(
|
||
|
stmt="torch.mm(matrixA, matrixB)",
|
||
|
globals={"matrixA": matrixA, "matrixB": matrixB},
|
||
|
)
|
||
|
not_aligned_dim_time = not_aligned_dim_timer.timeit(10).mean
|
||
|
aligned_dim_time = aligned_dim_timer.timeit(10).mean
|
||
|
shapes_factor_map[shape] = aligned_dim_time / not_aligned_dim_time
|
||
|
return shapes_factor_map
|
||
|
|
||
|
|
||
|
def source_code_location(event: Optional[_ProfilerEvent]):
|
||
|
while event:
|
||
|
if event.tag == _EventType.PyCall or event.tag == _EventType.PyCCall:
|
||
|
assert isinstance(
|
||
|
event.extra_fields, (_ExtraFields_PyCall, _ExtraFields_PyCCall)
|
||
|
)
|
||
|
if not event.extra_fields.caller.file_name.startswith("torch" + os.sep):
|
||
|
return f"{event.extra_fields.caller.file_name}:{event.extra_fields.caller.line_number}"
|
||
|
event = event.parent
|
||
|
return "No source code location found"
|
||
|
|
||
|
|
||
|
def input_shapes(event: _ProfilerEvent):
|
||
|
assert isinstance(event.extra_fields, _ExtraFields_TorchOp)
|
||
|
return tuple(tuple(getattr(i, "sizes", ())) for i in event.extra_fields.inputs)
|
||
|
|
||
|
|
||
|
def input_dtypes(event: _ProfilerEvent):
|
||
|
assert isinstance(event.extra_fields, _ExtraFields_TorchOp)
|
||
|
return tuple(getattr(i, "dtype", None) for i in event.extra_fields.inputs)
|
||
|
|
||
|
|
||
|
def report_all_anti_patterns(
|
||
|
prof,
|
||
|
should_benchmark: bool = False,
|
||
|
print_enable: bool = True,
|
||
|
json_report_dir: Optional[str] = None,
|
||
|
):
|
||
|
report_dict: Dict = {}
|
||
|
anti_patterns = [
|
||
|
ExtraCUDACopyPattern(prof, should_benchmark),
|
||
|
# ForLoopIndexingPattern(prof, should_benchmark),
|
||
|
FP32MatMulPattern(prof, should_benchmark),
|
||
|
OptimizerSingleTensorPattern(prof, should_benchmark),
|
||
|
SynchronizedDataLoaderPattern(prof, should_benchmark),
|
||
|
GradNotSetToNonePattern(prof, should_benchmark),
|
||
|
Conv2dBiasFollowedByBatchNorm2dPattern(prof, should_benchmark),
|
||
|
MatMulDimInFP16Pattern(prof, should_benchmark),
|
||
|
]
|
||
|
reported = set()
|
||
|
summaries = []
|
||
|
message_list = [f"{'-'*40}TorchTidy Report{'-'*40}"]
|
||
|
message_list.append("Matched Events:")
|
||
|
|
||
|
for anti_pattern in anti_patterns:
|
||
|
matched_events = anti_pattern.matched_events()
|
||
|
if not matched_events:
|
||
|
continue
|
||
|
summaries.append(anti_pattern.summary(matched_events))
|
||
|
for event in matched_events:
|
||
|
report_msg = anti_pattern.report(event)
|
||
|
if report_msg not in reported:
|
||
|
message_list.append(report_msg)
|
||
|
reported.add(report_msg)
|
||
|
src_location, line_no = source_code_location(event).split(":")
|
||
|
report_dict.setdefault(src_location, []).append(
|
||
|
{
|
||
|
"line_number": int(line_no),
|
||
|
"name": anti_pattern.name,
|
||
|
"url": anti_pattern.url,
|
||
|
"message": anti_pattern.description,
|
||
|
}
|
||
|
)
|
||
|
|
||
|
if json_report_dir is not None:
|
||
|
json_report_path = os.path.join(json_report_dir, "torchtidy_report.json")
|
||
|
if os.path.exists(json_report_path):
|
||
|
with open(json_report_path) as f:
|
||
|
exisiting_report = json.load(f)
|
||
|
exisiting_report.update(report_dict)
|
||
|
report_dict = exisiting_report
|
||
|
with open(json_report_path, "w") as f:
|
||
|
json.dump(report_dict, f, indent=4)
|
||
|
|
||
|
message_list.append("Summary:")
|
||
|
message_list += summaries
|
||
|
message_list.append(f"{'-'*40}TorchTidy Report{'-'*40}")
|
||
|
if print_enable:
|
||
|
print("\n".join(message_list))
|