272 lines
10 KiB
Python
272 lines
10 KiB
Python
|
# coding=utf-8
|
||
|
# Copyright 2018 The HuggingFace Inc. team.
|
||
|
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||
|
#
|
||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
# you may not use this file except in compliance with the License.
|
||
|
# You may obtain a copy of the License at
|
||
|
#
|
||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||
|
#
|
||
|
# Unless required by applicable law or agreed to in writing, software
|
||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
# See the License for the specific language governing permissions and
|
||
|
# limitations under the License.
|
||
|
"""
|
||
|
Benchmarking the library on inference and training in PyTorch.
|
||
|
"""
|
||
|
|
||
|
|
||
|
import timeit
|
||
|
from typing import Callable, Optional
|
||
|
|
||
|
from ..configuration_utils import PretrainedConfig
|
||
|
from ..models.auto.modeling_auto import MODEL_MAPPING, MODEL_WITH_LM_HEAD_MAPPING
|
||
|
from ..utils import is_py3nvml_available, is_torch_available, logging
|
||
|
from .benchmark_utils import (
|
||
|
Benchmark,
|
||
|
Memory,
|
||
|
MemorySummary,
|
||
|
measure_peak_memory_cpu,
|
||
|
start_memory_tracing,
|
||
|
stop_memory_tracing,
|
||
|
)
|
||
|
|
||
|
|
||
|
if is_torch_available():
|
||
|
import torch
|
||
|
|
||
|
from .benchmark_args import PyTorchBenchmarkArguments
|
||
|
|
||
|
|
||
|
if is_py3nvml_available():
|
||
|
import py3nvml.py3nvml as nvml
|
||
|
|
||
|
|
||
|
logger = logging.get_logger(__name__)
|
||
|
|
||
|
|
||
|
class PyTorchBenchmark(Benchmark):
|
||
|
args: PyTorchBenchmarkArguments
|
||
|
configs: PretrainedConfig
|
||
|
framework: str = "PyTorch"
|
||
|
|
||
|
@property
|
||
|
def framework_version(self):
|
||
|
return torch.__version__
|
||
|
|
||
|
def _inference_speed(self, model_name: str, batch_size: int, sequence_length: int) -> float:
|
||
|
_inference = self._prepare_inference_func(model_name, batch_size, sequence_length)
|
||
|
return self._measure_speed(_inference)
|
||
|
|
||
|
def _inference_memory(
|
||
|
self, model_name: str, batch_size: int, sequence_length: int
|
||
|
) -> [Memory, Optional[MemorySummary]]:
|
||
|
_inference = self._prepare_inference_func(model_name, batch_size, sequence_length)
|
||
|
return self._measure_memory(_inference)
|
||
|
|
||
|
def _train_speed(self, model_name: str, batch_size: int, sequence_length: int) -> float:
|
||
|
_train = self._prepare_train_func(model_name, batch_size, sequence_length)
|
||
|
return self._measure_speed(_train)
|
||
|
|
||
|
def _train_memory(
|
||
|
self, model_name: str, batch_size: int, sequence_length: int
|
||
|
) -> [Memory, Optional[MemorySummary]]:
|
||
|
_train = self._prepare_train_func(model_name, batch_size, sequence_length)
|
||
|
return self._measure_memory(_train)
|
||
|
|
||
|
def _prepare_inference_func(self, model_name: str, batch_size: int, sequence_length: int) -> Callable[[], None]:
|
||
|
config = self.config_dict[model_name]
|
||
|
|
||
|
if self.args.torchscript:
|
||
|
config.torchscript = True
|
||
|
|
||
|
has_model_class_in_config = (
|
||
|
hasattr(config, "architectures")
|
||
|
and isinstance(config.architectures, list)
|
||
|
and len(config.architectures) > 0
|
||
|
)
|
||
|
if not self.args.only_pretrain_model and has_model_class_in_config:
|
||
|
try:
|
||
|
model_class = config.architectures[0]
|
||
|
transformers_module = __import__("transformers", fromlist=[model_class])
|
||
|
model_cls = getattr(transformers_module, model_class)
|
||
|
model = model_cls(config)
|
||
|
except ImportError:
|
||
|
raise ImportError(
|
||
|
f"{model_class} does not exist. If you just want to test the pretrained model, you might want to"
|
||
|
" set `--only_pretrain_model` or `args.only_pretrain_model=True`."
|
||
|
)
|
||
|
else:
|
||
|
model = MODEL_MAPPING[config.__class__](config)
|
||
|
|
||
|
model.eval()
|
||
|
model.to(self.args.device)
|
||
|
|
||
|
# encoder-decoder has vocab size saved differently
|
||
|
vocab_size = config.vocab_size if hasattr(config, "vocab_size") else config.encoder.vocab_size
|
||
|
input_ids = torch.randint(vocab_size, (batch_size, sequence_length), dtype=torch.long, device=self.args.device)
|
||
|
|
||
|
if self.args.fp16:
|
||
|
logger.info("Running training in Mixed Precision...")
|
||
|
if not self.args.is_gpu:
|
||
|
raise ValueError("Mixed precision is possible only for GPU.")
|
||
|
# amp seems to have memory leaks so that memory usage
|
||
|
# is measured using .half() for now https://github.com/NVIDIA/apex/issues/439
|
||
|
model.half()
|
||
|
|
||
|
if self.args.torchscript:
|
||
|
with torch.no_grad():
|
||
|
inference_model = torch.jit.trace(model, input_ids)
|
||
|
else:
|
||
|
inference_model = model
|
||
|
|
||
|
def encoder_decoder_forward():
|
||
|
with torch.no_grad():
|
||
|
outputs = inference_model(input_ids, decoder_input_ids=input_ids)
|
||
|
return outputs
|
||
|
|
||
|
def encoder_forward():
|
||
|
with torch.no_grad():
|
||
|
outputs = inference_model(input_ids)
|
||
|
return outputs
|
||
|
|
||
|
_forward = encoder_decoder_forward if config.is_encoder_decoder else encoder_forward
|
||
|
return _forward
|
||
|
|
||
|
def _prepare_train_func(self, model_name: str, batch_size: int, sequence_length: int) -> Callable[[], None]:
|
||
|
config = self.config_dict[model_name]
|
||
|
|
||
|
has_model_class_in_config = (
|
||
|
hasattr(config, "architectures")
|
||
|
and isinstance(config.architectures, list)
|
||
|
and len(config.architectures) > 0
|
||
|
)
|
||
|
if not self.args.only_pretrain_model and has_model_class_in_config:
|
||
|
try:
|
||
|
model_class = config.architectures[0]
|
||
|
transformers_module = __import__("transformers", fromlist=[model_class])
|
||
|
model_cls = getattr(transformers_module, model_class)
|
||
|
model = model_cls(config)
|
||
|
except ImportError:
|
||
|
raise ImportError(
|
||
|
f"{model_class} does not exist. If you just want to test the pretrained model, you might want to"
|
||
|
" set `--only_pretrain_model` or `args.only_pretrain_model=True`."
|
||
|
)
|
||
|
else:
|
||
|
model = MODEL_WITH_LM_HEAD_MAPPING[config.__class__](config)
|
||
|
|
||
|
if self.args.torchscript:
|
||
|
raise NotImplementedError("Training for torchscript is currently not implemented")
|
||
|
else:
|
||
|
train_model = model
|
||
|
|
||
|
model.train()
|
||
|
model.to(self.args.device)
|
||
|
|
||
|
# encoder-decoder has vocab size saved differently
|
||
|
vocab_size = config.vocab_size if hasattr(config, "vocab_size") else config.encoder.vocab_size
|
||
|
input_ids = torch.randint(vocab_size, (batch_size, sequence_length), dtype=torch.long, device=self.args.device)
|
||
|
|
||
|
if self.args.fp16:
|
||
|
logger.info("Running training in Mixed Precision...")
|
||
|
if not self.args.is_gpu:
|
||
|
raise ValueError("Mixed precision is possible only for GPU.")
|
||
|
|
||
|
# amp seems to have memory leaks so that memory usage
|
||
|
# is measured using .half() for now https://github.com/NVIDIA/apex/issues/439
|
||
|
model.half()
|
||
|
|
||
|
def compute_loss_and_backprob_encoder():
|
||
|
loss = train_model(input_ids, labels=input_ids)[0]
|
||
|
loss.backward()
|
||
|
return loss
|
||
|
|
||
|
def compute_loss_and_backprob_encoder_decoder():
|
||
|
loss = train_model(input_ids, decoder_input_ids=input_ids, labels=input_ids)[0]
|
||
|
loss.backward()
|
||
|
return loss
|
||
|
|
||
|
_train = (
|
||
|
compute_loss_and_backprob_encoder_decoder
|
||
|
if config.is_encoder_decoder
|
||
|
else compute_loss_and_backprob_encoder
|
||
|
)
|
||
|
return _train
|
||
|
|
||
|
def _measure_speed(self, func) -> float:
|
||
|
try:
|
||
|
if self.args.is_tpu or self.args.torchscript:
|
||
|
# run additional 10 times to stabilize compilation for tpu and torchscript
|
||
|
logger.info("Do inference on TPU or torchscript. Running model 5 times to stabilize compilation")
|
||
|
timeit.repeat(
|
||
|
func,
|
||
|
repeat=1,
|
||
|
number=5,
|
||
|
)
|
||
|
|
||
|
# as written in https://docs.python.org/2/library/timeit.html#timeit.Timer.repeat, min should be taken rather than the average
|
||
|
runtimes = timeit.repeat(
|
||
|
func,
|
||
|
repeat=self.args.repeat,
|
||
|
number=10,
|
||
|
)
|
||
|
|
||
|
if self.args.is_tpu and self.args.torch_xla_tpu_print_metrics:
|
||
|
import torch_xla.debug.metrics as met
|
||
|
|
||
|
self.print_fn(met.metrics_report())
|
||
|
|
||
|
return min(runtimes) / 10.0
|
||
|
except RuntimeError as e:
|
||
|
self.print_fn(f"Doesn't fit on GPU. {e}")
|
||
|
return "N/A"
|
||
|
|
||
|
def _measure_memory(self, func: Callable[[], None]) -> [Memory, MemorySummary]:
|
||
|
try:
|
||
|
if self.args.trace_memory_line_by_line:
|
||
|
trace = start_memory_tracing("transformers")
|
||
|
|
||
|
if self.args.is_tpu:
|
||
|
# tpu
|
||
|
raise NotImplementedError(
|
||
|
"Memory Benchmarking is currently not implemented for TPU. Please disable memory benchmarking with"
|
||
|
" `--no-memory` or `args.memory=False`"
|
||
|
)
|
||
|
elif self.args.is_gpu:
|
||
|
if not is_py3nvml_available():
|
||
|
logger.warning(
|
||
|
"py3nvml not installed, we won't log GPU memory usage. "
|
||
|
"Install py3nvml (pip install py3nvml) to log information about GPU."
|
||
|
)
|
||
|
memory = "N/A"
|
||
|
else:
|
||
|
logger.info(
|
||
|
"Measuring total GPU usage on GPU device. Make sure to not have additional processes running"
|
||
|
" on the same GPU."
|
||
|
)
|
||
|
# init nvml
|
||
|
nvml.nvmlInit()
|
||
|
func()
|
||
|
handle = nvml.nvmlDeviceGetHandleByIndex(self.args.device_idx)
|
||
|
meminfo = nvml.nvmlDeviceGetMemoryInfo(handle)
|
||
|
max_bytes_in_use = meminfo.used
|
||
|
memory = Memory(max_bytes_in_use)
|
||
|
# shutdown nvml
|
||
|
nvml.nvmlShutdown()
|
||
|
else:
|
||
|
# cpu
|
||
|
memory_bytes = measure_peak_memory_cpu(func)
|
||
|
memory = Memory(memory_bytes) if isinstance(memory_bytes, int) else memory_bytes
|
||
|
|
||
|
if self.args.trace_memory_line_by_line:
|
||
|
summary = stop_memory_tracing(trace)
|
||
|
else:
|
||
|
summary = None
|
||
|
|
||
|
return memory, summary
|
||
|
except RuntimeError as e:
|
||
|
self.print_fn(f"Doesn't fit on GPU. {e}")
|
||
|
return "N/A", None
|