ai-content-maker/.venv/Lib/site-packages/transformers/training_args_tf.py

300 lines
14 KiB
Python

# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import warnings
from dataclasses import dataclass, field
from typing import Optional, Tuple
from .training_args import TrainingArguments
from .utils import cached_property, is_tf_available, logging, requires_backends
logger = logging.get_logger(__name__)
if is_tf_available():
import tensorflow as tf
from .modeling_tf_utils import keras
@dataclass
class TFTrainingArguments(TrainingArguments):
"""
TrainingArguments is the subset of the arguments we use in our example scripts **which relate to the training loop
itself**.
Using [`HfArgumentParser`] we can turn this class into
[argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
command line.
Parameters:
output_dir (`str`):
The output directory where the model predictions and checkpoints will be written.
overwrite_output_dir (`bool`, *optional*, defaults to `False`):
If `True`, overwrite the content of the output directory. Use this to continue training if `output_dir`
points to a checkpoint directory.
do_train (`bool`, *optional*, defaults to `False`):
Whether to run training or not. This argument is not directly used by [`Trainer`], it's intended to be used
by your training/evaluation scripts instead. See the [example
scripts](https://github.com/huggingface/transformers/tree/main/examples) for more details.
do_eval (`bool`, *optional*):
Whether to run evaluation on the validation set or not. Will be set to `True` if `evaluation_strategy` is
different from `"no"`. This argument is not directly used by [`Trainer`], it's intended to be used by your
training/evaluation scripts instead. See the [example
scripts](https://github.com/huggingface/transformers/tree/main/examples) for more details.
do_predict (`bool`, *optional*, defaults to `False`):
Whether to run predictions on the test set or not. This argument is not directly used by [`Trainer`], it's
intended to be used by your training/evaluation scripts instead. See the [example
scripts](https://github.com/huggingface/transformers/tree/main/examples) for more details.
evaluation_strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"no"`):
The evaluation strategy to adopt during training. Possible values are:
- `"no"`: No evaluation is done during training.
- `"steps"`: Evaluation is done (and logged) every `eval_steps`.
- `"epoch"`: Evaluation is done at the end of each epoch.
per_device_train_batch_size (`int`, *optional*, defaults to 8):
The batch size per GPU/TPU core/CPU for training.
per_device_eval_batch_size (`int`, *optional*, defaults to 8):
The batch size per GPU/TPU core/CPU for evaluation.
gradient_accumulation_steps (`int`, *optional*, defaults to 1):
Number of updates steps to accumulate the gradients for, before performing a backward/update pass.
<Tip warning={true}>
When using gradient accumulation, one step is counted as one step with backward pass. Therefore, logging,
evaluation, save will be conducted every `gradient_accumulation_steps * xxx_step` training examples.
</Tip>
learning_rate (`float`, *optional*, defaults to 5e-5):
The initial learning rate for Adam.
weight_decay (`float`, *optional*, defaults to 0):
The weight decay to apply (if not zero).
adam_beta1 (`float`, *optional*, defaults to 0.9):
The beta1 hyperparameter for the Adam optimizer.
adam_beta2 (`float`, *optional*, defaults to 0.999):
The beta2 hyperparameter for the Adam optimizer.
adam_epsilon (`float`, *optional*, defaults to 1e-8):
The epsilon hyperparameter for the Adam optimizer.
max_grad_norm (`float`, *optional*, defaults to 1.0):
Maximum gradient norm (for gradient clipping).
num_train_epochs(`float`, *optional*, defaults to 3.0):
Total number of training epochs to perform.
max_steps (`int`, *optional*, defaults to -1):
If set to a positive number, the total number of training steps to perform. Overrides `num_train_epochs`.
For a finite dataset, training is reiterated through the dataset (if all data is exhausted) until
`max_steps` is reached.
warmup_ratio (`float`, *optional*, defaults to 0.0):
Ratio of total training steps used for a linear warmup from 0 to `learning_rate`.
warmup_steps (`int`, *optional*, defaults to 0):
Number of steps used for a linear warmup from 0 to `learning_rate`. Overrides any effect of `warmup_ratio`.
logging_dir (`str`, *optional*):
[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to
*runs/**CURRENT_DATETIME_HOSTNAME***.
logging_strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"steps"`):
The logging strategy to adopt during training. Possible values are:
- `"no"`: No logging is done during training.
- `"epoch"`: Logging is done at the end of each epoch.
- `"steps"`: Logging is done every `logging_steps`.
logging_first_step (`bool`, *optional*, defaults to `False`):
Whether to log and evaluate the first `global_step` or not.
logging_steps (`int`, *optional*, defaults to 500):
Number of update steps between two logs if `logging_strategy="steps"`.
save_strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"steps"`):
The checkpoint save strategy to adopt during training. Possible values are:
- `"no"`: No save is done during training.
- `"epoch"`: Save is done at the end of each epoch.
- `"steps"`: Save is done every `save_steps`.
save_steps (`int`, *optional*, defaults to 500):
Number of updates steps before two checkpoint saves if `save_strategy="steps"`.
save_total_limit (`int`, *optional*):
If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in
`output_dir`.
no_cuda (`bool`, *optional*, defaults to `False`):
Whether to not use CUDA even when it is available or not.
seed (`int`, *optional*, defaults to 42):
Random seed that will be set at the beginning of training.
fp16 (`bool`, *optional*, defaults to `False`):
Whether to use 16-bit (mixed) precision training (through NVIDIA Apex) instead of 32-bit training.
fp16_opt_level (`str`, *optional*, defaults to 'O1'):
For `fp16` training, Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. See details on
the [Apex documentation](https://nvidia.github.io/apex/amp).
local_rank (`int`, *optional*, defaults to -1):
During distributed training, the rank of the process.
tpu_num_cores (`int`, *optional*):
When training on TPU, the number of TPU cores (automatically passed by launcher script).
debug (`bool`, *optional*, defaults to `False`):
Whether to activate the trace to record computation graphs and profiling information or not.
dataloader_drop_last (`bool`, *optional*, defaults to `False`):
Whether to drop the last incomplete batch (if the length of the dataset is not divisible by the batch size)
or not.
eval_steps (`int`, *optional*, defaults to 1000):
Number of update steps before two evaluations.
past_index (`int`, *optional*, defaults to -1):
Some models like [TransformerXL](../model_doc/transformerxl) or :doc*XLNet <../model_doc/xlnet>* can make
use of the past hidden states for their predictions. If this argument is set to a positive int, the
`Trainer` will use the corresponding output (usually index 2) as the past state and feed it to the model at
the next training step under the keyword argument `mems`.
tpu_name (`str`, *optional*):
The name of the TPU the process is running on.
tpu_zone (`str`, *optional*):
The zone of the TPU the process is running on. If not specified, we will attempt to automatically detect
from metadata.
gcp_project (`str`, *optional*):
Google Cloud Project name for the Cloud TPU-enabled project. If not specified, we will attempt to
automatically detect from metadata.
run_name (`str`, *optional*):
A descriptor for the run. Notably used for wandb logging.
xla (`bool`, *optional*):
Whether to activate the XLA compilation or not.
"""
framework = "tf"
tpu_name: Optional[str] = field(
default=None,
metadata={"help": "Name of TPU"},
)
tpu_zone: Optional[str] = field(
default=None,
metadata={"help": "Zone of TPU"},
)
gcp_project: Optional[str] = field(
default=None,
metadata={"help": "Name of Cloud TPU-enabled project"},
)
poly_power: float = field(
default=1.0,
metadata={"help": "Power for the Polynomial decay LR scheduler."},
)
xla: bool = field(default=False, metadata={"help": "Whether to activate the XLA compilation or not"})
@cached_property
def _setup_strategy(self) -> Tuple["tf.distribute.Strategy", int]:
requires_backends(self, ["tf"])
logger.info("Tensorflow: setting up strategy")
gpus = tf.config.list_physical_devices("GPU")
# Set to float16 at first
if self.fp16:
keras.mixed_precision.set_global_policy("mixed_float16")
if self.no_cuda:
strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0")
else:
try:
if self.tpu_name:
tpu = tf.distribute.cluster_resolver.TPUClusterResolver(
self.tpu_name, zone=self.tpu_zone, project=self.gcp_project
)
else:
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
except ValueError:
if self.tpu_name:
raise RuntimeError(f"Couldn't connect to TPU {self.tpu_name}!")
else:
tpu = None
if tpu:
# Set to bfloat16 in case of TPU
if self.fp16:
keras.mixed_precision.set_global_policy("mixed_bfloat16")
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
strategy = tf.distribute.TPUStrategy(tpu)
elif len(gpus) == 0:
strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0")
elif len(gpus) == 1:
strategy = tf.distribute.OneDeviceStrategy(device="/gpu:0")
elif len(gpus) > 1:
# If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0`
strategy = tf.distribute.MirroredStrategy()
else:
raise ValueError("Cannot find the proper strategy, please check your environment properties.")
return strategy
@property
def strategy(self) -> "tf.distribute.Strategy":
"""
The strategy used for distributed training.
"""
requires_backends(self, ["tf"])
return self._setup_strategy
@property
def n_replicas(self) -> int:
"""
The number of replicas (CPUs, GPUs or TPU cores) used in this training.
"""
requires_backends(self, ["tf"])
return self._setup_strategy.num_replicas_in_sync
@property
def should_log(self):
"""
Whether or not the current process should produce log.
"""
return False # TF Logging is handled by Keras not the Trainer
@property
def train_batch_size(self) -> int:
"""
The actual batch size for training (may differ from `per_gpu_train_batch_size` in distributed training).
"""
if self.per_gpu_train_batch_size:
logger.warning(
"Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future "
"version. Using `--per_device_train_batch_size` is preferred."
)
per_device_batch_size = self.per_gpu_train_batch_size or self.per_device_train_batch_size
return per_device_batch_size * self.n_replicas
@property
def eval_batch_size(self) -> int:
"""
The actual batch size for evaluation (may differ from `per_gpu_eval_batch_size` in distributed training).
"""
if self.per_gpu_eval_batch_size:
logger.warning(
"Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future "
"version. Using `--per_device_eval_batch_size` is preferred."
)
per_device_batch_size = self.per_gpu_eval_batch_size or self.per_device_eval_batch_size
return per_device_batch_size * self.n_replicas
@property
def n_gpu(self) -> int:
"""
The number of replicas (CPUs, GPUs or TPU cores) used in this training.
"""
requires_backends(self, ["tf"])
warnings.warn(
"The n_gpu argument is deprecated and will be removed in a future version, use n_replicas instead.",
FutureWarning,
)
return self._setup_strategy.num_replicas_in_sync