ai-content-maker/.venv/Lib/site-packages/spacy_loggers/clearml.py

311 lines
11 KiB
Python
Raw Normal View History

2024-05-03 04:18:51 +03:00
from typing import Dict, Any, Tuple, Callable, List, Optional, IO
from types import ModuleType
import os
import sys
from spacy import Language
from spacy.util import SimpleFrozenList
from .util import dict_to_dot, dot_to_dict, matcher_for_regex_patterns
from .util import setup_default_console_logger, LoggerT
# entry point: spacy.ClearMLLogger.v2
def clearml_logger_v2(
project_name: str,
task_name: str,
remove_config_values: List[str] = SimpleFrozenList(),
model_log_interval: Optional[int] = None,
log_dataset_dir: Optional[str] = None,
log_best_dir: Optional[str] = None,
log_latest_dir: Optional[str] = None,
log_custom_stats: Optional[List[str]] = None,
) -> LoggerT:
"""Creates a logger that interoperates with the ClearML framework.
Args:
project_name (str):
The name of the project in the ClearML interface. The project will be created automatically if it doesn't exist yet.
task_name (str):
The name of the ClearML task. A task is an experiment that lives inside a project. Can be non-unique.
remove_config_values (List[str]):
A list of values to exclude from the config before it is uploaded to ClearML. Defaults to [].
model_log_interval (Optional[int]):
Steps to wait between logging model checkpoints to the ClearML dasboard (default: `None`). Will have no effect without also setting `log_best_dir` or `log_latest_dir`. Defaults to None.
log_dataset_dir (Optional[str]):
Directory containing the dataset to be logged and versioned as a ClearML Dataset. Defaults to None.
log_best_dir (Optional[str]):
Directory containing the best trained model as saved by spaCy, to be logged and versioned as a ClearML artifact. Defaults to None.
log_latest_dir (Optional[str]):
Directory containing the latest trained model as saved by spaCy, to be logged and versioned as a ClearML artifact. Defaults to None.
log_custom_stats (Optional[List[str]]):
A list of regular expressions that will be applied to the info dictionary passed to the logger. Statistics and metrics that match these regexps will be automatically logged. Defaults to None.
Returns:
LoggerT: Logger instance.
"""
clearml = _import_clearml()
def setup_logger(
nlp: Language, stdout: IO = sys.stdout, stderr: IO = sys.stderr
) -> Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]:
match_stat = matcher_for_regex_patterns(log_custom_stats)
task, best_model, last_model = _setup_clearml(
clearml,
nlp,
project_name,
task_name,
log_dataset_dir,
log_best_dir,
log_latest_dir,
remove_config_values,
)
def log_step(info: Optional[Dict[str, Any]]):
_log_step_clearml(
info,
task,
best_model,
last_model,
model_log_interval,
log_best_dir,
log_latest_dir,
)
_log_custom_stats(clearml, info, match_stat)
def finalize():
_finalize_clearml(task)
return log_step, finalize
return setup_logger
# entry point: spacy.ClearMLLogger.v1
def clearml_logger_v1(
project_name: str,
task_name: str,
remove_config_values: List[str] = SimpleFrozenList(),
model_log_interval: Optional[int] = None,
log_dataset_dir: Optional[str] = None,
log_best_dir: Optional[str] = None,
log_latest_dir: Optional[str] = None,
) -> LoggerT:
"""Creates a logger that interoperates with the ClearML framework.
Args:
project_name (str):
The name of the project in the ClearML interface. The project will be created automatically if it doesn't exist yet.
task_name (str):
The name of the ClearML task. A task is an experiment that lives inside a project. Can be non-unique.
remove_config_values (List[str]):
A list of values to exclude from the config before it is uploaded to ClearML. Defaults to [].
model_log_interval (Optional[int]):
Steps to wait between logging model checkpoints to the ClearML dasboard (default: `None`). Will have no effect without also setting `log_best_dir` or `log_latest_dir`. Defaults to None.
log_dataset_dir (Optional[str]):
Directory containing the dataset to be logged and versioned as a ClearML Dataset. Defaults to None.
log_best_dir (Optional[str]):
Directory containing the best trained model as saved by spaCy, to be logged and versioned as a ClearML artifact. Defaults to None.
log_latest_dir (Optional[str]):
Directory containing the latest trained model as saved by spaCy, to be logged and versioned as a ClearML artifact. Defaults to None.
Returns:
LoggerT: Logger instance.
"""
clearml = _import_clearml()
def setup_logger(
nlp: Language, stdout: IO = sys.stdout, stderr: IO = sys.stderr
) -> Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]:
console_log_step, console_finalize = setup_default_console_logger(
nlp, stdout, stderr
)
task, best_model, last_model = _setup_clearml(
clearml,
nlp,
project_name,
task_name,
log_dataset_dir,
log_best_dir,
log_latest_dir,
remove_config_values,
)
def log_step(info: Optional[Dict[str, Any]]):
console_log_step(info)
_log_step_clearml(
info,
task,
best_model,
last_model,
model_log_interval,
log_best_dir,
log_latest_dir,
)
def finalize():
console_finalize()
_finalize_clearml(task)
return log_step, finalize
return setup_logger
def _import_clearml() -> ModuleType:
try:
import clearml
# test that these are available
from clearml import Task, Dataset, OutputModel # noqa: F401
except ImportError as exc:
raise ImportError(
"The 'clearml' library could not be found - did you install it? "
"Alternatively, specify the 'ConsoleLogger' in the "
"'training.logger' config section, instead of the 'ClearMLLogger'."
) from exc
return clearml
def _setup_clearml(
clearml: ModuleType,
nlp: "Language",
project_name: str,
task_name: str,
log_dataset_dir: Optional[str] = None,
log_best_dir: Optional[str] = None,
log_latest_dir: Optional[str] = None,
remove_config_values: List[str] = SimpleFrozenList(),
) -> Tuple[Any, Any, Any]:
config = nlp.config.interpolate()
config_dot = dict_to_dot(config)
for field in remove_config_values:
del config_dot[field]
config = dot_to_dict(config_dot)
task = clearml.Task.init(
project_name=project_name,
task_name=task_name,
output_uri=True,
)
for config_section, subconfig_or_value in config.items():
task.connect(subconfig_or_value, name=config_section)
if log_dataset_dir:
dataset = clearml.Dataset.create(
dataset_project=project_name,
dataset_name=os.path.basename(log_dataset_dir),
)
dataset.add_files(log_dataset_dir)
dataset.finalize(auto_upload=True)
task.set_user_properties(
{
"name": "Created Dataset ID",
"value": dataset.id,
}
)
# Connect 2 models to the task, we will periodically update their weights later on
if log_best_dir:
best_model = clearml.OutputModel(
task=task, framework="spaCy", name="Best Model"
)
else:
best_model = None
if log_latest_dir:
last_model = clearml.OutputModel(
task=task, framework="spaCy", name="Last Model"
)
else:
last_model = None
return task, best_model, last_model
def _log_step_clearml(
info: Optional[Dict[str, Any]],
task: Any,
best_model: Optional[Any] = None,
last_model: Optional[Any] = None,
model_log_interval: Optional[int] = None,
log_best_dir: Optional[str] = None,
log_latest_dir: Optional[str] = None,
):
if info is None:
return
score = info.get("score")
other_scores = info.get("other_scores")
losses = info.get("losses")
if score:
task.get_logger().report_scalar(
"Score", "Score", iteration=info["step"], value=score
)
if losses:
for metric, metric_value in losses.items():
task.get_logger().report_scalar(
title=f"loss_{metric}",
series=f"loss_{metric}",
iteration=info["step"],
value=metric_value,
)
if isinstance(other_scores, dict):
# other_scores is usually a nested dict, so group they by the first key and flatten the rest
# combine flattened submetrics on the same ClearML graph when they have the same first key
for metric, metric_value in other_scores.items():
if isinstance(metric_value, dict):
sub_metrics_dict = dict_to_dot(metric_value)
for (
sub_metric,
sub_metric_value,
) in sub_metrics_dict.items():
# Scalars with the same title get plotted on the same graph as multiple traces
# This saves a lot of space in the UI
task.get_logger().report_scalar(
title=metric,
series=sub_metric,
iteration=info["step"],
value=sub_metric_value,
)
elif isinstance(metric_value, (float, int)):
task.get_logger().report_scalar(
metric,
metric,
iteration=info["step"],
value=metric_value,
)
if model_log_interval and info.get("output_path"):
if info["step"] % model_log_interval == 0 and info["step"] != 0:
if log_latest_dir:
assert last_model is not None
last_model.update_weights_package(
weights_path=log_latest_dir,
auto_delete_file=False,
target_filename="last_model",
)
if log_best_dir and info["score"] == max(info["checkpoints"])[0]:
assert best_model is not None
best_model.update_weights_package(
weights_path=log_best_dir,
auto_delete_file=False,
target_filename="best_model",
)
def _finalize_clearml(task: Any):
task.flush(wait_for_uploads=True)
task.close()
def _log_custom_stats(
task: Any, info: Optional[Dict[str, Any]], matcher: Callable[[str], bool]
):
if info is not None:
for k, v in info.items():
if matcher(k):
task.get_logger().report_scalar(
title=f"loss_{k}",
series=f"loss_{k}",
iteration=info["step"],
value=v,
)