from typing import Dict, Any, Tuple, Callable, List, Optional, IO from types import ModuleType import os import sys from spacy import Language from spacy.util import SimpleFrozenList from .util import dict_to_dot, dot_to_dict, matcher_for_regex_patterns from .util import setup_default_console_logger, LoggerT # entry point: spacy.ClearMLLogger.v2 def clearml_logger_v2( project_name: str, task_name: str, remove_config_values: List[str] = SimpleFrozenList(), model_log_interval: Optional[int] = None, log_dataset_dir: Optional[str] = None, log_best_dir: Optional[str] = None, log_latest_dir: Optional[str] = None, log_custom_stats: Optional[List[str]] = None, ) -> LoggerT: """Creates a logger that interoperates with the ClearML framework. Args: project_name (str): The name of the project in the ClearML interface. The project will be created automatically if it doesn't exist yet. task_name (str): The name of the ClearML task. A task is an experiment that lives inside a project. Can be non-unique. remove_config_values (List[str]): A list of values to exclude from the config before it is uploaded to ClearML. Defaults to []. model_log_interval (Optional[int]): Steps to wait between logging model checkpoints to the ClearML dasboard (default: `None`). Will have no effect without also setting `log_best_dir` or `log_latest_dir`. Defaults to None. log_dataset_dir (Optional[str]): Directory containing the dataset to be logged and versioned as a ClearML Dataset. Defaults to None. log_best_dir (Optional[str]): Directory containing the best trained model as saved by spaCy, to be logged and versioned as a ClearML artifact. Defaults to None. log_latest_dir (Optional[str]): Directory containing the latest trained model as saved by spaCy, to be logged and versioned as a ClearML artifact. Defaults to None. log_custom_stats (Optional[List[str]]): A list of regular expressions that will be applied to the info dictionary passed to the logger. Statistics and metrics that match these regexps will be automatically logged. Defaults to None. Returns: LoggerT: Logger instance. """ clearml = _import_clearml() def setup_logger( nlp: Language, stdout: IO = sys.stdout, stderr: IO = sys.stderr ) -> Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]: match_stat = matcher_for_regex_patterns(log_custom_stats) task, best_model, last_model = _setup_clearml( clearml, nlp, project_name, task_name, log_dataset_dir, log_best_dir, log_latest_dir, remove_config_values, ) def log_step(info: Optional[Dict[str, Any]]): _log_step_clearml( info, task, best_model, last_model, model_log_interval, log_best_dir, log_latest_dir, ) _log_custom_stats(clearml, info, match_stat) def finalize(): _finalize_clearml(task) return log_step, finalize return setup_logger # entry point: spacy.ClearMLLogger.v1 def clearml_logger_v1( project_name: str, task_name: str, remove_config_values: List[str] = SimpleFrozenList(), model_log_interval: Optional[int] = None, log_dataset_dir: Optional[str] = None, log_best_dir: Optional[str] = None, log_latest_dir: Optional[str] = None, ) -> LoggerT: """Creates a logger that interoperates with the ClearML framework. Args: project_name (str): The name of the project in the ClearML interface. The project will be created automatically if it doesn't exist yet. task_name (str): The name of the ClearML task. A task is an experiment that lives inside a project. Can be non-unique. remove_config_values (List[str]): A list of values to exclude from the config before it is uploaded to ClearML. Defaults to []. model_log_interval (Optional[int]): Steps to wait between logging model checkpoints to the ClearML dasboard (default: `None`). Will have no effect without also setting `log_best_dir` or `log_latest_dir`. Defaults to None. log_dataset_dir (Optional[str]): Directory containing the dataset to be logged and versioned as a ClearML Dataset. Defaults to None. log_best_dir (Optional[str]): Directory containing the best trained model as saved by spaCy, to be logged and versioned as a ClearML artifact. Defaults to None. log_latest_dir (Optional[str]): Directory containing the latest trained model as saved by spaCy, to be logged and versioned as a ClearML artifact. Defaults to None. Returns: LoggerT: Logger instance. """ clearml = _import_clearml() def setup_logger( nlp: Language, stdout: IO = sys.stdout, stderr: IO = sys.stderr ) -> Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]: console_log_step, console_finalize = setup_default_console_logger( nlp, stdout, stderr ) task, best_model, last_model = _setup_clearml( clearml, nlp, project_name, task_name, log_dataset_dir, log_best_dir, log_latest_dir, remove_config_values, ) def log_step(info: Optional[Dict[str, Any]]): console_log_step(info) _log_step_clearml( info, task, best_model, last_model, model_log_interval, log_best_dir, log_latest_dir, ) def finalize(): console_finalize() _finalize_clearml(task) return log_step, finalize return setup_logger def _import_clearml() -> ModuleType: try: import clearml # test that these are available from clearml import Task, Dataset, OutputModel # noqa: F401 except ImportError as exc: raise ImportError( "The 'clearml' library could not be found - did you install it? " "Alternatively, specify the 'ConsoleLogger' in the " "'training.logger' config section, instead of the 'ClearMLLogger'." ) from exc return clearml def _setup_clearml( clearml: ModuleType, nlp: "Language", project_name: str, task_name: str, log_dataset_dir: Optional[str] = None, log_best_dir: Optional[str] = None, log_latest_dir: Optional[str] = None, remove_config_values: List[str] = SimpleFrozenList(), ) -> Tuple[Any, Any, Any]: config = nlp.config.interpolate() config_dot = dict_to_dot(config) for field in remove_config_values: del config_dot[field] config = dot_to_dict(config_dot) task = clearml.Task.init( project_name=project_name, task_name=task_name, output_uri=True, ) for config_section, subconfig_or_value in config.items(): task.connect(subconfig_or_value, name=config_section) if log_dataset_dir: dataset = clearml.Dataset.create( dataset_project=project_name, dataset_name=os.path.basename(log_dataset_dir), ) dataset.add_files(log_dataset_dir) dataset.finalize(auto_upload=True) task.set_user_properties( { "name": "Created Dataset ID", "value": dataset.id, } ) # Connect 2 models to the task, we will periodically update their weights later on if log_best_dir: best_model = clearml.OutputModel( task=task, framework="spaCy", name="Best Model" ) else: best_model = None if log_latest_dir: last_model = clearml.OutputModel( task=task, framework="spaCy", name="Last Model" ) else: last_model = None return task, best_model, last_model def _log_step_clearml( info: Optional[Dict[str, Any]], task: Any, best_model: Optional[Any] = None, last_model: Optional[Any] = None, model_log_interval: Optional[int] = None, log_best_dir: Optional[str] = None, log_latest_dir: Optional[str] = None, ): if info is None: return score = info.get("score") other_scores = info.get("other_scores") losses = info.get("losses") if score: task.get_logger().report_scalar( "Score", "Score", iteration=info["step"], value=score ) if losses: for metric, metric_value in losses.items(): task.get_logger().report_scalar( title=f"loss_{metric}", series=f"loss_{metric}", iteration=info["step"], value=metric_value, ) if isinstance(other_scores, dict): # other_scores is usually a nested dict, so group they by the first key and flatten the rest # combine flattened submetrics on the same ClearML graph when they have the same first key for metric, metric_value in other_scores.items(): if isinstance(metric_value, dict): sub_metrics_dict = dict_to_dot(metric_value) for ( sub_metric, sub_metric_value, ) in sub_metrics_dict.items(): # Scalars with the same title get plotted on the same graph as multiple traces # This saves a lot of space in the UI task.get_logger().report_scalar( title=metric, series=sub_metric, iteration=info["step"], value=sub_metric_value, ) elif isinstance(metric_value, (float, int)): task.get_logger().report_scalar( metric, metric, iteration=info["step"], value=metric_value, ) if model_log_interval and info.get("output_path"): if info["step"] % model_log_interval == 0 and info["step"] != 0: if log_latest_dir: assert last_model is not None last_model.update_weights_package( weights_path=log_latest_dir, auto_delete_file=False, target_filename="last_model", ) if log_best_dir and info["score"] == max(info["checkpoints"])[0]: assert best_model is not None best_model.update_weights_package( weights_path=log_best_dir, auto_delete_file=False, target_filename="best_model", ) def _finalize_clearml(task: Any): task.flush(wait_for_uploads=True) task.close() def _log_custom_stats( task: Any, info: Optional[Dict[str, Any]], matcher: Callable[[str], bool] ): if info is not None: for k, v in info.items(): if matcher(k): task.get_logger().report_scalar( title=f"loss_{k}", series=f"loss_{k}", iteration=info["step"], value=v, )