ai-content-maker/.venv/Lib/site-packages/spacy_loggers/clearml.py

from typing import Dict, Any, Tuple, Callable, List, Optional, IO
from types import ModuleType
import os
import sys

from spacy import Language
from spacy.util import SimpleFrozenList
from .util import dict_to_dot, dot_to_dict, matcher_for_regex_patterns
from .util import setup_default_console_logger, LoggerT


# entry point: spacy.ClearMLLogger.v2
def clearml_logger_v2(
    project_name: str,
    task_name: str,
    remove_config_values: List[str] = SimpleFrozenList(),
    model_log_interval: Optional[int] = None,
    log_dataset_dir: Optional[str] = None,
    log_best_dir: Optional[str] = None,
    log_latest_dir: Optional[str] = None,
    log_custom_stats: Optional[List[str]] = None,
) -> LoggerT:
    """Creates a logger that interoperates with the ClearML framework.

    Args:
        project_name (str):
            The name of the project in the ClearML interface. The project will be created automatically if it doesn't exist yet.
        task_name (str):
            The name of the ClearML task. A task is an experiment that lives inside a project. Can be non-unique.
        remove_config_values (List[str]):
            A list of values to exclude from the config before it is uploaded to ClearML. Defaults to [].
        model_log_interval (Optional[int]):
            Steps to wait between logging model checkpoints to the ClearML dasboard (default: `None`). Will have no effect without also setting `log_best_dir` or `log_latest_dir`. Defaults to None.
        log_dataset_dir (Optional[str]):
            Directory containing the dataset to be logged and versioned as a ClearML Dataset. Defaults to None.
        log_best_dir (Optional[str]):
            Directory containing the best trained model as saved by spaCy, to be logged and versioned as a ClearML artifact. Defaults to None.
        log_latest_dir (Optional[str]):
            Directory containing the latest trained model as saved by spaCy, to be logged and versioned as a ClearML artifact. Defaults to None.
        log_custom_stats (Optional[List[str]]):
            A list of regular expressions that will be applied to the info dictionary passed to the logger. Statistics and metrics that match these regexps will be automatically logged. Defaults to None.

    Returns:
        LoggerT: Logger instance.
    """
    clearml = _import_clearml()

    def setup_logger(
        nlp: Language, stdout: IO = sys.stdout, stderr: IO = sys.stderr
    ) -> Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]:
        match_stat = matcher_for_regex_patterns(log_custom_stats)
        task, best_model, last_model = _setup_clearml(
            clearml,
            nlp,
            project_name,
            task_name,
            log_dataset_dir,
            log_best_dir,
            log_latest_dir,
            remove_config_values,
        )

        def log_step(info: Optional[Dict[str, Any]]):
            _log_step_clearml(
                info,
                task,
                best_model,
                last_model,
                model_log_interval,
                log_best_dir,
                log_latest_dir,
            )
            _log_custom_stats(clearml, info, match_stat)

        def finalize():
            _finalize_clearml(task)

        return log_step, finalize

    return setup_logger


# entry point: spacy.ClearMLLogger.v1
def clearml_logger_v1(
    project_name: str,
    task_name: str,
    remove_config_values: List[str] = SimpleFrozenList(),
    model_log_interval: Optional[int] = None,
    log_dataset_dir: Optional[str] = None,
    log_best_dir: Optional[str] = None,
    log_latest_dir: Optional[str] = None,
) -> LoggerT:
    """Creates a logger that interoperates with the ClearML framework.

    Args:
        project_name (str):
            The name of the project in the ClearML interface. The project will be created automatically if it doesn't exist yet.
        task_name (str):
            The name of the ClearML task. A task is an experiment that lives inside a project. Can be non-unique.
        remove_config_values (List[str]):
            A list of values to exclude from the config before it is uploaded to ClearML. Defaults to [].
        model_log_interval (Optional[int]):
            Steps to wait between logging model checkpoints to the ClearML dasboard (default: `None`). Will have no effect without also setting `log_best_dir` or `log_latest_dir`. Defaults to None.
        log_dataset_dir (Optional[str]):
            Directory containing the dataset to be logged and versioned as a ClearML Dataset. Defaults to None.
        log_best_dir (Optional[str]):
            Directory containing the best trained model as saved by spaCy, to be logged and versioned as a ClearML artifact. Defaults to None.
        log_latest_dir (Optional[str]):
            Directory containing the latest trained model as saved by spaCy, to be logged and versioned as a ClearML artifact. Defaults to None.

    Returns:
        LoggerT: Logger instance.
    """
    clearml = _import_clearml()

    def setup_logger(
        nlp: Language, stdout: IO = sys.stdout, stderr: IO = sys.stderr
    ) -> Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]:
        console_log_step, console_finalize = setup_default_console_logger(
            nlp, stdout, stderr
        )
        task, best_model, last_model = _setup_clearml(
            clearml,
            nlp,
            project_name,
            task_name,
            log_dataset_dir,
            log_best_dir,
            log_latest_dir,
            remove_config_values,
        )

        def log_step(info: Optional[Dict[str, Any]]):
            console_log_step(info)
            _log_step_clearml(
                info,
                task,
                best_model,
                last_model,
                model_log_interval,
                log_best_dir,
                log_latest_dir,
            )

        def finalize():
            console_finalize()
            _finalize_clearml(task)

        return log_step, finalize

    return setup_logger


def _import_clearml() -> ModuleType:
    try:
        import clearml

        # test that these are available
        from clearml import Task, Dataset, OutputModel  # noqa: F401
    except ImportError as exc:
        raise ImportError(
            "The 'clearml' library could not be found - did you install it? "
            "Alternatively, specify the 'ConsoleLogger' in the "
            "'training.logger' config section, instead of the 'ClearMLLogger'."
        ) from exc
    return clearml


def _setup_clearml(
    clearml: ModuleType,
    nlp: "Language",
    project_name: str,
    task_name: str,
    log_dataset_dir: Optional[str] = None,
    log_best_dir: Optional[str] = None,
    log_latest_dir: Optional[str] = None,
    remove_config_values: List[str] = SimpleFrozenList(),
) -> Tuple[Any, Any, Any]:
    config = nlp.config.interpolate()
    config_dot = dict_to_dot(config)
    for field in remove_config_values:
        del config_dot[field]
    config = dot_to_dict(config_dot)
    task = clearml.Task.init(
        project_name=project_name,
        task_name=task_name,
        output_uri=True,
    )
    for config_section, subconfig_or_value in config.items():
        task.connect(subconfig_or_value, name=config_section)

    if log_dataset_dir:
        dataset = clearml.Dataset.create(
            dataset_project=project_name,
            dataset_name=os.path.basename(log_dataset_dir),
        )
        dataset.add_files(log_dataset_dir)
        dataset.finalize(auto_upload=True)
        task.set_user_properties(
            {
                "name": "Created Dataset ID",
                "value": dataset.id,
            }
        )

    # Connect 2 models to the task, we will periodically update their weights later on
    if log_best_dir:
        best_model = clearml.OutputModel(
            task=task, framework="spaCy", name="Best Model"
        )
    else:
        best_model = None
    if log_latest_dir:
        last_model = clearml.OutputModel(
            task=task, framework="spaCy", name="Last Model"
        )
    else:
        last_model = None

    return task, best_model, last_model


def _log_step_clearml(
    info: Optional[Dict[str, Any]],
    task: Any,
    best_model: Optional[Any] = None,
    last_model: Optional[Any] = None,
    model_log_interval: Optional[int] = None,
    log_best_dir: Optional[str] = None,
    log_latest_dir: Optional[str] = None,
):
    if info is None:
        return

    score = info.get("score")
    other_scores = info.get("other_scores")
    losses = info.get("losses")
    if score:
        task.get_logger().report_scalar(
            "Score", "Score", iteration=info["step"], value=score
        )
    if losses:
        for metric, metric_value in losses.items():
            task.get_logger().report_scalar(
                title=f"loss_{metric}",
                series=f"loss_{metric}",
                iteration=info["step"],
                value=metric_value,
            )
    if isinstance(other_scores, dict):
        # other_scores is usually a nested dict, so group they by the first key and flatten the rest
        # combine flattened submetrics on the same ClearML graph when they have the same first key
        for metric, metric_value in other_scores.items():
            if isinstance(metric_value, dict):
                sub_metrics_dict = dict_to_dot(metric_value)
                for (
                    sub_metric,
                    sub_metric_value,
                ) in sub_metrics_dict.items():
                    # Scalars with the same title get plotted on the same graph as multiple traces
                    # This saves a lot of space in the UI
                    task.get_logger().report_scalar(
                        title=metric,
                        series=sub_metric,
                        iteration=info["step"],
                        value=sub_metric_value,
                    )
            elif isinstance(metric_value, (float, int)):
                task.get_logger().report_scalar(
                    metric,
                    metric,
                    iteration=info["step"],
                    value=metric_value,
                )

    if model_log_interval and info.get("output_path"):
        if info["step"] % model_log_interval == 0 and info["step"] != 0:
            if log_latest_dir:
                assert last_model is not None
                last_model.update_weights_package(
                    weights_path=log_latest_dir,
                    auto_delete_file=False,
                    target_filename="last_model",
                )
            if log_best_dir and info["score"] == max(info["checkpoints"])[0]:
                assert best_model is not None
                best_model.update_weights_package(
                    weights_path=log_best_dir,
                    auto_delete_file=False,
                    target_filename="best_model",
                )


def _finalize_clearml(task: Any):
    task.flush(wait_for_uploads=True)
    task.close()


def _log_custom_stats(
    task: Any, info: Optional[Dict[str, Any]], matcher: Callable[[str], bool]
):
    if info is not None:
        for k, v in info.items():
            if matcher(k):
                task.get_logger().report_scalar(
                    title=f"loss_{k}",
                    series=f"loss_{k}",
                    iteration=info["step"],
                    value=v,
                )