ai-content-maker/.venv/Lib/site-packages/spacy/cli/init_pipeline.py

import logging
from pathlib import Path
from typing import Optional

import srsly
import typer
from wasabi import msg

from .. import util
from ..language import Language
from ..training.initialize import convert_vectors, init_nlp
from ._util import (
    Arg,
    Opt,
    import_code,
    init_cli,
    parse_config_overrides,
    setup_gpu,
    show_validation_error,
)


@init_cli.command("vectors")
def init_vectors_cli(
    # fmt: off
    lang: str = Arg(..., help="The language of the nlp object to create"),
    vectors_loc: Path = Arg(..., help="Vectors file in Word2Vec format", exists=True),
    output_dir: Path = Arg(..., help="Pipeline output directory"),
    prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
    truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
    mode: str = Opt("default", "--mode", "-m", help="Vectors mode: default or floret"),
    name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
    jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
    attr: str = Opt("ORTH", "--attr", "-a", help="Optional token attribute to use for vectors, e.g. LOWER or NORM"),
    # fmt: on
):
    """Convert word vectors for use with spaCy. Will export an nlp object that
    you can use in the [initialize] block of your config to initialize
    a model with vectors.
    """
    if verbose:
        util.logger.setLevel(logging.DEBUG)
    msg.info(f"Creating blank nlp object for language '{lang}'")
    nlp = util.get_lang_class(lang)()
    if jsonl_loc is not None:
        update_lexemes(nlp, jsonl_loc)
    convert_vectors(
        nlp,
        vectors_loc,
        truncate=truncate,
        prune=prune,
        name=name,
        mode=mode,
        attr=attr,
    )
    msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
    nlp.to_disk(output_dir)
    msg.good(
        "Saved nlp object with vectors to output directory. You can now use the "
        "path to it in your config as the 'vectors' setting in [initialize].",
        output_dir.resolve(),
    )


def update_lexemes(nlp: Language, jsonl_loc: Path) -> None:
    # Mostly used for backwards-compatibility and may be removed in the future
    lex_attrs = srsly.read_jsonl(jsonl_loc)
    for attrs in lex_attrs:
        if "settings" in attrs:
            continue
        lexeme = nlp.vocab[attrs["orth"]]
        lexeme.set_attrs(**attrs)


@init_cli.command(
    "nlp",
    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
    hidden=True,
)
def init_pipeline_cli(
    # fmt: off
    ctx: typer.Context,  # This is only used to read additional arguments
    config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
    output_path: Path = Arg(..., help="Output directory for the prepared data"),
    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
    # fmt: on
):
    if verbose:
        util.logger.setLevel(logging.DEBUG)
    overrides = parse_config_overrides(ctx.args)
    import_code(code_path)
    setup_gpu(use_gpu)
    with show_validation_error(config_path):
        config = util.load_config(config_path, overrides=overrides)
    with show_validation_error(hint_fill=False):
        nlp = init_nlp(config, use_gpu=use_gpu)
    nlp.to_disk(output_path)
    msg.good(f"Saved initialized pipeline to {output_path}")


@init_cli.command(
    "labels",
    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
)
def init_labels_cli(
    # fmt: off
    ctx: typer.Context,  # This is only used to read additional arguments
    config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
    output_path: Path = Arg(..., help="Output directory for the labels"),
    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
    # fmt: on
):
    """Generate JSON files for the labels in the data. This helps speed up the
    training process, since spaCy won't have to preprocess the data to
    extract the labels."""
    if verbose:
        util.logger.setLevel(logging.DEBUG)
    if not output_path.exists():
        output_path.mkdir(parents=True)
    overrides = parse_config_overrides(ctx.args)
    import_code(code_path)
    setup_gpu(use_gpu)
    with show_validation_error(config_path):
        config = util.load_config(config_path, overrides=overrides)
    with show_validation_error(hint_fill=False):
        nlp = init_nlp(config, use_gpu=use_gpu)
    _init_labels(nlp, output_path)


def _init_labels(nlp, output_path):
    for name, component in nlp.pipeline:
        if getattr(component, "label_data", None) is not None:
            output_file = output_path / f"{name}.json"
            srsly.write_json(output_file, component.label_data)
            msg.good(f"Saving label data for component '{name}' to {output_file}")
        else:
            msg.info(f"No label data found for component '{name}'")
first commit 2024-05-03 04:18:51 +03:00			`import logging`
			`from pathlib import Path`
			`from typing import Optional`

			`import srsly`
			`import typer`
			`from wasabi import msg`

			`from .. import util`
			`from ..language import Language`
			`from ..training.initialize import convert_vectors, init_nlp`
			`from ._util import (`
			`Arg,`
			`Opt,`
			`import_code,`
			`init_cli,`
			`parse_config_overrides,`
			`setup_gpu,`
			`show_validation_error,`
			`)`


			`@init_cli.command("vectors")`
			`def init_vectors_cli(`
			`# fmt: off`
			`lang: str = Arg(..., help="The language of the nlp object to create"),`
			`vectors_loc: Path = Arg(..., help="Vectors file in Word2Vec format", exists=True),`
			`output_dir: Path = Arg(..., help="Pipeline output directory"),`
			`prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),`
			`truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),`
			`mode: str = Opt("default", "--mode", "-m", help="Vectors mode: default or floret"),`
			`name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),`
			`verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),`
			`jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),`
			`attr: str = Opt("ORTH", "--attr", "-a", help="Optional token attribute to use for vectors, e.g. LOWER or NORM"),`
			`# fmt: on`
			`):`
			`"""Convert word vectors for use with spaCy. Will export an nlp object that`
			`you can use in the [initialize] block of your config to initialize`
			`a model with vectors.`
			`"""`
			`if verbose:`
			`util.logger.setLevel(logging.DEBUG)`
			`msg.info(f"Creating blank nlp object for language '{lang}'")`
			`nlp = util.get_lang_class(lang)()`
			`if jsonl_loc is not None:`
			`update_lexemes(nlp, jsonl_loc)`
			`convert_vectors(`
			`nlp,`
			`vectors_loc,`
			`truncate=truncate,`
			`prune=prune,`
			`name=name,`
			`mode=mode,`
			`attr=attr,`
			`)`
			`msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")`
			`nlp.to_disk(output_dir)`
			`msg.good(`
			`"Saved nlp object with vectors to output directory. You can now use the "`
			`"path to it in your config as the 'vectors' setting in [initialize].",`
			`output_dir.resolve(),`
			`)`


			`def update_lexemes(nlp: Language, jsonl_loc: Path) -> None:`
			`# Mostly used for backwards-compatibility and may be removed in the future`
			`lex_attrs = srsly.read_jsonl(jsonl_loc)`
			`for attrs in lex_attrs:`
			`if "settings" in attrs:`
			`continue`
			`lexeme = nlp.vocab[attrs["orth"]]`
			`lexeme.set_attrs(**attrs)`


			`@init_cli.command(`
			`"nlp",`
			`context_settings={"allow_extra_args": True, "ignore_unknown_options": True},`
			`hidden=True,`
			`)`
			`def init_pipeline_cli(`
			`# fmt: off`
			`ctx: typer.Context, # This is only used to read additional arguments`
			`config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),`
			`output_path: Path = Arg(..., help="Output directory for the prepared data"),`
			`code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),`
			`verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),`
			`use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")`
			`# fmt: on`
			`):`
			`if verbose:`
			`util.logger.setLevel(logging.DEBUG)`
			`overrides = parse_config_overrides(ctx.args)`
			`import_code(code_path)`
			`setup_gpu(use_gpu)`
			`with show_validation_error(config_path):`
			`config = util.load_config(config_path, overrides=overrides)`
			`with show_validation_error(hint_fill=False):`
			`nlp = init_nlp(config, use_gpu=use_gpu)`
			`nlp.to_disk(output_path)`
			`msg.good(f"Saved initialized pipeline to {output_path}")`


			`@init_cli.command(`
			`"labels",`
			`context_settings={"allow_extra_args": True, "ignore_unknown_options": True},`
			`)`
			`def init_labels_cli(`
			`# fmt: off`
			`ctx: typer.Context, # This is only used to read additional arguments`
			`config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),`
			`output_path: Path = Arg(..., help="Output directory for the labels"),`
			`code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),`
			`verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),`
			`use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")`
			`# fmt: on`
			`):`
			`"""Generate JSON files for the labels in the data. This helps speed up the`
			`training process, since spaCy won't have to preprocess the data to`
			`extract the labels."""`
			`if verbose:`
			`util.logger.setLevel(logging.DEBUG)`
			`if not output_path.exists():`
			`output_path.mkdir(parents=True)`
			`overrides = parse_config_overrides(ctx.args)`
			`import_code(code_path)`
			`setup_gpu(use_gpu)`
			`with show_validation_error(config_path):`
			`config = util.load_config(config_path, overrides=overrides)`
			`with show_validation_error(hint_fill=False):`
			`nlp = init_nlp(config, use_gpu=use_gpu)`
			`_init_labels(nlp, output_path)`


			`def _init_labels(nlp, output_path):`
			`for name, component in nlp.pipeline:`
			`if getattr(component, "label_data", None) is not None:`
			`output_file = output_path / f"{name}.json"`
			`srsly.write_json(output_file, component.label_data)`
			`msg.good(f"Saving label data for component '{name}' to {output_file}")`
			`else:`
			`msg.info(f"No label data found for component '{name}'")`