ai-content-maker/.venv/Lib/site-packages/sudachipy/command_line.py

# Copyright (c) 2019 Works Applications Co., Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import fileinput
import logging
import os
import sys
from pathlib import Path

from . import Dictionary, SplitMode
from . import __version__
from . import sudachipy


def _set_default_subparser(self, name, args=None):
    """
    copy and modify code from https://bitbucket.org/ruamel/std.argparse
    """
    subparser_found = False
    for arg in sys.argv[1:]:
        if arg in ['-h', '--help']:  # global help if no subparser
            break
    else:
        for x in self._subparsers._actions:
            if not isinstance(x, argparse._SubParsersAction):
                continue
            for sp_name in x._name_parser_map.keys():
                if sp_name in sys.argv[1:]:
                    subparser_found = True
        if not subparser_found:
            # insert default in first position, this implies no
            # global options without a sub_parsers specified
            if args is None:
                sys.argv.insert(1, name)
            else:
                args.insert(0, name)


argparse.ArgumentParser.set_default_subparser = _set_default_subparser


def run(tokenizer, input_, output, print_all, morphs, is_stdout):
    # get an empty MorphemeList for memory reuse
    mlist = tokenizer.tokenize("")
    for line in input_:
        line = line.rstrip('\n')
        # out parameter means we are reusing memory here
        for m in tokenizer.tokenize(line, out=mlist):
            list_info = [
                m.surface(),
                morphs[m.part_of_speech_id()],
                m.normalized_form()]
            if print_all:
                list_info += [
                    m.dictionary_form(),
                    m.reading_form(),
                    str(m.dictionary_id()),
                    '[{}]'.format(','.join([str(synonym_group_id) for synonym_group_id in m.synonym_group_ids()]))]
                if m.is_oov():
                    list_info.append("(OOV)")
            output.write("\t".join(list_info))
            output.write("\n")
        output.write("EOS\n")
        if is_stdout:
            output.flush()


def _input_files_checker(args, print_usage):
    for file in args.in_files:
        if not os.path.exists(file):
            print_usage()
            print('{}: error: {} doesn\'t exist'.format(
                __name__, file), file=sys.stderr)
            exit(1)


def _command_tokenize(args, print_usage):
    if args.version:
        print_version()
        return

    _input_files_checker(args, print_usage)

    if args.mode == "A":
        mode = SplitMode.A
    elif args.mode == "B":
        mode = SplitMode.B
    else:
        mode = SplitMode.C

    output = sys.stdout
    if args.fpath_out:
        output = open(args.fpath_out, "w", encoding="utf-8")

    stdout_logger = logging.getLogger(__name__)
    handler = logging.StreamHandler(sys.stdout)
    handler.setLevel(logging.DEBUG)
    stdout_logger.addHandler(handler)
    stdout_logger.setLevel(logging.DEBUG)
    stdout_logger.propagate = False

    print_all = args.a

    try:
        dict_ = Dictionary(config_path=args.fpath_setting,
                           dict_type=args.system_dict_type)
        # empty matcher - get all POS tags
        all_morphs = dict_.pos_matcher([()])
        # precompute output POS strings
        morphs = [",".join(ms) for ms in all_morphs]

        tokenizer_obj = dict_.create(mode=mode)
        input_ = fileinput.input(
            args.in_files, openhook=fileinput.hook_encoded("utf-8"))
        run(tokenizer_obj, input_, output, print_all, morphs, is_stdout=args.fpath_out is None)
    finally:
        if args.fpath_out:
            output.close()


def _command_build(args, print_usage):
    matrix = Path(args.matrix_file)
    if not matrix.exists():
        print("Matrix file", matrix, "does not exist", file=sys.stderr)
        return print_usage()

    in_files = []
    for file in args.in_files:
        file = Path(file)
        if not file.exists():
            print("Input file", file, "does not exists", file=sys.stderr)
            return print_usage()
        in_files.append(file)

    out_file = Path(args.out_file)
    if out_file.exists():
        print("File", out_file, "already exists, refusing to overwrite it", file=sys.stderr)
        return

    description = args.description or ""
    if len(description.encode("utf-8")) > 255:
        print("Description is longer than 255 bytes in utf-8, it will be truncated")
        return

    stats = sudachipy.build_system_dic(
        matrix=matrix,
        lex=in_files,
        output=out_file,
        description=description,
    )

    for (name, size, time) in stats:
        print("{} -> {} in {:.2F} sec".format(name, size, time))


def _command_user_build(args, print_usage):
    system = Path(args.system_dic)
    if not system.exists():
        print("System dictionary file", system, "does not exist", file=sys.stderr)
        return print_usage()

    in_files = []
    for file in args.in_files:
        file = Path(file)
        if not file.exists():
            print("Input file", file, "does not exists", file=sys.stderr)
            return print_usage()
        in_files.append(file)

    out_file = Path(args.out_file)
    if out_file.exists():
        print("File", out_file, "already exists, refusing to overwrite it", file=sys.stderr)
        return

    description = args.description or ""
    if len(description.encode("utf-8")) > 255:
        print("Description is longer than 255 bytes in utf-8, it will be truncated")
        return

    stats = sudachipy.build_user_dic(
        system=system,
        lex=in_files,
        output=out_file,
        description=description,
    )

    for (name, size, time) in stats:
        print("{} -> {} in {:.2F} sec".format(name, size, time))


def print_version():
    print('sudachipy {}'.format(__version__))


def main():
    parser = argparse.ArgumentParser(
        description="Japanese Morphological Analyzer")

    subparsers = parser.add_subparsers(description='')

    # root, tokenizer parser
    parser_tk = subparsers.add_parser(
        'tokenize', help='(default) see `tokenize -h`', description='Tokenize Text')
    parser_tk.add_argument("-r", dest="fpath_setting",
                           metavar="file", help="the setting file in JSON format")
    parser_tk.add_argument(
        "-m", dest="mode", choices=["A", "B", "C"], default="C", help="the mode of splitting")
    parser_tk.add_argument("-o", dest="fpath_out",
                           metavar="file", help="the output file")
    parser_tk.add_argument("-s", dest="system_dict_type", metavar='string', choices=["small", "core", "full"],
                           help="sudachidict type")
    parser_tk.add_argument("-a", action="store_true",
                           help="print all of the fields")
    parser_tk.add_argument("-d", action="store_true",
                           help="print the debug information")
    parser_tk.add_argument("-v", "--version", action="store_true",
                           dest="version", help="print sudachipy version")
    parser_tk.add_argument("in_files", metavar="file",
                           nargs=argparse.ZERO_OR_MORE, help='text written in utf-8')
    parser_tk.set_defaults(handler=_command_tokenize,
                           print_usage=parser_tk.print_usage)

    # build dictionary parser
    parser_bd = subparsers.add_parser(
        'build', help='see `build -h`', description='Build Sudachi Dictionary')
    parser_bd.add_argument('-o', dest='out_file', metavar='file', default='system.dic',
                           help='output file (default: system.dic)')
    parser_bd.add_argument('-d', dest='description', default='', metavar='string', required=False,
                           help='description comment to be embedded on dictionary')
    required_named_bd = parser_bd.add_argument_group(
        'required named arguments')
    required_named_bd.add_argument('-m', dest='matrix_file', metavar='file', required=True,
                                   help='connection matrix file with MeCab\'s matrix.def format')
    parser_bd.add_argument("in_files", metavar="file", nargs=argparse.ONE_OR_MORE,
                           help='source files with CSV format (one of more)')
    parser_bd.set_defaults(handler=_command_build,
                           print_usage=parser_bd.print_usage)

    # build user-dictionary parser
    parser_ubd = subparsers.add_parser(
        'ubuild', help='see `ubuild -h`', description='Build User Dictionary')
    parser_ubd.add_argument('-d', dest='description', default='', metavar='string', required=False,
                            help='description comment to be embedded on dictionary')
    parser_ubd.add_argument('-o', dest='out_file', metavar='file', default='user.dic',
                            help='output file (default: user.dic)')
    parser_ubd.add_argument('-s', dest='system_dic', metavar='file', required=False,
                            help='system dictionary path (default: system core dictionary path)')
    parser_ubd.add_argument("in_files", metavar="file", nargs=argparse.ONE_OR_MORE,
                            help='source files with CSV format (one or more)')
    parser_ubd.set_defaults(handler=_command_user_build,
                            print_usage=parser_ubd.print_usage)

    parser.set_default_subparser('tokenize')

    args = parser.parse_args()

    if hasattr(args, 'handler'):
        args.handler(args, args.print_usage)
    else:
        parser.print_help()


if __name__ == '__main__':
    main()