# Copyright (c) 2019 Works Applications Co., Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import argparse import fileinput import logging import os import sys from pathlib import Path from . import Dictionary, SplitMode from . import __version__ from . import sudachipy def _set_default_subparser(self, name, args=None): """ copy and modify code from https://bitbucket.org/ruamel/std.argparse """ subparser_found = False for arg in sys.argv[1:]: if arg in ['-h', '--help']: # global help if no subparser break else: for x in self._subparsers._actions: if not isinstance(x, argparse._SubParsersAction): continue for sp_name in x._name_parser_map.keys(): if sp_name in sys.argv[1:]: subparser_found = True if not subparser_found: # insert default in first position, this implies no # global options without a sub_parsers specified if args is None: sys.argv.insert(1, name) else: args.insert(0, name) argparse.ArgumentParser.set_default_subparser = _set_default_subparser def run(tokenizer, input_, output, print_all, morphs, is_stdout): # get an empty MorphemeList for memory reuse mlist = tokenizer.tokenize("") for line in input_: line = line.rstrip('\n') # out parameter means we are reusing memory here for m in tokenizer.tokenize(line, out=mlist): list_info = [ m.surface(), morphs[m.part_of_speech_id()], m.normalized_form()] if print_all: list_info += [ m.dictionary_form(), m.reading_form(), str(m.dictionary_id()), '[{}]'.format(','.join([str(synonym_group_id) for synonym_group_id in m.synonym_group_ids()]))] if m.is_oov(): list_info.append("(OOV)") output.write("\t".join(list_info)) output.write("\n") output.write("EOS\n") if is_stdout: output.flush() def _input_files_checker(args, print_usage): for file in args.in_files: if not os.path.exists(file): print_usage() print('{}: error: {} doesn\'t exist'.format( __name__, file), file=sys.stderr) exit(1) def _command_tokenize(args, print_usage): if args.version: print_version() return _input_files_checker(args, print_usage) if args.mode == "A": mode = SplitMode.A elif args.mode == "B": mode = SplitMode.B else: mode = SplitMode.C output = sys.stdout if args.fpath_out: output = open(args.fpath_out, "w", encoding="utf-8") stdout_logger = logging.getLogger(__name__) handler = logging.StreamHandler(sys.stdout) handler.setLevel(logging.DEBUG) stdout_logger.addHandler(handler) stdout_logger.setLevel(logging.DEBUG) stdout_logger.propagate = False print_all = args.a try: dict_ = Dictionary(config_path=args.fpath_setting, dict_type=args.system_dict_type) # empty matcher - get all POS tags all_morphs = dict_.pos_matcher([()]) # precompute output POS strings morphs = [",".join(ms) for ms in all_morphs] tokenizer_obj = dict_.create(mode=mode) input_ = fileinput.input( args.in_files, openhook=fileinput.hook_encoded("utf-8")) run(tokenizer_obj, input_, output, print_all, morphs, is_stdout=args.fpath_out is None) finally: if args.fpath_out: output.close() def _command_build(args, print_usage): matrix = Path(args.matrix_file) if not matrix.exists(): print("Matrix file", matrix, "does not exist", file=sys.stderr) return print_usage() in_files = [] for file in args.in_files: file = Path(file) if not file.exists(): print("Input file", file, "does not exists", file=sys.stderr) return print_usage() in_files.append(file) out_file = Path(args.out_file) if out_file.exists(): print("File", out_file, "already exists, refusing to overwrite it", file=sys.stderr) return description = args.description or "" if len(description.encode("utf-8")) > 255: print("Description is longer than 255 bytes in utf-8, it will be truncated") return stats = sudachipy.build_system_dic( matrix=matrix, lex=in_files, output=out_file, description=description, ) for (name, size, time) in stats: print("{} -> {} in {:.2F} sec".format(name, size, time)) def _command_user_build(args, print_usage): system = Path(args.system_dic) if not system.exists(): print("System dictionary file", system, "does not exist", file=sys.stderr) return print_usage() in_files = [] for file in args.in_files: file = Path(file) if not file.exists(): print("Input file", file, "does not exists", file=sys.stderr) return print_usage() in_files.append(file) out_file = Path(args.out_file) if out_file.exists(): print("File", out_file, "already exists, refusing to overwrite it", file=sys.stderr) return description = args.description or "" if len(description.encode("utf-8")) > 255: print("Description is longer than 255 bytes in utf-8, it will be truncated") return stats = sudachipy.build_user_dic( system=system, lex=in_files, output=out_file, description=description, ) for (name, size, time) in stats: print("{} -> {} in {:.2F} sec".format(name, size, time)) def print_version(): print('sudachipy {}'.format(__version__)) def main(): parser = argparse.ArgumentParser( description="Japanese Morphological Analyzer") subparsers = parser.add_subparsers(description='') # root, tokenizer parser parser_tk = subparsers.add_parser( 'tokenize', help='(default) see `tokenize -h`', description='Tokenize Text') parser_tk.add_argument("-r", dest="fpath_setting", metavar="file", help="the setting file in JSON format") parser_tk.add_argument( "-m", dest="mode", choices=["A", "B", "C"], default="C", help="the mode of splitting") parser_tk.add_argument("-o", dest="fpath_out", metavar="file", help="the output file") parser_tk.add_argument("-s", dest="system_dict_type", metavar='string', choices=["small", "core", "full"], help="sudachidict type") parser_tk.add_argument("-a", action="store_true", help="print all of the fields") parser_tk.add_argument("-d", action="store_true", help="print the debug information") parser_tk.add_argument("-v", "--version", action="store_true", dest="version", help="print sudachipy version") parser_tk.add_argument("in_files", metavar="file", nargs=argparse.ZERO_OR_MORE, help='text written in utf-8') parser_tk.set_defaults(handler=_command_tokenize, print_usage=parser_tk.print_usage) # build dictionary parser parser_bd = subparsers.add_parser( 'build', help='see `build -h`', description='Build Sudachi Dictionary') parser_bd.add_argument('-o', dest='out_file', metavar='file', default='system.dic', help='output file (default: system.dic)') parser_bd.add_argument('-d', dest='description', default='', metavar='string', required=False, help='description comment to be embedded on dictionary') required_named_bd = parser_bd.add_argument_group( 'required named arguments') required_named_bd.add_argument('-m', dest='matrix_file', metavar='file', required=True, help='connection matrix file with MeCab\'s matrix.def format') parser_bd.add_argument("in_files", metavar="file", nargs=argparse.ONE_OR_MORE, help='source files with CSV format (one of more)') parser_bd.set_defaults(handler=_command_build, print_usage=parser_bd.print_usage) # build user-dictionary parser parser_ubd = subparsers.add_parser( 'ubuild', help='see `ubuild -h`', description='Build User Dictionary') parser_ubd.add_argument('-d', dest='description', default='', metavar='string', required=False, help='description comment to be embedded on dictionary') parser_ubd.add_argument('-o', dest='out_file', metavar='file', default='user.dic', help='output file (default: user.dic)') parser_ubd.add_argument('-s', dest='system_dic', metavar='file', required=False, help='system dictionary path (default: system core dictionary path)') parser_ubd.add_argument("in_files", metavar="file", nargs=argparse.ONE_OR_MORE, help='source files with CSV format (one or more)') parser_ubd.set_defaults(handler=_command_user_build, print_usage=parser_ubd.print_usage) parser.set_default_subparser('tokenize') args = parser.parse_args() if hasattr(args, 'handler'): args.handler(args, args.print_usage) else: parser.print_help() if __name__ == '__main__': main()