351 lines
11 KiB
Python
351 lines
11 KiB
Python
|
from typing import ClassVar, Iterator, List, Tuple, Union, Callable, Iterable, Optional, Literal, Set
|
||
|
from sudachipy.config import Config
|
||
|
|
||
|
POS = Tuple[str, str, str, str, str, str]
|
||
|
# POS element
|
||
|
PE = Optional[str]
|
||
|
PartialPOS = Union[
|
||
|
Tuple[PE, PE, PE, PE, PE, PE],
|
||
|
Tuple[PE, PE, PE, PE, PE],
|
||
|
Tuple[PE, PE, PE, PE],
|
||
|
Tuple[PE, PE, PE],
|
||
|
Tuple[PE, PE],
|
||
|
Tuple[PE],
|
||
|
Tuple[()],
|
||
|
]
|
||
|
|
||
|
FieldSet = Optional[Set[Literal["surface", "pos", "normalized_form", "dictionary_form", "reading_form",
|
||
|
"word_structure", "split_a", "split_b", "synonym_group_id"]]]
|
||
|
|
||
|
class SplitMode:
|
||
|
"""
|
||
|
Unit to split text.
|
||
|
|
||
|
A == short mode
|
||
|
|
||
|
B == middle mode
|
||
|
|
||
|
C == long mode
|
||
|
"""
|
||
|
|
||
|
A: ClassVar[SplitMode] = ...
|
||
|
B: ClassVar[SplitMode] = ...
|
||
|
C: ClassVar[SplitMode] = ...
|
||
|
@classmethod
|
||
|
def __init__(cls) -> None: ...
|
||
|
|
||
|
|
||
|
class Dictionary:
|
||
|
"""
|
||
|
A sudachi dictionary.
|
||
|
"""
|
||
|
|
||
|
@classmethod
|
||
|
def __init__(cls, config_path: Optional[str | Config] = ..., resource_dir: Optional[str] = ..., dict: Optional[str] = None,
|
||
|
dict_type: Optional[str] = None, *, config: Optional[str | Config] = ...) -> None:
|
||
|
"""
|
||
|
Creates a sudachi dictionary.
|
||
|
|
||
|
If both config.systemDict and dict are not given, `sudachidict_core` is used.
|
||
|
If both config.systemDict and dict are given, dict_type is used.
|
||
|
|
||
|
:param config_path: path to the configuration JSON file, config json as a string, or a [sudachipy.config.Config] object
|
||
|
:param config: alias to config_path, only one of them can be specified at the same time
|
||
|
:param resource_dir: path to the resource directory folder
|
||
|
:param dict: type of pre-packaged system dictionary, referring to sudachidict_<dict> packages on PyPI: https://pypi.org/search/?q=sudachidict.
|
||
|
Also, can be an _absolute_ path to a compiled dictionary file.
|
||
|
:param dict_type: deprecated alias to dict
|
||
|
"""
|
||
|
...
|
||
|
|
||
|
def close(self) -> None:
|
||
|
"""
|
||
|
Close this dictionary.
|
||
|
"""
|
||
|
...
|
||
|
|
||
|
def create(self,
|
||
|
mode: SplitMode = SplitMode.C,
|
||
|
fields: FieldSet = None,
|
||
|
*,
|
||
|
projection: str = None) -> Tokenizer:
|
||
|
"""
|
||
|
Creates a Sudachi Tokenizer.
|
||
|
|
||
|
:param mode: sets the analysis mode for this Tokenizer
|
||
|
:param fields: load only a subset of fields.
|
||
|
See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html
|
||
|
:param projection: Projection override for created Tokenizer. See Config.projection for values.
|
||
|
"""
|
||
|
...
|
||
|
|
||
|
def pos_matcher(self, target: Union[Iterable[PartialPOS], Callable[[POS], bool]]) -> PosMatcher:
|
||
|
"""
|
||
|
Creates a new POS matcher.
|
||
|
|
||
|
If target is a function, then it must return whether a POS should match or not.
|
||
|
If target a list, it should contain partially specified POS.
|
||
|
By partially specified it means that it is possible to omit POS fields or
|
||
|
use None as a sentinel value that matches any POS.
|
||
|
|
||
|
For example, ('名詞',) will match any noun and
|
||
|
(None, None, None, None, None, '終止形') will match any word in 終止形 conjugation form.
|
||
|
|
||
|
:param target: can be either a function or a list of POS tuples.
|
||
|
"""
|
||
|
...
|
||
|
|
||
|
def pre_tokenizer(self,
|
||
|
mode: SplitMode = SplitMode.C,
|
||
|
fields: FieldSet = None,
|
||
|
handler: Optional[Callable[[int, object, MorphemeList], list]] = None,
|
||
|
*,
|
||
|
projection: str = None) -> object:
|
||
|
"""
|
||
|
Creates HuggingFace Tokenizers-compatible PreTokenizer.
|
||
|
Requires package `tokenizers` to be installed.
|
||
|
|
||
|
:param mode: Use this split mode (C by default)
|
||
|
:param fields: ask Sudachi to load only a subset of fields. See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html
|
||
|
:param handler: custom callable to transform MorphemeList into list of tokens. See https://github.com/huggingface/tokenizers/blob/master/bindings/python/examples/custom_components.py
|
||
|
First two parameters are the index (int) and HuggingFace NormalizedString.
|
||
|
The handler must return a List[NormalizedString]. By default, just segment the tokens.
|
||
|
:param projection: Projection override for created Tokenizer. See Config.projection for values.
|
||
|
"""
|
||
|
...
|
||
|
|
||
|
def pos_of(self, pos_id: int) -> Optional[POS]:
|
||
|
"""
|
||
|
Returns POS with the given id.
|
||
|
|
||
|
:param pos_id: POS id
|
||
|
:return: POS tuple with the given id.
|
||
|
"""
|
||
|
...
|
||
|
|
||
|
def lookup(self, query: str, out: Optional[MorphemeList] = None) -> MorphemeList: ...
|
||
|
|
||
|
class Morpheme:
|
||
|
"""
|
||
|
A morpheme (basic semantic unit of language).
|
||
|
"""
|
||
|
def __init__(self) -> None: ...
|
||
|
|
||
|
def begin(self) -> int:
|
||
|
"""
|
||
|
Returns the begin index of this in the input text.
|
||
|
"""
|
||
|
...
|
||
|
|
||
|
def dictionary_form(self) -> str:
|
||
|
"""
|
||
|
Returns the dictionary form.
|
||
|
"""
|
||
|
...
|
||
|
|
||
|
def dictionary_id(self) -> int:
|
||
|
"""
|
||
|
Returns the dictionary id which this word belongs.
|
||
|
"""
|
||
|
...
|
||
|
|
||
|
def end(self) -> int:
|
||
|
"""
|
||
|
Returns the end index of this in the input text.
|
||
|
"""
|
||
|
...
|
||
|
|
||
|
def get_word_info(self) -> WordInfo:
|
||
|
"""
|
||
|
Returns the word info.
|
||
|
"""
|
||
|
...
|
||
|
|
||
|
def is_oov(self) -> bool:
|
||
|
"""
|
||
|
Returns whether if this is out of vocabulary word.
|
||
|
"""
|
||
|
...
|
||
|
|
||
|
def normalized_form(self) -> str:
|
||
|
"""
|
||
|
Returns the normalized form.
|
||
|
"""
|
||
|
...
|
||
|
|
||
|
def part_of_speech(self) -> POS:
|
||
|
"""
|
||
|
Returns the part of speech.
|
||
|
"""
|
||
|
...
|
||
|
|
||
|
def part_of_speech_id(self) -> int:
|
||
|
"""
|
||
|
Returns the id of the part of speech in the dictionary.
|
||
|
"""
|
||
|
...
|
||
|
|
||
|
def reading_form(self) -> str:
|
||
|
"""
|
||
|
Returns the reading form.
|
||
|
"""
|
||
|
...
|
||
|
|
||
|
def split(self, mode: SplitMode, out: Optional[MorphemeList] = None, add_single: bool = True) -> MorphemeList:
|
||
|
"""
|
||
|
Returns sub-morphemes in the provided split mode.
|
||
|
|
||
|
:param mode: mode of new split
|
||
|
:param out: write results to this MorhpemeList instead of creating new one
|
||
|
See https://worksapplications.github.io/sudachi.rs/python/topics/out_param.html for
|
||
|
more information on output parameters.
|
||
|
Returned MorphemeList will be invalidated if this MorphemeList is used as an output parameter.
|
||
|
:param add_single: return lists with the current morpheme if the split hasn't produced any elements.
|
||
|
When False is passed, empty lists are returned instead.
|
||
|
"""
|
||
|
...
|
||
|
|
||
|
def surface(self) -> str:
|
||
|
"""
|
||
|
Returns the projected string for the given morpheme (by default surface).
|
||
|
See Config.projection
|
||
|
"""
|
||
|
...
|
||
|
|
||
|
def raw_surface(self) -> str:
|
||
|
"""
|
||
|
Returns the surface string no matter the value of Config.projection
|
||
|
"""
|
||
|
...
|
||
|
|
||
|
def synonym_group_ids(self) -> List[int]:
|
||
|
"""
|
||
|
Returns the list of synonym group ids.
|
||
|
"""
|
||
|
...
|
||
|
|
||
|
def word_id(self) -> int:
|
||
|
"""
|
||
|
Returns word id of this word in the dictionary.
|
||
|
"""
|
||
|
...
|
||
|
|
||
|
def __len__(self) -> int:
|
||
|
"""
|
||
|
Returns morpheme length in codepoints
|
||
|
"""
|
||
|
|
||
|
|
||
|
class MorphemeList:
|
||
|
"""
|
||
|
A list of morphemes.
|
||
|
An object can not be instantiated manually.
|
||
|
Use Tokenizer.tokenize("") to create an empty morpheme list.
|
||
|
"""
|
||
|
def __init__(self) -> None: ...
|
||
|
|
||
|
@classmethod
|
||
|
def empty(cls, dict) -> MorphemeList:
|
||
|
"""
|
||
|
Returns an empty morpheme list with dictionary.
|
||
|
"""
|
||
|
...
|
||
|
|
||
|
def get_internal_cost(self) -> int:
|
||
|
"""
|
||
|
Returns the total cost of the path.
|
||
|
"""
|
||
|
...
|
||
|
|
||
|
def size(self) -> int:
|
||
|
"""
|
||
|
Returns the number of morpheme in this list.
|
||
|
"""
|
||
|
...
|
||
|
|
||
|
def __getitem__(self, index) -> Morpheme: ...
|
||
|
def __iter__(self) -> Iterator[Morpheme]: ...
|
||
|
def __len__(self) -> int: ...
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
class Tokenizer:
|
||
|
SplitMode: ClassVar[SplitMode] = ...
|
||
|
@classmethod
|
||
|
def __init__(cls) -> None: ...
|
||
|
|
||
|
def tokenize(self, text: str,
|
||
|
mode: SplitMode = ...,
|
||
|
out: Optional[MorphemeList] = None) -> MorphemeList:
|
||
|
"""
|
||
|
Break text into morphemes.
|
||
|
|
||
|
SudachiPy 0.5.* had logger parameter, it is accepted, but ignored.
|
||
|
|
||
|
:param text: text to analyze
|
||
|
:param mode: analysis mode.
|
||
|
This parameter is deprecated.
|
||
|
Pass the analysis mode at the Tokenizer creation time and create different tokenizers for different modes.
|
||
|
If you need multi-level splitting, prefer using :py:meth:`Morpheme.split` method instead.
|
||
|
:param out: tokenization results will be written into this MorphemeList, a new one will be created instead.
|
||
|
See https://worksapplications.github.io/sudachi.rs/python/topics/out_param.html for details.
|
||
|
"""
|
||
|
...
|
||
|
|
||
|
|
||
|
class WordInfo:
|
||
|
a_unit_split: ClassVar[List[int]] = ...
|
||
|
b_unit_split: ClassVar[List[int]] = ...
|
||
|
dictionary_form: ClassVar[str] = ...
|
||
|
dictionary_form_word_id: ClassVar[int] = ...
|
||
|
head_word_length: ClassVar[int] = ...
|
||
|
normalized_form: ClassVar[str] = ...
|
||
|
pos_id: ClassVar[int] = ...
|
||
|
reading_form: ClassVar[str] = ...
|
||
|
surface: ClassVar[str] = ...
|
||
|
synonym_group_ids: ClassVar[List[int]] = ...
|
||
|
word_structure: ClassVar[List[int]] = ...
|
||
|
@classmethod
|
||
|
def __init__(self) -> None: ...
|
||
|
def length(self) -> int: ...
|
||
|
|
||
|
class PosMatcher:
|
||
|
def __iter__(self) -> Iterator[POS]: ...
|
||
|
def __len__(self) -> int: ...
|
||
|
def __call__(self, m: Morpheme) -> bool:
|
||
|
"""
|
||
|
Checks whether a morpheme has matching POS
|
||
|
:param m: morpheme
|
||
|
:return: if morpheme has matching POS
|
||
|
"""
|
||
|
...
|
||
|
|
||
|
def __or__(self, other: PosMatcher) -> PosMatcher:
|
||
|
"""
|
||
|
Returns a POS matcher which matches a POS if any of two matchers would match it
|
||
|
:return: PosMatcher
|
||
|
"""
|
||
|
...
|
||
|
|
||
|
def __and__(self, other: PosMatcher) -> PosMatcher:
|
||
|
"""
|
||
|
Returns a POS matcher which matches a POS if both matchers would match it at the same time
|
||
|
:return: PosMatcher
|
||
|
"""
|
||
|
...
|
||
|
|
||
|
def __sub__(self, other: PosMatcher) -> PosMatcher:
|
||
|
"""
|
||
|
Returns a POS matcher which matches a POS if self would match the POS and other would not match the POS
|
||
|
:return: PosMatcher
|
||
|
"""
|
||
|
...
|
||
|
|
||
|
def __invert__(self) -> PosMatcher:
|
||
|
"""
|
||
|
Returns a POS matcher which matches all POS tags except ones defined in the current POS matcher
|
||
|
:return: PosMatcher
|
||
|
"""
|
||
|
...
|