ai-content-maker/.venv/Lib/site-packages/sudachipy/sudachipy.pyi

351 lines
11 KiB
Python

from typing import ClassVar, Iterator, List, Tuple, Union, Callable, Iterable, Optional, Literal, Set
from sudachipy.config import Config
POS = Tuple[str, str, str, str, str, str]
# POS element
PE = Optional[str]
PartialPOS = Union[
Tuple[PE, PE, PE, PE, PE, PE],
Tuple[PE, PE, PE, PE, PE],
Tuple[PE, PE, PE, PE],
Tuple[PE, PE, PE],
Tuple[PE, PE],
Tuple[PE],
Tuple[()],
]
FieldSet = Optional[Set[Literal["surface", "pos", "normalized_form", "dictionary_form", "reading_form",
"word_structure", "split_a", "split_b", "synonym_group_id"]]]
class SplitMode:
"""
Unit to split text.
A == short mode
B == middle mode
C == long mode
"""
A: ClassVar[SplitMode] = ...
B: ClassVar[SplitMode] = ...
C: ClassVar[SplitMode] = ...
@classmethod
def __init__(cls) -> None: ...
class Dictionary:
"""
A sudachi dictionary.
"""
@classmethod
def __init__(cls, config_path: Optional[str | Config] = ..., resource_dir: Optional[str] = ..., dict: Optional[str] = None,
dict_type: Optional[str] = None, *, config: Optional[str | Config] = ...) -> None:
"""
Creates a sudachi dictionary.
If both config.systemDict and dict are not given, `sudachidict_core` is used.
If both config.systemDict and dict are given, dict_type is used.
:param config_path: path to the configuration JSON file, config json as a string, or a [sudachipy.config.Config] object
:param config: alias to config_path, only one of them can be specified at the same time
:param resource_dir: path to the resource directory folder
:param dict: type of pre-packaged system dictionary, referring to sudachidict_<dict> packages on PyPI: https://pypi.org/search/?q=sudachidict.
Also, can be an _absolute_ path to a compiled dictionary file.
:param dict_type: deprecated alias to dict
"""
...
def close(self) -> None:
"""
Close this dictionary.
"""
...
def create(self,
mode: SplitMode = SplitMode.C,
fields: FieldSet = None,
*,
projection: str = None) -> Tokenizer:
"""
Creates a Sudachi Tokenizer.
:param mode: sets the analysis mode for this Tokenizer
:param fields: load only a subset of fields.
See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html
:param projection: Projection override for created Tokenizer. See Config.projection for values.
"""
...
def pos_matcher(self, target: Union[Iterable[PartialPOS], Callable[[POS], bool]]) -> PosMatcher:
"""
Creates a new POS matcher.
If target is a function, then it must return whether a POS should match or not.
If target a list, it should contain partially specified POS.
By partially specified it means that it is possible to omit POS fields or
use None as a sentinel value that matches any POS.
For example, ('名詞',) will match any noun and
(None, None, None, None, None, '終止形') will match any word in 終止形 conjugation form.
:param target: can be either a function or a list of POS tuples.
"""
...
def pre_tokenizer(self,
mode: SplitMode = SplitMode.C,
fields: FieldSet = None,
handler: Optional[Callable[[int, object, MorphemeList], list]] = None,
*,
projection: str = None) -> object:
"""
Creates HuggingFace Tokenizers-compatible PreTokenizer.
Requires package `tokenizers` to be installed.
:param mode: Use this split mode (C by default)
:param fields: ask Sudachi to load only a subset of fields. See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html
:param handler: custom callable to transform MorphemeList into list of tokens. See https://github.com/huggingface/tokenizers/blob/master/bindings/python/examples/custom_components.py
First two parameters are the index (int) and HuggingFace NormalizedString.
The handler must return a List[NormalizedString]. By default, just segment the tokens.
:param projection: Projection override for created Tokenizer. See Config.projection for values.
"""
...
def pos_of(self, pos_id: int) -> Optional[POS]:
"""
Returns POS with the given id.
:param pos_id: POS id
:return: POS tuple with the given id.
"""
...
def lookup(self, query: str, out: Optional[MorphemeList] = None) -> MorphemeList: ...
class Morpheme:
"""
A morpheme (basic semantic unit of language).
"""
def __init__(self) -> None: ...
def begin(self) -> int:
"""
Returns the begin index of this in the input text.
"""
...
def dictionary_form(self) -> str:
"""
Returns the dictionary form.
"""
...
def dictionary_id(self) -> int:
"""
Returns the dictionary id which this word belongs.
"""
...
def end(self) -> int:
"""
Returns the end index of this in the input text.
"""
...
def get_word_info(self) -> WordInfo:
"""
Returns the word info.
"""
...
def is_oov(self) -> bool:
"""
Returns whether if this is out of vocabulary word.
"""
...
def normalized_form(self) -> str:
"""
Returns the normalized form.
"""
...
def part_of_speech(self) -> POS:
"""
Returns the part of speech.
"""
...
def part_of_speech_id(self) -> int:
"""
Returns the id of the part of speech in the dictionary.
"""
...
def reading_form(self) -> str:
"""
Returns the reading form.
"""
...
def split(self, mode: SplitMode, out: Optional[MorphemeList] = None, add_single: bool = True) -> MorphemeList:
"""
Returns sub-morphemes in the provided split mode.
:param mode: mode of new split
:param out: write results to this MorhpemeList instead of creating new one
See https://worksapplications.github.io/sudachi.rs/python/topics/out_param.html for
more information on output parameters.
Returned MorphemeList will be invalidated if this MorphemeList is used as an output parameter.
:param add_single: return lists with the current morpheme if the split hasn't produced any elements.
When False is passed, empty lists are returned instead.
"""
...
def surface(self) -> str:
"""
Returns the projected string for the given morpheme (by default surface).
See Config.projection
"""
...
def raw_surface(self) -> str:
"""
Returns the surface string no matter the value of Config.projection
"""
...
def synonym_group_ids(self) -> List[int]:
"""
Returns the list of synonym group ids.
"""
...
def word_id(self) -> int:
"""
Returns word id of this word in the dictionary.
"""
...
def __len__(self) -> int:
"""
Returns morpheme length in codepoints
"""
class MorphemeList:
"""
A list of morphemes.
An object can not be instantiated manually.
Use Tokenizer.tokenize("") to create an empty morpheme list.
"""
def __init__(self) -> None: ...
@classmethod
def empty(cls, dict) -> MorphemeList:
"""
Returns an empty morpheme list with dictionary.
"""
...
def get_internal_cost(self) -> int:
"""
Returns the total cost of the path.
"""
...
def size(self) -> int:
"""
Returns the number of morpheme in this list.
"""
...
def __getitem__(self, index) -> Morpheme: ...
def __iter__(self) -> Iterator[Morpheme]: ...
def __len__(self) -> int: ...
class Tokenizer:
SplitMode: ClassVar[SplitMode] = ...
@classmethod
def __init__(cls) -> None: ...
def tokenize(self, text: str,
mode: SplitMode = ...,
out: Optional[MorphemeList] = None) -> MorphemeList:
"""
Break text into morphemes.
SudachiPy 0.5.* had logger parameter, it is accepted, but ignored.
:param text: text to analyze
:param mode: analysis mode.
This parameter is deprecated.
Pass the analysis mode at the Tokenizer creation time and create different tokenizers for different modes.
If you need multi-level splitting, prefer using :py:meth:`Morpheme.split` method instead.
:param out: tokenization results will be written into this MorphemeList, a new one will be created instead.
See https://worksapplications.github.io/sudachi.rs/python/topics/out_param.html for details.
"""
...
class WordInfo:
a_unit_split: ClassVar[List[int]] = ...
b_unit_split: ClassVar[List[int]] = ...
dictionary_form: ClassVar[str] = ...
dictionary_form_word_id: ClassVar[int] = ...
head_word_length: ClassVar[int] = ...
normalized_form: ClassVar[str] = ...
pos_id: ClassVar[int] = ...
reading_form: ClassVar[str] = ...
surface: ClassVar[str] = ...
synonym_group_ids: ClassVar[List[int]] = ...
word_structure: ClassVar[List[int]] = ...
@classmethod
def __init__(self) -> None: ...
def length(self) -> int: ...
class PosMatcher:
def __iter__(self) -> Iterator[POS]: ...
def __len__(self) -> int: ...
def __call__(self, m: Morpheme) -> bool:
"""
Checks whether a morpheme has matching POS
:param m: morpheme
:return: if morpheme has matching POS
"""
...
def __or__(self, other: PosMatcher) -> PosMatcher:
"""
Returns a POS matcher which matches a POS if any of two matchers would match it
:return: PosMatcher
"""
...
def __and__(self, other: PosMatcher) -> PosMatcher:
"""
Returns a POS matcher which matches a POS if both matchers would match it at the same time
:return: PosMatcher
"""
...
def __sub__(self, other: PosMatcher) -> PosMatcher:
"""
Returns a POS matcher which matches a POS if self would match the POS and other would not match the POS
:return: PosMatcher
"""
...
def __invert__(self) -> PosMatcher:
"""
Returns a POS matcher which matches all POS tags except ones defined in the current POS matcher
:return: PosMatcher
"""
...