from typing import ClassVar, Iterator, List, Tuple, Union, Callable, Iterable, Optional, Literal, Set
from sudachipy.config import Config

POS = Tuple[str, str, str, str, str, str]
# POS element
PE = Optional[str]
PartialPOS = Union[
    Tuple[PE, PE, PE, PE, PE, PE],
    Tuple[PE, PE, PE, PE, PE],
    Tuple[PE, PE, PE, PE],
    Tuple[PE, PE, PE],
    Tuple[PE, PE],
    Tuple[PE],
    Tuple[()],
]

FieldSet = Optional[Set[Literal["surface", "pos", "normalized_form", "dictionary_form", "reading_form",
                                "word_structure", "split_a", "split_b", "synonym_group_id"]]]

class SplitMode:
    """
    Unit to split text.

    A == short mode

    B == middle mode

    C == long mode
    """

    A: ClassVar[SplitMode] = ...
    B: ClassVar[SplitMode] = ...
    C: ClassVar[SplitMode] = ...
    @classmethod
    def __init__(cls) -> None: ...


class Dictionary:
    """
    A sudachi dictionary.
    """

    @classmethod
    def __init__(cls, config_path: Optional[str | Config] = ..., resource_dir: Optional[str] = ..., dict: Optional[str] = None,
                 dict_type: Optional[str] = None, *, config: Optional[str | Config] = ...) -> None:
        """
        Creates a sudachi dictionary.

        If both config.systemDict and dict are not given, `sudachidict_core` is used.
        If both config.systemDict and dict are given, dict_type is used.

        :param config_path: path to the configuration JSON file, config json as a string, or a [sudachipy.config.Config] object
        :param config: alias to config_path, only one of them can be specified at the same time
        :param resource_dir: path to the resource directory folder
        :param dict: type of pre-packaged system dictionary, referring to sudachidict_<dict> packages on PyPI: https://pypi.org/search/?q=sudachidict.
            Also, can be an _absolute_ path to a compiled dictionary file.
        :param dict_type: deprecated alias to dict
        """
        ...

    def close(self) -> None:
        """
        Close this dictionary.
        """
        ...

    def create(self,
               mode: SplitMode = SplitMode.C,
               fields: FieldSet = None,
               *,
               projection: str = None) -> Tokenizer:
        """
        Creates a Sudachi Tokenizer.

        :param mode: sets the analysis mode for this Tokenizer
        :param fields: load only a subset of fields.
            See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html
        :param projection: Projection override for created Tokenizer. See Config.projection for values.
        """
        ...

    def pos_matcher(self, target: Union[Iterable[PartialPOS], Callable[[POS], bool]]) -> PosMatcher:
        """
        Creates a new POS matcher.

        If target is a function, then it must return whether a POS should match or not.
        If target a list, it should contain partially specified POS.
        By partially specified it means that it is possible to omit POS fields or
        use None as a sentinel value that matches any POS.

        For example, ('名詞',) will match any noun and
        (None, None, None, None, None, '終止形') will match any word in 終止形 conjugation form.

        :param target: can be either a function or a list of POS tuples.
        """
        ...

    def pre_tokenizer(self,
                      mode: SplitMode = SplitMode.C,
                      fields: FieldSet = None,
                      handler: Optional[Callable[[int, object, MorphemeList], list]] = None,
                      *,
                      projection: str = None) -> object:
        """
        Creates HuggingFace Tokenizers-compatible PreTokenizer.
        Requires package `tokenizers` to be installed.

        :param mode: Use this split mode (C by default)
        :param fields: ask Sudachi to load only a subset of fields. See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html
        :param handler: custom callable to transform MorphemeList into list of tokens. See https://github.com/huggingface/tokenizers/blob/master/bindings/python/examples/custom_components.py
        First two parameters are the index (int) and HuggingFace NormalizedString.
        The handler must return a List[NormalizedString]. By default, just segment the tokens.
        :param projection: Projection override for created Tokenizer. See Config.projection for values.
        """
        ...

    def pos_of(self, pos_id: int) -> Optional[POS]:
        """
        Returns POS with the given id.

        :param pos_id: POS id
        :return: POS tuple with the given id.
        """
        ...

    def lookup(self, query: str, out: Optional[MorphemeList] = None) -> MorphemeList: ...

class Morpheme:
    """
    A morpheme (basic semantic unit of language).
    """
    def __init__(self) -> None: ...

    def begin(self) -> int:
        """
        Returns the begin index of this in the input text.
        """
        ...

    def dictionary_form(self) -> str:
        """
        Returns the dictionary form.
        """
        ...

    def dictionary_id(self) -> int:
        """
        Returns the dictionary id which this word belongs.
        """
        ...

    def end(self) -> int:
        """
        Returns the end index of this in the input text.
        """
        ...

    def get_word_info(self) -> WordInfo:
        """
        Returns the word info.
        """
        ...

    def is_oov(self) -> bool:
        """
        Returns whether if this is out of vocabulary word.
        """
        ...

    def normalized_form(self) -> str:
        """
        Returns the normalized form.
        """
        ...

    def part_of_speech(self) -> POS:
        """
        Returns the part of speech.
        """
        ...

    def part_of_speech_id(self) -> int:
        """
        Returns the id of the part of speech in the dictionary.
        """
        ...

    def reading_form(self) -> str:
        """
        Returns the reading form.
        """
        ...

    def split(self, mode: SplitMode, out: Optional[MorphemeList] = None, add_single: bool = True) -> MorphemeList:
        """
        Returns sub-morphemes in the provided split mode.

        :param mode: mode of new split
        :param out: write results to this MorhpemeList instead of creating new one
            See https://worksapplications.github.io/sudachi.rs/python/topics/out_param.html for
            more information on output parameters.
            Returned MorphemeList will be invalidated if this MorphemeList is used as an output parameter.
        :param add_single: return lists with the current morpheme if the split hasn't produced any elements.
            When False is passed, empty lists are returned instead.
        """
        ...

    def surface(self) -> str:
        """
        Returns the projected string for the given morpheme (by default surface).
        See Config.projection
        """
        ...

    def raw_surface(self) -> str:
        """
        Returns the surface string no matter the value of Config.projection
        """
        ...

    def synonym_group_ids(self) -> List[int]:
        """
        Returns the list of synonym group ids.
        """
        ...

    def word_id(self) -> int:
        """
        Returns word id of this word in the dictionary.
        """
        ...

    def __len__(self) -> int:
        """
        Returns morpheme length in codepoints
        """


class MorphemeList:
    """
    A list of morphemes.
    An object can not be instantiated manually.
    Use Tokenizer.tokenize("") to create an empty morpheme list.
    """
    def __init__(self) -> None: ...

    @classmethod
    def empty(cls, dict) -> MorphemeList:
        """
        Returns an empty morpheme list with dictionary.
        """
        ...

    def get_internal_cost(self) -> int:
        """
        Returns the total cost of the path.
        """
        ...

    def size(self) -> int:
        """
        Returns the number of morpheme in this list.
        """
        ...

    def __getitem__(self, index) -> Morpheme: ...
    def __iter__(self) -> Iterator[Morpheme]: ...
    def __len__(self) -> int: ...


class Tokenizer:
    SplitMode: ClassVar[SplitMode] = ...
    @classmethod
    def __init__(cls) -> None: ...

    def tokenize(self, text: str,
                 mode: SplitMode = ...,
                 out: Optional[MorphemeList] = None) -> MorphemeList:
        """
        Break text into morphemes.

        SudachiPy 0.5.* had logger parameter, it is accepted, but ignored.

        :param text: text to analyze
        :param mode: analysis mode.
            This parameter is deprecated.
            Pass the analysis mode at the Tokenizer creation time and create different tokenizers for different modes.
            If you need multi-level splitting, prefer using :py:meth:`Morpheme.split` method instead.
        :param out: tokenization results will be written into this MorphemeList, a new one will be created instead.
            See https://worksapplications.github.io/sudachi.rs/python/topics/out_param.html for details.
        """
        ...


class WordInfo:
    a_unit_split: ClassVar[List[int]] = ...
    b_unit_split: ClassVar[List[int]] = ...
    dictionary_form: ClassVar[str] = ...
    dictionary_form_word_id: ClassVar[int] = ...
    head_word_length: ClassVar[int] = ...
    normalized_form: ClassVar[str] = ...
    pos_id: ClassVar[int] = ...
    reading_form: ClassVar[str] = ...
    surface: ClassVar[str] = ...
    synonym_group_ids: ClassVar[List[int]] = ...
    word_structure: ClassVar[List[int]] = ...
    @classmethod
    def __init__(self) -> None: ...
    def length(self) -> int: ...

class PosMatcher:
    def __iter__(self) -> Iterator[POS]: ...
    def __len__(self) -> int: ...
    def __call__(self, m: Morpheme) -> bool:
        """
        Checks whether a morpheme has matching POS
        :param m: morpheme
        :return: if morpheme has matching POS
        """
        ...

    def __or__(self, other: PosMatcher) -> PosMatcher:
        """
        Returns a POS matcher which matches a POS if any of two matchers would match it
        :return: PosMatcher
        """
        ...

    def __and__(self, other: PosMatcher) -> PosMatcher:
        """
        Returns a POS matcher which matches a POS if both matchers would match it at the same time
        :return: PosMatcher
        """
        ...

    def __sub__(self, other: PosMatcher) -> PosMatcher:
        """
        Returns a POS matcher which matches a POS if self would match the POS and other would not match the POS
        :return: PosMatcher
        """
        ...

    def __invert__(self) -> PosMatcher:
        """
        Returns a POS matcher which matches all POS tags except ones defined in the current POS matcher
        :return: PosMatcher
        """
        ...