ai-content-maker/.venv/Lib/site-packages/bnunicodenormalizer/base.py

#-*- coding: utf-8 -*-
"""
@author:Bengali.AI
"""
from __future__ import print_function
#-------------------------------------------
from .langs import languages
from bnunicodenormalizer import langs

#-------------------------------------------
# cleaner class
#-------------------------------------------
class BaseNormalizer(object):
    def __init__(self,
                 language,
                 allow_english=False,
                 keep_legacy_symbols=False,
                 legacy_maps=None):

        '''
            initialize a normalizer
            args:
                language                        :   language identifier/name  type(str)
                allow_english                   :   allow english letters numbers and punctuations [default:False] type(bool)
                keep_legacy_symbols             :   legacy symbols will be considered as valid unicodes[default:False] type(bool)
                legacy_maps                     :   a dictionay for changing legacy symbols into a more used  unicode [default:None] type(dict/None)

        '''
        # error handling
        assert type(language)==str,"language is not string type!!!"
        assert language in languages.keys(),"Language is not available"
        assert type(allow_english)==bool,"allow_english is not of type boolean [True/False]"
        assert type(keep_legacy_symbols)==bool,"keep_legacy_symbols is not of type boolean [True/False]"

        self.lang           =   languages[language]

        if legacy_maps is not None:
            assert type(legacy_maps)==dict,"legacy_maps is not of type dict or None"
            assert len(legacy_maps.keys())>0,"legacy_maps is an empty dict"
            for k,v in legacy_maps.items():
                assert k in self.lang.legacy_symbols,f"{k} is not a legacy symbol.See README.md initialization section for legacy symbols"
                assert v in self.lang.used,f"{v} is not a valid legacy map.See README.md initialization section for legacy symbols"
        # assignments
        self.valid          =   self.lang.valid
        self.roots          =   self.lang.complex_roots
        self.legacy_maps    =   legacy_maps
        #------------------------- update valid and complex roots-----------------------------------
        if allow_english:
            self.valid=sorted(list(set(self.valid+languages["english"].valid)))
            self.roots=sorted(list(set(self.roots+languages["english"].valid)))
        if keep_legacy_symbols:
            self.valid=sorted(list(set(self.valid+self.lang.legacy_symbols)))
            self.roots=sorted(list(set(self.roots+self.lang.legacy_symbols)))

        self.word_level_ops={"LegacySymbols"   :self.mapLegacySymbols,
                             "BrokenDiacritics":self.fixBrokenDiacritics}

        self.decomp_level_ops={"BrokenNukta"                :self.fixBrokenNukta,
                               "InvalidUnicode"             :self.cleanInvalidUnicodes,
                               "InvalidConnector"           :self.cleanInvalidConnector,
                               "FixDiacritics"              :self.cleanDiacritics,
                               "VowelDiacriticAfterVowel"   :self.cleanVowelDiacriticComingAfterVowel}

#-------------------------common ops-----------------------------------------------------------------------------
    def checkDecomp(self):
        if len(self.decomp)>0:return True
        else:return False

    def replaceMaps(self,map_dict):
        for k,v in map_dict.items():
            self.word=self.word.replace(k,v)

    def swapIdxs(self,idx1,idx2):
        temp=self.decomp[idx1]
        self.decomp[idx1]=self.decomp[idx2]
        self.decomp[idx2]=temp

    def safeop(self,op):
        #try:
        self.decomp=[x for x in self.decomp if x is not None]
        self.decomp=[x for x in "".join(self.decomp) if x is not None]
        op()
        # reform
        self.decomp=[x for x in self.decomp if x is not None]
        self.decomp=[x for x in "".join(self.decomp) if x is not None]

        # except IndexError:
        #     self.decomp=[x for x in self.decomp if x is not None]
        #     pass

    def constructComplexDecomp(self):
        '''
            creates grapheme root based decomp
        '''

        if self.lang.connector in self.decomp:
            c_idxs = [i for i, x in enumerate(self.decomp) if x == self.lang.connector]
            # component wise index map
            comps=[[cid-1,cid,cid+1] for cid in c_idxs ]
            # merge multi root
            r_decomp = []
            while len(comps)>0:
                first, *rest = comps
                first = set(first)

                lf = -1
                while len(first)>lf:
                    lf = len(first)

                    rest2 = []
                    for r in rest:
                        if len(first.intersection(set(r)))>0:
                            first |= set(r)
                        else:
                            rest2.append(r)
                    rest = rest2

                r_decomp.append(sorted(list(first)))
                comps = rest

            # add
            combs=[]
            for ridx in r_decomp:
                comb=''
                for i in ridx:
                    comb+=self.decomp[i]
                combs.append(comb)
                for i in ridx:
                    if i==ridx[-1]:
                        self.decomp[i]=comb
                    else:
                        self.decomp[i]=None
            self.decomp=[d for d in self.decomp if d is not None]


#-------------------------common ops-----------------------------------------------------------------------------
#-------------------------word ops-----------------------------------------------------------------------------
    def mapLegacySymbols(self):
        if self.legacy_maps is not None:
            self.replaceMaps(self.legacy_maps)

    def fixBrokenDiacritics(self):
        if self.lang.diacritic_map is not None:
            self.replaceMaps(self.lang.diacritic_map)


#-------------------------word ops-----------------------------------------------------------------------------
#-------------------------unicode ops-----------------------------------------------------------------------------
    def fixBrokenNukta(self):
        if self.lang.nukta_map is not None:
            for idx,d in enumerate(self.decomp):
                if d==self.lang.nukta:
                    for cidx in range(idx-1,-1,-1):
                        if self.decomp[cidx] in self.lang.nukta_map.keys():
                            self.decomp[cidx]=self.lang.nukta_map[self.decomp[cidx]]
                            self.decomp[idx]=None
                            break

    def cleanInvalidUnicodes(self):
        # clean starts
        while self.checkDecomp() and self.decomp[0] in self.lang.invalid_starts:
            self.decomp=self.decomp[1:]
        # clean endings
        while self.checkDecomp() and self.decomp[-1] == self.lang.connector:
            self.decomp=self.decomp[:-1]
        if self.checkDecomp():
            # clean invalid unicodes
            for idx,d in enumerate(self.decomp):
                if d not in self.valid:
                    self.decomp[idx]=None

    def cleanInvalidConnector(self):
        for idx,d in enumerate(self.decomp):
            if idx<len(self.decomp)-1:
                if d==self.lang.connector :
                    if self.decomp[idx-1] in self.lang.invalid_connectors  or self.decomp[idx+1] in self.lang.invalid_connectors:
                        self.decomp[idx]=None

    def cleanVowelDiacritics(self):
        # vowel diacritics
        for idx,d in enumerate(self.decomp):
            if idx<len(self.decomp)-1:
                if d in self.lang.vowel_diacritics and self.decomp[idx+1] in self.lang.vowel_diacritics :
                    # if they are same delete the current one
                    if d==self.decomp[idx+1]:
                        self.decomp[idx]=None
                    # if they are not same --> remove the last one
                    else:
                        self.decomp[idx+1]=None
    def cleanConsonantDiacritics(self):
        # consonant diacritics
        for idx,d in enumerate(self.decomp):
            if idx<len(self.decomp)-1:
                if d in self.lang.consonant_diacritics and self.decomp[idx+1] in self.lang.consonant_diacritics:
                    # if they are same delete the current one
                    if d==self.decomp[idx+1]:
                        self.decomp[idx]=None

    def fixDiacriticOrder(self):
        for idx,d in enumerate(self.decomp):
            if idx<len(self.decomp)-1:
                if d in self.lang.consonant_diacritics and self.decomp[idx+1] in self.lang.vowel_diacritics:
                    self.swapIdxs(idx,idx+1)

    def cleanNonCharDiacs(self):
        for idx,d in enumerate(self.decomp):
            if idx>0:
                if d in self.lang.diacritics and self.decomp[idx-1] in self.lang.non_chars:
                    self.decomp[idx]=None


    def cleanDiacritics(self):
        self.safeop(self.cleanVowelDiacritics)
        self.safeop(self.cleanConsonantDiacritics)
        self.safeop(self.fixDiacriticOrder)
        self.safeop(self.cleanNonCharDiacs)

    def cleanVowelDiacriticComingAfterVowel(self):
        for idx,d in enumerate(self.decomp):
            if  d in self.lang.vowel_diacritics and self.decomp[idx-1] in self.lang.vowels:
                # remove diacritic
                self.decomp[idx]=None

    def fixNoSpaceChar(self):
        for idx,d in enumerate(self.decomp):
            if idx==0 and self.decomp[idx] in ["\u200c","\u200d"]:
                self.decomp[idx]=None
            else:
                if idx<len(self.decomp)-1 and self.decomp[idx] in ["\u200c","\u200d"] and self.decomp[idx+1] in [self.lang.connector]+["\u200c","\u200d"]:
                    # if the next char is either connector or is repeated
                    self.decomp[idx+1]=None
                if idx>0 and self.decomp[idx] in ["\u200c","\u200d"] and self.decomp[idx-1] in [None]+["\u200c","\u200d"]:
                    # if the previous char is None
                    self.decomp[idx]=None


#----------------------composite ops-----------------------------------------------------------------------
    def baseCompose(self):
        self.safeop(self.cleanInvalidUnicodes)
        self.safeop(self.cleanInvalidConnector)
        self.safeop(self.cleanDiacritics)
        self.safeop(self.cleanVowelDiacriticComingAfterVowel)
        self.safeop(self.fixNoSpaceChar)


#----------------------entry-----------------------------------------------------------------------
    def __call__(self,word):
        '''
            normalizes a given word
            args:
                word    : the string to normalize
            returns:
                a dictionary-
                * "given" = provided text
                * "normalized = normalized text (gives None if during the operation length of the text becomes 0)
                * "ops" = list of operations (dictionary) that were executed in given text to create normalized text
                *  each dictionary in ops has:
                    * "operation": the name of the operation / problem in given text
                    * "before" : what the text looked like before the specific operation
                    * "after"  : what the text looks like after the specific operation
        '''
        # algorithm:
        #         * execute word level ops
        #         > create unicode based decomp
        #         * execute unicode level ops
        #         > create root based decomp
        #         * execute root level ops

        details=[]
        self.check=word
        if not isinstance(self.check, str):
            raise TypeError("The provided argument/ word is not a string")
        if len(self.check.strip().split(" "))>1:
            raise ValueError(f"The provided string has hultiple words.Make sure no space exists in the middle of the text.probable word:{self.check.replace(' ','')}")
        self.word=word
        #---------------------------------------------word ops-------------------------
        for op_id,op in self.word_level_ops.items():
            word_before_op=self.word[:]
            op()
            word_after_op=self.word[:]
            if word_before_op!=word_after_op:
                details.append({"operation":op_id,"before":word_before_op,"after":word_after_op})
        #---------------------------------------------word ops-------------------------
        self.decomp=[ch for ch in self.word]
        #---------------------------------------------unicode ops-------------------------
        for op_id,op in self.decomp_level_ops.items():
            word_before_op="".join(self.decomp)
            # execute
            self.safeop(op)
            # check length
            if not self.checkDecomp():
                return {"normalized":None,"given":self.check,"ops":details}

            word_after_op="".join(self.decomp)
            if word_before_op!=word_after_op:
                details.append({"operation":op_id,"before":word_before_op,"after":word_after_op})


        #----------------------------------------------------------------------------------------------------------
        self.safeop(self.baseCompose)
        self.word="".join([x for x in self.decomp if x is not None])

        return {"normalized":self.word,"given":self.check,"ops":details}