#-*- coding: utf-8 -*- """ @author:Bengali.AI """ from __future__ import print_function #------------------------------------------- # globals #------------------------------------------- from .base import BaseNormalizer,languages #------------------------------------------- # cleaner class #------------------------------------------- class Normalizer(BaseNormalizer): def __init__(self, allow_english=False, keep_legacy_symbols=False, legacy_maps=None): ''' initialize a normalizer args: allow_english : allow english letters numbers and punctuations [default:False] keep_legacy_symbols : legacy symbols will be considered as valid unicodes[default:False] '৺':Isshar '৻':Ganda 'ঀ':Anji (not '৭') 'ঌ':li 'ৡ':dirgho li 'ঽ':Avagraha 'ৠ':Vocalic Rr (not 'ঋ') '৲':rupi '৴':currency numerator 1 '৵':currency numerator 2 '৶':currency numerator 3 '৷':currency numerator 4 '৸':currency numerator one less than the denominator '৹':Currency Denominator Sixteen legacy_maps : a dictionay for changing legacy symbols into a more used unicode a default legacy map is included in the language class as well, legacy_maps={'ঀ':'৭', 'ঌ':'৯', 'ৡ':'৯', '৵':'৯', '৻':'ৎ', 'ৠ':'ঋ', 'ঽ':'ই'} pass- * legacy_maps=None; for keeping the legacy symbols as they are * legacy_maps="default"; for using the default legacy map * legacy_maps=custom dictionary(type-dict) ; which will map your desired legacy symbol to any of symbol you want * the keys in the custiom dicts must belong to any of the legacy symbols * the values in the custiom dicts must belong to either vowels,consonants,numbers or diacritics vowels = ['অ', 'আ', 'ই', 'ঈ', 'উ', 'ঊ', 'ঋ', 'এ', 'ঐ', 'ও', 'ঔ'] consonants = ['ক', 'খ', 'গ', 'ঘ', 'ঙ', 'চ', 'ছ','জ', 'ঝ', 'ঞ', 'ট', 'ঠ', 'ড', 'ঢ', 'ণ', 'ত', 'থ', 'দ', 'ধ', 'ন', 'প', 'ফ', 'ব', 'ভ', 'ম', 'য', 'র', 'ল', 'শ', 'ষ', 'স', 'হ','ড়', 'ঢ়', 'য়','ৎ'] numbers = ['০', '১', '২', '৩', '৪', '৫', '৬', '৭', '৮', '৯'] vowel_diacritics = ['া', 'ি', 'ী', 'ু', 'ূ', 'ৃ', 'ে', 'ৈ', 'ো', 'ৌ'] consonant_diacritics = ['ঁ', 'ং', 'ঃ'] > for example you may want to map 'ঽ':Avagraha as 'হ' based on visual similiarity (default:'ই') ** legacy contions: keep_legacy_symbols and legacy_maps operates as follows case-1) keep_legacy_symbols=True and legacy_maps=None : all legacy symbols will be considered valid unicodes. None of them will be changed case-2) keep_legacy_symbols=True and legacy_maps=valid dictionary example:{'ঀ':'ক'} : all legacy symbols will be considered valid unicodes. Only 'ঀ' will be changed to 'ক' , others will be untouched case-3) keep_legacy_symbols=False and legacy_maps=None : all legacy symbols will be removed case-4) keep_legacy_symbols=False and legacy_maps=valid dictionary example:{'ঽ':'ই','ৠ':'ঋ'} : 'ঽ' will be changed to 'ই' and 'ৠ' will be changed to 'ঋ'. All other legacy symbols will be removed ''' if legacy_maps=="default": legacy_maps=languages["bangla"].legacy_maps self.complex_roots=languages["bangla"].complex_roots super(Normalizer,self).__init__(language="bangla", allow_english=allow_english, keep_legacy_symbols=keep_legacy_symbols, legacy_maps=legacy_maps) #-------------------------------------------------extended ops---------------------- # assemese self.assamese_map = {'ৰ':'র','ৱ':'ব'} self.word_level_ops["AssameseReplacement"] = self.replaceAssamese # punctuations self.punctuations_map = {'"': '"' , '৷' : '।', '–' : '-', "'" : "'", "'" : "'"} self.word_level_ops["PunctuationReplacement"] = self.replacePunctuations # to+hosonto case ''' n case-1: if 'ত'+hosonto is followed by anything other than a consonant the word is an invalid word case-2: The ত্‍ symbol which should be replaced by a 'ৎ' occurs for all consonants except:ত,থ,ন,ব,ম,য,র # code to verify this manually for c in self.consonants: print('ত'+ self.lang.connector+c) ''' self.valid_consonants_after_to_and_hosonto = ['ত','থ','ন','ব','ম','য','র'] self.decomp_level_ops["base_bangla_compose"] = self.baseCompose self.decomp_level_ops["ToAndHosontoNormalize"] = self.normalizeToandHosonto # invalid folas self.decomp_level_ops["NormalizeConjunctsDiacritics"] = self.cleanInvalidConjunctDiacritics # complex root cleanup self.decomp_level_ops["ComplexRootNormalization"] = self.convertComplexRoots #-------------------------word ops----------------------------------------------------------------------------- def replaceAssamese(self): self.replaceMaps(self.assamese_map) def replacePunctuations(self): self.replaceMaps(self.punctuations_map) #-------------------------unicode ops----------------------------------------------------------------------------- def cleanConsonantDiacritics(self): # consonant diacritics for idx,d in enumerate(self.decomp): if idx0: if self.decomp[idx]=="\u200d": # last one if idx==len(self.decomp)-1: self.decomp[idx]=None else: # if previous one is a connector if self.decomp[idx-1]==self.lang.connector: self.decomp[idx]=None self.decomp[idx-1]=None # if previous one is not 'র' elif self.decomp[idx-1]!='র': self.decomp[idx]=None else: # if prev='র' and the prev-1 is not a connector if idx>1 and self.decomp[idx-2]==self.lang.connector: self.decomp[idx]=None # if the next is not a connector elif idxFalse (a) breaks as ['ব', 'ু', 'ত', '্', 'প', 'ত', '্', 'ত', 'ি'] (b) breaks as ['ব', 'ু', 'ৎ', 'প', 'ত', '্', 'ত', 'ি'] # Example-2: (a)উত্স==(b)উৎস-->False (a) breaks as ['উ', 'ত', '্', 'স'] (b) breaks as ['উ', 'ৎ', 'স'] ''' for idx,d in enumerate(self.decomp): if idxFalse (a) breaks as ['উ', 'ু', 'ল', 'ু'] (b) breaks as ['উ', 'ল', 'ু'] # Example-2: (a)আর্কিওোলজি==(b)আর্কিওলজি-->False (a) breaks as ['আ', 'র', '্', 'ক', 'ি', 'ও', 'ো', 'ল', 'জ', 'ি'] (b) breaks as ['আ', 'র', '্', 'ক', 'ি', 'ও', 'ল', 'জ', 'ি'] Also Normalizes 'এ' and 'ত্র' # Example-1: (a)একএে==(b)একত্রে-->False (a) breaks as ['এ', 'ক', 'এ', 'ে'] (b) breaks as ['এ', 'ক', 'ত', '্', 'র', 'ে'] ''' for idx,d in enumerate(self.decomp): # if the current one is a VD and the previous char is a vowel if d in self.lang.vowel_diacritics and self.decomp[idx-1] in self.lang.vowels: # if the vowel is not 'এ' if self.decomp[idx-1] !='এ': # remove diacritic self.decomp[idx]=None # normalization case else: self.decomp[idx-1]='ত'+'্'+'র' ##------------------------------------------------------------------------------------------------------ def fixTypoForJoFola(self): for idx,d in enumerate(self.decomp): if idxFalse (a) breaks as ['গ', '্', 'র', '্', 'র', 'া', 'ম', 'ক', 'ে'] (b) breaks as ['গ', '্', 'র', 'া', 'ম', 'ক', 'ে'] ''' self.safeop(self.fixTypoForJoFola) self.safeop(self.cleanDoubleCC) self.safeop(self.cleanDoubleRef) # self.safeop(self.fixRefOrder) # print("".join(self.decomp)) # #self.safeop(self.fixOrdersForCC) #print("".join(self.decomp)) self.safeop(self.cleanConnectotForJoFola) self.baseCompose() ##------------------------------------------------------------------------------------------------------ def checkComplexRoot(self,root): formed=[] formed_idx=[] for i,c in enumerate(root): if c !='্' and i not in formed_idx: r=c if i==len(root)-1: formed.append(r) continue for j in range(i+2,len(root),2): d=root[j] k=r+'্'+d #if k== if k not in self.complex_roots: formed.append(r) break else: if j!=len(root)-1: r=k formed_idx.append(j) else: r=k formed_idx.append(j) formed.append(k) return "".join(formed) def convertComplexRoots(self): self.fixNoSpaceChar() self.decomp=[x for x in self.decomp if x is not None] self.constructComplexDecomp() for idx,d in enumerate(self.decomp): if d not in self.complex_roots and self.lang.connector in d: self.decomp[idx]=self.checkComplexRoot(d) #-------------------------unicode ops-----------------------------------------------------------------------------