394 lines
20 KiB
Python
394 lines
20 KiB
Python
#-*- coding: utf-8 -*-
|
||
"""
|
||
@author:Bengali.AI
|
||
"""
|
||
from __future__ import print_function
|
||
#-------------------------------------------
|
||
# globals
|
||
#-------------------------------------------
|
||
from .base import BaseNormalizer,languages
|
||
|
||
#-------------------------------------------
|
||
# cleaner class
|
||
#-------------------------------------------
|
||
|
||
class Normalizer(BaseNormalizer):
|
||
def __init__(self,
|
||
allow_english=False,
|
||
keep_legacy_symbols=False,
|
||
legacy_maps=None):
|
||
|
||
'''
|
||
initialize a normalizer
|
||
args:
|
||
allow_english : allow english letters numbers and punctuations [default:False]
|
||
keep_legacy_symbols : legacy symbols will be considered as valid unicodes[default:False]
|
||
'৺':Isshar
|
||
'৻':Ganda
|
||
'ঀ':Anji (not '৭')
|
||
'ঌ':li
|
||
'ৡ':dirgho li
|
||
'ঽ':Avagraha
|
||
'ৠ':Vocalic Rr (not 'ঋ')
|
||
'৲':rupi
|
||
'৴':currency numerator 1
|
||
'৵':currency numerator 2
|
||
'৶':currency numerator 3
|
||
'৷':currency numerator 4
|
||
'৸':currency numerator one less than the denominator
|
||
'৹':Currency Denominator Sixteen
|
||
legacy_maps : a dictionay for changing legacy symbols into a more used unicode
|
||
a default legacy map is included in the language class as well,
|
||
legacy_maps={'ঀ':'৭',
|
||
'ঌ':'৯',
|
||
'ৡ':'৯',
|
||
'৵':'৯',
|
||
'৻':'ৎ',
|
||
'ৠ':'ঋ',
|
||
'ঽ':'ই'}
|
||
|
||
pass-
|
||
* legacy_maps=None; for keeping the legacy symbols as they are
|
||
* legacy_maps="default"; for using the default legacy map
|
||
* legacy_maps=custom dictionary(type-dict) ; which will map your desired legacy symbol to any of symbol you want
|
||
* the keys in the custiom dicts must belong to any of the legacy symbols
|
||
* the values in the custiom dicts must belong to either vowels,consonants,numbers or diacritics
|
||
vowels = ['অ', 'আ', 'ই', 'ঈ', 'উ', 'ঊ', 'ঋ', 'এ', 'ঐ', 'ও', 'ঔ']
|
||
consonants = ['ক', 'খ', 'গ', 'ঘ', 'ঙ', 'চ', 'ছ','জ', 'ঝ', 'ঞ',
|
||
'ট', 'ঠ', 'ড', 'ঢ', 'ণ', 'ত', 'থ', 'দ', 'ধ', 'ন',
|
||
'প', 'ফ', 'ব', 'ভ', 'ম', 'য', 'র', 'ল', 'শ', 'ষ',
|
||
'স', 'হ','ড়', 'ঢ়', 'য়','ৎ']
|
||
numbers = ['০', '১', '২', '৩', '৪', '৫', '৬', '৭', '৮', '৯']
|
||
vowel_diacritics = ['া', 'ি', 'ী', 'ু', 'ূ', 'ৃ', 'ে', 'ৈ', 'ো', 'ৌ']
|
||
consonant_diacritics = ['ঁ', 'ং', 'ঃ']
|
||
|
||
> for example you may want to map 'ঽ':Avagraha as 'হ' based on visual similiarity
|
||
(default:'ই')
|
||
|
||
** legacy contions: keep_legacy_symbols and legacy_maps operates as follows
|
||
case-1) keep_legacy_symbols=True and legacy_maps=None
|
||
: all legacy symbols will be considered valid unicodes. None of them will be changed
|
||
case-2) keep_legacy_symbols=True and legacy_maps=valid dictionary example:{'ঀ':'ক'}
|
||
: all legacy symbols will be considered valid unicodes. Only 'ঀ' will be changed to 'ক' , others will be untouched
|
||
case-3) keep_legacy_symbols=False and legacy_maps=None
|
||
: all legacy symbols will be removed
|
||
case-4) keep_legacy_symbols=False and legacy_maps=valid dictionary example:{'ঽ':'ই','ৠ':'ঋ'}
|
||
: 'ঽ' will be changed to 'ই' and 'ৠ' will be changed to 'ঋ'. All other legacy symbols will be removed
|
||
'''
|
||
if legacy_maps=="default":
|
||
legacy_maps=languages["bangla"].legacy_maps
|
||
|
||
self.complex_roots=languages["bangla"].complex_roots
|
||
|
||
super(Normalizer,self).__init__(language="bangla",
|
||
allow_english=allow_english,
|
||
keep_legacy_symbols=keep_legacy_symbols,
|
||
legacy_maps=legacy_maps)
|
||
#-------------------------------------------------extended ops----------------------
|
||
# assemese
|
||
self.assamese_map = {'ৰ':'র','ৱ':'ব'}
|
||
self.word_level_ops["AssameseReplacement"] = self.replaceAssamese
|
||
# punctuations
|
||
self.punctuations_map = {'"': '"' ,
|
||
'৷' : '।',
|
||
'–' : '-',
|
||
"'" : "'",
|
||
"'" : "'"}
|
||
self.word_level_ops["PunctuationReplacement"] = self.replacePunctuations
|
||
|
||
# to+hosonto case
|
||
'''
|
||
n
|
||
case-1: if 'ত'+hosonto is followed by anything other than a consonant the word is an invalid word
|
||
|
||
case-2: The ত্ symbol which should be replaced by a 'ৎ' occurs for all consonants except:ত,থ,ন,ব,ম,য,র
|
||
# code to verify this manually
|
||
for c in self.consonants:
|
||
print('ত'+ self.lang.connector+c)
|
||
|
||
'''
|
||
|
||
self.valid_consonants_after_to_and_hosonto = ['ত','থ','ন','ব','ম','য','র']
|
||
self.decomp_level_ops["base_bangla_compose"] = self.baseCompose
|
||
self.decomp_level_ops["ToAndHosontoNormalize"] = self.normalizeToandHosonto
|
||
|
||
# invalid folas
|
||
self.decomp_level_ops["NormalizeConjunctsDiacritics"] = self.cleanInvalidConjunctDiacritics
|
||
|
||
# complex root cleanup
|
||
self.decomp_level_ops["ComplexRootNormalization"] = self.convertComplexRoots
|
||
|
||
|
||
#-------------------------word ops-----------------------------------------------------------------------------
|
||
def replaceAssamese(self):
|
||
self.replaceMaps(self.assamese_map)
|
||
|
||
def replacePunctuations(self):
|
||
self.replaceMaps(self.punctuations_map)
|
||
|
||
#-------------------------unicode ops-----------------------------------------------------------------------------
|
||
def cleanConsonantDiacritics(self):
|
||
# consonant diacritics
|
||
for idx,d in enumerate(self.decomp):
|
||
if idx<len(self.decomp)-1:
|
||
if d in self.lang.consonant_diacritics and self.decomp[idx+1] in self.lang.consonant_diacritics:
|
||
# if they are same delete the current one
|
||
if d==self.decomp[idx+1]:
|
||
self.decomp[idx]=None
|
||
elif d in ['ং', 'ঃ'] and self.decomp[idx+1]=='ঁ':
|
||
self.swapIdxs(idx,idx+1)
|
||
elif d=='ং' and self.decomp[idx+1]== 'ঃ':
|
||
self.decomp[idx+1]=None
|
||
elif d=='ঃ' and self.decomp[idx+1]== 'ং':
|
||
self.decomp[idx+1]=None
|
||
|
||
def fixNoSpaceChar(self):
|
||
# replace
|
||
for idx,d in enumerate(self.decomp):
|
||
if idx==0 and self.decomp[idx] in ["\u200c","\u200d"]:
|
||
self.decomp[idx]=None
|
||
else:
|
||
if self.decomp[idx]=="\u200c":
|
||
self.decomp[idx]="\u200d"
|
||
self.decomp=[x for x in self.decomp if x is not None]
|
||
# strict
|
||
for idx,d in enumerate(self.decomp):
|
||
if idx>0:
|
||
if self.decomp[idx]=="\u200d":
|
||
# last one
|
||
if idx==len(self.decomp)-1:
|
||
self.decomp[idx]=None
|
||
else:
|
||
# if previous one is a connector
|
||
if self.decomp[idx-1]==self.lang.connector:
|
||
self.decomp[idx]=None
|
||
self.decomp[idx-1]=None
|
||
# if previous one is not 'র'
|
||
elif self.decomp[idx-1]!='র':
|
||
self.decomp[idx]=None
|
||
else:
|
||
# if prev='র' and the prev-1 is not a connector
|
||
if idx>1 and self.decomp[idx-2]==self.lang.connector:
|
||
self.decomp[idx]=None
|
||
# if the next is not a connector
|
||
elif idx<len(self.decomp)-1 and self.decomp[idx+1]!=self.lang.connector:
|
||
self.decomp[idx]=None
|
||
# if the next one to connector is not "য"
|
||
elif idx<len(self.decomp)-2 and self.decomp[idx+2]!="য" and self.decomp[idx+1]!=self.lang.connector:
|
||
self.decomp[idx]=None
|
||
else:
|
||
# the actual allowed case
|
||
self.decomp[idx-1]+=self.decomp[idx]
|
||
self.decomp[idx]=None
|
||
self.decomp=[x for x in self.decomp if x is not None]
|
||
|
||
|
||
|
||
##------------------------------------------------------------------------------------------------------
|
||
def cleanInvalidConnector(self):
|
||
for idx,d in enumerate(self.decomp):
|
||
if idx<len(self.decomp)-1:
|
||
if d==self.lang.connector and self.decomp[idx+1]!="য" and self.decomp[idx-1] not in ['অ','এ']: # exception
|
||
if self.decomp[idx-1] in self.lang.invalid_connectors or self.decomp[idx+1] in self.lang.invalid_connectors:
|
||
self.decomp[idx]=None
|
||
if d==self.lang.connector and self.decomp[idx-1]=="য" and self.decomp[idx+1]!="য":
|
||
self.decomp[idx]=None
|
||
if d==self.lang.connector and self.decomp[idx-1]=="ব" and self.decomp[idx+1] not in ['জ', 'দ', 'ধ', 'ব', 'য', 'র', 'ল']:
|
||
self.decomp[idx]=None
|
||
|
||
# handle exception
|
||
self.decomp=[d for d in self.decomp if d is not None]
|
||
word="".join(self.decomp)
|
||
|
||
if "এ্যা" in word:
|
||
word=word.replace("এ্যা","অ্যা")
|
||
if 'অ্য' in word:
|
||
word=word.replace('অ্য',"অ্যা")
|
||
self.decomp=[ch for ch in word]
|
||
|
||
def convertToAndHosonto(self):
|
||
'''
|
||
normalizes to+hosonto for ['ত','থ','ন','ব','ম','য','র']
|
||
# Example-1:
|
||
(a)বুত্পত্তি==(b)বুৎপত্তি-->False
|
||
(a) breaks as ['ব', 'ু', 'ত', '্', 'প', 'ত', '্', 'ত', 'ি']
|
||
(b) breaks as ['ব', 'ু', 'ৎ', 'প', 'ত', '্', 'ত', 'ি']
|
||
# Example-2:
|
||
(a)উত্স==(b)উৎস-->False
|
||
(a) breaks as ['উ', 'ত', '্', 'স']
|
||
(b) breaks as ['উ', 'ৎ', 'স']
|
||
'''
|
||
for idx,d in enumerate(self.decomp):
|
||
if idx<len(self.decomp)-1:
|
||
# to + hosonto
|
||
if d=='ত' and self.decomp[idx+1]== self.lang.connector:
|
||
# for single case
|
||
if idx<len(self.decomp)-2:
|
||
if self.decomp[idx+2] not in self.valid_consonants_after_to_and_hosonto:
|
||
# replace
|
||
self.decomp[idx]='ৎ'
|
||
# delete
|
||
self.decomp[idx+1]=None
|
||
|
||
else:
|
||
# valid replacement for to+hos double case
|
||
if idx<len(self.decomp)-3:
|
||
if self.decomp[idx+2]=='ত' and self.decomp[idx+3]== self.lang.connector:
|
||
if idx<len(self.decomp)-4:
|
||
if self.decomp[idx+4] not in ['ব','য','র']:
|
||
# if the next charecter after the double to+hos+to+hos is with in ['ত','থ','ন','ম']
|
||
# replace
|
||
self.decomp[idx]='ৎ'
|
||
# delete
|
||
self.decomp[idx+1]=None
|
||
if idx<len(self.decomp)-4:
|
||
if self.decomp[idx+4]=='র':
|
||
# delete
|
||
self.decomp[idx+3]=None
|
||
|
||
def swapToAndHosontoDiacritics(self):
|
||
'''
|
||
puts diacritics in right place
|
||
'''
|
||
for idx,d in enumerate(self.decomp):
|
||
if idx<len(self.decomp)-1:
|
||
if d=='ৎ' and self.decomp[idx+1] in self.lang.diacritics:
|
||
self.swapIdxs(idx,idx+1)
|
||
###------------------------------------------------------------------------------------------------------
|
||
def normalizeToandHosonto(self):
|
||
self.safeop(self.convertToAndHosonto)
|
||
self.safeop(self.swapToAndHosontoDiacritics)
|
||
self.baseCompose()
|
||
|
||
###------------------------------------------------------------------------------------------------------
|
||
##------------------------------------------------------------------------------------------------------
|
||
|
||
def cleanVowelDiacriticComingAfterVowel(self):
|
||
'''
|
||
takes care of vowels and modifier followed by vowel diacritics
|
||
# Example-1:
|
||
(a)উুলু==(b)উলু-->False
|
||
(a) breaks as ['উ', 'ু', 'ল', 'ু']
|
||
(b) breaks as ['উ', 'ল', 'ু']
|
||
# Example-2:
|
||
(a)আর্কিওোলজি==(b)আর্কিওলজি-->False
|
||
(a) breaks as ['আ', 'র', '্', 'ক', 'ি', 'ও', 'ো', 'ল', 'জ', 'ি']
|
||
(b) breaks as ['আ', 'র', '্', 'ক', 'ি', 'ও', 'ল', 'জ', 'ি']
|
||
|
||
|
||
Also Normalizes 'এ' and 'ত্র'
|
||
# Example-1:
|
||
(a)একএে==(b)একত্রে-->False
|
||
(a) breaks as ['এ', 'ক', 'এ', 'ে']
|
||
(b) breaks as ['এ', 'ক', 'ত', '্', 'র', 'ে']
|
||
|
||
'''
|
||
for idx,d in enumerate(self.decomp):
|
||
# if the current one is a VD and the previous char is a vowel
|
||
if d in self.lang.vowel_diacritics and self.decomp[idx-1] in self.lang.vowels:
|
||
# if the vowel is not 'এ'
|
||
if self.decomp[idx-1] !='এ':
|
||
# remove diacritic
|
||
self.decomp[idx]=None
|
||
# normalization case
|
||
else:
|
||
self.decomp[idx-1]='ত'+'্'+'র'
|
||
|
||
##------------------------------------------------------------------------------------------------------
|
||
def fixTypoForJoFola(self):
|
||
for idx,d in enumerate(self.decomp):
|
||
if idx<len(self.decomp)-1:
|
||
if d== self.lang.connector and self.decomp[idx+1]=='য়':
|
||
self.decomp[idx+1]='য'
|
||
|
||
def cleanDoubleCC(self):
|
||
# c,cc,c,cc
|
||
for idx,d in enumerate(self.decomp):
|
||
if idx<len(self.decomp)-3:
|
||
if d== self.lang.connector and self.decomp[idx+1] in self.lang.consonants \
|
||
and self.decomp[idx+2]==self.lang.connector and self.decomp[idx+3] in self.lang.consonants:
|
||
if self.decomp[idx+3]==self.decomp[idx+1]:
|
||
self.decomp[idx]=None
|
||
self.decomp[idx+1]=None
|
||
|
||
|
||
|
||
def cleanDoubleRef(self):
|
||
for idx,d in enumerate(self.decomp):
|
||
if idx<len(self.decomp)-3:
|
||
if d=='র' and self.decomp[idx+1]==self.lang.connector\
|
||
and self.decomp[idx+2]=='র' and self.decomp[idx+3]== self.lang.connector:
|
||
self.decomp[idx]=None
|
||
self.decomp[idx+1]=None
|
||
|
||
def cleanConnectotForJoFola(self):
|
||
for idx,d in enumerate(self.decomp):
|
||
if idx<len(self.decomp)-2:
|
||
if d== self.lang.connector and self.decomp[idx+1]=='য' and self.decomp[idx+2]==self.lang.connector:
|
||
self.decomp[idx+2]=None
|
||
|
||
|
||
|
||
def cleanInvalidConjunctDiacritics(self):
|
||
'''
|
||
cleans repeated folas
|
||
# Example-1:
|
||
(a)গ্র্রামকে==(b)গ্রামকে-->False
|
||
(a) breaks as ['গ', '্', 'র', '্', 'র', 'া', 'ম', 'ক', 'ে']
|
||
(b) breaks as ['গ', '্', 'র', 'া', 'ম', 'ক', 'ে']
|
||
'''
|
||
self.safeop(self.fixTypoForJoFola)
|
||
self.safeop(self.cleanDoubleCC)
|
||
self.safeop(self.cleanDoubleRef)
|
||
# self.safeop(self.fixRefOrder)
|
||
# print("".join(self.decomp))
|
||
# #self.safeop(self.fixOrdersForCC)
|
||
#print("".join(self.decomp))
|
||
self.safeop(self.cleanConnectotForJoFola)
|
||
self.baseCompose()
|
||
##------------------------------------------------------------------------------------------------------
|
||
def checkComplexRoot(self,root):
|
||
formed=[]
|
||
formed_idx=[]
|
||
for i,c in enumerate(root):
|
||
if c !='্' and i not in formed_idx:
|
||
r=c
|
||
if i==len(root)-1:
|
||
formed.append(r)
|
||
continue
|
||
for j in range(i+2,len(root),2):
|
||
|
||
d=root[j]
|
||
k=r+'্'+d
|
||
#if k==
|
||
if k not in self.complex_roots:
|
||
formed.append(r)
|
||
break
|
||
else:
|
||
if j!=len(root)-1:
|
||
r=k
|
||
formed_idx.append(j)
|
||
else:
|
||
r=k
|
||
formed_idx.append(j)
|
||
formed.append(k)
|
||
return "".join(formed)
|
||
|
||
|
||
|
||
def convertComplexRoots(self):
|
||
self.fixNoSpaceChar()
|
||
self.decomp=[x for x in self.decomp if x is not None]
|
||
self.constructComplexDecomp()
|
||
for idx,d in enumerate(self.decomp):
|
||
if d not in self.complex_roots and self.lang.connector in d:
|
||
self.decomp[idx]=self.checkComplexRoot(d)
|
||
|
||
|
||
|
||
#-------------------------unicode ops-----------------------------------------------------------------------------
|
||
|
||
|
||
|
||
|