ai-content-maker/.venv/Lib/site-packages/bnunicodenormalizer/normalizer.py

394 lines
20 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#-*- coding: utf-8 -*-
"""
@author:Bengali.AI
"""
from __future__ import print_function
#-------------------------------------------
# globals
#-------------------------------------------
from .base import BaseNormalizer,languages
#-------------------------------------------
# cleaner class
#-------------------------------------------
class Normalizer(BaseNormalizer):
def __init__(self,
allow_english=False,
keep_legacy_symbols=False,
legacy_maps=None):
'''
initialize a normalizer
args:
allow_english : allow english letters numbers and punctuations [default:False]
keep_legacy_symbols : legacy symbols will be considered as valid unicodes[default:False]
'':Isshar
'':Ganda
'':Anji (not '')
'':li
'':dirgho li
'':Avagraha
'':Vocalic Rr (not '')
'':rupi
'':currency numerator 1
'':currency numerator 2
'':currency numerator 3
'':currency numerator 4
'':currency numerator one less than the denominator
'':Currency Denominator Sixteen
legacy_maps : a dictionay for changing legacy symbols into a more used unicode
a default legacy map is included in the language class as well,
legacy_maps={'':'',
'':'',
'':'',
'':'',
'':'',
'':'',
'':''}
pass-
* legacy_maps=None; for keeping the legacy symbols as they are
* legacy_maps="default"; for using the default legacy map
* legacy_maps=custom dictionary(type-dict) ; which will map your desired legacy symbol to any of symbol you want
* the keys in the custiom dicts must belong to any of the legacy symbols
* the values in the custiom dicts must belong to either vowels,consonants,numbers or diacritics
vowels = ['', '', '', '', '', '', '', '', '', '', '']
consonants = ['', '', '', '', '', '', '','', '', '',
'', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '',
'', '','', '', '','']
numbers = ['', '', '', '', '', '', '', '', '', '']
vowel_diacritics = ['', 'ি', '', '', '', '', '', '', '', '']
consonant_diacritics = ['', '', '']
> for example you may want to map '':Avagraha as '' based on visual similiarity
(default:'')
** legacy contions: keep_legacy_symbols and legacy_maps operates as follows
case-1) keep_legacy_symbols=True and legacy_maps=None
: all legacy symbols will be considered valid unicodes. None of them will be changed
case-2) keep_legacy_symbols=True and legacy_maps=valid dictionary example:{'':''}
: all legacy symbols will be considered valid unicodes. Only '' will be changed to '' , others will be untouched
case-3) keep_legacy_symbols=False and legacy_maps=None
: all legacy symbols will be removed
case-4) keep_legacy_symbols=False and legacy_maps=valid dictionary example:{'':'','':''}
: '' will be changed to '' and '' will be changed to ''. All other legacy symbols will be removed
'''
if legacy_maps=="default":
legacy_maps=languages["bangla"].legacy_maps
self.complex_roots=languages["bangla"].complex_roots
super(Normalizer,self).__init__(language="bangla",
allow_english=allow_english,
keep_legacy_symbols=keep_legacy_symbols,
legacy_maps=legacy_maps)
#-------------------------------------------------extended ops----------------------
# assemese
self.assamese_map = {'':'','':''}
self.word_level_ops["AssameseReplacement"] = self.replaceAssamese
# punctuations
self.punctuations_map = {'"': '"' ,
'' : '',
'' : '-',
"'" : "'",
"'" : "'"}
self.word_level_ops["PunctuationReplacement"] = self.replacePunctuations
# to+hosonto case
'''
n
case-1: if ''+hosonto is followed by anything other than a consonant the word is an invalid word
case-2: The ত্‍ symbol which should be replaced by a '' occurs for all consonants except:ত,থ,ন,ব,ম,য,র
# code to verify this manually
for c in self.consonants:
print(''+ self.lang.connector+c)
'''
self.valid_consonants_after_to_and_hosonto = ['','','','','','','']
self.decomp_level_ops["base_bangla_compose"] = self.baseCompose
self.decomp_level_ops["ToAndHosontoNormalize"] = self.normalizeToandHosonto
# invalid folas
self.decomp_level_ops["NormalizeConjunctsDiacritics"] = self.cleanInvalidConjunctDiacritics
# complex root cleanup
self.decomp_level_ops["ComplexRootNormalization"] = self.convertComplexRoots
#-------------------------word ops-----------------------------------------------------------------------------
def replaceAssamese(self):
self.replaceMaps(self.assamese_map)
def replacePunctuations(self):
self.replaceMaps(self.punctuations_map)
#-------------------------unicode ops-----------------------------------------------------------------------------
def cleanConsonantDiacritics(self):
# consonant diacritics
for idx,d in enumerate(self.decomp):
if idx<len(self.decomp)-1:
if d in self.lang.consonant_diacritics and self.decomp[idx+1] in self.lang.consonant_diacritics:
# if they are same delete the current one
if d==self.decomp[idx+1]:
self.decomp[idx]=None
elif d in ['', ''] and self.decomp[idx+1]=='':
self.swapIdxs(idx,idx+1)
elif d=='' and self.decomp[idx+1]== '':
self.decomp[idx+1]=None
elif d=='' and self.decomp[idx+1]== '':
self.decomp[idx+1]=None
def fixNoSpaceChar(self):
# replace
for idx,d in enumerate(self.decomp):
if idx==0 and self.decomp[idx] in ["\u200c","\u200d"]:
self.decomp[idx]=None
else:
if self.decomp[idx]=="\u200c":
self.decomp[idx]="\u200d"
self.decomp=[x for x in self.decomp if x is not None]
# strict
for idx,d in enumerate(self.decomp):
if idx>0:
if self.decomp[idx]=="\u200d":
# last one
if idx==len(self.decomp)-1:
self.decomp[idx]=None
else:
# if previous one is a connector
if self.decomp[idx-1]==self.lang.connector:
self.decomp[idx]=None
self.decomp[idx-1]=None
# if previous one is not 'র'
elif self.decomp[idx-1]!='':
self.decomp[idx]=None
else:
# if prev='র' and the prev-1 is not a connector
if idx>1 and self.decomp[idx-2]==self.lang.connector:
self.decomp[idx]=None
# if the next is not a connector
elif idx<len(self.decomp)-1 and self.decomp[idx+1]!=self.lang.connector:
self.decomp[idx]=None
# if the next one to connector is not "য"
elif idx<len(self.decomp)-2 and self.decomp[idx+2]!="" and self.decomp[idx+1]!=self.lang.connector:
self.decomp[idx]=None
else:
# the actual allowed case
self.decomp[idx-1]+=self.decomp[idx]
self.decomp[idx]=None
self.decomp=[x for x in self.decomp if x is not None]
##------------------------------------------------------------------------------------------------------
def cleanInvalidConnector(self):
for idx,d in enumerate(self.decomp):
if idx<len(self.decomp)-1:
if d==self.lang.connector and self.decomp[idx+1]!="" and self.decomp[idx-1] not in ['','']: # exception
if self.decomp[idx-1] in self.lang.invalid_connectors or self.decomp[idx+1] in self.lang.invalid_connectors:
self.decomp[idx]=None
if d==self.lang.connector and self.decomp[idx-1]=="" and self.decomp[idx+1]!="":
self.decomp[idx]=None
if d==self.lang.connector and self.decomp[idx-1]=="" and self.decomp[idx+1] not in ['', '', '', '', '', '', '']:
self.decomp[idx]=None
# handle exception
self.decomp=[d for d in self.decomp if d is not None]
word="".join(self.decomp)
if "এ্যা" in word:
word=word.replace("এ্যা","অ্যা")
if 'অ্য' in word:
word=word.replace('অ্য',"অ্যা")
self.decomp=[ch for ch in word]
def convertToAndHosonto(self):
'''
normalizes to+hosonto for ['','','','','','','']
# Example-1:
(a)বুত্পত্তি==(b)বুৎপত্তি-->False
(a) breaks as ['', '', '', '', '', '', '', '', 'ি']
(b) breaks as ['', '', '', '', '', '', '', 'ি']
# Example-2:
(a)উত্স==(b)উৎস-->False
(a) breaks as ['', '', '', '']
(b) breaks as ['', '', '']
'''
for idx,d in enumerate(self.decomp):
if idx<len(self.decomp)-1:
# to + hosonto
if d=='' and self.decomp[idx+1]== self.lang.connector:
# for single case
if idx<len(self.decomp)-2:
if self.decomp[idx+2] not in self.valid_consonants_after_to_and_hosonto:
# replace
self.decomp[idx]=''
# delete
self.decomp[idx+1]=None
else:
# valid replacement for to+hos double case
if idx<len(self.decomp)-3:
if self.decomp[idx+2]=='' and self.decomp[idx+3]== self.lang.connector:
if idx<len(self.decomp)-4:
if self.decomp[idx+4] not in ['','','']:
# if the next charecter after the double to+hos+to+hos is with in ['ত','থ','ন','ম']
# replace
self.decomp[idx]=''
# delete
self.decomp[idx+1]=None
if idx<len(self.decomp)-4:
if self.decomp[idx+4]=='':
# delete
self.decomp[idx+3]=None
def swapToAndHosontoDiacritics(self):
'''
puts diacritics in right place
'''
for idx,d in enumerate(self.decomp):
if idx<len(self.decomp)-1:
if d=='' and self.decomp[idx+1] in self.lang.diacritics:
self.swapIdxs(idx,idx+1)
###------------------------------------------------------------------------------------------------------
def normalizeToandHosonto(self):
self.safeop(self.convertToAndHosonto)
self.safeop(self.swapToAndHosontoDiacritics)
self.baseCompose()
###------------------------------------------------------------------------------------------------------
##------------------------------------------------------------------------------------------------------
def cleanVowelDiacriticComingAfterVowel(self):
'''
takes care of vowels and modifier followed by vowel diacritics
# Example-1:
(a)উুলু==(b)উলু-->False
(a) breaks as ['', '', '', '']
(b) breaks as ['', '', '']
# Example-2:
(a)আর্কিওোলজি==(b)আর্কিওলজি-->False
(a) breaks as ['', '', '', '', 'ি', '', '', '', '', 'ি']
(b) breaks as ['', '', '', '', 'ি', '', '', '', 'ি']
Also Normalizes '' and 'ত্র'
# Example-1:
(a)একএে==(b)একত্রে-->False
(a) breaks as ['', '', '', '']
(b) breaks as ['', '', '', '', '', '']
'''
for idx,d in enumerate(self.decomp):
# if the current one is a VD and the previous char is a vowel
if d in self.lang.vowel_diacritics and self.decomp[idx-1] in self.lang.vowels:
# if the vowel is not 'এ'
if self.decomp[idx-1] !='':
# remove diacritic
self.decomp[idx]=None
# normalization case
else:
self.decomp[idx-1]=''+''+''
##------------------------------------------------------------------------------------------------------
def fixTypoForJoFola(self):
for idx,d in enumerate(self.decomp):
if idx<len(self.decomp)-1:
if d== self.lang.connector and self.decomp[idx+1]=='':
self.decomp[idx+1]=''
def cleanDoubleCC(self):
# c,cc,c,cc
for idx,d in enumerate(self.decomp):
if idx<len(self.decomp)-3:
if d== self.lang.connector and self.decomp[idx+1] in self.lang.consonants \
and self.decomp[idx+2]==self.lang.connector and self.decomp[idx+3] in self.lang.consonants:
if self.decomp[idx+3]==self.decomp[idx+1]:
self.decomp[idx]=None
self.decomp[idx+1]=None
def cleanDoubleRef(self):
for idx,d in enumerate(self.decomp):
if idx<len(self.decomp)-3:
if d=='' and self.decomp[idx+1]==self.lang.connector\
and self.decomp[idx+2]=='' and self.decomp[idx+3]== self.lang.connector:
self.decomp[idx]=None
self.decomp[idx+1]=None
def cleanConnectotForJoFola(self):
for idx,d in enumerate(self.decomp):
if idx<len(self.decomp)-2:
if d== self.lang.connector and self.decomp[idx+1]=='' and self.decomp[idx+2]==self.lang.connector:
self.decomp[idx+2]=None
def cleanInvalidConjunctDiacritics(self):
'''
cleans repeated folas
# Example-1:
(a)গ্র্রামকে==(b)গ্রামকে-->False
(a) breaks as ['', '', '', '', '', '', '', '', '']
(b) breaks as ['', '', '', '', '', '', '']
'''
self.safeop(self.fixTypoForJoFola)
self.safeop(self.cleanDoubleCC)
self.safeop(self.cleanDoubleRef)
# self.safeop(self.fixRefOrder)
# print("".join(self.decomp))
# #self.safeop(self.fixOrdersForCC)
#print("".join(self.decomp))
self.safeop(self.cleanConnectotForJoFola)
self.baseCompose()
##------------------------------------------------------------------------------------------------------
def checkComplexRoot(self,root):
formed=[]
formed_idx=[]
for i,c in enumerate(root):
if c !='' and i not in formed_idx:
r=c
if i==len(root)-1:
formed.append(r)
continue
for j in range(i+2,len(root),2):
d=root[j]
k=r+''+d
#if k==
if k not in self.complex_roots:
formed.append(r)
break
else:
if j!=len(root)-1:
r=k
formed_idx.append(j)
else:
r=k
formed_idx.append(j)
formed.append(k)
return "".join(formed)
def convertComplexRoots(self):
self.fixNoSpaceChar()
self.decomp=[x for x in self.decomp if x is not None]
self.constructComplexDecomp()
for idx,d in enumerate(self.decomp):
if d not in self.complex_roots and self.lang.connector in d:
self.decomp[idx]=self.checkComplexRoot(d)
#-------------------------unicode ops-----------------------------------------------------------------------------