ai-content-maker/.venv/Lib/site-packages/bnunicodenormalizer/normalizer.py

394 lines
20 KiB
Python
Raw Normal View History

2024-05-03 04:18:51 +03:00
#-*- coding: utf-8 -*-
"""
@author:Bengali.AI
"""
from __future__ import print_function
#-------------------------------------------
# globals
#-------------------------------------------
from .base import BaseNormalizer,languages
#-------------------------------------------
# cleaner class
#-------------------------------------------
class Normalizer(BaseNormalizer):
def __init__(self,
allow_english=False,
keep_legacy_symbols=False,
legacy_maps=None):
'''
initialize a normalizer
args:
allow_english : allow english letters numbers and punctuations [default:False]
keep_legacy_symbols : legacy symbols will be considered as valid unicodes[default:False]
'':Isshar
'':Ganda
'':Anji (not '')
'':li
'':dirgho li
'':Avagraha
'':Vocalic Rr (not '')
'':rupi
'':currency numerator 1
'':currency numerator 2
'':currency numerator 3
'':currency numerator 4
'':currency numerator one less than the denominator
'':Currency Denominator Sixteen
legacy_maps : a dictionay for changing legacy symbols into a more used unicode
a default legacy map is included in the language class as well,
legacy_maps={'':'',
'':'',
'':'',
'':'',
'':'',
'':'',
'':''}
pass-
* legacy_maps=None; for keeping the legacy symbols as they are
* legacy_maps="default"; for using the default legacy map
* legacy_maps=custom dictionary(type-dict) ; which will map your desired legacy symbol to any of symbol you want
* the keys in the custiom dicts must belong to any of the legacy symbols
* the values in the custiom dicts must belong to either vowels,consonants,numbers or diacritics
vowels = ['', '', '', '', '', '', '', '', '', '', '']
consonants = ['', '', '', '', '', '', '','', '', '',
'', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '',
'', '','', '', '','']
numbers = ['', '', '', '', '', '', '', '', '', '']
vowel_diacritics = ['', 'ি', '', '', '', '', '', '', '', '']
consonant_diacritics = ['', '', '']
> for example you may want to map '':Avagraha as '' based on visual similiarity
(default:'')
** legacy contions: keep_legacy_symbols and legacy_maps operates as follows
case-1) keep_legacy_symbols=True and legacy_maps=None
: all legacy symbols will be considered valid unicodes. None of them will be changed
case-2) keep_legacy_symbols=True and legacy_maps=valid dictionary example:{'':''}
: all legacy symbols will be considered valid unicodes. Only '' will be changed to '' , others will be untouched
case-3) keep_legacy_symbols=False and legacy_maps=None
: all legacy symbols will be removed
case-4) keep_legacy_symbols=False and legacy_maps=valid dictionary example:{'':'','':''}
: '' will be changed to '' and '' will be changed to ''. All other legacy symbols will be removed
'''
if legacy_maps=="default":
legacy_maps=languages["bangla"].legacy_maps
self.complex_roots=languages["bangla"].complex_roots
super(Normalizer,self).__init__(language="bangla",
allow_english=allow_english,
keep_legacy_symbols=keep_legacy_symbols,
legacy_maps=legacy_maps)
#-------------------------------------------------extended ops----------------------
# assemese
self.assamese_map = {'':'','':''}
self.word_level_ops["AssameseReplacement"] = self.replaceAssamese
# punctuations
self.punctuations_map = {'"': '"' ,
'' : '',
'' : '-',
"'" : "'",
"'" : "'"}
self.word_level_ops["PunctuationReplacement"] = self.replacePunctuations
# to+hosonto case
'''
n
case-1: if ''+hosonto is followed by anything other than a consonant the word is an invalid word
case-2: The symbol which should be replaced by a '' occurs for all consonants except:,,,,,,
# code to verify this manually
for c in self.consonants:
print(''+ self.lang.connector+c)
'''
self.valid_consonants_after_to_and_hosonto = ['','','','','','','']
self.decomp_level_ops["base_bangla_compose"] = self.baseCompose
self.decomp_level_ops["ToAndHosontoNormalize"] = self.normalizeToandHosonto
# invalid folas
self.decomp_level_ops["NormalizeConjunctsDiacritics"] = self.cleanInvalidConjunctDiacritics
# complex root cleanup
self.decomp_level_ops["ComplexRootNormalization"] = self.convertComplexRoots
#-------------------------word ops-----------------------------------------------------------------------------
def replaceAssamese(self):
self.replaceMaps(self.assamese_map)
def replacePunctuations(self):
self.replaceMaps(self.punctuations_map)
#-------------------------unicode ops-----------------------------------------------------------------------------
def cleanConsonantDiacritics(self):
# consonant diacritics
for idx,d in enumerate(self.decomp):
if idx<len(self.decomp)-1:
if d in self.lang.consonant_diacritics and self.decomp[idx+1] in self.lang.consonant_diacritics:
# if they are same delete the current one
if d==self.decomp[idx+1]:
self.decomp[idx]=None
elif d in ['', ''] and self.decomp[idx+1]=='':
self.swapIdxs(idx,idx+1)
elif d=='' and self.decomp[idx+1]== '':
self.decomp[idx+1]=None
elif d=='' and self.decomp[idx+1]== '':
self.decomp[idx+1]=None
def fixNoSpaceChar(self):
# replace
for idx,d in enumerate(self.decomp):
if idx==0 and self.decomp[idx] in ["\u200c","\u200d"]:
self.decomp[idx]=None
else:
if self.decomp[idx]=="\u200c":
self.decomp[idx]="\u200d"
self.decomp=[x for x in self.decomp if x is not None]
# strict
for idx,d in enumerate(self.decomp):
if idx>0:
if self.decomp[idx]=="\u200d":
# last one
if idx==len(self.decomp)-1:
self.decomp[idx]=None
else:
# if previous one is a connector
if self.decomp[idx-1]==self.lang.connector:
self.decomp[idx]=None
self.decomp[idx-1]=None
# if previous one is not 'র'
elif self.decomp[idx-1]!='':
self.decomp[idx]=None
else:
# if prev='র' and the prev-1 is not a connector
if idx>1 and self.decomp[idx-2]==self.lang.connector:
self.decomp[idx]=None
# if the next is not a connector
elif idx<len(self.decomp)-1 and self.decomp[idx+1]!=self.lang.connector:
self.decomp[idx]=None
# if the next one to connector is not "য"
elif idx<len(self.decomp)-2 and self.decomp[idx+2]!="" and self.decomp[idx+1]!=self.lang.connector:
self.decomp[idx]=None
else:
# the actual allowed case
self.decomp[idx-1]+=self.decomp[idx]
self.decomp[idx]=None
self.decomp=[x for x in self.decomp if x is not None]
##------------------------------------------------------------------------------------------------------
def cleanInvalidConnector(self):
for idx,d in enumerate(self.decomp):
if idx<len(self.decomp)-1:
if d==self.lang.connector and self.decomp[idx+1]!="" and self.decomp[idx-1] not in ['','']: # exception
if self.decomp[idx-1] in self.lang.invalid_connectors or self.decomp[idx+1] in self.lang.invalid_connectors:
self.decomp[idx]=None
if d==self.lang.connector and self.decomp[idx-1]=="" and self.decomp[idx+1]!="":
self.decomp[idx]=None
if d==self.lang.connector and self.decomp[idx-1]=="" and self.decomp[idx+1] not in ['', '', '', '', '', '', '']:
self.decomp[idx]=None
# handle exception
self.decomp=[d for d in self.decomp if d is not None]
word="".join(self.decomp)
if "এ্যা" in word:
word=word.replace("এ্যা","অ্যা")
if 'অ্য' in word:
word=word.replace('অ্য',"অ্যা")
self.decomp=[ch for ch in word]
def convertToAndHosonto(self):
'''
normalizes to+hosonto for ['','','','','','','']
# Example-1:
(a)পতি==(b)ৎপতি-->False
(a) breaks as ['', '', '', '', '', '', '', '', 'ি']
(b) breaks as ['', '', '', '', '', '', '', 'ি']
# Example-2:
(a)উত==(b)উৎস-->False
(a) breaks as ['', '', '', '']
(b) breaks as ['', '', '']
'''
for idx,d in enumerate(self.decomp):
if idx<len(self.decomp)-1:
# to + hosonto
if d=='' and self.decomp[idx+1]== self.lang.connector:
# for single case
if idx<len(self.decomp)-2:
if self.decomp[idx+2] not in self.valid_consonants_after_to_and_hosonto:
# replace
self.decomp[idx]=''
# delete
self.decomp[idx+1]=None
else:
# valid replacement for to+hos double case
if idx<len(self.decomp)-3:
if self.decomp[idx+2]=='' and self.decomp[idx+3]== self.lang.connector:
if idx<len(self.decomp)-4:
if self.decomp[idx+4] not in ['','','']:
# if the next charecter after the double to+hos+to+hos is with in ['ত','থ','ন','ম']
# replace
self.decomp[idx]=''
# delete
self.decomp[idx+1]=None
if idx<len(self.decomp)-4:
if self.decomp[idx+4]=='':
# delete
self.decomp[idx+3]=None
def swapToAndHosontoDiacritics(self):
'''
puts diacritics in right place
'''
for idx,d in enumerate(self.decomp):
if idx<len(self.decomp)-1:
if d=='' and self.decomp[idx+1] in self.lang.diacritics:
self.swapIdxs(idx,idx+1)
###------------------------------------------------------------------------------------------------------
def normalizeToandHosonto(self):
self.safeop(self.convertToAndHosonto)
self.safeop(self.swapToAndHosontoDiacritics)
self.baseCompose()
###------------------------------------------------------------------------------------------------------
##------------------------------------------------------------------------------------------------------
def cleanVowelDiacriticComingAfterVowel(self):
'''
takes care of vowels and modifier followed by vowel diacritics
# Example-1:
(a)==(b)উল-->False
(a) breaks as ['', '', '', '']
(b) breaks as ['', '', '']
# Example-2:
(a)আরিলজি==(b)আরিওলজি-->False
(a) breaks as ['', '', '', '', 'ি', '', '', '', '', 'ি']
(b) breaks as ['', '', '', '', 'ি', '', '', '', 'ি']
Also Normalizes '' and 'ত্র'
# Example-1:
(a)একএ==(b)একত-->False
(a) breaks as ['', '', '', '']
(b) breaks as ['', '', '', '', '', '']
'''
for idx,d in enumerate(self.decomp):
# if the current one is a VD and the previous char is a vowel
if d in self.lang.vowel_diacritics and self.decomp[idx-1] in self.lang.vowels:
# if the vowel is not 'এ'
if self.decomp[idx-1] !='':
# remove diacritic
self.decomp[idx]=None
# normalization case
else:
self.decomp[idx-1]=''+''+''
##------------------------------------------------------------------------------------------------------
def fixTypoForJoFola(self):
for idx,d in enumerate(self.decomp):
if idx<len(self.decomp)-1:
if d== self.lang.connector and self.decomp[idx+1]=='':
self.decomp[idx+1]=''
def cleanDoubleCC(self):
# c,cc,c,cc
for idx,d in enumerate(self.decomp):
if idx<len(self.decomp)-3:
if d== self.lang.connector and self.decomp[idx+1] in self.lang.consonants \
and self.decomp[idx+2]==self.lang.connector and self.decomp[idx+3] in self.lang.consonants:
if self.decomp[idx+3]==self.decomp[idx+1]:
self.decomp[idx]=None
self.decomp[idx+1]=None
def cleanDoubleRef(self):
for idx,d in enumerate(self.decomp):
if idx<len(self.decomp)-3:
if d=='' and self.decomp[idx+1]==self.lang.connector\
and self.decomp[idx+2]=='' and self.decomp[idx+3]== self.lang.connector:
self.decomp[idx]=None
self.decomp[idx+1]=None
def cleanConnectotForJoFola(self):
for idx,d in enumerate(self.decomp):
if idx<len(self.decomp)-2:
if d== self.lang.connector and self.decomp[idx+1]=='' and self.decomp[idx+2]==self.lang.connector:
self.decomp[idx+2]=None
def cleanInvalidConjunctDiacritics(self):
'''
cleans repeated folas
# Example-1:
(a)মক==(b)মক-->False
(a) breaks as ['', '', '', '', '', '', '', '', '']
(b) breaks as ['', '', '', '', '', '', '']
'''
self.safeop(self.fixTypoForJoFola)
self.safeop(self.cleanDoubleCC)
self.safeop(self.cleanDoubleRef)
# self.safeop(self.fixRefOrder)
# print("".join(self.decomp))
# #self.safeop(self.fixOrdersForCC)
#print("".join(self.decomp))
self.safeop(self.cleanConnectotForJoFola)
self.baseCompose()
##------------------------------------------------------------------------------------------------------
def checkComplexRoot(self,root):
formed=[]
formed_idx=[]
for i,c in enumerate(root):
if c !='' and i not in formed_idx:
r=c
if i==len(root)-1:
formed.append(r)
continue
for j in range(i+2,len(root),2):
d=root[j]
k=r+''+d
#if k==
if k not in self.complex_roots:
formed.append(r)
break
else:
if j!=len(root)-1:
r=k
formed_idx.append(j)
else:
r=k
formed_idx.append(j)
formed.append(k)
return "".join(formed)
def convertComplexRoots(self):
self.fixNoSpaceChar()
self.decomp=[x for x in self.decomp if x is not None]
self.constructComplexDecomp()
for idx,d in enumerate(self.decomp):
if d not in self.complex_roots and self.lang.connector in d:
self.decomp[idx]=self.checkComplexRoot(d)
#-------------------------unicode ops-----------------------------------------------------------------------------