ai-content-maker/.venv/Lib/site-packages/bnunicodenormalizer/base.py

306 lines
14 KiB
Python
Raw Normal View History

2024-05-03 04:18:51 +03:00
#-*- coding: utf-8 -*-
"""
@author:Bengali.AI
"""
from __future__ import print_function
#-------------------------------------------
from .langs import languages
from bnunicodenormalizer import langs
#-------------------------------------------
# cleaner class
#-------------------------------------------
class BaseNormalizer(object):
def __init__(self,
language,
allow_english=False,
keep_legacy_symbols=False,
legacy_maps=None):
'''
initialize a normalizer
args:
language : language identifier/name type(str)
allow_english : allow english letters numbers and punctuations [default:False] type(bool)
keep_legacy_symbols : legacy symbols will be considered as valid unicodes[default:False] type(bool)
legacy_maps : a dictionay for changing legacy symbols into a more used unicode [default:None] type(dict/None)
'''
# error handling
assert type(language)==str,"language is not string type!!!"
assert language in languages.keys(),"Language is not available"
assert type(allow_english)==bool,"allow_english is not of type boolean [True/False]"
assert type(keep_legacy_symbols)==bool,"keep_legacy_symbols is not of type boolean [True/False]"
self.lang = languages[language]
if legacy_maps is not None:
assert type(legacy_maps)==dict,"legacy_maps is not of type dict or None"
assert len(legacy_maps.keys())>0,"legacy_maps is an empty dict"
for k,v in legacy_maps.items():
assert k in self.lang.legacy_symbols,f"{k} is not a legacy symbol.See README.md initialization section for legacy symbols"
assert v in self.lang.used,f"{v} is not a valid legacy map.See README.md initialization section for legacy symbols"
# assignments
self.valid = self.lang.valid
self.roots = self.lang.complex_roots
self.legacy_maps = legacy_maps
#------------------------- update valid and complex roots-----------------------------------
if allow_english:
self.valid=sorted(list(set(self.valid+languages["english"].valid)))
self.roots=sorted(list(set(self.roots+languages["english"].valid)))
if keep_legacy_symbols:
self.valid=sorted(list(set(self.valid+self.lang.legacy_symbols)))
self.roots=sorted(list(set(self.roots+self.lang.legacy_symbols)))
self.word_level_ops={"LegacySymbols" :self.mapLegacySymbols,
"BrokenDiacritics":self.fixBrokenDiacritics}
self.decomp_level_ops={"BrokenNukta" :self.fixBrokenNukta,
"InvalidUnicode" :self.cleanInvalidUnicodes,
"InvalidConnector" :self.cleanInvalidConnector,
"FixDiacritics" :self.cleanDiacritics,
"VowelDiacriticAfterVowel" :self.cleanVowelDiacriticComingAfterVowel}
#-------------------------common ops-----------------------------------------------------------------------------
def checkDecomp(self):
if len(self.decomp)>0:return True
else:return False
def replaceMaps(self,map_dict):
for k,v in map_dict.items():
self.word=self.word.replace(k,v)
def swapIdxs(self,idx1,idx2):
temp=self.decomp[idx1]
self.decomp[idx1]=self.decomp[idx2]
self.decomp[idx2]=temp
def safeop(self,op):
#try:
self.decomp=[x for x in self.decomp if x is not None]
self.decomp=[x for x in "".join(self.decomp) if x is not None]
op()
# reform
self.decomp=[x for x in self.decomp if x is not None]
self.decomp=[x for x in "".join(self.decomp) if x is not None]
# except IndexError:
# self.decomp=[x for x in self.decomp if x is not None]
# pass
def constructComplexDecomp(self):
'''
creates grapheme root based decomp
'''
if self.lang.connector in self.decomp:
c_idxs = [i for i, x in enumerate(self.decomp) if x == self.lang.connector]
# component wise index map
comps=[[cid-1,cid,cid+1] for cid in c_idxs ]
# merge multi root
r_decomp = []
while len(comps)>0:
first, *rest = comps
first = set(first)
lf = -1
while len(first)>lf:
lf = len(first)
rest2 = []
for r in rest:
if len(first.intersection(set(r)))>0:
first |= set(r)
else:
rest2.append(r)
rest = rest2
r_decomp.append(sorted(list(first)))
comps = rest
# add
combs=[]
for ridx in r_decomp:
comb=''
for i in ridx:
comb+=self.decomp[i]
combs.append(comb)
for i in ridx:
if i==ridx[-1]:
self.decomp[i]=comb
else:
self.decomp[i]=None
self.decomp=[d for d in self.decomp if d is not None]
#-------------------------common ops-----------------------------------------------------------------------------
#-------------------------word ops-----------------------------------------------------------------------------
def mapLegacySymbols(self):
if self.legacy_maps is not None:
self.replaceMaps(self.legacy_maps)
def fixBrokenDiacritics(self):
if self.lang.diacritic_map is not None:
self.replaceMaps(self.lang.diacritic_map)
#-------------------------word ops-----------------------------------------------------------------------------
#-------------------------unicode ops-----------------------------------------------------------------------------
def fixBrokenNukta(self):
if self.lang.nukta_map is not None:
for idx,d in enumerate(self.decomp):
if d==self.lang.nukta:
for cidx in range(idx-1,-1,-1):
if self.decomp[cidx] in self.lang.nukta_map.keys():
self.decomp[cidx]=self.lang.nukta_map[self.decomp[cidx]]
self.decomp[idx]=None
break
def cleanInvalidUnicodes(self):
# clean starts
while self.checkDecomp() and self.decomp[0] in self.lang.invalid_starts:
self.decomp=self.decomp[1:]
# clean endings
while self.checkDecomp() and self.decomp[-1] == self.lang.connector:
self.decomp=self.decomp[:-1]
if self.checkDecomp():
# clean invalid unicodes
for idx,d in enumerate(self.decomp):
if d not in self.valid:
self.decomp[idx]=None
def cleanInvalidConnector(self):
for idx,d in enumerate(self.decomp):
if idx<len(self.decomp)-1:
if d==self.lang.connector :
if self.decomp[idx-1] in self.lang.invalid_connectors or self.decomp[idx+1] in self.lang.invalid_connectors:
self.decomp[idx]=None
def cleanVowelDiacritics(self):
# vowel diacritics
for idx,d in enumerate(self.decomp):
if idx<len(self.decomp)-1:
if d in self.lang.vowel_diacritics and self.decomp[idx+1] in self.lang.vowel_diacritics :
# if they are same delete the current one
if d==self.decomp[idx+1]:
self.decomp[idx]=None
# if they are not same --> remove the last one
else:
self.decomp[idx+1]=None
def cleanConsonantDiacritics(self):
# consonant diacritics
for idx,d in enumerate(self.decomp):
if idx<len(self.decomp)-1:
if d in self.lang.consonant_diacritics and self.decomp[idx+1] in self.lang.consonant_diacritics:
# if they are same delete the current one
if d==self.decomp[idx+1]:
self.decomp[idx]=None
def fixDiacriticOrder(self):
for idx,d in enumerate(self.decomp):
if idx<len(self.decomp)-1:
if d in self.lang.consonant_diacritics and self.decomp[idx+1] in self.lang.vowel_diacritics:
self.swapIdxs(idx,idx+1)
def cleanNonCharDiacs(self):
for idx,d in enumerate(self.decomp):
if idx>0:
if d in self.lang.diacritics and self.decomp[idx-1] in self.lang.non_chars:
self.decomp[idx]=None
def cleanDiacritics(self):
self.safeop(self.cleanVowelDiacritics)
self.safeop(self.cleanConsonantDiacritics)
self.safeop(self.fixDiacriticOrder)
self.safeop(self.cleanNonCharDiacs)
def cleanVowelDiacriticComingAfterVowel(self):
for idx,d in enumerate(self.decomp):
if d in self.lang.vowel_diacritics and self.decomp[idx-1] in self.lang.vowels:
# remove diacritic
self.decomp[idx]=None
def fixNoSpaceChar(self):
for idx,d in enumerate(self.decomp):
if idx==0 and self.decomp[idx] in ["\u200c","\u200d"]:
self.decomp[idx]=None
else:
if idx<len(self.decomp)-1 and self.decomp[idx] in ["\u200c","\u200d"] and self.decomp[idx+1] in [self.lang.connector]+["\u200c","\u200d"]:
# if the next char is either connector or is repeated
self.decomp[idx+1]=None
if idx>0 and self.decomp[idx] in ["\u200c","\u200d"] and self.decomp[idx-1] in [None]+["\u200c","\u200d"]:
# if the previous char is None
self.decomp[idx]=None
#----------------------composite ops-----------------------------------------------------------------------
def baseCompose(self):
self.safeop(self.cleanInvalidUnicodes)
self.safeop(self.cleanInvalidConnector)
self.safeop(self.cleanDiacritics)
self.safeop(self.cleanVowelDiacriticComingAfterVowel)
self.safeop(self.fixNoSpaceChar)
#----------------------entry-----------------------------------------------------------------------
def __call__(self,word):
'''
normalizes a given word
args:
word : the string to normalize
returns:
a dictionary-
* "given" = provided text
* "normalized = normalized text (gives None if during the operation length of the text becomes 0)
* "ops" = list of operations (dictionary) that were executed in given text to create normalized text
* each dictionary in ops has:
* "operation": the name of the operation / problem in given text
* "before" : what the text looked like before the specific operation
* "after" : what the text looks like after the specific operation
'''
# algorithm:
# * execute word level ops
# > create unicode based decomp
# * execute unicode level ops
# > create root based decomp
# * execute root level ops
details=[]
self.check=word
if not isinstance(self.check, str):
raise TypeError("The provided argument/ word is not a string")
if len(self.check.strip().split(" "))>1:
raise ValueError(f"The provided string has hultiple words.Make sure no space exists in the middle of the text.probable word:{self.check.replace(' ','')}")
self.word=word
#---------------------------------------------word ops-------------------------
for op_id,op in self.word_level_ops.items():
word_before_op=self.word[:]
op()
word_after_op=self.word[:]
if word_before_op!=word_after_op:
details.append({"operation":op_id,"before":word_before_op,"after":word_after_op})
#---------------------------------------------word ops-------------------------
self.decomp=[ch for ch in self.word]
#---------------------------------------------unicode ops-------------------------
for op_id,op in self.decomp_level_ops.items():
word_before_op="".join(self.decomp)
# execute
self.safeop(op)
# check length
if not self.checkDecomp():
return {"normalized":None,"given":self.check,"ops":details}
word_after_op="".join(self.decomp)
if word_before_op!=word_after_op:
details.append({"operation":op_id,"before":word_before_op,"after":word_after_op})
#----------------------------------------------------------------------------------------------------------
self.safeop(self.baseCompose)
self.word="".join([x for x in self.decomp if x is not None])
return {"normalized":self.word,"given":self.check,"ops":details}