122 lines
3.7 KiB
Python
122 lines
3.7 KiB
Python
import re
|
||
|
||
import bangla
|
||
from bnnumerizer import numerize
|
||
from bnunicodenormalizer import Normalizer
|
||
|
||
# initialize
|
||
bnorm = Normalizer()
|
||
|
||
|
||
attribution_dict = {
|
||
"সাঃ": "সাল্লাল্লাহু আলাইহি ওয়া সাল্লাম",
|
||
"আঃ": "আলাইহিস সালাম",
|
||
"রাঃ": "রাদিআল্লাহু আনহু",
|
||
"রহঃ": "রহমাতুল্লাহি আলাইহি",
|
||
"রহিঃ": "রহিমাহুল্লাহ",
|
||
"হাফিঃ": "হাফিযাহুল্লাহ",
|
||
"বায়ান": "বাইআন",
|
||
"দাঃবাঃ": "দামাত বারাকাতুহুম,দামাত বারাকাতুল্লাহ",
|
||
# "আয়াত" : "আইআত",#আইআত
|
||
# "ওয়া" : "ওআ",
|
||
# "ওয়াসাল্লাম" : "ওআসাল্লাম",
|
||
# "কেন" : "কেনো",
|
||
# "কোন" : "কোনো",
|
||
# "বল" : "বলো",
|
||
# "চল" : "চলো",
|
||
# "কর" : "করো",
|
||
# "রাখ" : "রাখো",
|
||
"’": "",
|
||
"‘": "",
|
||
# "য়" : "অ",
|
||
# "সম্প্রদায়" : "সম্প্রদাই",
|
||
# "রয়েছে" : "রইছে",
|
||
# "রয়েছ" : "রইছ",
|
||
"/": " বাই ",
|
||
}
|
||
|
||
|
||
def tag_text(text: str):
|
||
# remove multiple spaces
|
||
text = re.sub(" +", " ", text)
|
||
# create start and end
|
||
text = "start" + text + "end"
|
||
# tag text
|
||
parts = re.split("[\u0600-\u06FF]+", text)
|
||
# remove non chars
|
||
parts = [p for p in parts if p.strip()]
|
||
# unique parts
|
||
parts = set(parts)
|
||
# tag the text
|
||
for m in parts:
|
||
if len(m.strip()) > 1:
|
||
text = text.replace(m, f"{m}")
|
||
# clean-tags
|
||
text = text.replace("start", "")
|
||
text = text.replace("end", "")
|
||
return text
|
||
|
||
|
||
def normalize(sen):
|
||
global bnorm # pylint: disable=global-statement
|
||
_words = [bnorm(word)["normalized"] for word in sen.split()]
|
||
return " ".join([word for word in _words if word is not None])
|
||
|
||
|
||
def expand_full_attribution(text):
|
||
for word, attr in attribution_dict.items():
|
||
if word in text:
|
||
text = text.replace(word, normalize(attr))
|
||
return text
|
||
|
||
|
||
def collapse_whitespace(text):
|
||
# Regular expression matching whitespace:
|
||
_whitespace_re = re.compile(r"\s+")
|
||
return re.sub(_whitespace_re, " ", text)
|
||
|
||
|
||
def bangla_text_to_phonemes(text: str) -> str:
|
||
# english numbers to bangla conversion
|
||
res = re.search("[0-9]", text)
|
||
if res is not None:
|
||
text = bangla.convert_english_digit_to_bangla_digit(text)
|
||
|
||
# replace ':' in between two bangla numbers with ' এর '
|
||
pattern = r"[০, ১, ২, ৩, ৪, ৫, ৬, ৭, ৮, ৯]:[০, ১, ২, ৩, ৪, ৫, ৬, ৭, ৮, ৯]"
|
||
matches = re.findall(pattern, text)
|
||
for m in matches:
|
||
r = m.replace(":", " এর ")
|
||
text = text.replace(m, r)
|
||
|
||
# numerize text
|
||
text = numerize(text)
|
||
|
||
# tag sections
|
||
text = tag_text(text)
|
||
|
||
# text blocks
|
||
# blocks = text.split("")
|
||
# blocks = [b for b in blocks if b.strip()]
|
||
|
||
# create tuple of (lang,text)
|
||
if "" in text:
|
||
text = text.replace("", "").replace("", "")
|
||
# Split based on sentence ending Characters
|
||
bn_text = text.strip()
|
||
|
||
sentenceEnders = re.compile("[।!?]")
|
||
sentences = sentenceEnders.split(str(bn_text))
|
||
|
||
data = ""
|
||
for sent in sentences:
|
||
res = re.sub("\n", "", sent)
|
||
res = normalize(res)
|
||
# expand attributes
|
||
res = expand_full_attribution(res)
|
||
|
||
res = collapse_whitespace(res)
|
||
res += "।"
|
||
data += res
|
||
return data
|