ai-content-maker/.venv/Lib/site-packages/bnunicodenormalizer/langs.py

590 lines
42 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#-*- coding: utf-8 -*-
"""
@author:Bengali.AI
indic generall ref: https://r12a.github.io/scripts/
"""
from __future__ import print_function
#--------------------------------------------------------------------------------------------------------------------------------------------
# language classes
#--------------------------------------------------------------------------------------------------------------------------------------------
class english:
lower = ["a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z"]
upper = ["A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z"]
punctuations = ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?',
'@', '[', '\\', ']', '^', '_', '`','{', '|', '}', '~']
numbers = ["0","1","2","3","4","5","6","7","8","9"]
valid = sorted(lower+upper+numbers+punctuations)
#--------------------------------------------------------------------------------------------------------------------------------------------
#############################################################################################################################################
#--------------------------------------------------------------------------------------------------------------------------------------------
class bangla:
'''
* vowel and consonant division according to : http://bn.wikipedia.org/wiki/%E0%A7%8E
* consonant conjuncts according to: http://bn.wiktionary.org/wiki/উইকিঅভিধান:বাংলা_যুক্তবর্ণের_তালিকা
* punctuations according to: https://bn.wikipedia.org/wiki/যতিচিহ্ন
'''
#-----------------------------------------------------basic-------------------------------------------------------------------
iden = "bangla"
nukta = ''
vowels = ['', '', '', '', '', '', '', '', '', '', '']
consonants = ['', '', '', '', '', '', '','', '', '', '', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '', '', '','', '', '','']
vowel_diacritics = ['', 'ি', '', '', '', '', '', '', '', '']
consonant_diacritics = ['', '', '']
numbers = ['', '', '', '', '', '', '', '', '', '']
punctuations = ['!', '"', "'", '(', ')', ',', '-', '.', '...', ':', ':-', ';', '<', '=', '>', '?', '[', ']', '{', '}', '', '', '', '', '', '']
symbols = ['']
connector = ''
# based on unicode range : \u0980-\u09FF
non_gylph_unicodes = ['\u0984', '\u098d','\u098e','\u0991','\u0992','\u09a9','\u09b1','\u09b3','\u09b4','\u09b5',
'\u09ba','\u09bb', '\u09c5','\u09c6','\u09c9','\u09ca','\u09cf','\u09d0','\u09d1','\u09d2',
'\u09d3','\u09d4','\u09d5','\u09d6', '\u09d8','\u09d9','\u09da','\u09db','\u09de', '\u09e4',
'\u09e5', '','','','\u09ff']
legacy_symbols = ['','','','','','','','','','','','','','']
non_chars = numbers+punctuations+symbols+non_gylph_unicodes+legacy_symbols
#-----------------------------------------------------basic-------------------------------------------------------------------
#---------------------------------------------------changeables----------------------------------------------------------------
conjuncts = ['এ্য','অ্য','ক্ক','ক্ট','ক্ট্য','ক্ট্র','ক্ত','ক্ত্র','ক্ব','ক্ম','ক্য','ক্র','ক্র্য','ক্ল','ক্ল্য','ক্ষ','ক্ষ্ণ','ক্ষ্ব','ক্ষ্ম','ক্ষ্ম্য','ক্ষ্য',
'ক্স','ক্স্য','খ্য','খ্র','গ্ণ','গ্ধ','গ্ধ্য','গ্ধ্র','গ্ন','গ্ন্য','গ্ব','গ্ম','গ্য','গ্র','গ্র্য','গ্ল','গ্ল্য','ঘ্ন','ঘ্য','ঘ্র',
'ঙ্ক','ঙ্ক্ত','ঙ্ক্য','ঙ্ক্ষ','ঙ্খ','ঙ্খ্য','ঙ্গ','ঙ্গ্য','ঙ্ঘ','ঙ্ঘ্য','ঙ্ঘ্র','ঙ্ম','চ্চ','চ্ছ','চ্ছ্ব','চ্ছ্র','চ্ঞ','চ্ব','চ্য','জ্জ',
'জ্জ্ব','জ্ঝ','জ্ঞ','জ্ব','জ্য','জ্র','ঞ্চ','ঞ্ছ','ঞ্জ','ঞ্ঝ','ট্ট','ট্ব','ট্ম','ট্য','ট্র','ট্র্য','ড্ড','ড্ব','ড্য','ড্র',
'ড্র্য','ঢ্য','ঢ্র','ণ্ট','ণ্ঠ','ণ্ঠ্য','ণ্ড','ণ্ড্য','ণ্ড্র','ণ্ঢ','ণ্ণ','ণ্ব','ণ্ম','ণ্য','ত্ত','ত্ত্ব','ত্ত্য','ত্থ','ত্ন','ত্ব',
'ত্ম','ত্ম্য','ত্য','ত্র','ত্র্য','থ্ব','থ্য','থ্র','থ্র্য','দ্গ','দ্ঘ','দ্দ','দ্দ্ব','দ্ধ','দ্ব','দ্ভ','দ্ভ্র','দ্ম','দ্য','দ্র',
'দ্র্য','ধ্ন','ধ্ব','ধ্ম','ধ্য','ধ্র','ন্ক','ন্ট','ন্ট্য','ন্ট্র','ন্ট্র্য','ন্ঠ','ন্ড','ন্ড্ব','ন্ড্য','ন্ড্র','ন্ত','ন্ত্ব','ন্ত্য','ন্ত্র',
'ন্ত্র্য','ন্থ','ন্থ্য','ন্থ্র','ন্দ','ন্দ্ব','ন্দ্য','ন্দ্র','ন্ধ','ন্ধ্য','ন্ধ্র','ন্ন','ন্ব','ন্ম','ন্য','ন্শ্য','ন্স','ন্স্য','প্ট','প্ট্য',
'প্ত','প্ন','প্প','প্য','প্র','প্র্য','প্ল','প্ল্য','প্স','ফ্য','ফ্র','ফ্র্য','ফ্ল','ফ্ল্য','ব্জ','ব্দ','ব্ধ','ব্ব','ব্য','ব্র',
'ব্র্য','ব্ল','ভ্ব','ভ্য','ভ্র','ম্ন','ম্ন্য','ম্প','ম্প্য','ম্প্র','ম্ফ','ম্ব','ম্ব্র','ম্ভ','ম্ভ্র','ম্ম','ম্য','ম্র','ম্ল','য্য',
'র্ক','র্ক্ট','র্ক্য','র্খ','র্গ','র্গ্য','র্গ্র','র্ঘ','র্ঘ্য','র্চ','র্চ্য','র্ছ','র্জ','র্জ্ঞ','র্জ্য','র্ঝ','র্ট','র্ট্য','র্ট্র','র্ড',
'র্ড্র','র্ঢ্য','র্ণ','র্ণ্য','র্ত','র্ত্ম','র্ত্য','র্ত্র','র্থ','র্থ্য','র্দ','র্দ্ব','র্দ্র','র্ধ','র্ধ্ব','র্ন','র্ন্ড','র্প','র্প্ট','র্প্ল',
'র্ফ','র্ব','র্ব্য','র্ভ','র্ম','র্ম্থ','র্ম্প','র্ম্য','র্য','র্ল','র্ল্ড','র্ল্য','র্শ','র্শ্ব','র্শ্য','র্ষ','র্ষ্য','র্স','র্স','র্স্ট',
'র্স্ম','র্স্য','র্হ','র্হ্য','র্হ্য','র‍্য','ল্ক','ল্ক্য','ল্গ','ল্চ','ল্ট','ল্ট্য','ল্ট্র','ল্ড','ল্ড্য','ল্ড্র','ল্প','ল্ফ','ল্ব','ল্ব্য',
'ল্ভ','ল্ম','ল্য','ল্ল','শ্চ','শ্ছ','শ্ন','শ্ব','শ্ম','শ্য','শ্র','শ্র্য','শ্ল','ষ্ক','ষ্ক্র','ষ্ট','ষ্ট্য','ষ্ট্র','ষ্ঠ','ষ্ঠ্য',
'ষ্ণ','ষ্প','ষ্প্র','ষ্ফ','ষ্ব','ষ্ম','ষ্য','স্ক','স্ক্য','স্ক্র','স্ক্র্য','স্খ','স্চ','স্ট','স্ট্য','স্ট্র','স্ট্র্য','স্ত','স্ত্ব','স্ত্য',
'স্ত্র','স্থ','স্থ্য','স্ন','স্ন্য','স্প','স্প্য','স্প্র','স্প্র্য','স্প্ল','স্প্ল্য','স্ফ','স্ব','স্ম','স্ম্য','স্য','স্র','স্ল','স্ল্য','হ্ণ',
'হ্ন','হ্ব','হ্ম','হ্য','হ্র','হ্ল','য়্য','ব্ল্য','র্ন্ত','ঠ্য','ভ্ল']
# this is a customizeable map : this map is purely based on visual similiarity
legacy_maps = {'':'',
'':'',
'':'',
'':'',
'':'',
'':'',
'':''}
#---------------------------------------------------changeables----------------------------------------------------------------
#---------------------------------------------------normalization maps---------------------------------------------------------
nukta_map = {'':'',
'':'',
'':'',
'':''}
diacritic_map = {'ো':'',
'ৌ':'',
'অা':'',
'':''}
#---------------------------------------------------normalization maps---------------------------------------------------------
diacritics = sorted(vowel_diacritics+consonant_diacritics)
used = sorted(vowels+consonants+vowel_diacritics+consonant_diacritics+numbers)
valid = sorted([' ']+used+punctuations+[connector]+["\u200d","\u200c"])
complex_roots = sorted([' ']+vowels+consonants+numbers+punctuations+symbols+conjuncts)
# these unicodes can not start a word
invalid_starts = sorted(diacritics+[connector])
# invalid connector cases
'''
a connector can not be sorrounded by/ can not come after or before:
* the vowels
* the diacritics
* another connector [double consecutive hosonto]
* khondo to
'''
invalid_connectors = sorted(invalid_starts+vowels+['']+numbers+punctuations)
class devanagari:
'''
* vowel and consonant division according to :https://unicode-table.com/en/blocks/devanagari/
* consonant conjuncts according to: https://en.wikipedia.org/wiki/Devanagari_conjuncts
* punctuations according to: https://www.learnsanskrit.org/guide/devanagari/numerals-and-punctuation/
'''
#-----------------------------------------------------basic-------------------------------------------------------------------
iden = "devanagari"
nukta = ''
vowels = ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '','', '', '', '','','', '', '','', '']
consonants = ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '', '', '', '','', '', '', '', '', '', '', '','', '', '']
vowel_diacritics = ['', '','', 'ि', '', '', '', '', '', '', '', '', '', '', '', '', '','', '','','', '']
consonant_diacritics = ['', '', '', '']
numbers = ['', '', '', '', '', '', '', '', '', '']
punctuations = ['', '', ':', ';', '!', '', '?', '']
symbols = []
connector = ''
non_gylph_unicodes = []
legacy_symbols = []
non_chars = numbers+punctuations+symbols+non_gylph_unicodes+legacy_symbols
#-----------------------------------------------------basic-------------------------------------------------------------------
#---------------------------------------------------changeables----------------------------------------------------------------
conjuncts = []
# this is a customizeable map : this map is purely based on visual similiarity
legacy_maps = {}
#---------------------------------------------------changeables----------------------------------------------------------------
#---------------------------------------------------normalization maps---------------------------------------------------------
nukta_map = {'':'',
'':'',
'':'',
'':'',
'':'',
'':'',
'':'',
'':'',
'':'',
'':''}
diacritic_map = {'ाे':'',
'ाै':'',
'अा':'',
'अो':'',
'अौ': '',
'एे': ''}
#---------------------------------------------------normalization maps---------------------------------------------------------
diacritics = sorted(vowel_diacritics+consonant_diacritics)
used = sorted(vowels+consonants+vowel_diacritics+consonant_diacritics+numbers)
valid = sorted([' ']+used+punctuations+[connector]+["\u200d","\u200c"])
complex_roots = sorted([' ']+vowels+consonants+numbers+punctuations+symbols+conjuncts)
# these unicodes can not start a word
invalid_starts = sorted(diacritics+[connector])
# invalid connector cases
'''
a connector can not be sorrounded by/ can not come after or before:
* the vowels
* the diacritics
* another connector [double consecutive hosonto]
'''
invalid_connectors = sorted(invalid_starts+vowels+numbers+punctuations)
#--------------------------------------------------------------------------------------------------------------------------------------------
#############################################################################################################################################
#--------------------------------------------------------------------------------------------------------------------------------------------
class gujarati:
'''
* vowel and consonant division according to : https://unicode-table.com/en/blocks/gujarati/
* consonant conjuncts according to: https://en.wikipedia.org/wiki/Gujarati_script
* punctuations according to: https://github.com/BengaliAI/syntheticWords/blob/main/coreLib/languages.py
'''
#-----------------------------------------------------basic-------------------------------------------------------------------
iden = "gujarati"
nukta = ''
vowels = ['', '', '', '', '', '', '', '', '', '', '', '', '', '','', '', '', '']
consonants = ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '','']
vowel_diacritics = ['', 'િ', '', '', '', '', '', '', '', '', '', '', '']
consonant_diacritics = ['', '', '']
numbers = ['', '', '', '', '', '', '', '', '', '']
punctuations = ['',',',';','','?','!',':','',':-',"'",'','(', ')','{', '}','[',']','','<','>','=','...','.','-']
symbols = ['']
connector = ''
# based on unicode range : \u0980-\u09FF
non_gylph_unicodes = ['\u0a80', '\u0a84', '\u0aa9', '\u0ab1', '\u0ab4', '\u0aba', '\u0abb', '\u0ac6', '\u0aca', '\u0ace', '\u0acf',
'\u0ad1', '\u0ad2', '\u0ad3', '\u0ad4', '\u0ad5', '\u0ad6', '\u0ad7', '\u0ad8', '\u0ad9', '\u0ada', '\u0adb',
'\u0adc', '\u0add', '\u0ade', '\u0adf', '\u0ae4', '\u0ae5', '\u0af1', '\u0af2', '\u0af3', '\u0af4', '\u0af5',
'\u0af6', '\u0af7', '\u0af8']
legacy_symbols = []
non_chars = numbers+punctuations+symbols+non_gylph_unicodes+legacy_symbols
#-----------------------------------------------------basic-------------------------------------------------------------------
#---------------------------------------------------changeables----------------------------------------------------------------
conjuncts = []
# this is a customizeable map : this map is purely based on visual similiarity
legacy_maps = {}
#---------------------------------------------------changeables----------------------------------------------------------------
#---------------------------------------------------normalization maps---------------------------------------------------------
nukta_map = {} # NONE
diacritic_map = {'ાે': '',
'ાૅ': '',
'ાૈ': '',
'અા': '',
'અે': '',
'અો': '',
'અૅ': '',
'અૉ': '',
'અૈ': '',
'અૌ': ''}
#---------------------------------------------------normalization maps---------------------------------------------------------
#---------------------------------------------------normalization maps---------------------------------------------------------
diacritics = sorted(vowel_diacritics+consonant_diacritics)
used = sorted(vowels+consonants+vowel_diacritics+consonant_diacritics+numbers)
valid = sorted([' ']+used+punctuations+[connector]+["\u200d","\u200c"])
complex_roots = sorted([' ']+vowels+consonants+numbers+punctuations+symbols+conjuncts)
# these unicodes can not start a word
invalid_starts = sorted(diacritics+[connector])
# invalid connector cases
'''
a connector can not be sorrounded by/ can not come after or before:
* the vowels
* the diacritics
* another connector [double consecutive hosonto]
'''
invalid_connectors = sorted(invalid_starts+vowels+numbers+punctuations)
#--------------------------------------------------------------------------------------------------------------------------------------------
#############################################################################################################################################
#--------------------------------------------------------------------------------------------------------------------------------------------
class odiya:
'''
* vowel and consonant division according to : https://unicode-table.com/en/blocks/oriya/
* consonant conjuncts according to: https://en.wikipedia.org/wiki/Odia_script
* punctuations according to: https://github.com/BengaliAI/syntheticWords/blob/main/coreLib/languages.py
'''
#-----------------------------------------------------basic-------------------------------------------------------------------
iden = "odiya"
nukta = ''
vowels = ['', '', '', '', '', '', '', '', '', '', '', '','', '']
consonants = ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
'', '', '', '','', '', '','']
vowel_diacritics = ['', 'ି', '', '', '', '', '', '', '','', '','', '']
consonant_diacritics = ['', '', '']
numbers = ['', '', '', '', '', '', '', '', '', '']
punctuations = [',',';','','?','!',':','',':-',"'",'','(', ')','{', '}','[',']','','<','>','=','...','.','-']
symbols = ['', '', '', '', '', '']
connector = ''
# based on unicode range : \u0980-\u09FF
non_gylph_unicodes = ['\u0b00', '\u0b04', '\u0b0d', '\u0b0e', '\u0b11', '\u0b12', '\u0b29', '\u0b31', '\u0b34', '\u0b3a', '\u0b3b',
'\u0b45', '\u0b46', '\u0b49', '\u0b4a', '\u0b4e', '\u0b4f', '\u0b50', '\u0b51', '\u0b52', '\u0b53', '\u0b54',
'\u0b58', '\u0b59', '\u0b5a', '\u0b5b', '\u0b5e', '\u0b64', '\u0b65', '\u0b78', '\u0b79', '\u0b7a', '\u0b7b',
'\u0b7c', '\u0b7d', '\u0b7e', '\u0b7f']
legacy_symbols = []
non_chars = numbers+punctuations+symbols+non_gylph_unicodes+legacy_symbols
#-----------------------------------------------------basic-------------------------------------------------------------------
#---------------------------------------------------changeables----------------------------------------------------------------
conjuncts = []
# this is a customizeable map : this map is purely based on visual similiarity
legacy_maps = {}
#---------------------------------------------------changeables----------------------------------------------------------------
#---------------------------------------------------normalization maps---------------------------------------------------------
nukta_map = {'':'',
'':''}
diacritic_map = {'ୋ':'',
'ୈା':'',
'ଓୗ':'',
'ଏୗ':'',
'ଅା':''}
#---------------------------------------------------normalization maps---------------------------------------------------------
diacritics = sorted(vowel_diacritics+consonant_diacritics)
used = sorted(vowels+consonants+vowel_diacritics+consonant_diacritics+numbers)
valid = sorted([' ']+used+punctuations+[connector]+["\u200d","\u200c"])
complex_roots = sorted([' ']+vowels+consonants+numbers+punctuations+symbols+conjuncts)
# these unicodes can not start a word
invalid_starts = sorted(diacritics+[connector])
# invalid connector cases
'''
a connector can not be sorrounded by/ can not come after or before:
* the vowels
* the diacritics
* another connector [double consecutive hosonto]
'''
invalid_connectors = sorted(invalid_starts+vowels+numbers+punctuations)
#--------------------------------------------------------------------------------------------------------------------------------------------
#############################################################################################################################################
#--------------------------------------------------------------------------------------------------------------------------------------------
class tamil:
'''
* vowel and consonant division according to : https://unicode-table.com/en/blocks/tamil/
* consonant conjuncts according to: https://en.wikipedia.org/wiki/Tamil_script
* punctuations according to: https://github.com/BengaliAI/syntheticWords/blob/main/coreLib/languages.py
'''
#-----------------------------------------------------basic-------------------------------------------------------------------
iden = "tamil"
nukta = ''
vowels = ['', '', '', '', '', '', '', '', '', '', '', '']
consonants = ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
vowel_diacritics = ['', 'ி', '', '', '', '', '', '','', '', '']
consonant_diacritics = ['', '']
numbers = ['', '', '', '', '', '', '', '', '', '','', '', '']
punctuations = [',',';','','?','!',':','',':-',"'",'','(', ')','{', '}','[',']','','<','>','=','...','.','-']
symbols = ['', '', '','','', '', '','']
connector = ''
# based on unicode range : \u0980-\u09FF
non_gylph_unicodes = ['\u0b80', '\u0b81', '\u0b84', '\u0b8b', '\u0b8c', '\u0b8d', '\u0b91', '\u0b96', '\u0b97', '\u0b98', '\u0b9b',
'\u0b9d', '\u0ba0', '\u0ba1', '\u0ba2', '\u0ba5', '\u0ba6', '\u0ba7', '\u0bab', '\u0bac', '\u0bad', '\u0bba',
'\u0bbb', '\u0bbc', '\u0bbd', '\u0bc3', '\u0bc4', '\u0bc5', '\u0bc9', '\u0bce', '\u0bcf', '\u0bd8', '\u0bd9',
'\u0bda', '\u0bdb', '\u0bdc', '\u0bdd', '\u0bde', '\u0bdf', '\u0be0', '\u0be1', '\u0be2', '\u0be3', '\u0bd1',
'\u0bd2', '\u0bd3', '\u0bd4', '\u0bd5', '\u0bd6', '\u0bd8', '\u0bd9', '\u0bda', '\u0bdb', '\u0bdc', '\u0bdd',
'\u0bde', '\u0bdf', '\u0be0', '\u0be1', '\u0be2', '\u0be3', '\u0be4', '\u0be5', '\u0bfb', '\u0bfc', '\u0bfd',
'\u0bfe', '\u0bff']
legacy_symbols = []
non_chars = numbers+punctuations+symbols+non_gylph_unicodes+legacy_symbols
#-----------------------------------------------------basic-------------------------------------------------------------------
#---------------------------------------------------changeables----------------------------------------------------------------
conjuncts = []
# this is a customizeable map : this map is purely based on visual similiarity
legacy_maps = {}
#---------------------------------------------------changeables----------------------------------------------------------------
#---------------------------------------------------normalization maps---------------------------------------------------------
nukta_map = {} # NONE
diacritic_map = {'ொ':'',
"ோ":''}
#---------------------------------------------------normalization maps---------------------------------------------------------
diacritics = sorted(vowel_diacritics+consonant_diacritics)
used = sorted(vowels+consonants+vowel_diacritics+consonant_diacritics+numbers)
valid = sorted([' ']+used+punctuations+[connector]+["\u200d","\u200c"])
complex_roots = sorted([' ']+vowels+consonants+numbers+punctuations+symbols+conjuncts)
# these unicodes can not start a word
invalid_starts = sorted(diacritics+[connector])
# invalid connector cases
'''
a connector can not be sorrounded by/ can not come after or before:
* the vowels
* the diacritics
* another connector [double consecutive hosonto]
'''
invalid_connectors = sorted(invalid_starts+vowels+numbers+punctuations)
#--------------------------------------------------------------------------------------------------------------------------------------------
#############################################################################################################################################
#--------------------------------------------------------------------------------------------------------------------------------------------
class panjabi:
'''
* vowel and consonant division according to : https://unicode-table.com/en/blocks/gurmukhi/
* consonant conjuncts according to: NOT USED
* punctuations according to: https://github.com/BengaliAI/syntheticWords/blob/main/coreLib/languages.py
'''
#-----------------------------------------------------basic-------------------------------------------------------------------
iden = "panjabi"
nukta = ''
vowels = ['', '', '', '', '', '', '', '', '', '','', '']
consonants = ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '','',
'', '', '', '']
vowel_diacritics = ['', 'ਿ', '', '', '', '', '', '', '']
consonant_diacritics = ['', '', ''] # Not found!
numbers = ['', '', '', '', '', '', '', '', '', '']
punctuations = [',',';','','?','!',':','',':-',"'",'','(', ')','{', '}','[',']','','<','>','=','...','.','-']
symbols = []
connector = ''
# based on unicode range : \u0980-\u09FF
non_gylph_unicodes = ['\u0a00', '\u0a04', '\u0a0b', '\u0a0c', '\u0a0d', '\u0a0e', '\u0a11', '\u0a12', '\u0a29', '\u0a31',
'\u0a34', '\u0a37', '\u0a3a', '\u0a3b', '\u0a3d', '\u0a43', '\u0a44', '\u0a45', '\u0a46', '\u0a49',
'\u0a4a', '\u0a4e', '\u0a4f', '\u0a50', '\u0a52', '\u0a53', '\u0a54', '\u0a55', '\u0a56', '\u0a57',
'\u0a58', '\u0a5d', '\u0a5f', '\u0a60', '\u0a61', '\u0a62', '\u0a63', '\u0a64', '\u0a65', '\u0a76',
'\u0a77', '\u0a78', '\u0a79', '\u0a7a', '\u0a7b', '\u0a7c', '\u0a7d', '\u0a7e', '\u0a7f']
legacy_symbols = []
non_chars = numbers+punctuations+symbols+non_gylph_unicodes+legacy_symbols
#-----------------------------------------------------basic-------------------------------------------------------------------
#---------------------------------------------------changeables----------------------------------------------------------------
conjuncts = []
# this is a customizeable map : this map is purely based on visual similiarity
legacy_maps = {}
#---------------------------------------------------changeables----------------------------------------------------------------
#---------------------------------------------------normalization maps---------------------------------------------------------
nukta_map = {'':'',
'':'',
'':'',
'':'',
'':'',
'':''
}
diacritic_map = {'ੇੋ': '',
'ਾੇ': '',
'ਾੋ': '',
'ਅੈ': '',
'ਅੌ': '',
'ਅਾ': ''}
#---------------------------------------------------normalization maps---------------------------------------------------------
diacritics = sorted(vowel_diacritics+consonant_diacritics)
used = sorted(vowels+consonants+vowel_diacritics+consonant_diacritics+numbers)
valid = sorted([' ']+used+punctuations+[connector]+["\u200d","\u200c"])
complex_roots = sorted([' ']+vowels+consonants+numbers+punctuations+symbols+conjuncts)
# these unicodes can not start a word
invalid_starts = sorted(diacritics+[connector])
# invalid connector cases
'''
a connector can not be sorrounded by/ can not come after or before:
* the vowels
* the diacritics
* another connector [double consecutive hosonto]
'''
invalid_connectors = sorted(invalid_starts+vowels+numbers+punctuations)
#--------------------------------------------------------------------------------------------------------------------------------------------
#############################################################################################################################################
#--------------------------------------------------------------------------------------------------------------------------------------------
class malayalam:
'''
* vowel and consonant division according to: https://unicode-table.com/en/blocks/malayalam/
* consonant conjuncts according to: Self Generated
* punctuations according to: https://github.com/BengaliAI/syntheticWords/blob/main/coreLib/languages.py
'''
#-----------------------------------------------------basic-------------------------------------------------------------------
iden = "malayalam"
nukta = ''
vowels = ['', '', '', '', '', '', '', '', '', '', '', '', '', '','', '']
consonants = ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
'', '', '']
vowel_diacritics = ['', 'ി', '', '', '', '', '', '', '', '','', '', '','','', '']
consonant_diacritics = ['', '', '', '', '']
numbers = ['', '', '', '', '', '', '', '', '', '','', '', '', '', '', '', '','', '', '','', '', '', '', '', '']
punctuations = [',',';','','?','!',':','',':-',"'",'','(', ')','{', '}','[',']','','<','>','=','...','.','-']
symbols = ['', '', '']
connector = '' # There are two other this shit '഻', '഼'
# based on unicode range : \u0980-\u09FF
non_gylph_unicodes = ['\u0d64', '\u0d65', '\u0d50', '\u0d51', '\u0d52', '\u0d53', '\u0d49', '\u0d45', '\u0d11', '\u0d0d']
legacy_symbols = ['']
non_chars = numbers+punctuations+symbols+non_gylph_unicodes+legacy_symbols
#-----------------------------------------------------basic-------------------------------------------------------------------
#---------------------------------------------------changeables----------------------------------------------------------------
conjuncts = []
# this is a customizeable map : this map is purely based on visual similiarity
legacy_maps = {}
#---------------------------------------------------changeables----------------------------------------------------------------
#---------------------------------------------------normalization maps---------------------------------------------------------
nukta_map = {}
diacritic_map = {} #NONE--<>
#---------------------------------------------------normalization maps---------------------------------------------------------
diacritics = sorted(vowel_diacritics+consonant_diacritics)
used = sorted(vowels+consonants+vowel_diacritics+consonant_diacritics+numbers)
valid = sorted([' ']+used+punctuations+[connector]+["\u200d","\u200c"])
complex_roots = sorted([' ']+vowels+consonants+numbers+punctuations+symbols+conjuncts)
# these unicodes can not start a word
invalid_starts = sorted(diacritics+[connector])
# invalid connector cases
'''
a connector can not be sorrounded by/ can not come after or before:
* the vowels
* the diacritics
* another connector [double consecutive hosonto]
'''
invalid_connectors = sorted(invalid_starts+vowels+numbers+punctuations)
class sylhetinagri:
'''
* according to asif sushmit
'''
#-----------------------------------------------------basic-------------------------------------------------------------------
iden = "sylhetinagri"
nukta = '' #done
vowels = ['', '', '', '', ''] #done
consonants = ['', '', '', '',
'', '', '', '',
'', '', '', '',
'', '', '', '', '',
'', '', '', '', '',
'', '', '', '', ''] #done
vowel_diacritics = ['', '', '', '', ''] #done
consonant_diacritics = ['', '', ''] #done
numbers = ['', '', '', '', '', '', '', '', '', '', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '', '', '', '', '', '', '', '', '', '']
punctuations = ['', '', ':', ';', '!', '', '?', '', '.', ',']
symbols = ['', '', '', '']
connector = '' #done
non_gylph_unicodes = []
legacy_symbols = []
non_chars = numbers+punctuations+symbols+non_gylph_unicodes+legacy_symbols
#-----------------------------------------------------basic-------------------------------------------------------------------
#---------------------------------------------------changeables----------------------------------------------------------------
conjuncts = []
# this is a customizeable map : this map is purely based on visual similiarity
legacy_maps = {}
#---------------------------------------------------changeables----------------------------------------------------------------
#---------------------------------------------------normalization maps---------------------------------------------------------
nukta_map = {}
diacritic_map = {'ꠦꠣ':'',
'ꠣꠦ':''}
#---------------------------------------------------normalization maps---------------------------------------------------------
diacritics = sorted(vowel_diacritics+consonant_diacritics)
used = sorted(vowels+consonants+vowel_diacritics+consonant_diacritics+numbers)
valid = sorted([' ']+used+punctuations+[connector]+["\u200d","\u200c"])
complex_roots = sorted([' ']+vowels+consonants+numbers+punctuations+symbols+conjuncts)
# these unicodes can not start a word
invalid_starts = sorted(diacritics+[connector])
# invalid connector cases
'''
a connector can not be sorrounded by/ can not come after or before:
* the vowels
* the diacritics
* another connector [double consecutive hosonto]
'''
invalid_connectors = sorted(invalid_starts+vowels+numbers+punctuations)
#--------------------------------------------------------------------------------------------------------------------------------------------
#############################################################################################################################################
#--------------------------------------------------------------------------------------------------------------------------------------------
languages={}
languages["english"] =english
languages["bangla"] =bangla
languages["devanagari"] =devanagari
languages["gujarati"] =gujarati
languages["odiya"] =odiya
languages["tamil"] =tamil
languages["panjabi"] =panjabi
languages["malayalam"] =malayalam
languages["sylhetinagri"]=sylhetinagri